blob: 17d711696b34c1f318026fe82764efd3f340f7a3 [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027#include <stdio.h>
28#include <assert.h>
29
Damien Georgeb4b10fd2015-01-01 23:30:53 +000030#include "py/mpstate.h"
Damien George51dfcb42015-01-01 20:27:54 +000031#include "py/lexer.h"
Damien429d7192013-10-04 19:53:11 +010032
33#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010034
Damien92c06562013-10-22 22:32:27 +010035// TODO seems that CPython allows NULL byte in the input stream
36// don't know if that's intentional or not, but we don't allow it
37
Damien George9528cd62014-01-15 21:23:31 +000038// TODO replace with a call to a standard function
Damien Georgea4c52c52014-12-05 19:35:18 +000039STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
Damien George54eb4e72014-07-03 13:47:47 +010040 mp_uint_t i = 0;
Damien429d7192013-10-04 19:53:11 +010041
Damiena5185f42013-10-20 14:41:27 +010042 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010043 ++i;
Damien429d7192013-10-04 19:53:11 +010044 ++str;
Damiena5185f42013-10-20 14:41:27 +010045 ++strn;
Damien429d7192013-10-04 19:53:11 +010046 }
47
Damiena5185f42013-10-20 14:41:27 +010048 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010049}
50
Damiena5185f42013-10-20 14:41:27 +010051#define CUR_CHAR(lex) ((lex)->chr0)
52
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020053STATIC bool is_end(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +010054 return lex->chr0 == MP_LEXER_EOF;
Damien429d7192013-10-04 19:53:11 +010055}
56
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020057STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien George32bade12015-01-30 00:27:46 +000058 return lex->chr0 == '\n';
Damien429d7192013-10-04 19:53:11 +010059}
60
Damien George2e2e4042015-03-19 00:21:29 +000061STATIC bool is_char(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010062 return lex->chr0 == c;
63}
64
Damien George2e2e4042015-03-19 00:21:29 +000065STATIC bool is_char_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010066 return lex->chr0 == c1 || lex->chr0 == c2;
67}
68
Damien George2e2e4042015-03-19 00:21:29 +000069STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
Damien429d7192013-10-04 19:53:11 +010070 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
71}
72
73/*
Damien George2e2e4042015-03-19 00:21:29 +000074STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010075 return lex->chr1 == c;
76}
77*/
78
Damien George2e2e4042015-03-19 00:21:29 +000079STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010080 return lex->chr1 == c1 || lex->chr1 == c2;
81}
82
Damien George2e2e4042015-03-19 00:21:29 +000083STATIC bool is_char_following_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010084 return lex->chr2 == c1 || lex->chr2 == c2;
85}
86
Damien George2e2e4042015-03-19 00:21:29 +000087STATIC bool is_char_and(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010088 return lex->chr0 == c1 && lex->chr1 == c2;
89}
90
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020091STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000092 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010093}
94
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020095STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000096 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010097}
98
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020099STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000100 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100101}
102
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200103STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000104 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100105}
106
Damien George7d414a12015-02-08 01:57:40 +0000107STATIC bool is_following_letter(mp_lexer_t *lex) {
108 return unichar_isalpha(lex->chr1);
109}
110
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200111STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200112 return lex->chr1 >= '0' && lex->chr1 <= '7';
113}
114
Damien George7ed58cb2015-06-09 10:58:07 +0000115// to easily parse utf-8 identifiers we allow any raw byte with high bit set
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200116STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien George7ed58cb2015-06-09 10:58:07 +0000117 return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
Damien429d7192013-10-04 19:53:11 +0100118}
119
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200120STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100121 return is_head_of_identifier(lex) || is_digit(lex);
122}
123
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200124STATIC void next_char(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +0100125 if (lex->chr0 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100126 return;
127 }
128
Damien429d7192013-10-04 19:53:11 +0100129 if (lex->chr0 == '\n') {
Damien George32bade12015-01-30 00:27:46 +0000130 // a new line
Damien429d7192013-10-04 19:53:11 +0100131 ++lex->line;
132 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100133 } else if (lex->chr0 == '\t') {
134 // a tab
135 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
136 } else {
137 // a character worth one column
138 ++lex->column;
139 }
140
Damien George32bade12015-01-30 00:27:46 +0000141 lex->chr0 = lex->chr1;
142 lex->chr1 = lex->chr2;
143 lex->chr2 = lex->stream_next_byte(lex->stream_data);
144
145 if (lex->chr0 == '\r') {
146 // CR is a new line, converted to LF
147 lex->chr0 = '\n';
148 if (lex->chr1 == '\n') {
149 // CR LF is a single new line
150 lex->chr1 = lex->chr2;
151 lex->chr2 = lex->stream_next_byte(lex->stream_data);
152 }
153 }
154
155 if (lex->chr2 == MP_LEXER_EOF) {
156 // EOF, check if we need to insert a newline at end of file
157 if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
158 // if lex->chr1 == '\r' then this makes a CR LF which will be converted to LF above
159 // otherwise it just inserts a LF
160 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100161 }
162 }
163}
164
Damien Georgea4c52c52014-12-05 19:35:18 +0000165STATIC void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
Damien429d7192013-10-04 19:53:11 +0100166 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100167 // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
Damien George58ebde42014-05-21 20:32:59 +0100168 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
169 lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
Damien429d7192013-10-04 19:53:11 +0100170 }
171 lex->indent_level[lex->num_indent_level++] = indent;
172}
173
Damien Georgea4c52c52014-12-05 19:35:18 +0000174STATIC mp_uint_t indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100175 return lex->indent_level[lex->num_indent_level - 1];
176}
177
Damien Georgea4c52c52014-12-05 19:35:18 +0000178STATIC void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100179 lex->num_indent_level -= 1;
180}
181
182// some tricky operator encoding:
183// <op> = begin with <op>, if this opchar matches then begin here
184// e<op> = end with <op>, if this opchar matches then end
185// E<op> = mandatory end with <op>, this opchar must match, then end
186// c<op> = continue with <op>, if this opchar matches then continue matching
187// this means if the start of two ops are the same then they are equal til the last char
188
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200189STATIC const char *tok_enc =
Damien429d7192013-10-04 19:53:11 +0100190 "()[]{},:;@~" // singles
191 "<e=c<e=" // < <= << <<=
192 ">e=c>e=" // > >= >> >>=
193 "*e=c*e=" // * *= ** **=
194 "+e=" // + +=
195 "-e=e>" // - -= ->
196 "&e=" // & &=
197 "|e=" // | |=
198 "/e=c/e=" // / /= // //=
199 "%e=" // % %=
200 "^e=" // ^ ^=
201 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100202 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100203
204// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200205STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000206 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
207 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
208 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
209 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100210
Damiend99b0522013-12-21 18:17:45 +0000211 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
212 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
213 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
214 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
215 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
216 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
217 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
218 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
219 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
220 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
221 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
222 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100223};
224
225// must have the same order as enum in lexer.h
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200226STATIC const char *tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100227 "False",
228 "None",
229 "True",
230 "and",
231 "as",
232 "assert",
233 "break",
234 "class",
235 "continue",
236 "def",
237 "del",
238 "elif",
239 "else",
240 "except",
241 "finally",
242 "for",
243 "from",
244 "global",
245 "if",
246 "import",
247 "in",
248 "is",
249 "lambda",
250 "nonlocal",
251 "not",
252 "or",
253 "pass",
254 "raise",
255 "return",
256 "try",
257 "while",
258 "with",
259 "yield",
Damien George97f9a282014-05-12 23:07:34 +0100260 "__debug__",
Damien429d7192013-10-04 19:53:11 +0100261};
262
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200263// This is called with CUR_CHAR() before first hex digit, and should return with
264// it pointing to last hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100265// num_digits must be greater than zero
266STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
267 mp_uint_t num = 0;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200268 while (num_digits-- != 0) {
269 next_char(lex);
270 unichar c = CUR_CHAR(lex);
271 if (!unichar_isxdigit(c)) {
272 return false;
273 }
Dave Hylands3ad94d62015-05-18 14:41:25 -0700274 num = (num << 4) + unichar_xdigit_value(c);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200275 }
276 *result = num;
277 return true;
278}
279
Damien Georgea4c52c52014-12-05 19:35:18 +0000280STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
281 // start new token text
282 vstr_reset(&lex->vstr);
283
Damiena5185f42013-10-20 14:41:27 +0100284 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100285 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100286 while (!is_end(lex)) {
287 if (is_physical_newline(lex)) {
288 had_physical_newline = true;
289 next_char(lex);
290 } else if (is_whitespace(lex)) {
291 next_char(lex);
292 } else if (is_char(lex, '#')) {
293 next_char(lex);
294 while (!is_end(lex) && !is_physical_newline(lex)) {
295 next_char(lex);
296 }
297 // had_physical_newline will be set on next loop
298 } else if (is_char(lex, '\\')) {
299 // backslash (outside string literals) must appear just before a physical newline
300 next_char(lex);
301 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000302 // SyntaxError: unexpected character after line continuation character
Damien Georgea4c52c52014-12-05 19:35:18 +0000303 lex->tok_line = lex->line;
304 lex->tok_column = lex->column;
305 lex->tok_kind = MP_TOKEN_BAD_LINE_CONTINUATION;
Damien George69a818d2014-01-12 13:55:24 +0000306 return;
Damien429d7192013-10-04 19:53:11 +0100307 } else {
308 next_char(lex);
309 }
310 } else {
311 break;
312 }
313 }
314
Damiena5185f42013-10-20 14:41:27 +0100315 // set token source information
Damien Georgea4c52c52014-12-05 19:35:18 +0000316 lex->tok_line = lex->line;
317 lex->tok_column = lex->column;
Damiena5185f42013-10-20 14:41:27 +0100318
319 if (first_token && lex->line == 1 && lex->column != 1) {
320 // check that the first token is in the first column
321 // if first token is not on first line, we get a physical newline and
322 // this check is done as part of normal indent/dedent checking below
323 // (done to get equivalence with CPython)
Damien Georgea4c52c52014-12-05 19:35:18 +0000324 lex->tok_kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100325
326 } else if (lex->emit_dent < 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000327 lex->tok_kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100328 lex->emit_dent += 1;
329
330 } else if (lex->emit_dent > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000331 lex->tok_kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100332 lex->emit_dent -= 1;
333
Damien91d387d2013-10-09 15:09:52 +0100334 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000335 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100336
Damien George54eb4e72014-07-03 13:47:47 +0100337 mp_uint_t num_spaces = lex->column - 1;
Damien429d7192013-10-04 19:53:11 +0100338 lex->emit_dent = 0;
339 if (num_spaces == indent_top(lex)) {
340 } else if (num_spaces > indent_top(lex)) {
341 indent_push(lex, num_spaces);
342 lex->emit_dent += 1;
343 } else {
344 while (num_spaces < indent_top(lex)) {
345 indent_pop(lex);
346 lex->emit_dent -= 1;
347 }
348 if (num_spaces != indent_top(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000349 lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100350 }
351 }
352
353 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100354 if (indent_top(lex) > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000355 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100356 lex->emit_dent = 0;
357 while (indent_top(lex) > 0) {
358 indent_pop(lex);
359 lex->emit_dent -= 1;
360 }
361 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000362 lex->tok_kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100363 }
364
365 } else if (is_char_or(lex, '\'', '\"')
366 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
367 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
368 // a string or bytes literal
369
370 // parse type codes
371 bool is_raw = false;
372 bool is_bytes = false;
373 if (is_char(lex, 'u')) {
374 next_char(lex);
375 } else if (is_char(lex, 'b')) {
376 is_bytes = true;
377 next_char(lex);
378 if (is_char(lex, 'r')) {
379 is_raw = true;
380 next_char(lex);
381 }
382 } else if (is_char(lex, 'r')) {
383 is_raw = true;
384 next_char(lex);
385 if (is_char(lex, 'b')) {
386 is_bytes = true;
387 next_char(lex);
388 }
389 }
390
391 // set token kind
392 if (is_bytes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000393 lex->tok_kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100394 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000395 lex->tok_kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100396 }
397
398 // get first quoting character
399 char quote_char = '\'';
400 if (is_char(lex, '\"')) {
401 quote_char = '\"';
402 }
403 next_char(lex);
404
405 // work out if it's a single or triple quoted literal
Damien George54eb4e72014-07-03 13:47:47 +0100406 mp_uint_t num_quotes;
Damien429d7192013-10-04 19:53:11 +0100407 if (is_char_and(lex, quote_char, quote_char)) {
408 // triple quotes
409 next_char(lex);
410 next_char(lex);
411 num_quotes = 3;
412 } else {
413 // single quotes
414 num_quotes = 1;
415 }
416
Damien429d7192013-10-04 19:53:11 +0100417 // parse the literal
Damien George54eb4e72014-07-03 13:47:47 +0100418 mp_uint_t n_closing = 0;
Damien429d7192013-10-04 19:53:11 +0100419 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
420 if (is_char(lex, quote_char)) {
421 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100422 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100423 } else {
424 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100425 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100426 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100427 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100428 if (is_raw) {
429 // raw strings allow escaping of quotes, but the backslash is also emitted
430 vstr_add_char(&lex->vstr, '\\');
431 } else {
432 switch (c) {
Damien George94fbe972014-07-30 11:46:05 +0100433 case MP_LEXER_EOF: break; // TODO a proper error message?
434 case '\n': c = MP_LEXER_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damien Georgea91f4142014-04-10 11:30:55 +0100435 case '\\': break;
436 case '\'': break;
437 case '"': break;
438 case 'a': c = 0x07; break;
439 case 'b': c = 0x08; break;
440 case 't': c = 0x09; break;
441 case 'n': c = 0x0a; break;
442 case 'v': c = 0x0b; break;
443 case 'f': c = 0x0c; break;
444 case 'r': c = 0x0d; break;
Chris Angelico2ba22992014-06-04 05:28:12 +1000445 case 'u':
446 case 'U':
447 if (is_bytes) {
448 // b'\u1234' == b'\\u1234'
449 vstr_add_char(&lex->vstr, '\\');
450 break;
451 }
452 // Otherwise fall through.
Damien Georgea91f4142014-04-10 11:30:55 +0100453 case 'x':
454 {
Damien George54eb4e72014-07-03 13:47:47 +0100455 mp_uint_t num = 0;
Chris Angelico2ba22992014-06-04 05:28:12 +1000456 if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
Damien Georged241c2a2015-07-23 23:20:37 +0100457 // not enough hex chars for escape sequence
458 lex->tok_kind = MP_TOKEN_INVALID;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200459 }
460 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100461 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200462 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000463 case 'N':
464 // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
465 // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
466 // 3MB of text; even gzip-compressed and with minimal structure, it'll take
467 // roughly half a meg of storage. This form of Unicode escape may be added
468 // later on, but it's definitely not a priority right now. -- CJA 20140607
469 assert(!"Unicode name escapes not supported");
470 break;
Damien Georgea91f4142014-04-10 11:30:55 +0100471 default:
472 if (c >= '0' && c <= '7') {
473 // Octal sequence, 1-3 chars
Damien George54eb4e72014-07-03 13:47:47 +0100474 mp_uint_t digits = 3;
475 mp_uint_t num = c - '0';
Damien Georgea91f4142014-04-10 11:30:55 +0100476 while (is_following_odigit(lex) && --digits != 0) {
477 next_char(lex);
478 num = num * 8 + (CUR_CHAR(lex) - '0');
479 }
480 c = num;
481 } else {
482 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
483 vstr_add_char(&lex->vstr, '\\');
484 }
485 break;
486 }
Damiena5185f42013-10-20 14:41:27 +0100487 }
Damien George94fbe972014-07-30 11:46:05 +0100488 if (c != MP_LEXER_EOF) {
Damien George16677ce2015-01-28 14:07:11 +0000489 #if MICROPY_PY_BUILTINS_STR_UNICODE
Chris Angelico2ba22992014-06-04 05:28:12 +1000490 if (c < 0x110000 && !is_bytes) {
491 vstr_add_char(&lex->vstr, c);
492 } else if (c < 0x100 && is_bytes) {
493 vstr_add_byte(&lex->vstr, c);
Damien George16677ce2015-01-28 14:07:11 +0000494 }
495 #else
496 // without unicode everything is just added as an 8-bit byte
497 if (c < 0x100) {
498 vstr_add_byte(&lex->vstr, c);
499 }
500 #endif
501 else {
Chris Angelico2ba22992014-06-04 05:28:12 +1000502 assert(!"TODO: Throw an error, invalid escape code probably");
503 }
Damiena5185f42013-10-20 14:41:27 +0100504 }
505 } else {
Damien George94fbe972014-07-30 11:46:05 +0100506 // Add the "character" as a byte so that we remain 8-bit clean.
507 // This way, strings are parsed correctly whether or not they contain utf-8 chars.
508 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100509 }
510 }
511 next_char(lex);
512 }
513
514 // check we got the required end quotes
515 if (n_closing < num_quotes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000516 lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100517 }
518
Damiena5185f42013-10-20 14:41:27 +0100519 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000520 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100521
522 } else if (is_head_of_identifier(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000523 lex->tok_kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100524
Damien George7ed58cb2015-06-09 10:58:07 +0000525 // get first char (add as byte to remain 8-bit clean and support utf-8)
526 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100527 next_char(lex);
528
Damiena5185f42013-10-20 14:41:27 +0100529 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100530 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damien George7ed58cb2015-06-09 10:58:07 +0000531 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100532 next_char(lex);
533 }
534
535 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damien George7d414a12015-02-08 01:57:40 +0000536 bool forced_integer = false;
537 if (is_char(lex, '.')) {
538 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
539 } else {
540 lex->tok_kind = MP_TOKEN_INTEGER;
541 if (is_char(lex, '0') && is_following_letter(lex)) {
542 forced_integer = true;
543 }
544 }
Damien429d7192013-10-04 19:53:11 +0100545
Damiena5185f42013-10-20 14:41:27 +0100546 // get first char
547 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100548 next_char(lex);
549
Damiena5185f42013-10-20 14:41:27 +0100550 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100551 while (!is_end(lex)) {
Damien George7d414a12015-02-08 01:57:40 +0000552 if (!forced_integer && is_char_or(lex, 'e', 'E')) {
553 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
Damiena5185f42013-10-20 14:41:27 +0100554 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100555 next_char(lex);
556 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100557 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100558 next_char(lex);
559 }
Damien George7d414a12015-02-08 01:57:40 +0000560 } else if (is_letter(lex) || is_digit(lex) || is_char(lex, '.')) {
561 if (is_char_or3(lex, '.', 'j', 'J')) {
562 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
563 }
Damiena5185f42013-10-20 14:41:27 +0100564 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100565 next_char(lex);
566 } else {
567 break;
568 }
569 }
570
Damien George2e9eb2d2014-04-10 12:19:33 +0100571 } else if (is_char(lex, '.')) {
572 // special handling for . and ... operators, because .. is not a valid operator
573
574 // get first char
575 vstr_add_char(&lex->vstr, '.');
576 next_char(lex);
577
578 if (is_char_and(lex, '.', '.')) {
579 vstr_add_char(&lex->vstr, '.');
580 vstr_add_char(&lex->vstr, '.');
581 next_char(lex);
582 next_char(lex);
Damien Georgea4c52c52014-12-05 19:35:18 +0000583 lex->tok_kind = MP_TOKEN_ELLIPSIS;
Damien George2e9eb2d2014-04-10 12:19:33 +0100584 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000585 lex->tok_kind = MP_TOKEN_DEL_PERIOD;
Damien George2e9eb2d2014-04-10 12:19:33 +0100586 }
587
Damien429d7192013-10-04 19:53:11 +0100588 } else {
589 // search for encoded delimiter or operator
590
591 const char *t = tok_enc;
Damien George54eb4e72014-07-03 13:47:47 +0100592 mp_uint_t tok_enc_index = 0;
Damien429d7192013-10-04 19:53:11 +0100593 for (; *t != 0 && !is_char(lex, *t); t += 1) {
594 if (*t == 'e' || *t == 'c') {
595 t += 1;
596 } else if (*t == 'E') {
597 tok_enc_index -= 1;
598 t += 1;
599 }
600 tok_enc_index += 1;
601 }
602
603 next_char(lex);
604
605 if (*t == 0) {
606 // didn't match any delimiter or operator characters
Damien Georgea4c52c52014-12-05 19:35:18 +0000607 lex->tok_kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100608
609 } else {
610 // matched a delimiter or operator character
611
612 // get the maximum characters for a valid token
613 t += 1;
Damien George54eb4e72014-07-03 13:47:47 +0100614 mp_uint_t t_index = tok_enc_index;
Damien429d7192013-10-04 19:53:11 +0100615 for (;;) {
616 for (; *t == 'e'; t += 1) {
617 t += 1;
618 t_index += 1;
619 if (is_char(lex, *t)) {
620 next_char(lex);
621 tok_enc_index = t_index;
622 break;
623 }
624 }
625
626 if (*t == 'E') {
627 t += 1;
628 if (is_char(lex, *t)) {
629 next_char(lex);
630 tok_enc_index = t_index;
631 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000632 lex->tok_kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100633 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100634 }
635 break;
636 }
637
638 if (*t == 'c') {
639 t += 1;
640 t_index += 1;
641 if (is_char(lex, *t)) {
642 next_char(lex);
643 tok_enc_index = t_index;
644 t += 1;
645 } else {
646 break;
647 }
648 } else {
649 break;
650 }
651 }
652
653 // set token kind
Damien Georgea4c52c52014-12-05 19:35:18 +0000654 lex->tok_kind = tok_enc_kind[tok_enc_index];
Damien429d7192013-10-04 19:53:11 +0100655
Damien George2e9eb2d2014-04-10 12:19:33 +0100656 tok_enc_no_match:
657
Damien429d7192013-10-04 19:53:11 +0100658 // compute bracket level for implicit line joining
Damien Georgea4c52c52014-12-05 19:35:18 +0000659 if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100660 lex->nested_bracket_level += 1;
Damien Georgea4c52c52014-12-05 19:35:18 +0000661 } else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100662 lex->nested_bracket_level -= 1;
663 }
664 }
665 }
666
Damiena5185f42013-10-20 14:41:27 +0100667 // check for keywords
Damien Georgea4c52c52014-12-05 19:35:18 +0000668 if (lex->tok_kind == MP_TOKEN_NAME) {
Damien George97f9a282014-05-12 23:07:34 +0100669 // We check for __debug__ here and convert it to its value. This is so
670 // the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
671 // need to check for this special token in many places in the compiler.
672 // TODO improve speed of these string comparisons
Damien George54eb4e72014-07-03 13:47:47 +0100673 //for (mp_int_t i = 0; tok_kw[i] != NULL; i++) {
Damien George963a5a32015-01-16 17:47:07 +0000674 for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000675 if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) {
Emmanuel Blotf6932d62014-06-19 18:54:34 +0200676 if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
677 // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
Damien Georgeb4b10fd2015-01-01 23:30:53 +0000678 lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
Damien George97f9a282014-05-12 23:07:34 +0100679 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000680 lex->tok_kind = MP_TOKEN_KW_FALSE + i;
Damien George97f9a282014-05-12 23:07:34 +0100681 }
Damien429d7192013-10-04 19:53:11 +0100682 break;
683 }
684 }
685 }
686}
687
Damien George94fbe972014-07-30 11:46:05 +0100688mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close) {
Damien George9bf5f282014-10-09 16:53:37 +0100689 mp_lexer_t *lex = m_new_obj_maybe(mp_lexer_t);
Damien Georgee1199ec2014-05-10 17:48:01 +0100690
691 // check for memory allocation error
692 if (lex == NULL) {
693 if (stream_close) {
694 stream_close(stream_data);
695 }
696 return NULL;
697 }
Damien429d7192013-10-04 19:53:11 +0100698
Damien Georgeb829b5c2014-01-25 13:51:19 +0000699 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100700 lex->stream_data = stream_data;
Damien George94fbe972014-07-30 11:46:05 +0100701 lex->stream_next_byte = stream_next_byte;
Damienfa2162b2013-10-20 17:42:00 +0100702 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100703 lex->line = 1;
704 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100705 lex->emit_dent = 0;
706 lex->nested_bracket_level = 0;
Damien George58ebde42014-05-21 20:32:59 +0100707 lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
Damien429d7192013-10-04 19:53:11 +0100708 lex->num_indent_level = 1;
Damien Georgee1199ec2014-05-10 17:48:01 +0100709 lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200710 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100711
Damien Georgee1199ec2014-05-10 17:48:01 +0100712 // check for memory allocation error
713 if (lex->indent_level == NULL || vstr_had_error(&lex->vstr)) {
714 mp_lexer_free(lex);
715 return NULL;
716 }
717
718 // store sentinel for first indentation level
719 lex->indent_level[0] = 0;
720
Damien429d7192013-10-04 19:53:11 +0100721 // preload characters
Damien George94fbe972014-07-30 11:46:05 +0100722 lex->chr0 = stream_next_byte(stream_data);
723 lex->chr1 = stream_next_byte(stream_data);
724 lex->chr2 = stream_next_byte(stream_data);
Damiena5185f42013-10-20 14:41:27 +0100725
726 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damien George94fbe972014-07-30 11:46:05 +0100727 if (lex->chr0 == MP_LEXER_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100728 lex->chr0 = '\n';
Damien George94fbe972014-07-30 11:46:05 +0100729 } else if (lex->chr1 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000730 if (lex->chr0 == '\r') {
731 lex->chr0 = '\n';
732 } else if (lex->chr0 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100733 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100734 }
Damien George94fbe972014-07-30 11:46:05 +0100735 } else if (lex->chr2 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000736 if (lex->chr1 == '\r') {
737 lex->chr1 = '\n';
738 } else if (lex->chr1 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100739 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100740 }
Damien429d7192013-10-04 19:53:11 +0100741 }
742
Damiena5185f42013-10-20 14:41:27 +0100743 // preload first token
Damien Georgea4c52c52014-12-05 19:35:18 +0000744 mp_lexer_next_token_into(lex, true);
Damien429d7192013-10-04 19:53:11 +0100745
746 return lex;
747}
748
Damiend99b0522013-12-21 18:17:45 +0000749void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100750 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100751 if (lex->stream_close) {
752 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100753 }
Damienbb5316b2013-10-22 21:12:29 +0100754 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200755 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000756 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100757 }
Damien429d7192013-10-04 19:53:11 +0100758}
759
Damiend99b0522013-12-21 18:17:45 +0000760void mp_lexer_to_next(mp_lexer_t *lex) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000761 mp_lexer_next_token_into(lex, false);
Damien429d7192013-10-04 19:53:11 +0100762}
763
Damien Georgea4c52c52014-12-05 19:35:18 +0000764#if MICROPY_DEBUG_PRINTERS
765void mp_lexer_show_token(const mp_lexer_t *lex) {
Damien George451a0872014-12-05 22:50:16 +0000766 printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
Damien Georgea4c52c52014-12-05 19:35:18 +0000767 if (lex->vstr.len > 0) {
768 const byte *i = (const byte *)lex->vstr.buf;
769 const byte *j = (const byte *)i + lex->vstr.len;
770 printf(" ");
771 while (i < j) {
772 unichar c = utf8_get_char(i);
773 i = utf8_next_char(i);
774 if (unichar_isprint(c)) {
Damien George7f19a392015-06-22 17:40:12 +0100775 printf("%c", (int)c);
Damien Georgea4c52c52014-12-05 19:35:18 +0000776 } else {
777 printf("?");
778 }
779 }
780 }
781 printf("\n");
Damien429d7192013-10-04 19:53:11 +0100782}
Damien Georgea4c52c52014-12-05 19:35:18 +0000783#endif