blob: 6017d69d6dc647823127c6a6d9843e18834fc3c0 [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
Alexander Steffen55f33242017-06-30 09:22:17 +02002 * This file is part of the MicroPython project, http://micropython.org/
Damien George04b91472014-05-03 23:27:38 +01003 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027#include <stdio.h>
Damien Georgeae436792017-02-17 11:10:35 +110028#include <string.h>
Damien429d7192013-10-04 19:53:11 +010029#include <assert.h>
30
Damien George511c0832016-11-16 16:22:08 +110031#include "py/reader.h"
Damien George51dfcb42015-01-01 20:27:54 +000032#include "py/lexer.h"
Damien George081f9322015-09-07 17:08:49 +010033#include "py/runtime.h"
Damien429d7192013-10-04 19:53:11 +010034
Damien Georgedd5353a2015-12-18 12:35:44 +000035#if MICROPY_ENABLE_COMPILER
36
Damien429d7192013-10-04 19:53:11 +010037#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010038
Damien92c06562013-10-22 22:32:27 +010039// TODO seems that CPython allows NULL byte in the input stream
40// don't know if that's intentional or not, but we don't allow it
41
Damien George5bdf1652016-11-16 18:27:20 +110042#define MP_LEXER_EOF ((unichar)MP_READER_EOF)
Damiena5185f42013-10-20 14:41:27 +010043#define CUR_CHAR(lex) ((lex)->chr0)
44
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020045STATIC bool is_end(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +010046 return lex->chr0 == MP_LEXER_EOF;
Damien429d7192013-10-04 19:53:11 +010047}
48
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020049STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien George32bade12015-01-30 00:27:46 +000050 return lex->chr0 == '\n';
Damien429d7192013-10-04 19:53:11 +010051}
52
Damien George2e2e4042015-03-19 00:21:29 +000053STATIC bool is_char(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010054 return lex->chr0 == c;
55}
56
Damien George2e2e4042015-03-19 00:21:29 +000057STATIC bool is_char_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010058 return lex->chr0 == c1 || lex->chr0 == c2;
59}
60
Damien George2e2e4042015-03-19 00:21:29 +000061STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
Damien429d7192013-10-04 19:53:11 +010062 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
63}
64
Damien George2e2e4042015-03-19 00:21:29 +000065STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010066 return lex->chr1 == c;
67}
Damien429d7192013-10-04 19:53:11 +010068
Damien George2e2e4042015-03-19 00:21:29 +000069STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010070 return lex->chr1 == c1 || lex->chr1 == c2;
71}
72
Damien George2e2e4042015-03-19 00:21:29 +000073STATIC bool is_char_following_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010074 return lex->chr2 == c1 || lex->chr2 == c2;
75}
76
Damien George2e2e4042015-03-19 00:21:29 +000077STATIC bool is_char_and(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010078 return lex->chr0 == c1 && lex->chr1 == c2;
79}
80
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020081STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000082 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010083}
84
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020085STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000086 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010087}
88
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020089STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000090 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010091}
92
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020093STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000094 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +010095}
96
Damien George2b000472015-09-07 17:33:44 +010097STATIC bool is_following_base_char(mp_lexer_t *lex) {
98 const unichar chr1 = lex->chr1 | 0x20;
99 return chr1 == 'b' || chr1 == 'o' || chr1 == 'x';
Damien George7d414a12015-02-08 01:57:40 +0000100}
101
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200102STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200103 return lex->chr1 >= '0' && lex->chr1 <= '7';
104}
105
Damien George534b7c32017-02-17 12:12:40 +1100106STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
107 return is_char_or(lex, '\'', '\"')
108 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
109 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
110 && is_char_following_following_or(lex, '\'', '\"'));
111}
112
Damien George7ed58cb2015-06-09 10:58:07 +0000113// to easily parse utf-8 identifiers we allow any raw byte with high bit set
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200114STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien George7ed58cb2015-06-09 10:58:07 +0000115 return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
Damien429d7192013-10-04 19:53:11 +0100116}
117
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200118STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100119 return is_head_of_identifier(lex) || is_digit(lex);
120}
121
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200122STATIC void next_char(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100123 if (lex->chr0 == '\n') {
Damien George32bade12015-01-30 00:27:46 +0000124 // a new line
Damien429d7192013-10-04 19:53:11 +0100125 ++lex->line;
126 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100127 } else if (lex->chr0 == '\t') {
128 // a tab
129 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
130 } else {
131 // a character worth one column
132 ++lex->column;
133 }
134
Damien George32bade12015-01-30 00:27:46 +0000135 lex->chr0 = lex->chr1;
136 lex->chr1 = lex->chr2;
Damien George5bdf1652016-11-16 18:27:20 +1100137 lex->chr2 = lex->reader.readbyte(lex->reader.data);
Damien George32bade12015-01-30 00:27:46 +0000138
Tom Collins6f564122017-05-09 13:19:46 -0700139 if (lex->chr1 == '\r') {
Damien George32bade12015-01-30 00:27:46 +0000140 // CR is a new line, converted to LF
Tom Collins6f564122017-05-09 13:19:46 -0700141 lex->chr1 = '\n';
142 if (lex->chr2 == '\n') {
143 // CR LF is a single new line, throw out the extra LF
Damien George5bdf1652016-11-16 18:27:20 +1100144 lex->chr2 = lex->reader.readbyte(lex->reader.data);
Damien George32bade12015-01-30 00:27:46 +0000145 }
146 }
147
Tom Collins6f564122017-05-09 13:19:46 -0700148 // check if we need to insert a newline at end of file
149 if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
150 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100151 }
152}
153
Damien George5124a942017-02-17 12:44:24 +1100154STATIC void indent_push(mp_lexer_t *lex, size_t indent) {
Damien429d7192013-10-04 19:53:11 +0100155 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien George58ebde42014-05-21 20:32:59 +0100156 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
157 lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
Damien429d7192013-10-04 19:53:11 +0100158 }
159 lex->indent_level[lex->num_indent_level++] = indent;
160}
161
Damien George5124a942017-02-17 12:44:24 +1100162STATIC size_t indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100163 return lex->indent_level[lex->num_indent_level - 1];
164}
165
Damien Georgea4c52c52014-12-05 19:35:18 +0000166STATIC void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100167 lex->num_indent_level -= 1;
168}
169
170// some tricky operator encoding:
171// <op> = begin with <op>, if this opchar matches then begin here
172// e<op> = end with <op>, if this opchar matches then end
Damien429d7192013-10-04 19:53:11 +0100173// c<op> = continue with <op>, if this opchar matches then continue matching
174// this means if the start of two ops are the same then they are equal til the last char
175
Damien George3ff16ff2016-05-20 12:38:15 +0100176STATIC const char *const tok_enc =
Damien429d7192013-10-04 19:53:11 +0100177 "()[]{},:;@~" // singles
178 "<e=c<e=" // < <= << <<=
179 ">e=c>e=" // > >= >> >>=
180 "*e=c*e=" // * *= ** **=
181 "+e=" // + +=
182 "-e=e>" // - -= ->
183 "&e=" // & &=
184 "|e=" // | |=
185 "/e=c/e=" // / /= // //=
186 "%e=" // % %=
187 "^e=" // ^ ^=
188 "=e=" // = ==
Damien George5010d192017-03-29 10:55:36 +1100189 "!."; // start of special cases: != . ...
Damien429d7192013-10-04 19:53:11 +0100190
191// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200192STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000193 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
194 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
195 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
196 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100197
Damiend99b0522013-12-21 18:17:45 +0000198 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
199 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
200 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
201 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
202 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
203 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
204 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
205 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
206 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
207 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
208 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100209};
210
211// must have the same order as enum in lexer.h
Damien Georgeae436792017-02-17 11:10:35 +1100212// must be sorted according to strcmp
Damien George3ff16ff2016-05-20 12:38:15 +0100213STATIC const char *const tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100214 "False",
215 "None",
216 "True",
Damien Georgeae436792017-02-17 11:10:35 +1100217 "__debug__",
Damien429d7192013-10-04 19:53:11 +0100218 "and",
219 "as",
220 "assert",
pohmelie81ebba72016-01-27 23:23:11 +0300221 #if MICROPY_PY_ASYNC_AWAIT
222 "async",
223 "await",
224 #endif
Damien429d7192013-10-04 19:53:11 +0100225 "break",
226 "class",
227 "continue",
228 "def",
229 "del",
230 "elif",
231 "else",
232 "except",
233 "finally",
234 "for",
235 "from",
236 "global",
237 "if",
238 "import",
239 "in",
240 "is",
241 "lambda",
242 "nonlocal",
243 "not",
244 "or",
245 "pass",
246 "raise",
247 "return",
248 "try",
249 "while",
250 "with",
251 "yield",
Damien429d7192013-10-04 19:53:11 +0100252};
253
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200254// This is called with CUR_CHAR() before first hex digit, and should return with
255// it pointing to last hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100256// num_digits must be greater than zero
Damien George5124a942017-02-17 12:44:24 +1100257STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) {
Damien George54eb4e72014-07-03 13:47:47 +0100258 mp_uint_t num = 0;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200259 while (num_digits-- != 0) {
260 next_char(lex);
261 unichar c = CUR_CHAR(lex);
262 if (!unichar_isxdigit(c)) {
263 return false;
264 }
Dave Hylands3ad94d62015-05-18 14:41:25 -0700265 num = (num << 4) + unichar_xdigit_value(c);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200266 }
267 *result = num;
268 return true;
269}
270
Damien George534b7c32017-02-17 12:12:40 +1100271STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
272 // get first quoting character
273 char quote_char = '\'';
274 if (is_char(lex, '\"')) {
275 quote_char = '\"';
276 }
277 next_char(lex);
Damien Georgea4c52c52014-12-05 19:35:18 +0000278
Damien George534b7c32017-02-17 12:12:40 +1100279 // work out if it's a single or triple quoted literal
280 size_t num_quotes;
281 if (is_char_and(lex, quote_char, quote_char)) {
282 // triple quotes
283 next_char(lex);
284 next_char(lex);
285 num_quotes = 3;
286 } else {
287 // single quotes
288 num_quotes = 1;
289 }
290
291 size_t n_closing = 0;
292 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
293 if (is_char(lex, quote_char)) {
294 n_closing += 1;
295 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
296 } else {
297 n_closing = 0;
298 if (is_char(lex, '\\')) {
299 next_char(lex);
300 unichar c = CUR_CHAR(lex);
301 if (is_raw) {
302 // raw strings allow escaping of quotes, but the backslash is also emitted
303 vstr_add_char(&lex->vstr, '\\');
304 } else {
305 switch (c) {
306 // note: "c" can never be MP_LEXER_EOF because next_char
307 // always inserts a newline at the end of the input stream
308 case '\n': c = MP_LEXER_EOF; break; // backslash escape the newline, just ignore it
309 case '\\': break;
310 case '\'': break;
311 case '"': break;
312 case 'a': c = 0x07; break;
313 case 'b': c = 0x08; break;
314 case 't': c = 0x09; break;
315 case 'n': c = 0x0a; break;
316 case 'v': c = 0x0b; break;
317 case 'f': c = 0x0c; break;
318 case 'r': c = 0x0d; break;
319 case 'u':
320 case 'U':
321 if (lex->tok_kind == MP_TOKEN_BYTES) {
322 // b'\u1234' == b'\\u1234'
323 vstr_add_char(&lex->vstr, '\\');
324 break;
325 }
326 // Otherwise fall through.
327 case 'x':
328 {
329 mp_uint_t num = 0;
330 if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
331 // not enough hex chars for escape sequence
332 lex->tok_kind = MP_TOKEN_INVALID;
333 }
334 c = num;
335 break;
336 }
337 case 'N':
338 // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
339 // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
340 // 3MB of text; even gzip-compressed and with minimal structure, it'll take
341 // roughly half a meg of storage. This form of Unicode escape may be added
342 // later on, but it's definitely not a priority right now. -- CJA 20140607
Javier Candeira35a1fea2017-08-09 14:40:45 +1000343 mp_raise_NotImplementedError("unicode name escapes");
Damien George534b7c32017-02-17 12:12:40 +1100344 break;
345 default:
346 if (c >= '0' && c <= '7') {
347 // Octal sequence, 1-3 chars
Damien George5124a942017-02-17 12:44:24 +1100348 size_t digits = 3;
Damien George534b7c32017-02-17 12:12:40 +1100349 mp_uint_t num = c - '0';
350 while (is_following_odigit(lex) && --digits != 0) {
351 next_char(lex);
352 num = num * 8 + (CUR_CHAR(lex) - '0');
353 }
354 c = num;
355 } else {
356 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
357 vstr_add_char(&lex->vstr, '\\');
358 }
359 break;
360 }
361 }
362 if (c != MP_LEXER_EOF) {
363 if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
364 if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {
365 vstr_add_char(&lex->vstr, c);
366 } else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {
367 vstr_add_byte(&lex->vstr, c);
368 } else {
369 // unicode character out of range
370 // this raises a generic SyntaxError; could provide more info
371 lex->tok_kind = MP_TOKEN_INVALID;
372 }
373 } else {
374 // without unicode everything is just added as an 8-bit byte
375 if (c < 0x100) {
376 vstr_add_byte(&lex->vstr, c);
377 } else {
378 // 8-bit character out of range
379 // this raises a generic SyntaxError; could provide more info
380 lex->tok_kind = MP_TOKEN_INVALID;
381 }
382 }
383 }
384 } else {
385 // Add the "character" as a byte so that we remain 8-bit clean.
386 // This way, strings are parsed correctly whether or not they contain utf-8 chars.
387 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
388 }
389 }
390 next_char(lex);
391 }
392
393 // check we got the required end quotes
394 if (n_closing < num_quotes) {
395 lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
396 }
397
398 // cut off the end quotes from the token text
399 vstr_cut_tail_bytes(&lex->vstr, n_closing);
400}
401
402STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
Damien429d7192013-10-04 19:53:11 +0100403 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100404 while (!is_end(lex)) {
405 if (is_physical_newline(lex)) {
Damien George534b7c32017-02-17 12:12:40 +1100406 if (stop_at_newline && lex->nested_bracket_level == 0) {
407 break;
408 }
Damien429d7192013-10-04 19:53:11 +0100409 had_physical_newline = true;
410 next_char(lex);
411 } else if (is_whitespace(lex)) {
412 next_char(lex);
413 } else if (is_char(lex, '#')) {
414 next_char(lex);
415 while (!is_end(lex) && !is_physical_newline(lex)) {
416 next_char(lex);
417 }
418 // had_physical_newline will be set on next loop
Damien George773278e2017-02-17 11:30:14 +1100419 } else if (is_char_and(lex, '\\', '\n')) {
420 // line-continuation, so don't set had_physical_newline
Damien429d7192013-10-04 19:53:11 +0100421 next_char(lex);
Damien George773278e2017-02-17 11:30:14 +1100422 next_char(lex);
Damien429d7192013-10-04 19:53:11 +0100423 } else {
424 break;
425 }
426 }
Damien George534b7c32017-02-17 12:12:40 +1100427 return had_physical_newline;
428}
429
430void mp_lexer_to_next(mp_lexer_t *lex) {
431 // start new token text
432 vstr_reset(&lex->vstr);
433
434 // skip white space and comments
435 bool had_physical_newline = skip_whitespace(lex, false);
Damien429d7192013-10-04 19:53:11 +0100436
Damiena5185f42013-10-20 14:41:27 +0100437 // set token source information
Damien Georgea4c52c52014-12-05 19:35:18 +0000438 lex->tok_line = lex->line;
439 lex->tok_column = lex->column;
Damiena5185f42013-10-20 14:41:27 +0100440
Damien George98b30722017-02-17 10:56:06 +1100441 if (lex->emit_dent < 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000442 lex->tok_kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100443 lex->emit_dent += 1;
444
445 } else if (lex->emit_dent > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000446 lex->tok_kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100447 lex->emit_dent -= 1;
448
Damien91d387d2013-10-09 15:09:52 +0100449 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000450 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100451
Damien George5124a942017-02-17 12:44:24 +1100452 size_t num_spaces = lex->column - 1;
Damien429d7192013-10-04 19:53:11 +0100453 if (num_spaces == indent_top(lex)) {
454 } else if (num_spaces > indent_top(lex)) {
455 indent_push(lex, num_spaces);
456 lex->emit_dent += 1;
457 } else {
458 while (num_spaces < indent_top(lex)) {
459 indent_pop(lex);
460 lex->emit_dent -= 1;
461 }
462 if (num_spaces != indent_top(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000463 lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100464 }
465 }
466
467 } else if (is_end(lex)) {
Damien George31101d92016-10-12 11:00:17 +1100468 lex->tok_kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100469
Damien George534b7c32017-02-17 12:12:40 +1100470 } else if (is_string_or_bytes(lex)) {
Damien429d7192013-10-04 19:53:11 +0100471 // a string or bytes literal
472
Damien George534b7c32017-02-17 12:12:40 +1100473 // Python requires adjacent string/bytes literals to be automatically
474 // concatenated. We do it here in the tokeniser to make efficient use of RAM,
475 // because then the lexer's vstr can be used to accumulate the string literal,
476 // in contrast to creating a parse tree of strings and then joining them later
477 // in the compiler. It's also more compact in code size to do it here.
478
479 // MP_TOKEN_END is used to indicate that this is the first string token
480 lex->tok_kind = MP_TOKEN_END;
481
482 // Loop to accumulate string/bytes literals
483 do {
484 // parse type codes
485 bool is_raw = false;
486 mp_token_kind_t kind = MP_TOKEN_STRING;
487 int n_char = 0;
488 if (is_char(lex, 'u')) {
489 n_char = 1;
490 } else if (is_char(lex, 'b')) {
491 kind = MP_TOKEN_BYTES;
492 n_char = 1;
493 if (is_char_following(lex, 'r')) {
494 is_raw = true;
495 n_char = 2;
496 }
497 } else if (is_char(lex, 'r')) {
Damien429d7192013-10-04 19:53:11 +0100498 is_raw = true;
Damien George534b7c32017-02-17 12:12:40 +1100499 n_char = 1;
500 if (is_char_following(lex, 'b')) {
501 kind = MP_TOKEN_BYTES;
502 n_char = 2;
Damien429d7192013-10-04 19:53:11 +0100503 }
504 }
Damien429d7192013-10-04 19:53:11 +0100505
Damien George534b7c32017-02-17 12:12:40 +1100506 // Set or check token kind
507 if (lex->tok_kind == MP_TOKEN_END) {
508 lex->tok_kind = kind;
509 } else if (lex->tok_kind != kind) {
510 // Can't concatenate string with bytes
511 break;
512 }
Damien429d7192013-10-04 19:53:11 +0100513
Damien George534b7c32017-02-17 12:12:40 +1100514 // Skip any type code characters
515 if (n_char != 0) {
516 next_char(lex);
517 if (n_char == 2) {
518 next_char(lex);
519 }
520 }
521
522 // Parse the literal
523 parse_string_literal(lex, is_raw);
524
525 // Skip whitespace so we can check if there's another string following
526 skip_whitespace(lex, true);
527
528 } while (is_string_or_bytes(lex));
Damien429d7192013-10-04 19:53:11 +0100529
530 } else if (is_head_of_identifier(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000531 lex->tok_kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100532
Damien George7ed58cb2015-06-09 10:58:07 +0000533 // get first char (add as byte to remain 8-bit clean and support utf-8)
534 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100535 next_char(lex);
536
Damiena5185f42013-10-20 14:41:27 +0100537 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100538 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damien George7ed58cb2015-06-09 10:58:07 +0000539 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100540 next_char(lex);
541 }
542
Damien Georgea68c7542017-02-17 10:59:57 +1100543 // Check if the name is a keyword.
544 // We also check for __debug__ here and convert it to its value. This is
545 // so the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
546 // need to check for this special token in many places in the compiler.
Damien Georgeae436792017-02-17 11:10:35 +1100547 const char *s = vstr_null_terminated_str(&lex->vstr);
Damien Georgea68c7542017-02-17 10:59:57 +1100548 for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
Damien Georgeae436792017-02-17 11:10:35 +1100549 int cmp = strcmp(s, tok_kw[i]);
550 if (cmp == 0) {
551 lex->tok_kind = MP_TOKEN_KW_FALSE + i;
552 if (lex->tok_kind == MP_TOKEN_KW___DEBUG__) {
Damien Georgea68c7542017-02-17 10:59:57 +1100553 lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
Damien Georgea68c7542017-02-17 10:59:57 +1100554 }
555 break;
Damien Georgeae436792017-02-17 11:10:35 +1100556 } else if (cmp < 0) {
557 // Table is sorted and comparison was less-than, so stop searching
558 break;
Damien Georgea68c7542017-02-17 10:59:57 +1100559 }
560 }
561
Damien429d7192013-10-04 19:53:11 +0100562 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damien George7d414a12015-02-08 01:57:40 +0000563 bool forced_integer = false;
564 if (is_char(lex, '.')) {
565 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
566 } else {
567 lex->tok_kind = MP_TOKEN_INTEGER;
Damien George2b000472015-09-07 17:33:44 +0100568 if (is_char(lex, '0') && is_following_base_char(lex)) {
Damien George7d414a12015-02-08 01:57:40 +0000569 forced_integer = true;
570 }
571 }
Damien429d7192013-10-04 19:53:11 +0100572
Damiena5185f42013-10-20 14:41:27 +0100573 // get first char
574 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100575 next_char(lex);
576
Damiena5185f42013-10-20 14:41:27 +0100577 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100578 while (!is_end(lex)) {
Damien George7d414a12015-02-08 01:57:40 +0000579 if (!forced_integer && is_char_or(lex, 'e', 'E')) {
580 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
Damiena5185f42013-10-20 14:41:27 +0100581 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100582 next_char(lex);
583 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100584 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100585 next_char(lex);
586 }
Damien George7d414a12015-02-08 01:57:40 +0000587 } else if (is_letter(lex) || is_digit(lex) || is_char(lex, '.')) {
588 if (is_char_or3(lex, '.', 'j', 'J')) {
589 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
590 }
Damiena5185f42013-10-20 14:41:27 +0100591 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100592 next_char(lex);
593 } else {
594 break;
595 }
596 }
597
598 } else {
599 // search for encoded delimiter or operator
600
601 const char *t = tok_enc;
Damien George5124a942017-02-17 12:44:24 +1100602 size_t tok_enc_index = 0;
Damien429d7192013-10-04 19:53:11 +0100603 for (; *t != 0 && !is_char(lex, *t); t += 1) {
604 if (*t == 'e' || *t == 'c') {
605 t += 1;
Damien429d7192013-10-04 19:53:11 +0100606 }
607 tok_enc_index += 1;
608 }
609
610 next_char(lex);
611
612 if (*t == 0) {
613 // didn't match any delimiter or operator characters
Damien Georgea4c52c52014-12-05 19:35:18 +0000614 lex->tok_kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100615
Damien George5010d192017-03-29 10:55:36 +1100616 } else if (*t == '!') {
617 // "!=" is a special case because "!" is not a valid operator
618 if (is_char(lex, '=')) {
619 next_char(lex);
620 lex->tok_kind = MP_TOKEN_OP_NOT_EQUAL;
621 } else {
622 lex->tok_kind = MP_TOKEN_INVALID;
623 }
624
625 } else if (*t == '.') {
626 // "." and "..." are special cases because ".." is not a valid operator
627 if (is_char_and(lex, '.', '.')) {
628 next_char(lex);
629 next_char(lex);
630 lex->tok_kind = MP_TOKEN_ELLIPSIS;
631 } else {
632 lex->tok_kind = MP_TOKEN_DEL_PERIOD;
633 }
634
Damien429d7192013-10-04 19:53:11 +0100635 } else {
636 // matched a delimiter or operator character
637
638 // get the maximum characters for a valid token
639 t += 1;
Damien George5124a942017-02-17 12:44:24 +1100640 size_t t_index = tok_enc_index;
Damien George5010d192017-03-29 10:55:36 +1100641 while (*t == 'c' || *t == 'e') {
642 t_index += 1;
643 if (is_char(lex, t[1])) {
644 next_char(lex);
645 tok_enc_index = t_index;
646 if (*t == 'e') {
Damien429d7192013-10-04 19:53:11 +0100647 break;
648 }
Damien George5010d192017-03-29 10:55:36 +1100649 } else if (*t == 'c') {
Damien429d7192013-10-04 19:53:11 +0100650 break;
651 }
Damien George5010d192017-03-29 10:55:36 +1100652 t += 2;
Damien429d7192013-10-04 19:53:11 +0100653 }
654
655 // set token kind
Damien Georgea4c52c52014-12-05 19:35:18 +0000656 lex->tok_kind = tok_enc_kind[tok_enc_index];
Damien429d7192013-10-04 19:53:11 +0100657
658 // compute bracket level for implicit line joining
Damien Georgea4c52c52014-12-05 19:35:18 +0000659 if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100660 lex->nested_bracket_level += 1;
Damien Georgea4c52c52014-12-05 19:35:18 +0000661 } else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100662 lex->nested_bracket_level -= 1;
663 }
664 }
665 }
Damien429d7192013-10-04 19:53:11 +0100666}
667
Damien George5bdf1652016-11-16 18:27:20 +1100668mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
Damien George18310342017-03-14 11:16:31 +1100669 mp_lexer_t *lex = m_new_obj(mp_lexer_t);
Damien429d7192013-10-04 19:53:11 +0100670
Damien Georgeb829b5c2014-01-25 13:51:19 +0000671 lex->source_name = src_name;
Damien George5bdf1652016-11-16 18:27:20 +1100672 lex->reader = reader;
Damien429d7192013-10-04 19:53:11 +0100673 lex->line = 1;
Tom Collins145796f2017-06-30 16:23:29 -0700674 lex->column = (size_t)-2; // account for 3 dummy bytes
Damien429d7192013-10-04 19:53:11 +0100675 lex->emit_dent = 0;
676 lex->nested_bracket_level = 0;
Damien George58ebde42014-05-21 20:32:59 +0100677 lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
Damien429d7192013-10-04 19:53:11 +0100678 lex->num_indent_level = 1;
Damien George18310342017-03-14 11:16:31 +1100679 lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200680 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100681
Damien Georgee1199ec2014-05-10 17:48:01 +0100682 // store sentinel for first indentation level
683 lex->indent_level[0] = 0;
684
Tom Collins29986472017-05-04 16:31:08 -0700685 // load lexer with start of file, advancing lex->column to 1
686 // start with dummy bytes and use next_char() for proper EOL/EOF handling
687 lex->chr0 = lex->chr1 = lex->chr2 = 0;
688 next_char(lex);
689 next_char(lex);
690 next_char(lex);
Damien429d7192013-10-04 19:53:11 +0100691
Damiena5185f42013-10-20 14:41:27 +0100692 // preload first token
Damien George98b30722017-02-17 10:56:06 +1100693 mp_lexer_to_next(lex);
694
695 // Check that the first token is in the first column. If it's not then we
696 // convert the token kind to INDENT so that the parser gives a syntax error.
697 if (lex->tok_column != 1) {
698 lex->tok_kind = MP_TOKEN_INDENT;
699 }
Damien429d7192013-10-04 19:53:11 +0100700
701 return lex;
702}
703
Damien George5124a942017-02-17 12:44:24 +1100704mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, size_t len, size_t free_len) {
Damien George511c0832016-11-16 16:22:08 +1100705 mp_reader_t reader;
Damien George18310342017-03-14 11:16:31 +1100706 mp_reader_new_mem(&reader, (const byte*)str, len, free_len);
Damien George5bdf1652016-11-16 18:27:20 +1100707 return mp_lexer_new(src_name, reader);
Damien George511c0832016-11-16 16:22:08 +1100708}
709
Damien George8beba732017-01-29 15:16:51 +1100710#if MICROPY_READER_POSIX || MICROPY_READER_VFS
Damien Georgee5ef15a2016-11-16 16:25:06 +1100711
712mp_lexer_t *mp_lexer_new_from_file(const char *filename) {
713 mp_reader_t reader;
Damien George18310342017-03-14 11:16:31 +1100714 mp_reader_new_file(&reader, filename);
Damien George5bdf1652016-11-16 18:27:20 +1100715 return mp_lexer_new(qstr_from_str(filename), reader);
Damien Georgee5ef15a2016-11-16 16:25:06 +1100716}
717
Damien George66d955c2016-11-16 18:12:55 +1100718#if MICROPY_HELPER_LEXER_UNIX
719
720mp_lexer_t *mp_lexer_new_from_fd(qstr filename, int fd, bool close_fd) {
721 mp_reader_t reader;
Damien George18310342017-03-14 11:16:31 +1100722 mp_reader_new_file_from_fd(&reader, fd, close_fd);
Damien George5bdf1652016-11-16 18:27:20 +1100723 return mp_lexer_new(filename, reader);
Damien George66d955c2016-11-16 18:12:55 +1100724}
725
726#endif
727
Damien Georgee5ef15a2016-11-16 16:25:06 +1100728#endif
729
Damiend99b0522013-12-21 18:17:45 +0000730void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100731 if (lex) {
Damien George5bdf1652016-11-16 18:27:20 +1100732 lex->reader.close(lex->reader.data);
Damienbb5316b2013-10-22 21:12:29 +0100733 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200734 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000735 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100736 }
Damien429d7192013-10-04 19:53:11 +0100737}
738
Damien Georgec305ae32016-12-22 10:49:54 +1100739#if 0
740// This function is used to print the current token and should only be
741// needed to debug the lexer, so it's not available via a config option.
Damien Georgea4c52c52014-12-05 19:35:18 +0000742void mp_lexer_show_token(const mp_lexer_t *lex) {
Damien George451a0872014-12-05 22:50:16 +0000743 printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
Damien Georgea4c52c52014-12-05 19:35:18 +0000744 if (lex->vstr.len > 0) {
745 const byte *i = (const byte *)lex->vstr.buf;
746 const byte *j = (const byte *)i + lex->vstr.len;
747 printf(" ");
748 while (i < j) {
749 unichar c = utf8_get_char(i);
750 i = utf8_next_char(i);
751 if (unichar_isprint(c)) {
Damien George7f19a392015-06-22 17:40:12 +0100752 printf("%c", (int)c);
Damien Georgea4c52c52014-12-05 19:35:18 +0000753 } else {
754 printf("?");
755 }
756 }
757 }
758 printf("\n");
Damien429d7192013-10-04 19:53:11 +0100759}
Damien Georgea4c52c52014-12-05 19:35:18 +0000760#endif
Damien Georgedd5353a2015-12-18 12:35:44 +0000761
762#endif // MICROPY_ENABLE_COMPILER