blob: 9dcdd19eb51eeeab80c03bac41f0ab3632c3d720 [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027#include <stdio.h>
Damien Georgeae436792017-02-17 11:10:35 +110028#include <string.h>
Damien429d7192013-10-04 19:53:11 +010029#include <assert.h>
30
Damien Georgeb4b10fd2015-01-01 23:30:53 +000031#include "py/mpstate.h"
Damien George511c0832016-11-16 16:22:08 +110032#include "py/reader.h"
Damien George51dfcb42015-01-01 20:27:54 +000033#include "py/lexer.h"
Damien George081f9322015-09-07 17:08:49 +010034#include "py/runtime.h"
Damien429d7192013-10-04 19:53:11 +010035
Damien Georgedd5353a2015-12-18 12:35:44 +000036#if MICROPY_ENABLE_COMPILER
37
Damien429d7192013-10-04 19:53:11 +010038#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010039
Damien92c06562013-10-22 22:32:27 +010040// TODO seems that CPython allows NULL byte in the input stream
41// don't know if that's intentional or not, but we don't allow it
42
Damien George5bdf1652016-11-16 18:27:20 +110043#define MP_LEXER_EOF ((unichar)MP_READER_EOF)
Damiena5185f42013-10-20 14:41:27 +010044#define CUR_CHAR(lex) ((lex)->chr0)
45
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020046STATIC bool is_end(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +010047 return lex->chr0 == MP_LEXER_EOF;
Damien429d7192013-10-04 19:53:11 +010048}
49
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020050STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien George32bade12015-01-30 00:27:46 +000051 return lex->chr0 == '\n';
Damien429d7192013-10-04 19:53:11 +010052}
53
Damien George2e2e4042015-03-19 00:21:29 +000054STATIC bool is_char(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010055 return lex->chr0 == c;
56}
57
Damien George2e2e4042015-03-19 00:21:29 +000058STATIC bool is_char_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010059 return lex->chr0 == c1 || lex->chr0 == c2;
60}
61
Damien George2e2e4042015-03-19 00:21:29 +000062STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
Damien429d7192013-10-04 19:53:11 +010063 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
64}
65
Damien George2e2e4042015-03-19 00:21:29 +000066STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010067 return lex->chr1 == c;
68}
Damien429d7192013-10-04 19:53:11 +010069
Damien George2e2e4042015-03-19 00:21:29 +000070STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010071 return lex->chr1 == c1 || lex->chr1 == c2;
72}
73
Damien George2e2e4042015-03-19 00:21:29 +000074STATIC bool is_char_following_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010075 return lex->chr2 == c1 || lex->chr2 == c2;
76}
77
Damien George2e2e4042015-03-19 00:21:29 +000078STATIC bool is_char_and(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010079 return lex->chr0 == c1 && lex->chr1 == c2;
80}
81
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020082STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000083 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010084}
85
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020086STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000087 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010088}
89
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020090STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000091 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010092}
93
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020094STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000095 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +010096}
97
Damien George2b000472015-09-07 17:33:44 +010098STATIC bool is_following_base_char(mp_lexer_t *lex) {
99 const unichar chr1 = lex->chr1 | 0x20;
100 return chr1 == 'b' || chr1 == 'o' || chr1 == 'x';
Damien George7d414a12015-02-08 01:57:40 +0000101}
102
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200103STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200104 return lex->chr1 >= '0' && lex->chr1 <= '7';
105}
106
Damien George534b7c32017-02-17 12:12:40 +1100107STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
108 return is_char_or(lex, '\'', '\"')
109 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
110 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
111 && is_char_following_following_or(lex, '\'', '\"'));
112}
113
Damien George7ed58cb2015-06-09 10:58:07 +0000114// to easily parse utf-8 identifiers we allow any raw byte with high bit set
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200115STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien George7ed58cb2015-06-09 10:58:07 +0000116 return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
Damien429d7192013-10-04 19:53:11 +0100117}
118
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200119STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100120 return is_head_of_identifier(lex) || is_digit(lex);
121}
122
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200123STATIC void next_char(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100124 if (lex->chr0 == '\n') {
Damien George32bade12015-01-30 00:27:46 +0000125 // a new line
Damien429d7192013-10-04 19:53:11 +0100126 ++lex->line;
127 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100128 } else if (lex->chr0 == '\t') {
129 // a tab
130 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
131 } else {
132 // a character worth one column
133 ++lex->column;
134 }
135
Damien George32bade12015-01-30 00:27:46 +0000136 lex->chr0 = lex->chr1;
137 lex->chr1 = lex->chr2;
Damien George5bdf1652016-11-16 18:27:20 +1100138 lex->chr2 = lex->reader.readbyte(lex->reader.data);
Damien George32bade12015-01-30 00:27:46 +0000139
140 if (lex->chr0 == '\r') {
141 // CR is a new line, converted to LF
142 lex->chr0 = '\n';
143 if (lex->chr1 == '\n') {
144 // CR LF is a single new line
145 lex->chr1 = lex->chr2;
Damien George5bdf1652016-11-16 18:27:20 +1100146 lex->chr2 = lex->reader.readbyte(lex->reader.data);
Damien George32bade12015-01-30 00:27:46 +0000147 }
148 }
149
150 if (lex->chr2 == MP_LEXER_EOF) {
151 // EOF, check if we need to insert a newline at end of file
152 if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
153 // if lex->chr1 == '\r' then this makes a CR LF which will be converted to LF above
154 // otherwise it just inserts a LF
155 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100156 }
157 }
158}
159
Damien George5124a942017-02-17 12:44:24 +1100160STATIC void indent_push(mp_lexer_t *lex, size_t indent) {
Damien429d7192013-10-04 19:53:11 +0100161 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100162 // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
Damien George58ebde42014-05-21 20:32:59 +0100163 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
164 lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
Damien429d7192013-10-04 19:53:11 +0100165 }
166 lex->indent_level[lex->num_indent_level++] = indent;
167}
168
Damien George5124a942017-02-17 12:44:24 +1100169STATIC size_t indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100170 return lex->indent_level[lex->num_indent_level - 1];
171}
172
Damien Georgea4c52c52014-12-05 19:35:18 +0000173STATIC void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100174 lex->num_indent_level -= 1;
175}
176
177// some tricky operator encoding:
178// <op> = begin with <op>, if this opchar matches then begin here
179// e<op> = end with <op>, if this opchar matches then end
180// E<op> = mandatory end with <op>, this opchar must match, then end
181// c<op> = continue with <op>, if this opchar matches then continue matching
182// this means if the start of two ops are the same then they are equal til the last char
183
Damien George3ff16ff2016-05-20 12:38:15 +0100184STATIC const char *const tok_enc =
Damien429d7192013-10-04 19:53:11 +0100185 "()[]{},:;@~" // singles
186 "<e=c<e=" // < <= << <<=
187 ">e=c>e=" // > >= >> >>=
188 "*e=c*e=" // * *= ** **=
189 "+e=" // + +=
190 "-e=e>" // - -= ->
191 "&e=" // & &=
192 "|e=" // | |=
193 "/e=c/e=" // / /= // //=
194 "%e=" // % %=
195 "^e=" // ^ ^=
196 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100197 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100198
199// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200200STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000201 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
202 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
203 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
204 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100205
Damiend99b0522013-12-21 18:17:45 +0000206 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
207 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
208 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
209 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
210 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
211 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
212 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
213 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
214 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
215 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
216 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
217 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100218};
219
220// must have the same order as enum in lexer.h
Damien Georgeae436792017-02-17 11:10:35 +1100221// must be sorted according to strcmp
Damien George3ff16ff2016-05-20 12:38:15 +0100222STATIC const char *const tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100223 "False",
224 "None",
225 "True",
Damien Georgeae436792017-02-17 11:10:35 +1100226 "__debug__",
Damien429d7192013-10-04 19:53:11 +0100227 "and",
228 "as",
229 "assert",
pohmelie81ebba72016-01-27 23:23:11 +0300230 #if MICROPY_PY_ASYNC_AWAIT
231 "async",
232 "await",
233 #endif
Damien429d7192013-10-04 19:53:11 +0100234 "break",
235 "class",
236 "continue",
237 "def",
238 "del",
239 "elif",
240 "else",
241 "except",
242 "finally",
243 "for",
244 "from",
245 "global",
246 "if",
247 "import",
248 "in",
249 "is",
250 "lambda",
251 "nonlocal",
252 "not",
253 "or",
254 "pass",
255 "raise",
256 "return",
257 "try",
258 "while",
259 "with",
260 "yield",
Damien429d7192013-10-04 19:53:11 +0100261};
262
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200263// This is called with CUR_CHAR() before first hex digit, and should return with
264// it pointing to last hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100265// num_digits must be greater than zero
Damien George5124a942017-02-17 12:44:24 +1100266STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) {
Damien George54eb4e72014-07-03 13:47:47 +0100267 mp_uint_t num = 0;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200268 while (num_digits-- != 0) {
269 next_char(lex);
270 unichar c = CUR_CHAR(lex);
271 if (!unichar_isxdigit(c)) {
272 return false;
273 }
Dave Hylands3ad94d62015-05-18 14:41:25 -0700274 num = (num << 4) + unichar_xdigit_value(c);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200275 }
276 *result = num;
277 return true;
278}
279
Damien George534b7c32017-02-17 12:12:40 +1100280STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
281 // get first quoting character
282 char quote_char = '\'';
283 if (is_char(lex, '\"')) {
284 quote_char = '\"';
285 }
286 next_char(lex);
Damien Georgea4c52c52014-12-05 19:35:18 +0000287
Damien George534b7c32017-02-17 12:12:40 +1100288 // work out if it's a single or triple quoted literal
289 size_t num_quotes;
290 if (is_char_and(lex, quote_char, quote_char)) {
291 // triple quotes
292 next_char(lex);
293 next_char(lex);
294 num_quotes = 3;
295 } else {
296 // single quotes
297 num_quotes = 1;
298 }
299
300 size_t n_closing = 0;
301 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
302 if (is_char(lex, quote_char)) {
303 n_closing += 1;
304 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
305 } else {
306 n_closing = 0;
307 if (is_char(lex, '\\')) {
308 next_char(lex);
309 unichar c = CUR_CHAR(lex);
310 if (is_raw) {
311 // raw strings allow escaping of quotes, but the backslash is also emitted
312 vstr_add_char(&lex->vstr, '\\');
313 } else {
314 switch (c) {
315 // note: "c" can never be MP_LEXER_EOF because next_char
316 // always inserts a newline at the end of the input stream
317 case '\n': c = MP_LEXER_EOF; break; // backslash escape the newline, just ignore it
318 case '\\': break;
319 case '\'': break;
320 case '"': break;
321 case 'a': c = 0x07; break;
322 case 'b': c = 0x08; break;
323 case 't': c = 0x09; break;
324 case 'n': c = 0x0a; break;
325 case 'v': c = 0x0b; break;
326 case 'f': c = 0x0c; break;
327 case 'r': c = 0x0d; break;
328 case 'u':
329 case 'U':
330 if (lex->tok_kind == MP_TOKEN_BYTES) {
331 // b'\u1234' == b'\\u1234'
332 vstr_add_char(&lex->vstr, '\\');
333 break;
334 }
335 // Otherwise fall through.
336 case 'x':
337 {
338 mp_uint_t num = 0;
339 if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
340 // not enough hex chars for escape sequence
341 lex->tok_kind = MP_TOKEN_INVALID;
342 }
343 c = num;
344 break;
345 }
346 case 'N':
347 // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
348 // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
349 // 3MB of text; even gzip-compressed and with minimal structure, it'll take
350 // roughly half a meg of storage. This form of Unicode escape may be added
351 // later on, but it's definitely not a priority right now. -- CJA 20140607
352 mp_not_implemented("unicode name escapes");
353 break;
354 default:
355 if (c >= '0' && c <= '7') {
356 // Octal sequence, 1-3 chars
Damien George5124a942017-02-17 12:44:24 +1100357 size_t digits = 3;
Damien George534b7c32017-02-17 12:12:40 +1100358 mp_uint_t num = c - '0';
359 while (is_following_odigit(lex) && --digits != 0) {
360 next_char(lex);
361 num = num * 8 + (CUR_CHAR(lex) - '0');
362 }
363 c = num;
364 } else {
365 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
366 vstr_add_char(&lex->vstr, '\\');
367 }
368 break;
369 }
370 }
371 if (c != MP_LEXER_EOF) {
372 if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
373 if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {
374 vstr_add_char(&lex->vstr, c);
375 } else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {
376 vstr_add_byte(&lex->vstr, c);
377 } else {
378 // unicode character out of range
379 // this raises a generic SyntaxError; could provide more info
380 lex->tok_kind = MP_TOKEN_INVALID;
381 }
382 } else {
383 // without unicode everything is just added as an 8-bit byte
384 if (c < 0x100) {
385 vstr_add_byte(&lex->vstr, c);
386 } else {
387 // 8-bit character out of range
388 // this raises a generic SyntaxError; could provide more info
389 lex->tok_kind = MP_TOKEN_INVALID;
390 }
391 }
392 }
393 } else {
394 // Add the "character" as a byte so that we remain 8-bit clean.
395 // This way, strings are parsed correctly whether or not they contain utf-8 chars.
396 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
397 }
398 }
399 next_char(lex);
400 }
401
402 // check we got the required end quotes
403 if (n_closing < num_quotes) {
404 lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
405 }
406
407 // cut off the end quotes from the token text
408 vstr_cut_tail_bytes(&lex->vstr, n_closing);
409}
410
411STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
Damien429d7192013-10-04 19:53:11 +0100412 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100413 while (!is_end(lex)) {
414 if (is_physical_newline(lex)) {
Damien George534b7c32017-02-17 12:12:40 +1100415 if (stop_at_newline && lex->nested_bracket_level == 0) {
416 break;
417 }
Damien429d7192013-10-04 19:53:11 +0100418 had_physical_newline = true;
419 next_char(lex);
420 } else if (is_whitespace(lex)) {
421 next_char(lex);
422 } else if (is_char(lex, '#')) {
423 next_char(lex);
424 while (!is_end(lex) && !is_physical_newline(lex)) {
425 next_char(lex);
426 }
427 // had_physical_newline will be set on next loop
Damien George773278e2017-02-17 11:30:14 +1100428 } else if (is_char_and(lex, '\\', '\n')) {
429 // line-continuation, so don't set had_physical_newline
Damien429d7192013-10-04 19:53:11 +0100430 next_char(lex);
Damien George773278e2017-02-17 11:30:14 +1100431 next_char(lex);
Damien429d7192013-10-04 19:53:11 +0100432 } else {
433 break;
434 }
435 }
Damien George534b7c32017-02-17 12:12:40 +1100436 return had_physical_newline;
437}
438
439void mp_lexer_to_next(mp_lexer_t *lex) {
440 // start new token text
441 vstr_reset(&lex->vstr);
442
443 // skip white space and comments
444 bool had_physical_newline = skip_whitespace(lex, false);
Damien429d7192013-10-04 19:53:11 +0100445
Damiena5185f42013-10-20 14:41:27 +0100446 // set token source information
Damien Georgea4c52c52014-12-05 19:35:18 +0000447 lex->tok_line = lex->line;
448 lex->tok_column = lex->column;
Damiena5185f42013-10-20 14:41:27 +0100449
Damien George98b30722017-02-17 10:56:06 +1100450 if (lex->emit_dent < 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000451 lex->tok_kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100452 lex->emit_dent += 1;
453
454 } else if (lex->emit_dent > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000455 lex->tok_kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100456 lex->emit_dent -= 1;
457
Damien91d387d2013-10-09 15:09:52 +0100458 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000459 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100460
Damien George5124a942017-02-17 12:44:24 +1100461 size_t num_spaces = lex->column - 1;
Damien429d7192013-10-04 19:53:11 +0100462 if (num_spaces == indent_top(lex)) {
463 } else if (num_spaces > indent_top(lex)) {
464 indent_push(lex, num_spaces);
465 lex->emit_dent += 1;
466 } else {
467 while (num_spaces < indent_top(lex)) {
468 indent_pop(lex);
469 lex->emit_dent -= 1;
470 }
471 if (num_spaces != indent_top(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000472 lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100473 }
474 }
475
476 } else if (is_end(lex)) {
Damien George31101d92016-10-12 11:00:17 +1100477 lex->tok_kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100478
Damien George534b7c32017-02-17 12:12:40 +1100479 } else if (is_string_or_bytes(lex)) {
Damien429d7192013-10-04 19:53:11 +0100480 // a string or bytes literal
481
Damien George534b7c32017-02-17 12:12:40 +1100482 // Python requires adjacent string/bytes literals to be automatically
483 // concatenated. We do it here in the tokeniser to make efficient use of RAM,
484 // because then the lexer's vstr can be used to accumulate the string literal,
485 // in contrast to creating a parse tree of strings and then joining them later
486 // in the compiler. It's also more compact in code size to do it here.
487
488 // MP_TOKEN_END is used to indicate that this is the first string token
489 lex->tok_kind = MP_TOKEN_END;
490
491 // Loop to accumulate string/bytes literals
492 do {
493 // parse type codes
494 bool is_raw = false;
495 mp_token_kind_t kind = MP_TOKEN_STRING;
496 int n_char = 0;
497 if (is_char(lex, 'u')) {
498 n_char = 1;
499 } else if (is_char(lex, 'b')) {
500 kind = MP_TOKEN_BYTES;
501 n_char = 1;
502 if (is_char_following(lex, 'r')) {
503 is_raw = true;
504 n_char = 2;
505 }
506 } else if (is_char(lex, 'r')) {
Damien429d7192013-10-04 19:53:11 +0100507 is_raw = true;
Damien George534b7c32017-02-17 12:12:40 +1100508 n_char = 1;
509 if (is_char_following(lex, 'b')) {
510 kind = MP_TOKEN_BYTES;
511 n_char = 2;
Damien429d7192013-10-04 19:53:11 +0100512 }
513 }
Damien429d7192013-10-04 19:53:11 +0100514
Damien George534b7c32017-02-17 12:12:40 +1100515 // Set or check token kind
516 if (lex->tok_kind == MP_TOKEN_END) {
517 lex->tok_kind = kind;
518 } else if (lex->tok_kind != kind) {
519 // Can't concatenate string with bytes
520 break;
521 }
Damien429d7192013-10-04 19:53:11 +0100522
Damien George534b7c32017-02-17 12:12:40 +1100523 // Skip any type code characters
524 if (n_char != 0) {
525 next_char(lex);
526 if (n_char == 2) {
527 next_char(lex);
528 }
529 }
530
531 // Parse the literal
532 parse_string_literal(lex, is_raw);
533
534 // Skip whitespace so we can check if there's another string following
535 skip_whitespace(lex, true);
536
537 } while (is_string_or_bytes(lex));
Damien429d7192013-10-04 19:53:11 +0100538
539 } else if (is_head_of_identifier(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000540 lex->tok_kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100541
Damien George7ed58cb2015-06-09 10:58:07 +0000542 // get first char (add as byte to remain 8-bit clean and support utf-8)
543 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100544 next_char(lex);
545
Damiena5185f42013-10-20 14:41:27 +0100546 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100547 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damien George7ed58cb2015-06-09 10:58:07 +0000548 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100549 next_char(lex);
550 }
551
Damien Georgea68c7542017-02-17 10:59:57 +1100552 // Check if the name is a keyword.
553 // We also check for __debug__ here and convert it to its value. This is
554 // so the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
555 // need to check for this special token in many places in the compiler.
Damien Georgeae436792017-02-17 11:10:35 +1100556 const char *s = vstr_null_terminated_str(&lex->vstr);
Damien Georgea68c7542017-02-17 10:59:57 +1100557 for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
Damien Georgeae436792017-02-17 11:10:35 +1100558 int cmp = strcmp(s, tok_kw[i]);
559 if (cmp == 0) {
560 lex->tok_kind = MP_TOKEN_KW_FALSE + i;
561 if (lex->tok_kind == MP_TOKEN_KW___DEBUG__) {
Damien Georgea68c7542017-02-17 10:59:57 +1100562 lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
Damien Georgea68c7542017-02-17 10:59:57 +1100563 }
564 break;
Damien Georgeae436792017-02-17 11:10:35 +1100565 } else if (cmp < 0) {
566 // Table is sorted and comparison was less-than, so stop searching
567 break;
Damien Georgea68c7542017-02-17 10:59:57 +1100568 }
569 }
570
Damien429d7192013-10-04 19:53:11 +0100571 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damien George7d414a12015-02-08 01:57:40 +0000572 bool forced_integer = false;
573 if (is_char(lex, '.')) {
574 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
575 } else {
576 lex->tok_kind = MP_TOKEN_INTEGER;
Damien George2b000472015-09-07 17:33:44 +0100577 if (is_char(lex, '0') && is_following_base_char(lex)) {
Damien George7d414a12015-02-08 01:57:40 +0000578 forced_integer = true;
579 }
580 }
Damien429d7192013-10-04 19:53:11 +0100581
Damiena5185f42013-10-20 14:41:27 +0100582 // get first char
583 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100584 next_char(lex);
585
Damiena5185f42013-10-20 14:41:27 +0100586 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100587 while (!is_end(lex)) {
Damien George7d414a12015-02-08 01:57:40 +0000588 if (!forced_integer && is_char_or(lex, 'e', 'E')) {
589 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
Damiena5185f42013-10-20 14:41:27 +0100590 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100591 next_char(lex);
592 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100593 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100594 next_char(lex);
595 }
Damien George7d414a12015-02-08 01:57:40 +0000596 } else if (is_letter(lex) || is_digit(lex) || is_char(lex, '.')) {
597 if (is_char_or3(lex, '.', 'j', 'J')) {
598 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
599 }
Damiena5185f42013-10-20 14:41:27 +0100600 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100601 next_char(lex);
602 } else {
603 break;
604 }
605 }
606
Damien George2e9eb2d2014-04-10 12:19:33 +0100607 } else if (is_char(lex, '.')) {
608 // special handling for . and ... operators, because .. is not a valid operator
609
610 // get first char
Damien George2e9eb2d2014-04-10 12:19:33 +0100611 next_char(lex);
612
613 if (is_char_and(lex, '.', '.')) {
Damien George2e9eb2d2014-04-10 12:19:33 +0100614 next_char(lex);
615 next_char(lex);
Damien Georgea4c52c52014-12-05 19:35:18 +0000616 lex->tok_kind = MP_TOKEN_ELLIPSIS;
Damien George2e9eb2d2014-04-10 12:19:33 +0100617 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000618 lex->tok_kind = MP_TOKEN_DEL_PERIOD;
Damien George2e9eb2d2014-04-10 12:19:33 +0100619 }
620
Damien429d7192013-10-04 19:53:11 +0100621 } else {
622 // search for encoded delimiter or operator
623
624 const char *t = tok_enc;
Damien George5124a942017-02-17 12:44:24 +1100625 size_t tok_enc_index = 0;
Damien429d7192013-10-04 19:53:11 +0100626 for (; *t != 0 && !is_char(lex, *t); t += 1) {
627 if (*t == 'e' || *t == 'c') {
628 t += 1;
629 } else if (*t == 'E') {
630 tok_enc_index -= 1;
631 t += 1;
632 }
633 tok_enc_index += 1;
634 }
635
636 next_char(lex);
637
638 if (*t == 0) {
639 // didn't match any delimiter or operator characters
Damien Georgea4c52c52014-12-05 19:35:18 +0000640 lex->tok_kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100641
642 } else {
643 // matched a delimiter or operator character
644
645 // get the maximum characters for a valid token
646 t += 1;
Damien George5124a942017-02-17 12:44:24 +1100647 size_t t_index = tok_enc_index;
Damien429d7192013-10-04 19:53:11 +0100648 for (;;) {
649 for (; *t == 'e'; t += 1) {
650 t += 1;
651 t_index += 1;
652 if (is_char(lex, *t)) {
653 next_char(lex);
654 tok_enc_index = t_index;
655 break;
656 }
657 }
658
659 if (*t == 'E') {
660 t += 1;
661 if (is_char(lex, *t)) {
662 next_char(lex);
663 tok_enc_index = t_index;
664 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000665 lex->tok_kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100666 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100667 }
668 break;
669 }
670
671 if (*t == 'c') {
672 t += 1;
673 t_index += 1;
674 if (is_char(lex, *t)) {
675 next_char(lex);
676 tok_enc_index = t_index;
677 t += 1;
678 } else {
679 break;
680 }
681 } else {
682 break;
683 }
684 }
685
686 // set token kind
Damien Georgea4c52c52014-12-05 19:35:18 +0000687 lex->tok_kind = tok_enc_kind[tok_enc_index];
Damien429d7192013-10-04 19:53:11 +0100688
Damien George2e9eb2d2014-04-10 12:19:33 +0100689 tok_enc_no_match:
690
Damien429d7192013-10-04 19:53:11 +0100691 // compute bracket level for implicit line joining
Damien Georgea4c52c52014-12-05 19:35:18 +0000692 if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100693 lex->nested_bracket_level += 1;
Damien Georgea4c52c52014-12-05 19:35:18 +0000694 } else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100695 lex->nested_bracket_level -= 1;
696 }
697 }
698 }
Damien429d7192013-10-04 19:53:11 +0100699}
700
Damien George5bdf1652016-11-16 18:27:20 +1100701mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
Damien George9bf5f282014-10-09 16:53:37 +0100702 mp_lexer_t *lex = m_new_obj_maybe(mp_lexer_t);
Damien Georgee1199ec2014-05-10 17:48:01 +0100703
704 // check for memory allocation error
705 if (lex == NULL) {
Damien George5bdf1652016-11-16 18:27:20 +1100706 reader.close(reader.data);
Damien Georgee1199ec2014-05-10 17:48:01 +0100707 return NULL;
708 }
Damien429d7192013-10-04 19:53:11 +0100709
Damien Georgeb829b5c2014-01-25 13:51:19 +0000710 lex->source_name = src_name;
Damien George5bdf1652016-11-16 18:27:20 +1100711 lex->reader = reader;
Damien429d7192013-10-04 19:53:11 +0100712 lex->line = 1;
713 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100714 lex->emit_dent = 0;
715 lex->nested_bracket_level = 0;
Damien George58ebde42014-05-21 20:32:59 +0100716 lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
Damien429d7192013-10-04 19:53:11 +0100717 lex->num_indent_level = 1;
Damien Georgee1199ec2014-05-10 17:48:01 +0100718 lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200719 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100720
Damien Georgee1199ec2014-05-10 17:48:01 +0100721 // check for memory allocation error
Damien George98b30722017-02-17 10:56:06 +1100722 // note: vstr_init above may fail on malloc, but so may mp_lexer_to_next below
Damien George5da0d292016-09-19 11:17:02 +1000723 if (lex->indent_level == NULL) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100724 mp_lexer_free(lex);
725 return NULL;
726 }
727
728 // store sentinel for first indentation level
729 lex->indent_level[0] = 0;
730
Damien429d7192013-10-04 19:53:11 +0100731 // preload characters
Damien George5bdf1652016-11-16 18:27:20 +1100732 lex->chr0 = reader.readbyte(reader.data);
733 lex->chr1 = reader.readbyte(reader.data);
734 lex->chr2 = reader.readbyte(reader.data);
Damiena5185f42013-10-20 14:41:27 +0100735
736 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damien George94fbe972014-07-30 11:46:05 +0100737 if (lex->chr0 == MP_LEXER_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100738 lex->chr0 = '\n';
Damien George94fbe972014-07-30 11:46:05 +0100739 } else if (lex->chr1 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000740 if (lex->chr0 == '\r') {
741 lex->chr0 = '\n';
742 } else if (lex->chr0 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100743 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100744 }
Damien George94fbe972014-07-30 11:46:05 +0100745 } else if (lex->chr2 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000746 if (lex->chr1 == '\r') {
747 lex->chr1 = '\n';
748 } else if (lex->chr1 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100749 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100750 }
Damien429d7192013-10-04 19:53:11 +0100751 }
752
Damiena5185f42013-10-20 14:41:27 +0100753 // preload first token
Damien George98b30722017-02-17 10:56:06 +1100754 mp_lexer_to_next(lex);
755
756 // Check that the first token is in the first column. If it's not then we
757 // convert the token kind to INDENT so that the parser gives a syntax error.
758 if (lex->tok_column != 1) {
759 lex->tok_kind = MP_TOKEN_INDENT;
760 }
Damien429d7192013-10-04 19:53:11 +0100761
762 return lex;
763}
764
Damien George5124a942017-02-17 12:44:24 +1100765mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, size_t len, size_t free_len) {
Damien George511c0832016-11-16 16:22:08 +1100766 mp_reader_t reader;
767 if (!mp_reader_new_mem(&reader, (const byte*)str, len, free_len)) {
768 return NULL;
769 }
Damien George5bdf1652016-11-16 18:27:20 +1100770 return mp_lexer_new(src_name, reader);
Damien George511c0832016-11-16 16:22:08 +1100771}
772
Damien George8beba732017-01-29 15:16:51 +1100773#if MICROPY_READER_POSIX || MICROPY_READER_VFS
Damien Georgee5ef15a2016-11-16 16:25:06 +1100774
775mp_lexer_t *mp_lexer_new_from_file(const char *filename) {
776 mp_reader_t reader;
777 int ret = mp_reader_new_file(&reader, filename);
778 if (ret != 0) {
779 return NULL;
780 }
Damien George5bdf1652016-11-16 18:27:20 +1100781 return mp_lexer_new(qstr_from_str(filename), reader);
Damien Georgee5ef15a2016-11-16 16:25:06 +1100782}
783
Damien George66d955c2016-11-16 18:12:55 +1100784#if MICROPY_HELPER_LEXER_UNIX
785
786mp_lexer_t *mp_lexer_new_from_fd(qstr filename, int fd, bool close_fd) {
787 mp_reader_t reader;
788 int ret = mp_reader_new_file_from_fd(&reader, fd, close_fd);
789 if (ret != 0) {
790 return NULL;
791 }
Damien George5bdf1652016-11-16 18:27:20 +1100792 return mp_lexer_new(filename, reader);
Damien George66d955c2016-11-16 18:12:55 +1100793}
794
795#endif
796
Damien Georgee5ef15a2016-11-16 16:25:06 +1100797#endif
798
Damiend99b0522013-12-21 18:17:45 +0000799void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100800 if (lex) {
Damien George5bdf1652016-11-16 18:27:20 +1100801 lex->reader.close(lex->reader.data);
Damienbb5316b2013-10-22 21:12:29 +0100802 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200803 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000804 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100805 }
Damien429d7192013-10-04 19:53:11 +0100806}
807
Damien Georgec305ae32016-12-22 10:49:54 +1100808#if 0
809// This function is used to print the current token and should only be
810// needed to debug the lexer, so it's not available via a config option.
Damien Georgea4c52c52014-12-05 19:35:18 +0000811void mp_lexer_show_token(const mp_lexer_t *lex) {
Damien George451a0872014-12-05 22:50:16 +0000812 printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
Damien Georgea4c52c52014-12-05 19:35:18 +0000813 if (lex->vstr.len > 0) {
814 const byte *i = (const byte *)lex->vstr.buf;
815 const byte *j = (const byte *)i + lex->vstr.len;
816 printf(" ");
817 while (i < j) {
818 unichar c = utf8_get_char(i);
819 i = utf8_next_char(i);
820 if (unichar_isprint(c)) {
Damien George7f19a392015-06-22 17:40:12 +0100821 printf("%c", (int)c);
Damien Georgea4c52c52014-12-05 19:35:18 +0000822 } else {
823 printf("?");
824 }
825 }
826 }
827 printf("\n");
Damien429d7192013-10-04 19:53:11 +0100828}
Damien Georgea4c52c52014-12-05 19:35:18 +0000829#endif
Damien Georgedd5353a2015-12-18 12:35:44 +0000830
831#endif // MICROPY_ENABLE_COMPILER