blob: 9342ce8ccf3a814b2e0c0ce1db1117377914cd72 [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027#include <stdio.h>
28#include <assert.h>
29
Damien Georgeb4b10fd2015-01-01 23:30:53 +000030#include "py/mpstate.h"
Damien George511c0832016-11-16 16:22:08 +110031#include "py/reader.h"
Damien George51dfcb42015-01-01 20:27:54 +000032#include "py/lexer.h"
Damien George081f9322015-09-07 17:08:49 +010033#include "py/runtime.h"
Damien429d7192013-10-04 19:53:11 +010034
Damien Georgedd5353a2015-12-18 12:35:44 +000035#if MICROPY_ENABLE_COMPILER
36
Damien429d7192013-10-04 19:53:11 +010037#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010038
Damien92c06562013-10-22 22:32:27 +010039// TODO seems that CPython allows NULL byte in the input stream
40// don't know if that's intentional or not, but we don't allow it
41
Damien George9528cd62014-01-15 21:23:31 +000042// TODO replace with a call to a standard function
Damien Georgea4c52c52014-12-05 19:35:18 +000043STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
Damien George54eb4e72014-07-03 13:47:47 +010044 mp_uint_t i = 0;
Damien429d7192013-10-04 19:53:11 +010045
Damiena5185f42013-10-20 14:41:27 +010046 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010047 ++i;
Damien429d7192013-10-04 19:53:11 +010048 ++str;
Damiena5185f42013-10-20 14:41:27 +010049 ++strn;
Damien429d7192013-10-04 19:53:11 +010050 }
51
Damiena5185f42013-10-20 14:41:27 +010052 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010053}
54
Damiena5185f42013-10-20 14:41:27 +010055#define CUR_CHAR(lex) ((lex)->chr0)
56
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020057STATIC bool is_end(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +010058 return lex->chr0 == MP_LEXER_EOF;
Damien429d7192013-10-04 19:53:11 +010059}
60
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020061STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien George32bade12015-01-30 00:27:46 +000062 return lex->chr0 == '\n';
Damien429d7192013-10-04 19:53:11 +010063}
64
Damien George2e2e4042015-03-19 00:21:29 +000065STATIC bool is_char(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010066 return lex->chr0 == c;
67}
68
Damien George2e2e4042015-03-19 00:21:29 +000069STATIC bool is_char_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010070 return lex->chr0 == c1 || lex->chr0 == c2;
71}
72
Damien George2e2e4042015-03-19 00:21:29 +000073STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
Damien429d7192013-10-04 19:53:11 +010074 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
75}
76
77/*
Damien George2e2e4042015-03-19 00:21:29 +000078STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010079 return lex->chr1 == c;
80}
81*/
82
Damien George2e2e4042015-03-19 00:21:29 +000083STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010084 return lex->chr1 == c1 || lex->chr1 == c2;
85}
86
Damien George2e2e4042015-03-19 00:21:29 +000087STATIC bool is_char_following_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010088 return lex->chr2 == c1 || lex->chr2 == c2;
89}
90
Damien George2e2e4042015-03-19 00:21:29 +000091STATIC bool is_char_and(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010092 return lex->chr0 == c1 && lex->chr1 == c2;
93}
94
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020095STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000096 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010097}
98
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020099STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000100 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100101}
102
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200103STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000104 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100105}
106
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200107STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000108 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100109}
110
Damien George2b000472015-09-07 17:33:44 +0100111STATIC bool is_following_base_char(mp_lexer_t *lex) {
112 const unichar chr1 = lex->chr1 | 0x20;
113 return chr1 == 'b' || chr1 == 'o' || chr1 == 'x';
Damien George7d414a12015-02-08 01:57:40 +0000114}
115
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200116STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200117 return lex->chr1 >= '0' && lex->chr1 <= '7';
118}
119
Damien George7ed58cb2015-06-09 10:58:07 +0000120// to easily parse utf-8 identifiers we allow any raw byte with high bit set
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200121STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien George7ed58cb2015-06-09 10:58:07 +0000122 return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
Damien429d7192013-10-04 19:53:11 +0100123}
124
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200125STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100126 return is_head_of_identifier(lex) || is_digit(lex);
127}
128
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200129STATIC void next_char(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +0100130 if (lex->chr0 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100131 return;
132 }
133
Damien429d7192013-10-04 19:53:11 +0100134 if (lex->chr0 == '\n') {
Damien George32bade12015-01-30 00:27:46 +0000135 // a new line
Damien429d7192013-10-04 19:53:11 +0100136 ++lex->line;
137 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100138 } else if (lex->chr0 == '\t') {
139 // a tab
140 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
141 } else {
142 // a character worth one column
143 ++lex->column;
144 }
145
Damien George32bade12015-01-30 00:27:46 +0000146 lex->chr0 = lex->chr1;
147 lex->chr1 = lex->chr2;
148 lex->chr2 = lex->stream_next_byte(lex->stream_data);
149
150 if (lex->chr0 == '\r') {
151 // CR is a new line, converted to LF
152 lex->chr0 = '\n';
153 if (lex->chr1 == '\n') {
154 // CR LF is a single new line
155 lex->chr1 = lex->chr2;
156 lex->chr2 = lex->stream_next_byte(lex->stream_data);
157 }
158 }
159
160 if (lex->chr2 == MP_LEXER_EOF) {
161 // EOF, check if we need to insert a newline at end of file
162 if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
163 // if lex->chr1 == '\r' then this makes a CR LF which will be converted to LF above
164 // otherwise it just inserts a LF
165 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100166 }
167 }
168}
169
Damien Georgea4c52c52014-12-05 19:35:18 +0000170STATIC void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
Damien429d7192013-10-04 19:53:11 +0100171 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100172 // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
Damien George58ebde42014-05-21 20:32:59 +0100173 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
174 lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
Damien429d7192013-10-04 19:53:11 +0100175 }
176 lex->indent_level[lex->num_indent_level++] = indent;
177}
178
Damien Georgea4c52c52014-12-05 19:35:18 +0000179STATIC mp_uint_t indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100180 return lex->indent_level[lex->num_indent_level - 1];
181}
182
Damien Georgea4c52c52014-12-05 19:35:18 +0000183STATIC void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100184 lex->num_indent_level -= 1;
185}
186
187// some tricky operator encoding:
188// <op> = begin with <op>, if this opchar matches then begin here
189// e<op> = end with <op>, if this opchar matches then end
190// E<op> = mandatory end with <op>, this opchar must match, then end
191// c<op> = continue with <op>, if this opchar matches then continue matching
192// this means if the start of two ops are the same then they are equal til the last char
193
Damien George3ff16ff2016-05-20 12:38:15 +0100194STATIC const char *const tok_enc =
Damien429d7192013-10-04 19:53:11 +0100195 "()[]{},:;@~" // singles
196 "<e=c<e=" // < <= << <<=
197 ">e=c>e=" // > >= >> >>=
198 "*e=c*e=" // * *= ** **=
199 "+e=" // + +=
200 "-e=e>" // - -= ->
201 "&e=" // & &=
202 "|e=" // | |=
203 "/e=c/e=" // / /= // //=
204 "%e=" // % %=
205 "^e=" // ^ ^=
206 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100207 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100208
209// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200210STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000211 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
212 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
213 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
214 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100215
Damiend99b0522013-12-21 18:17:45 +0000216 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
217 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
218 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
219 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
220 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
221 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
222 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
223 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
224 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
225 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
226 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
227 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100228};
229
230// must have the same order as enum in lexer.h
Damien George3ff16ff2016-05-20 12:38:15 +0100231STATIC const char *const tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100232 "False",
233 "None",
234 "True",
235 "and",
236 "as",
237 "assert",
pohmelie81ebba72016-01-27 23:23:11 +0300238 #if MICROPY_PY_ASYNC_AWAIT
239 "async",
240 "await",
241 #endif
Damien429d7192013-10-04 19:53:11 +0100242 "break",
243 "class",
244 "continue",
245 "def",
246 "del",
247 "elif",
248 "else",
249 "except",
250 "finally",
251 "for",
252 "from",
253 "global",
254 "if",
255 "import",
256 "in",
257 "is",
258 "lambda",
259 "nonlocal",
260 "not",
261 "or",
262 "pass",
263 "raise",
264 "return",
265 "try",
266 "while",
267 "with",
268 "yield",
Damien George97f9a282014-05-12 23:07:34 +0100269 "__debug__",
Damien429d7192013-10-04 19:53:11 +0100270};
271
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200272// This is called with CUR_CHAR() before first hex digit, and should return with
273// it pointing to last hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100274// num_digits must be greater than zero
275STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
276 mp_uint_t num = 0;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200277 while (num_digits-- != 0) {
278 next_char(lex);
279 unichar c = CUR_CHAR(lex);
280 if (!unichar_isxdigit(c)) {
281 return false;
282 }
Dave Hylands3ad94d62015-05-18 14:41:25 -0700283 num = (num << 4) + unichar_xdigit_value(c);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200284 }
285 *result = num;
286 return true;
287}
288
Damien Georgea4c52c52014-12-05 19:35:18 +0000289STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
290 // start new token text
291 vstr_reset(&lex->vstr);
292
Damiena5185f42013-10-20 14:41:27 +0100293 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100294 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100295 while (!is_end(lex)) {
296 if (is_physical_newline(lex)) {
297 had_physical_newline = true;
298 next_char(lex);
299 } else if (is_whitespace(lex)) {
300 next_char(lex);
301 } else if (is_char(lex, '#')) {
302 next_char(lex);
303 while (!is_end(lex) && !is_physical_newline(lex)) {
304 next_char(lex);
305 }
306 // had_physical_newline will be set on next loop
307 } else if (is_char(lex, '\\')) {
308 // backslash (outside string literals) must appear just before a physical newline
309 next_char(lex);
310 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000311 // SyntaxError: unexpected character after line continuation character
Damien Georgea4c52c52014-12-05 19:35:18 +0000312 lex->tok_line = lex->line;
313 lex->tok_column = lex->column;
314 lex->tok_kind = MP_TOKEN_BAD_LINE_CONTINUATION;
Damien George69a818d2014-01-12 13:55:24 +0000315 return;
Damien429d7192013-10-04 19:53:11 +0100316 } else {
317 next_char(lex);
318 }
319 } else {
320 break;
321 }
322 }
323
Damiena5185f42013-10-20 14:41:27 +0100324 // set token source information
Damien Georgea4c52c52014-12-05 19:35:18 +0000325 lex->tok_line = lex->line;
326 lex->tok_column = lex->column;
Damiena5185f42013-10-20 14:41:27 +0100327
328 if (first_token && lex->line == 1 && lex->column != 1) {
329 // check that the first token is in the first column
330 // if first token is not on first line, we get a physical newline and
331 // this check is done as part of normal indent/dedent checking below
332 // (done to get equivalence with CPython)
Damien Georgea4c52c52014-12-05 19:35:18 +0000333 lex->tok_kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100334
335 } else if (lex->emit_dent < 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000336 lex->tok_kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100337 lex->emit_dent += 1;
338
339 } else if (lex->emit_dent > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000340 lex->tok_kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100341 lex->emit_dent -= 1;
342
Damien91d387d2013-10-09 15:09:52 +0100343 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000344 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100345
Damien George54eb4e72014-07-03 13:47:47 +0100346 mp_uint_t num_spaces = lex->column - 1;
Damien429d7192013-10-04 19:53:11 +0100347 if (num_spaces == indent_top(lex)) {
348 } else if (num_spaces > indent_top(lex)) {
349 indent_push(lex, num_spaces);
350 lex->emit_dent += 1;
351 } else {
352 while (num_spaces < indent_top(lex)) {
353 indent_pop(lex);
354 lex->emit_dent -= 1;
355 }
356 if (num_spaces != indent_top(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000357 lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100358 }
359 }
360
361 } else if (is_end(lex)) {
Damien George31101d92016-10-12 11:00:17 +1100362 lex->tok_kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100363
364 } else if (is_char_or(lex, '\'', '\"')
365 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
366 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
367 // a string or bytes literal
368
369 // parse type codes
370 bool is_raw = false;
371 bool is_bytes = false;
372 if (is_char(lex, 'u')) {
373 next_char(lex);
374 } else if (is_char(lex, 'b')) {
375 is_bytes = true;
376 next_char(lex);
377 if (is_char(lex, 'r')) {
378 is_raw = true;
379 next_char(lex);
380 }
381 } else if (is_char(lex, 'r')) {
382 is_raw = true;
383 next_char(lex);
384 if (is_char(lex, 'b')) {
385 is_bytes = true;
386 next_char(lex);
387 }
388 }
389
390 // set token kind
391 if (is_bytes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000392 lex->tok_kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100393 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000394 lex->tok_kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100395 }
396
397 // get first quoting character
398 char quote_char = '\'';
399 if (is_char(lex, '\"')) {
400 quote_char = '\"';
401 }
402 next_char(lex);
403
404 // work out if it's a single or triple quoted literal
Damien George54eb4e72014-07-03 13:47:47 +0100405 mp_uint_t num_quotes;
Damien429d7192013-10-04 19:53:11 +0100406 if (is_char_and(lex, quote_char, quote_char)) {
407 // triple quotes
408 next_char(lex);
409 next_char(lex);
410 num_quotes = 3;
411 } else {
412 // single quotes
413 num_quotes = 1;
414 }
415
Damien429d7192013-10-04 19:53:11 +0100416 // parse the literal
Damien George54eb4e72014-07-03 13:47:47 +0100417 mp_uint_t n_closing = 0;
Damien429d7192013-10-04 19:53:11 +0100418 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
419 if (is_char(lex, quote_char)) {
420 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100421 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100422 } else {
423 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100424 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100425 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100426 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100427 if (is_raw) {
428 // raw strings allow escaping of quotes, but the backslash is also emitted
429 vstr_add_char(&lex->vstr, '\\');
430 } else {
431 switch (c) {
Damien George94fbe972014-07-30 11:46:05 +0100432 case MP_LEXER_EOF: break; // TODO a proper error message?
433 case '\n': c = MP_LEXER_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damien Georgea91f4142014-04-10 11:30:55 +0100434 case '\\': break;
435 case '\'': break;
436 case '"': break;
437 case 'a': c = 0x07; break;
438 case 'b': c = 0x08; break;
439 case 't': c = 0x09; break;
440 case 'n': c = 0x0a; break;
441 case 'v': c = 0x0b; break;
442 case 'f': c = 0x0c; break;
443 case 'r': c = 0x0d; break;
Chris Angelico2ba22992014-06-04 05:28:12 +1000444 case 'u':
445 case 'U':
446 if (is_bytes) {
447 // b'\u1234' == b'\\u1234'
448 vstr_add_char(&lex->vstr, '\\');
449 break;
450 }
451 // Otherwise fall through.
Damien Georgea91f4142014-04-10 11:30:55 +0100452 case 'x':
453 {
Damien George54eb4e72014-07-03 13:47:47 +0100454 mp_uint_t num = 0;
Chris Angelico2ba22992014-06-04 05:28:12 +1000455 if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
Damien Georged241c2a2015-07-23 23:20:37 +0100456 // not enough hex chars for escape sequence
457 lex->tok_kind = MP_TOKEN_INVALID;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200458 }
459 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100460 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200461 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000462 case 'N':
463 // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
464 // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
465 // 3MB of text; even gzip-compressed and with minimal structure, it'll take
466 // roughly half a meg of storage. This form of Unicode escape may be added
467 // later on, but it's definitely not a priority right now. -- CJA 20140607
Damien George081f9322015-09-07 17:08:49 +0100468 mp_not_implemented("unicode name escapes");
Chris Angelico2ba22992014-06-04 05:28:12 +1000469 break;
Damien Georgea91f4142014-04-10 11:30:55 +0100470 default:
471 if (c >= '0' && c <= '7') {
472 // Octal sequence, 1-3 chars
Damien George54eb4e72014-07-03 13:47:47 +0100473 mp_uint_t digits = 3;
474 mp_uint_t num = c - '0';
Damien Georgea91f4142014-04-10 11:30:55 +0100475 while (is_following_odigit(lex) && --digits != 0) {
476 next_char(lex);
477 num = num * 8 + (CUR_CHAR(lex) - '0');
478 }
479 c = num;
480 } else {
481 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
482 vstr_add_char(&lex->vstr, '\\');
483 }
484 break;
485 }
Damiena5185f42013-10-20 14:41:27 +0100486 }
Damien George94fbe972014-07-30 11:46:05 +0100487 if (c != MP_LEXER_EOF) {
Damien Georgeea235202016-02-11 22:30:53 +0000488 if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
489 if (c < 0x110000 && !is_bytes) {
490 vstr_add_char(&lex->vstr, c);
491 } else if (c < 0x100 && is_bytes) {
492 vstr_add_byte(&lex->vstr, c);
493 } else {
494 // unicode character out of range
495 // this raises a generic SyntaxError; could provide more info
496 lex->tok_kind = MP_TOKEN_INVALID;
497 }
498 } else {
499 // without unicode everything is just added as an 8-bit byte
500 if (c < 0x100) {
501 vstr_add_byte(&lex->vstr, c);
502 } else {
503 // 8-bit character out of range
504 // this raises a generic SyntaxError; could provide more info
505 lex->tok_kind = MP_TOKEN_INVALID;
506 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000507 }
Damiena5185f42013-10-20 14:41:27 +0100508 }
509 } else {
Damien George94fbe972014-07-30 11:46:05 +0100510 // Add the "character" as a byte so that we remain 8-bit clean.
511 // This way, strings are parsed correctly whether or not they contain utf-8 chars.
512 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100513 }
514 }
515 next_char(lex);
516 }
517
518 // check we got the required end quotes
519 if (n_closing < num_quotes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000520 lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100521 }
522
Damiena5185f42013-10-20 14:41:27 +0100523 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000524 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100525
526 } else if (is_head_of_identifier(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000527 lex->tok_kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100528
Damien George7ed58cb2015-06-09 10:58:07 +0000529 // get first char (add as byte to remain 8-bit clean and support utf-8)
530 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100531 next_char(lex);
532
Damiena5185f42013-10-20 14:41:27 +0100533 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100534 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damien George7ed58cb2015-06-09 10:58:07 +0000535 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100536 next_char(lex);
537 }
538
539 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damien George7d414a12015-02-08 01:57:40 +0000540 bool forced_integer = false;
541 if (is_char(lex, '.')) {
542 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
543 } else {
544 lex->tok_kind = MP_TOKEN_INTEGER;
Damien George2b000472015-09-07 17:33:44 +0100545 if (is_char(lex, '0') && is_following_base_char(lex)) {
Damien George7d414a12015-02-08 01:57:40 +0000546 forced_integer = true;
547 }
548 }
Damien429d7192013-10-04 19:53:11 +0100549
Damiena5185f42013-10-20 14:41:27 +0100550 // get first char
551 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100552 next_char(lex);
553
Damiena5185f42013-10-20 14:41:27 +0100554 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100555 while (!is_end(lex)) {
Damien George7d414a12015-02-08 01:57:40 +0000556 if (!forced_integer && is_char_or(lex, 'e', 'E')) {
557 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
Damiena5185f42013-10-20 14:41:27 +0100558 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100559 next_char(lex);
560 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100561 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100562 next_char(lex);
563 }
Damien George7d414a12015-02-08 01:57:40 +0000564 } else if (is_letter(lex) || is_digit(lex) || is_char(lex, '.')) {
565 if (is_char_or3(lex, '.', 'j', 'J')) {
566 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
567 }
Damiena5185f42013-10-20 14:41:27 +0100568 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100569 next_char(lex);
570 } else {
571 break;
572 }
573 }
574
Damien George2e9eb2d2014-04-10 12:19:33 +0100575 } else if (is_char(lex, '.')) {
576 // special handling for . and ... operators, because .. is not a valid operator
577
578 // get first char
579 vstr_add_char(&lex->vstr, '.');
580 next_char(lex);
581
582 if (is_char_and(lex, '.', '.')) {
583 vstr_add_char(&lex->vstr, '.');
584 vstr_add_char(&lex->vstr, '.');
585 next_char(lex);
586 next_char(lex);
Damien Georgea4c52c52014-12-05 19:35:18 +0000587 lex->tok_kind = MP_TOKEN_ELLIPSIS;
Damien George2e9eb2d2014-04-10 12:19:33 +0100588 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000589 lex->tok_kind = MP_TOKEN_DEL_PERIOD;
Damien George2e9eb2d2014-04-10 12:19:33 +0100590 }
591
Damien429d7192013-10-04 19:53:11 +0100592 } else {
593 // search for encoded delimiter or operator
594
595 const char *t = tok_enc;
Damien George54eb4e72014-07-03 13:47:47 +0100596 mp_uint_t tok_enc_index = 0;
Damien429d7192013-10-04 19:53:11 +0100597 for (; *t != 0 && !is_char(lex, *t); t += 1) {
598 if (*t == 'e' || *t == 'c') {
599 t += 1;
600 } else if (*t == 'E') {
601 tok_enc_index -= 1;
602 t += 1;
603 }
604 tok_enc_index += 1;
605 }
606
607 next_char(lex);
608
609 if (*t == 0) {
610 // didn't match any delimiter or operator characters
Damien Georgea4c52c52014-12-05 19:35:18 +0000611 lex->tok_kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100612
613 } else {
614 // matched a delimiter or operator character
615
616 // get the maximum characters for a valid token
617 t += 1;
Damien George54eb4e72014-07-03 13:47:47 +0100618 mp_uint_t t_index = tok_enc_index;
Damien429d7192013-10-04 19:53:11 +0100619 for (;;) {
620 for (; *t == 'e'; t += 1) {
621 t += 1;
622 t_index += 1;
623 if (is_char(lex, *t)) {
624 next_char(lex);
625 tok_enc_index = t_index;
626 break;
627 }
628 }
629
630 if (*t == 'E') {
631 t += 1;
632 if (is_char(lex, *t)) {
633 next_char(lex);
634 tok_enc_index = t_index;
635 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000636 lex->tok_kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100637 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100638 }
639 break;
640 }
641
642 if (*t == 'c') {
643 t += 1;
644 t_index += 1;
645 if (is_char(lex, *t)) {
646 next_char(lex);
647 tok_enc_index = t_index;
648 t += 1;
649 } else {
650 break;
651 }
652 } else {
653 break;
654 }
655 }
656
657 // set token kind
Damien Georgea4c52c52014-12-05 19:35:18 +0000658 lex->tok_kind = tok_enc_kind[tok_enc_index];
Damien429d7192013-10-04 19:53:11 +0100659
Damien George2e9eb2d2014-04-10 12:19:33 +0100660 tok_enc_no_match:
661
Damien429d7192013-10-04 19:53:11 +0100662 // compute bracket level for implicit line joining
Damien Georgea4c52c52014-12-05 19:35:18 +0000663 if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100664 lex->nested_bracket_level += 1;
Damien Georgea4c52c52014-12-05 19:35:18 +0000665 } else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100666 lex->nested_bracket_level -= 1;
667 }
668 }
669 }
670
Damiena5185f42013-10-20 14:41:27 +0100671 // check for keywords
Damien Georgea4c52c52014-12-05 19:35:18 +0000672 if (lex->tok_kind == MP_TOKEN_NAME) {
Damien George97f9a282014-05-12 23:07:34 +0100673 // We check for __debug__ here and convert it to its value. This is so
674 // the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
675 // need to check for this special token in many places in the compiler.
676 // TODO improve speed of these string comparisons
Damien George54eb4e72014-07-03 13:47:47 +0100677 //for (mp_int_t i = 0; tok_kw[i] != NULL; i++) {
Damien George963a5a32015-01-16 17:47:07 +0000678 for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000679 if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) {
Emmanuel Blotf6932d62014-06-19 18:54:34 +0200680 if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
681 // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
Damien Georgeb4b10fd2015-01-01 23:30:53 +0000682 lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
Damien George97f9a282014-05-12 23:07:34 +0100683 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000684 lex->tok_kind = MP_TOKEN_KW_FALSE + i;
Damien George97f9a282014-05-12 23:07:34 +0100685 }
Damien429d7192013-10-04 19:53:11 +0100686 break;
687 }
688 }
689 }
690}
691
Damien George94fbe972014-07-30 11:46:05 +0100692mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close) {
Damien George9bf5f282014-10-09 16:53:37 +0100693 mp_lexer_t *lex = m_new_obj_maybe(mp_lexer_t);
Damien Georgee1199ec2014-05-10 17:48:01 +0100694
695 // check for memory allocation error
696 if (lex == NULL) {
697 if (stream_close) {
698 stream_close(stream_data);
699 }
700 return NULL;
701 }
Damien429d7192013-10-04 19:53:11 +0100702
Damien Georgeb829b5c2014-01-25 13:51:19 +0000703 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100704 lex->stream_data = stream_data;
Damien George94fbe972014-07-30 11:46:05 +0100705 lex->stream_next_byte = stream_next_byte;
Damienfa2162b2013-10-20 17:42:00 +0100706 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100707 lex->line = 1;
708 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100709 lex->emit_dent = 0;
710 lex->nested_bracket_level = 0;
Damien George58ebde42014-05-21 20:32:59 +0100711 lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
Damien429d7192013-10-04 19:53:11 +0100712 lex->num_indent_level = 1;
Damien Georgee1199ec2014-05-10 17:48:01 +0100713 lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200714 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100715
Damien Georgee1199ec2014-05-10 17:48:01 +0100716 // check for memory allocation error
Damien George5da0d292016-09-19 11:17:02 +1000717 // note: vstr_init above may fail on malloc, but so may mp_lexer_next_token_into below
718 if (lex->indent_level == NULL) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100719 mp_lexer_free(lex);
720 return NULL;
721 }
722
723 // store sentinel for first indentation level
724 lex->indent_level[0] = 0;
725
Damien429d7192013-10-04 19:53:11 +0100726 // preload characters
Damien George94fbe972014-07-30 11:46:05 +0100727 lex->chr0 = stream_next_byte(stream_data);
728 lex->chr1 = stream_next_byte(stream_data);
729 lex->chr2 = stream_next_byte(stream_data);
Damiena5185f42013-10-20 14:41:27 +0100730
731 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damien George94fbe972014-07-30 11:46:05 +0100732 if (lex->chr0 == MP_LEXER_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100733 lex->chr0 = '\n';
Damien George94fbe972014-07-30 11:46:05 +0100734 } else if (lex->chr1 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000735 if (lex->chr0 == '\r') {
736 lex->chr0 = '\n';
737 } else if (lex->chr0 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100738 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100739 }
Damien George94fbe972014-07-30 11:46:05 +0100740 } else if (lex->chr2 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000741 if (lex->chr1 == '\r') {
742 lex->chr1 = '\n';
743 } else if (lex->chr1 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100744 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100745 }
Damien429d7192013-10-04 19:53:11 +0100746 }
747
Damiena5185f42013-10-20 14:41:27 +0100748 // preload first token
Damien Georgea4c52c52014-12-05 19:35:18 +0000749 mp_lexer_next_token_into(lex, true);
Damien429d7192013-10-04 19:53:11 +0100750
751 return lex;
752}
753
Damien George511c0832016-11-16 16:22:08 +1100754mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, mp_uint_t len, mp_uint_t free_len) {
755 mp_reader_t reader;
756 if (!mp_reader_new_mem(&reader, (const byte*)str, len, free_len)) {
757 return NULL;
758 }
759 return mp_lexer_new(src_name, reader.data, (mp_lexer_stream_next_byte_t)reader.readbyte, (mp_lexer_stream_close_t)reader.close);
760}
761
Damien Georgee5ef15a2016-11-16 16:25:06 +1100762#if MICROPY_READER_POSIX || MICROPY_READER_FATFS
763
764mp_lexer_t *mp_lexer_new_from_file(const char *filename) {
765 mp_reader_t reader;
766 int ret = mp_reader_new_file(&reader, filename);
767 if (ret != 0) {
768 return NULL;
769 }
770 return mp_lexer_new(qstr_from_str(filename), reader.data, (mp_lexer_stream_next_byte_t)reader.readbyte, (mp_lexer_stream_close_t)reader.close);
771}
772
Damien George66d955c2016-11-16 18:12:55 +1100773#if MICROPY_HELPER_LEXER_UNIX
774
775mp_lexer_t *mp_lexer_new_from_fd(qstr filename, int fd, bool close_fd) {
776 mp_reader_t reader;
777 int ret = mp_reader_new_file_from_fd(&reader, fd, close_fd);
778 if (ret != 0) {
779 return NULL;
780 }
781 return mp_lexer_new(filename, reader.data, (mp_lexer_stream_next_byte_t)reader.readbyte, (mp_lexer_stream_close_t)reader.close);
782}
783
784#endif
785
Damien Georgee5ef15a2016-11-16 16:25:06 +1100786#endif
787
Damiend99b0522013-12-21 18:17:45 +0000788void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100789 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100790 if (lex->stream_close) {
791 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100792 }
Damienbb5316b2013-10-22 21:12:29 +0100793 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200794 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000795 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100796 }
Damien429d7192013-10-04 19:53:11 +0100797}
798
Damiend99b0522013-12-21 18:17:45 +0000799void mp_lexer_to_next(mp_lexer_t *lex) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000800 mp_lexer_next_token_into(lex, false);
Damien429d7192013-10-04 19:53:11 +0100801}
802
Damien Georgea4c52c52014-12-05 19:35:18 +0000803#if MICROPY_DEBUG_PRINTERS
804void mp_lexer_show_token(const mp_lexer_t *lex) {
Damien George451a0872014-12-05 22:50:16 +0000805 printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
Damien Georgea4c52c52014-12-05 19:35:18 +0000806 if (lex->vstr.len > 0) {
807 const byte *i = (const byte *)lex->vstr.buf;
808 const byte *j = (const byte *)i + lex->vstr.len;
809 printf(" ");
810 while (i < j) {
811 unichar c = utf8_get_char(i);
812 i = utf8_next_char(i);
813 if (unichar_isprint(c)) {
Damien George7f19a392015-06-22 17:40:12 +0100814 printf("%c", (int)c);
Damien Georgea4c52c52014-12-05 19:35:18 +0000815 } else {
816 printf("?");
817 }
818 }
819 }
820 printf("\n");
Damien429d7192013-10-04 19:53:11 +0100821}
Damien Georgea4c52c52014-12-05 19:35:18 +0000822#endif
Damien Georgedd5353a2015-12-18 12:35:44 +0000823
824#endif // MICROPY_ENABLE_COMPILER