blob: 0611a727b9b6b69f0ccb9833c508a3d7099fdf2b [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027#include <stdio.h>
28#include <assert.h>
29
Damien Georgeb4b10fd2015-01-01 23:30:53 +000030#include "py/mpstate.h"
Damien George511c0832016-11-16 16:22:08 +110031#include "py/reader.h"
Damien George51dfcb42015-01-01 20:27:54 +000032#include "py/lexer.h"
Damien George081f9322015-09-07 17:08:49 +010033#include "py/runtime.h"
Damien429d7192013-10-04 19:53:11 +010034
Damien Georgedd5353a2015-12-18 12:35:44 +000035#if MICROPY_ENABLE_COMPILER
36
Damien429d7192013-10-04 19:53:11 +010037#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010038
Damien92c06562013-10-22 22:32:27 +010039// TODO seems that CPython allows NULL byte in the input stream
40// don't know if that's intentional or not, but we don't allow it
41
Damien George9528cd62014-01-15 21:23:31 +000042// TODO replace with a call to a standard function
Damien Georgea4c52c52014-12-05 19:35:18 +000043STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
Damien George54eb4e72014-07-03 13:47:47 +010044 mp_uint_t i = 0;
Damien429d7192013-10-04 19:53:11 +010045
Damiena5185f42013-10-20 14:41:27 +010046 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010047 ++i;
Damien429d7192013-10-04 19:53:11 +010048 ++str;
Damiena5185f42013-10-20 14:41:27 +010049 ++strn;
Damien429d7192013-10-04 19:53:11 +010050 }
51
Damiena5185f42013-10-20 14:41:27 +010052 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010053}
54
Damien George5bdf1652016-11-16 18:27:20 +110055#define MP_LEXER_EOF ((unichar)MP_READER_EOF)
Damiena5185f42013-10-20 14:41:27 +010056#define CUR_CHAR(lex) ((lex)->chr0)
57
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020058STATIC bool is_end(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +010059 return lex->chr0 == MP_LEXER_EOF;
Damien429d7192013-10-04 19:53:11 +010060}
61
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020062STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien George32bade12015-01-30 00:27:46 +000063 return lex->chr0 == '\n';
Damien429d7192013-10-04 19:53:11 +010064}
65
Damien George2e2e4042015-03-19 00:21:29 +000066STATIC bool is_char(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010067 return lex->chr0 == c;
68}
69
Damien George2e2e4042015-03-19 00:21:29 +000070STATIC bool is_char_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010071 return lex->chr0 == c1 || lex->chr0 == c2;
72}
73
Damien George2e2e4042015-03-19 00:21:29 +000074STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
Damien429d7192013-10-04 19:53:11 +010075 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
76}
77
78/*
Damien George2e2e4042015-03-19 00:21:29 +000079STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010080 return lex->chr1 == c;
81}
82*/
83
Damien George2e2e4042015-03-19 00:21:29 +000084STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010085 return lex->chr1 == c1 || lex->chr1 == c2;
86}
87
Damien George2e2e4042015-03-19 00:21:29 +000088STATIC bool is_char_following_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010089 return lex->chr2 == c1 || lex->chr2 == c2;
90}
91
Damien George2e2e4042015-03-19 00:21:29 +000092STATIC bool is_char_and(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010093 return lex->chr0 == c1 && lex->chr1 == c2;
94}
95
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020096STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000097 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010098}
99
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200100STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000101 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100102}
103
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200104STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000105 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100106}
107
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200108STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000109 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100110}
111
Damien George2b000472015-09-07 17:33:44 +0100112STATIC bool is_following_base_char(mp_lexer_t *lex) {
113 const unichar chr1 = lex->chr1 | 0x20;
114 return chr1 == 'b' || chr1 == 'o' || chr1 == 'x';
Damien George7d414a12015-02-08 01:57:40 +0000115}
116
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200117STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200118 return lex->chr1 >= '0' && lex->chr1 <= '7';
119}
120
Damien George7ed58cb2015-06-09 10:58:07 +0000121// to easily parse utf-8 identifiers we allow any raw byte with high bit set
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200122STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien George7ed58cb2015-06-09 10:58:07 +0000123 return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
Damien429d7192013-10-04 19:53:11 +0100124}
125
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200126STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100127 return is_head_of_identifier(lex) || is_digit(lex);
128}
129
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200130STATIC void next_char(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +0100131 if (lex->chr0 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100132 return;
133 }
134
Damien429d7192013-10-04 19:53:11 +0100135 if (lex->chr0 == '\n') {
Damien George32bade12015-01-30 00:27:46 +0000136 // a new line
Damien429d7192013-10-04 19:53:11 +0100137 ++lex->line;
138 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100139 } else if (lex->chr0 == '\t') {
140 // a tab
141 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
142 } else {
143 // a character worth one column
144 ++lex->column;
145 }
146
Damien George32bade12015-01-30 00:27:46 +0000147 lex->chr0 = lex->chr1;
148 lex->chr1 = lex->chr2;
Damien George5bdf1652016-11-16 18:27:20 +1100149 lex->chr2 = lex->reader.readbyte(lex->reader.data);
Damien George32bade12015-01-30 00:27:46 +0000150
151 if (lex->chr0 == '\r') {
152 // CR is a new line, converted to LF
153 lex->chr0 = '\n';
154 if (lex->chr1 == '\n') {
155 // CR LF is a single new line
156 lex->chr1 = lex->chr2;
Damien George5bdf1652016-11-16 18:27:20 +1100157 lex->chr2 = lex->reader.readbyte(lex->reader.data);
Damien George32bade12015-01-30 00:27:46 +0000158 }
159 }
160
161 if (lex->chr2 == MP_LEXER_EOF) {
162 // EOF, check if we need to insert a newline at end of file
163 if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
164 // if lex->chr1 == '\r' then this makes a CR LF which will be converted to LF above
165 // otherwise it just inserts a LF
166 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100167 }
168 }
169}
170
Damien Georgea4c52c52014-12-05 19:35:18 +0000171STATIC void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
Damien429d7192013-10-04 19:53:11 +0100172 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100173 // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
Damien George58ebde42014-05-21 20:32:59 +0100174 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
175 lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
Damien429d7192013-10-04 19:53:11 +0100176 }
177 lex->indent_level[lex->num_indent_level++] = indent;
178}
179
Damien Georgea4c52c52014-12-05 19:35:18 +0000180STATIC mp_uint_t indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100181 return lex->indent_level[lex->num_indent_level - 1];
182}
183
Damien Georgea4c52c52014-12-05 19:35:18 +0000184STATIC void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100185 lex->num_indent_level -= 1;
186}
187
188// some tricky operator encoding:
189// <op> = begin with <op>, if this opchar matches then begin here
190// e<op> = end with <op>, if this opchar matches then end
191// E<op> = mandatory end with <op>, this opchar must match, then end
192// c<op> = continue with <op>, if this opchar matches then continue matching
193// this means if the start of two ops are the same then they are equal til the last char
194
Damien George3ff16ff2016-05-20 12:38:15 +0100195STATIC const char *const tok_enc =
Damien429d7192013-10-04 19:53:11 +0100196 "()[]{},:;@~" // singles
197 "<e=c<e=" // < <= << <<=
198 ">e=c>e=" // > >= >> >>=
199 "*e=c*e=" // * *= ** **=
200 "+e=" // + +=
201 "-e=e>" // - -= ->
202 "&e=" // & &=
203 "|e=" // | |=
204 "/e=c/e=" // / /= // //=
205 "%e=" // % %=
206 "^e=" // ^ ^=
207 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100208 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100209
210// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200211STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000212 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
213 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
214 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
215 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100216
Damiend99b0522013-12-21 18:17:45 +0000217 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
218 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
219 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
220 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
221 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
222 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
223 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
224 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
225 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
226 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
227 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
228 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100229};
230
231// must have the same order as enum in lexer.h
Damien George3ff16ff2016-05-20 12:38:15 +0100232STATIC const char *const tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100233 "False",
234 "None",
235 "True",
236 "and",
237 "as",
238 "assert",
pohmelie81ebba72016-01-27 23:23:11 +0300239 #if MICROPY_PY_ASYNC_AWAIT
240 "async",
241 "await",
242 #endif
Damien429d7192013-10-04 19:53:11 +0100243 "break",
244 "class",
245 "continue",
246 "def",
247 "del",
248 "elif",
249 "else",
250 "except",
251 "finally",
252 "for",
253 "from",
254 "global",
255 "if",
256 "import",
257 "in",
258 "is",
259 "lambda",
260 "nonlocal",
261 "not",
262 "or",
263 "pass",
264 "raise",
265 "return",
266 "try",
267 "while",
268 "with",
269 "yield",
Damien George97f9a282014-05-12 23:07:34 +0100270 "__debug__",
Damien429d7192013-10-04 19:53:11 +0100271};
272
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200273// This is called with CUR_CHAR() before first hex digit, and should return with
274// it pointing to last hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100275// num_digits must be greater than zero
276STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
277 mp_uint_t num = 0;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200278 while (num_digits-- != 0) {
279 next_char(lex);
280 unichar c = CUR_CHAR(lex);
281 if (!unichar_isxdigit(c)) {
282 return false;
283 }
Dave Hylands3ad94d62015-05-18 14:41:25 -0700284 num = (num << 4) + unichar_xdigit_value(c);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200285 }
286 *result = num;
287 return true;
288}
289
Damien Georgea4c52c52014-12-05 19:35:18 +0000290STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
291 // start new token text
292 vstr_reset(&lex->vstr);
293
Damiena5185f42013-10-20 14:41:27 +0100294 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100295 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100296 while (!is_end(lex)) {
297 if (is_physical_newline(lex)) {
298 had_physical_newline = true;
299 next_char(lex);
300 } else if (is_whitespace(lex)) {
301 next_char(lex);
302 } else if (is_char(lex, '#')) {
303 next_char(lex);
304 while (!is_end(lex) && !is_physical_newline(lex)) {
305 next_char(lex);
306 }
307 // had_physical_newline will be set on next loop
308 } else if (is_char(lex, '\\')) {
309 // backslash (outside string literals) must appear just before a physical newline
310 next_char(lex);
311 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000312 // SyntaxError: unexpected character after line continuation character
Damien Georgea4c52c52014-12-05 19:35:18 +0000313 lex->tok_line = lex->line;
314 lex->tok_column = lex->column;
315 lex->tok_kind = MP_TOKEN_BAD_LINE_CONTINUATION;
Damien George69a818d2014-01-12 13:55:24 +0000316 return;
Damien429d7192013-10-04 19:53:11 +0100317 } else {
318 next_char(lex);
319 }
320 } else {
321 break;
322 }
323 }
324
Damiena5185f42013-10-20 14:41:27 +0100325 // set token source information
Damien Georgea4c52c52014-12-05 19:35:18 +0000326 lex->tok_line = lex->line;
327 lex->tok_column = lex->column;
Damiena5185f42013-10-20 14:41:27 +0100328
329 if (first_token && lex->line == 1 && lex->column != 1) {
330 // check that the first token is in the first column
331 // if first token is not on first line, we get a physical newline and
332 // this check is done as part of normal indent/dedent checking below
333 // (done to get equivalence with CPython)
Damien Georgea4c52c52014-12-05 19:35:18 +0000334 lex->tok_kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100335
336 } else if (lex->emit_dent < 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000337 lex->tok_kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100338 lex->emit_dent += 1;
339
340 } else if (lex->emit_dent > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000341 lex->tok_kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100342 lex->emit_dent -= 1;
343
Damien91d387d2013-10-09 15:09:52 +0100344 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000345 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100346
Damien George54eb4e72014-07-03 13:47:47 +0100347 mp_uint_t num_spaces = lex->column - 1;
Damien429d7192013-10-04 19:53:11 +0100348 if (num_spaces == indent_top(lex)) {
349 } else if (num_spaces > indent_top(lex)) {
350 indent_push(lex, num_spaces);
351 lex->emit_dent += 1;
352 } else {
353 while (num_spaces < indent_top(lex)) {
354 indent_pop(lex);
355 lex->emit_dent -= 1;
356 }
357 if (num_spaces != indent_top(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000358 lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100359 }
360 }
361
362 } else if (is_end(lex)) {
Damien George31101d92016-10-12 11:00:17 +1100363 lex->tok_kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100364
365 } else if (is_char_or(lex, '\'', '\"')
366 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
367 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
368 // a string or bytes literal
369
370 // parse type codes
371 bool is_raw = false;
372 bool is_bytes = false;
373 if (is_char(lex, 'u')) {
374 next_char(lex);
375 } else if (is_char(lex, 'b')) {
376 is_bytes = true;
377 next_char(lex);
378 if (is_char(lex, 'r')) {
379 is_raw = true;
380 next_char(lex);
381 }
382 } else if (is_char(lex, 'r')) {
383 is_raw = true;
384 next_char(lex);
385 if (is_char(lex, 'b')) {
386 is_bytes = true;
387 next_char(lex);
388 }
389 }
390
391 // set token kind
392 if (is_bytes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000393 lex->tok_kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100394 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000395 lex->tok_kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100396 }
397
398 // get first quoting character
399 char quote_char = '\'';
400 if (is_char(lex, '\"')) {
401 quote_char = '\"';
402 }
403 next_char(lex);
404
405 // work out if it's a single or triple quoted literal
Damien George54eb4e72014-07-03 13:47:47 +0100406 mp_uint_t num_quotes;
Damien429d7192013-10-04 19:53:11 +0100407 if (is_char_and(lex, quote_char, quote_char)) {
408 // triple quotes
409 next_char(lex);
410 next_char(lex);
411 num_quotes = 3;
412 } else {
413 // single quotes
414 num_quotes = 1;
415 }
416
Damien429d7192013-10-04 19:53:11 +0100417 // parse the literal
Damien George54eb4e72014-07-03 13:47:47 +0100418 mp_uint_t n_closing = 0;
Damien429d7192013-10-04 19:53:11 +0100419 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
420 if (is_char(lex, quote_char)) {
421 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100422 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100423 } else {
424 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100425 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100426 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100427 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100428 if (is_raw) {
429 // raw strings allow escaping of quotes, but the backslash is also emitted
430 vstr_add_char(&lex->vstr, '\\');
431 } else {
432 switch (c) {
Damien Georgeb9c47832016-12-22 10:37:13 +1100433 // note: "c" can never be MP_LEXER_EOF because next_char
434 // always inserts a newline at the end of the input stream
Damien Georgeadccafb2016-12-22 10:32:06 +1100435 case '\n': c = MP_LEXER_EOF; break; // backslash escape the newline, just ignore it
Damien Georgea91f4142014-04-10 11:30:55 +0100436 case '\\': break;
437 case '\'': break;
438 case '"': break;
439 case 'a': c = 0x07; break;
440 case 'b': c = 0x08; break;
441 case 't': c = 0x09; break;
442 case 'n': c = 0x0a; break;
443 case 'v': c = 0x0b; break;
444 case 'f': c = 0x0c; break;
445 case 'r': c = 0x0d; break;
Chris Angelico2ba22992014-06-04 05:28:12 +1000446 case 'u':
447 case 'U':
448 if (is_bytes) {
449 // b'\u1234' == b'\\u1234'
450 vstr_add_char(&lex->vstr, '\\');
451 break;
452 }
453 // Otherwise fall through.
Damien Georgea91f4142014-04-10 11:30:55 +0100454 case 'x':
455 {
Damien George54eb4e72014-07-03 13:47:47 +0100456 mp_uint_t num = 0;
Chris Angelico2ba22992014-06-04 05:28:12 +1000457 if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
Damien Georged241c2a2015-07-23 23:20:37 +0100458 // not enough hex chars for escape sequence
459 lex->tok_kind = MP_TOKEN_INVALID;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200460 }
461 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100462 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200463 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000464 case 'N':
465 // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
466 // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
467 // 3MB of text; even gzip-compressed and with minimal structure, it'll take
468 // roughly half a meg of storage. This form of Unicode escape may be added
469 // later on, but it's definitely not a priority right now. -- CJA 20140607
Damien George081f9322015-09-07 17:08:49 +0100470 mp_not_implemented("unicode name escapes");
Chris Angelico2ba22992014-06-04 05:28:12 +1000471 break;
Damien Georgea91f4142014-04-10 11:30:55 +0100472 default:
473 if (c >= '0' && c <= '7') {
474 // Octal sequence, 1-3 chars
Damien George54eb4e72014-07-03 13:47:47 +0100475 mp_uint_t digits = 3;
476 mp_uint_t num = c - '0';
Damien Georgea91f4142014-04-10 11:30:55 +0100477 while (is_following_odigit(lex) && --digits != 0) {
478 next_char(lex);
479 num = num * 8 + (CUR_CHAR(lex) - '0');
480 }
481 c = num;
482 } else {
483 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
484 vstr_add_char(&lex->vstr, '\\');
485 }
486 break;
487 }
Damiena5185f42013-10-20 14:41:27 +0100488 }
Damien George94fbe972014-07-30 11:46:05 +0100489 if (c != MP_LEXER_EOF) {
Damien Georgeea235202016-02-11 22:30:53 +0000490 if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
491 if (c < 0x110000 && !is_bytes) {
492 vstr_add_char(&lex->vstr, c);
493 } else if (c < 0x100 && is_bytes) {
494 vstr_add_byte(&lex->vstr, c);
495 } else {
496 // unicode character out of range
497 // this raises a generic SyntaxError; could provide more info
498 lex->tok_kind = MP_TOKEN_INVALID;
499 }
500 } else {
501 // without unicode everything is just added as an 8-bit byte
502 if (c < 0x100) {
503 vstr_add_byte(&lex->vstr, c);
504 } else {
505 // 8-bit character out of range
506 // this raises a generic SyntaxError; could provide more info
507 lex->tok_kind = MP_TOKEN_INVALID;
508 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000509 }
Damiena5185f42013-10-20 14:41:27 +0100510 }
511 } else {
Damien George94fbe972014-07-30 11:46:05 +0100512 // Add the "character" as a byte so that we remain 8-bit clean.
513 // This way, strings are parsed correctly whether or not they contain utf-8 chars.
514 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100515 }
516 }
517 next_char(lex);
518 }
519
520 // check we got the required end quotes
521 if (n_closing < num_quotes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000522 lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100523 }
524
Damiena5185f42013-10-20 14:41:27 +0100525 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000526 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100527
528 } else if (is_head_of_identifier(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000529 lex->tok_kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100530
Damien George7ed58cb2015-06-09 10:58:07 +0000531 // get first char (add as byte to remain 8-bit clean and support utf-8)
532 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100533 next_char(lex);
534
Damiena5185f42013-10-20 14:41:27 +0100535 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100536 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damien George7ed58cb2015-06-09 10:58:07 +0000537 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100538 next_char(lex);
539 }
540
541 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damien George7d414a12015-02-08 01:57:40 +0000542 bool forced_integer = false;
543 if (is_char(lex, '.')) {
544 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
545 } else {
546 lex->tok_kind = MP_TOKEN_INTEGER;
Damien George2b000472015-09-07 17:33:44 +0100547 if (is_char(lex, '0') && is_following_base_char(lex)) {
Damien George7d414a12015-02-08 01:57:40 +0000548 forced_integer = true;
549 }
550 }
Damien429d7192013-10-04 19:53:11 +0100551
Damiena5185f42013-10-20 14:41:27 +0100552 // get first char
553 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100554 next_char(lex);
555
Damiena5185f42013-10-20 14:41:27 +0100556 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100557 while (!is_end(lex)) {
Damien George7d414a12015-02-08 01:57:40 +0000558 if (!forced_integer && is_char_or(lex, 'e', 'E')) {
559 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
Damiena5185f42013-10-20 14:41:27 +0100560 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100561 next_char(lex);
562 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100563 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100564 next_char(lex);
565 }
Damien George7d414a12015-02-08 01:57:40 +0000566 } else if (is_letter(lex) || is_digit(lex) || is_char(lex, '.')) {
567 if (is_char_or3(lex, '.', 'j', 'J')) {
568 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
569 }
Damiena5185f42013-10-20 14:41:27 +0100570 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100571 next_char(lex);
572 } else {
573 break;
574 }
575 }
576
Damien George2e9eb2d2014-04-10 12:19:33 +0100577 } else if (is_char(lex, '.')) {
578 // special handling for . and ... operators, because .. is not a valid operator
579
580 // get first char
581 vstr_add_char(&lex->vstr, '.');
582 next_char(lex);
583
584 if (is_char_and(lex, '.', '.')) {
585 vstr_add_char(&lex->vstr, '.');
586 vstr_add_char(&lex->vstr, '.');
587 next_char(lex);
588 next_char(lex);
Damien Georgea4c52c52014-12-05 19:35:18 +0000589 lex->tok_kind = MP_TOKEN_ELLIPSIS;
Damien George2e9eb2d2014-04-10 12:19:33 +0100590 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000591 lex->tok_kind = MP_TOKEN_DEL_PERIOD;
Damien George2e9eb2d2014-04-10 12:19:33 +0100592 }
593
Damien429d7192013-10-04 19:53:11 +0100594 } else {
595 // search for encoded delimiter or operator
596
597 const char *t = tok_enc;
Damien George54eb4e72014-07-03 13:47:47 +0100598 mp_uint_t tok_enc_index = 0;
Damien429d7192013-10-04 19:53:11 +0100599 for (; *t != 0 && !is_char(lex, *t); t += 1) {
600 if (*t == 'e' || *t == 'c') {
601 t += 1;
602 } else if (*t == 'E') {
603 tok_enc_index -= 1;
604 t += 1;
605 }
606 tok_enc_index += 1;
607 }
608
609 next_char(lex);
610
611 if (*t == 0) {
612 // didn't match any delimiter or operator characters
Damien Georgea4c52c52014-12-05 19:35:18 +0000613 lex->tok_kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100614
615 } else {
616 // matched a delimiter or operator character
617
618 // get the maximum characters for a valid token
619 t += 1;
Damien George54eb4e72014-07-03 13:47:47 +0100620 mp_uint_t t_index = tok_enc_index;
Damien429d7192013-10-04 19:53:11 +0100621 for (;;) {
622 for (; *t == 'e'; t += 1) {
623 t += 1;
624 t_index += 1;
625 if (is_char(lex, *t)) {
626 next_char(lex);
627 tok_enc_index = t_index;
628 break;
629 }
630 }
631
632 if (*t == 'E') {
633 t += 1;
634 if (is_char(lex, *t)) {
635 next_char(lex);
636 tok_enc_index = t_index;
637 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000638 lex->tok_kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100639 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100640 }
641 break;
642 }
643
644 if (*t == 'c') {
645 t += 1;
646 t_index += 1;
647 if (is_char(lex, *t)) {
648 next_char(lex);
649 tok_enc_index = t_index;
650 t += 1;
651 } else {
652 break;
653 }
654 } else {
655 break;
656 }
657 }
658
659 // set token kind
Damien Georgea4c52c52014-12-05 19:35:18 +0000660 lex->tok_kind = tok_enc_kind[tok_enc_index];
Damien429d7192013-10-04 19:53:11 +0100661
Damien George2e9eb2d2014-04-10 12:19:33 +0100662 tok_enc_no_match:
663
Damien429d7192013-10-04 19:53:11 +0100664 // compute bracket level for implicit line joining
Damien Georgea4c52c52014-12-05 19:35:18 +0000665 if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100666 lex->nested_bracket_level += 1;
Damien Georgea4c52c52014-12-05 19:35:18 +0000667 } else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100668 lex->nested_bracket_level -= 1;
669 }
670 }
671 }
672
Damiena5185f42013-10-20 14:41:27 +0100673 // check for keywords
Damien Georgea4c52c52014-12-05 19:35:18 +0000674 if (lex->tok_kind == MP_TOKEN_NAME) {
Damien George97f9a282014-05-12 23:07:34 +0100675 // We check for __debug__ here and convert it to its value. This is so
676 // the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
677 // need to check for this special token in many places in the compiler.
678 // TODO improve speed of these string comparisons
Damien George54eb4e72014-07-03 13:47:47 +0100679 //for (mp_int_t i = 0; tok_kw[i] != NULL; i++) {
Damien George963a5a32015-01-16 17:47:07 +0000680 for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000681 if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) {
Emmanuel Blotf6932d62014-06-19 18:54:34 +0200682 if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
683 // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
Damien Georgeb4b10fd2015-01-01 23:30:53 +0000684 lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
Damien George97f9a282014-05-12 23:07:34 +0100685 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000686 lex->tok_kind = MP_TOKEN_KW_FALSE + i;
Damien George97f9a282014-05-12 23:07:34 +0100687 }
Damien429d7192013-10-04 19:53:11 +0100688 break;
689 }
690 }
691 }
692}
693
Damien George5bdf1652016-11-16 18:27:20 +1100694mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
Damien George9bf5f282014-10-09 16:53:37 +0100695 mp_lexer_t *lex = m_new_obj_maybe(mp_lexer_t);
Damien Georgee1199ec2014-05-10 17:48:01 +0100696
697 // check for memory allocation error
698 if (lex == NULL) {
Damien George5bdf1652016-11-16 18:27:20 +1100699 reader.close(reader.data);
Damien Georgee1199ec2014-05-10 17:48:01 +0100700 return NULL;
701 }
Damien429d7192013-10-04 19:53:11 +0100702
Damien Georgeb829b5c2014-01-25 13:51:19 +0000703 lex->source_name = src_name;
Damien George5bdf1652016-11-16 18:27:20 +1100704 lex->reader = reader;
Damien429d7192013-10-04 19:53:11 +0100705 lex->line = 1;
706 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100707 lex->emit_dent = 0;
708 lex->nested_bracket_level = 0;
Damien George58ebde42014-05-21 20:32:59 +0100709 lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
Damien429d7192013-10-04 19:53:11 +0100710 lex->num_indent_level = 1;
Damien Georgee1199ec2014-05-10 17:48:01 +0100711 lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200712 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100713
Damien Georgee1199ec2014-05-10 17:48:01 +0100714 // check for memory allocation error
Damien George5da0d292016-09-19 11:17:02 +1000715 // note: vstr_init above may fail on malloc, but so may mp_lexer_next_token_into below
716 if (lex->indent_level == NULL) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100717 mp_lexer_free(lex);
718 return NULL;
719 }
720
721 // store sentinel for first indentation level
722 lex->indent_level[0] = 0;
723
Damien429d7192013-10-04 19:53:11 +0100724 // preload characters
Damien George5bdf1652016-11-16 18:27:20 +1100725 lex->chr0 = reader.readbyte(reader.data);
726 lex->chr1 = reader.readbyte(reader.data);
727 lex->chr2 = reader.readbyte(reader.data);
Damiena5185f42013-10-20 14:41:27 +0100728
729 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damien George94fbe972014-07-30 11:46:05 +0100730 if (lex->chr0 == MP_LEXER_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100731 lex->chr0 = '\n';
Damien George94fbe972014-07-30 11:46:05 +0100732 } else if (lex->chr1 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000733 if (lex->chr0 == '\r') {
734 lex->chr0 = '\n';
735 } else if (lex->chr0 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100736 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100737 }
Damien George94fbe972014-07-30 11:46:05 +0100738 } else if (lex->chr2 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000739 if (lex->chr1 == '\r') {
740 lex->chr1 = '\n';
741 } else if (lex->chr1 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100742 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100743 }
Damien429d7192013-10-04 19:53:11 +0100744 }
745
Damiena5185f42013-10-20 14:41:27 +0100746 // preload first token
Damien Georgea4c52c52014-12-05 19:35:18 +0000747 mp_lexer_next_token_into(lex, true);
Damien429d7192013-10-04 19:53:11 +0100748
749 return lex;
750}
751
Damien George511c0832016-11-16 16:22:08 +1100752mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, mp_uint_t len, mp_uint_t free_len) {
753 mp_reader_t reader;
754 if (!mp_reader_new_mem(&reader, (const byte*)str, len, free_len)) {
755 return NULL;
756 }
Damien George5bdf1652016-11-16 18:27:20 +1100757 return mp_lexer_new(src_name, reader);
Damien George511c0832016-11-16 16:22:08 +1100758}
759
Damien Georgee5ef15a2016-11-16 16:25:06 +1100760#if MICROPY_READER_POSIX || MICROPY_READER_FATFS
761
762mp_lexer_t *mp_lexer_new_from_file(const char *filename) {
763 mp_reader_t reader;
764 int ret = mp_reader_new_file(&reader, filename);
765 if (ret != 0) {
766 return NULL;
767 }
Damien George5bdf1652016-11-16 18:27:20 +1100768 return mp_lexer_new(qstr_from_str(filename), reader);
Damien Georgee5ef15a2016-11-16 16:25:06 +1100769}
770
Damien George66d955c2016-11-16 18:12:55 +1100771#if MICROPY_HELPER_LEXER_UNIX
772
773mp_lexer_t *mp_lexer_new_from_fd(qstr filename, int fd, bool close_fd) {
774 mp_reader_t reader;
775 int ret = mp_reader_new_file_from_fd(&reader, fd, close_fd);
776 if (ret != 0) {
777 return NULL;
778 }
Damien George5bdf1652016-11-16 18:27:20 +1100779 return mp_lexer_new(filename, reader);
Damien George66d955c2016-11-16 18:12:55 +1100780}
781
782#endif
783
Damien Georgee5ef15a2016-11-16 16:25:06 +1100784#endif
785
Damiend99b0522013-12-21 18:17:45 +0000786void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100787 if (lex) {
Damien George5bdf1652016-11-16 18:27:20 +1100788 lex->reader.close(lex->reader.data);
Damienbb5316b2013-10-22 21:12:29 +0100789 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200790 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000791 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100792 }
Damien429d7192013-10-04 19:53:11 +0100793}
794
Damiend99b0522013-12-21 18:17:45 +0000795void mp_lexer_to_next(mp_lexer_t *lex) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000796 mp_lexer_next_token_into(lex, false);
Damien429d7192013-10-04 19:53:11 +0100797}
798
Damien Georgea4c52c52014-12-05 19:35:18 +0000799#if MICROPY_DEBUG_PRINTERS
800void mp_lexer_show_token(const mp_lexer_t *lex) {
Damien George451a0872014-12-05 22:50:16 +0000801 printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
Damien Georgea4c52c52014-12-05 19:35:18 +0000802 if (lex->vstr.len > 0) {
803 const byte *i = (const byte *)lex->vstr.buf;
804 const byte *j = (const byte *)i + lex->vstr.len;
805 printf(" ");
806 while (i < j) {
807 unichar c = utf8_get_char(i);
808 i = utf8_next_char(i);
809 if (unichar_isprint(c)) {
Damien George7f19a392015-06-22 17:40:12 +0100810 printf("%c", (int)c);
Damien Georgea4c52c52014-12-05 19:35:18 +0000811 } else {
812 printf("?");
813 }
814 }
815 }
816 printf("\n");
Damien429d7192013-10-04 19:53:11 +0100817}
Damien Georgea4c52c52014-12-05 19:35:18 +0000818#endif
Damien Georgedd5353a2015-12-18 12:35:44 +0000819
820#endif // MICROPY_ENABLE_COMPILER