blob: c6ecdf1f8e0f5f8ac14b0aee84780a2aa7057ade [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027#include <stdio.h>
28#include <assert.h>
29
Damien Georgeb4b10fd2015-01-01 23:30:53 +000030#include "py/mpstate.h"
Damien George511c0832016-11-16 16:22:08 +110031#include "py/reader.h"
Damien George51dfcb42015-01-01 20:27:54 +000032#include "py/lexer.h"
Damien George081f9322015-09-07 17:08:49 +010033#include "py/runtime.h"
Damien429d7192013-10-04 19:53:11 +010034
Damien Georgedd5353a2015-12-18 12:35:44 +000035#if MICROPY_ENABLE_COMPILER
36
Damien429d7192013-10-04 19:53:11 +010037#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010038
Damien92c06562013-10-22 22:32:27 +010039// TODO seems that CPython allows NULL byte in the input stream
40// don't know if that's intentional or not, but we don't allow it
41
Damien George9528cd62014-01-15 21:23:31 +000042// TODO replace with a call to a standard function
Damien Georgea4c52c52014-12-05 19:35:18 +000043STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
Damien George54eb4e72014-07-03 13:47:47 +010044 mp_uint_t i = 0;
Damien429d7192013-10-04 19:53:11 +010045
Damiena5185f42013-10-20 14:41:27 +010046 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010047 ++i;
Damien429d7192013-10-04 19:53:11 +010048 ++str;
Damiena5185f42013-10-20 14:41:27 +010049 ++strn;
Damien429d7192013-10-04 19:53:11 +010050 }
51
Damiena5185f42013-10-20 14:41:27 +010052 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010053}
54
Damien George5bdf1652016-11-16 18:27:20 +110055#define MP_LEXER_EOF ((unichar)MP_READER_EOF)
Damiena5185f42013-10-20 14:41:27 +010056#define CUR_CHAR(lex) ((lex)->chr0)
57
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020058STATIC bool is_end(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +010059 return lex->chr0 == MP_LEXER_EOF;
Damien429d7192013-10-04 19:53:11 +010060}
61
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020062STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien George32bade12015-01-30 00:27:46 +000063 return lex->chr0 == '\n';
Damien429d7192013-10-04 19:53:11 +010064}
65
Damien George2e2e4042015-03-19 00:21:29 +000066STATIC bool is_char(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010067 return lex->chr0 == c;
68}
69
Damien George2e2e4042015-03-19 00:21:29 +000070STATIC bool is_char_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010071 return lex->chr0 == c1 || lex->chr0 == c2;
72}
73
Damien George2e2e4042015-03-19 00:21:29 +000074STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
Damien429d7192013-10-04 19:53:11 +010075 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
76}
77
78/*
Damien George2e2e4042015-03-19 00:21:29 +000079STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010080 return lex->chr1 == c;
81}
82*/
83
Damien George2e2e4042015-03-19 00:21:29 +000084STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010085 return lex->chr1 == c1 || lex->chr1 == c2;
86}
87
Damien George2e2e4042015-03-19 00:21:29 +000088STATIC bool is_char_following_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010089 return lex->chr2 == c1 || lex->chr2 == c2;
90}
91
Damien George2e2e4042015-03-19 00:21:29 +000092STATIC bool is_char_and(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010093 return lex->chr0 == c1 && lex->chr1 == c2;
94}
95
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020096STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000097 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010098}
99
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200100STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000101 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100102}
103
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200104STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000105 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100106}
107
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200108STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000109 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100110}
111
Damien George2b000472015-09-07 17:33:44 +0100112STATIC bool is_following_base_char(mp_lexer_t *lex) {
113 const unichar chr1 = lex->chr1 | 0x20;
114 return chr1 == 'b' || chr1 == 'o' || chr1 == 'x';
Damien George7d414a12015-02-08 01:57:40 +0000115}
116
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200117STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200118 return lex->chr1 >= '0' && lex->chr1 <= '7';
119}
120
Damien George7ed58cb2015-06-09 10:58:07 +0000121// to easily parse utf-8 identifiers we allow any raw byte with high bit set
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200122STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien George7ed58cb2015-06-09 10:58:07 +0000123 return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
Damien429d7192013-10-04 19:53:11 +0100124}
125
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200126STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100127 return is_head_of_identifier(lex) || is_digit(lex);
128}
129
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200130STATIC void next_char(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +0100131 if (lex->chr0 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100132 return;
133 }
134
Damien429d7192013-10-04 19:53:11 +0100135 if (lex->chr0 == '\n') {
Damien George32bade12015-01-30 00:27:46 +0000136 // a new line
Damien429d7192013-10-04 19:53:11 +0100137 ++lex->line;
138 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100139 } else if (lex->chr0 == '\t') {
140 // a tab
141 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
142 } else {
143 // a character worth one column
144 ++lex->column;
145 }
146
Damien George32bade12015-01-30 00:27:46 +0000147 lex->chr0 = lex->chr1;
148 lex->chr1 = lex->chr2;
Damien George5bdf1652016-11-16 18:27:20 +1100149 lex->chr2 = lex->reader.readbyte(lex->reader.data);
Damien George32bade12015-01-30 00:27:46 +0000150
151 if (lex->chr0 == '\r') {
152 // CR is a new line, converted to LF
153 lex->chr0 = '\n';
154 if (lex->chr1 == '\n') {
155 // CR LF is a single new line
156 lex->chr1 = lex->chr2;
Damien George5bdf1652016-11-16 18:27:20 +1100157 lex->chr2 = lex->reader.readbyte(lex->reader.data);
Damien George32bade12015-01-30 00:27:46 +0000158 }
159 }
160
161 if (lex->chr2 == MP_LEXER_EOF) {
162 // EOF, check if we need to insert a newline at end of file
163 if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
164 // if lex->chr1 == '\r' then this makes a CR LF which will be converted to LF above
165 // otherwise it just inserts a LF
166 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100167 }
168 }
169}
170
Damien Georgea4c52c52014-12-05 19:35:18 +0000171STATIC void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
Damien429d7192013-10-04 19:53:11 +0100172 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100173 // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
Damien George58ebde42014-05-21 20:32:59 +0100174 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
175 lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
Damien429d7192013-10-04 19:53:11 +0100176 }
177 lex->indent_level[lex->num_indent_level++] = indent;
178}
179
Damien Georgea4c52c52014-12-05 19:35:18 +0000180STATIC mp_uint_t indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100181 return lex->indent_level[lex->num_indent_level - 1];
182}
183
Damien Georgea4c52c52014-12-05 19:35:18 +0000184STATIC void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100185 lex->num_indent_level -= 1;
186}
187
188// some tricky operator encoding:
189// <op> = begin with <op>, if this opchar matches then begin here
190// e<op> = end with <op>, if this opchar matches then end
191// E<op> = mandatory end with <op>, this opchar must match, then end
192// c<op> = continue with <op>, if this opchar matches then continue matching
193// this means if the start of two ops are the same then they are equal til the last char
194
Damien George3ff16ff2016-05-20 12:38:15 +0100195STATIC const char *const tok_enc =
Damien429d7192013-10-04 19:53:11 +0100196 "()[]{},:;@~" // singles
197 "<e=c<e=" // < <= << <<=
198 ">e=c>e=" // > >= >> >>=
199 "*e=c*e=" // * *= ** **=
200 "+e=" // + +=
201 "-e=e>" // - -= ->
202 "&e=" // & &=
203 "|e=" // | |=
204 "/e=c/e=" // / /= // //=
205 "%e=" // % %=
206 "^e=" // ^ ^=
207 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100208 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100209
210// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200211STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000212 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
213 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
214 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
215 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100216
Damiend99b0522013-12-21 18:17:45 +0000217 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
218 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
219 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
220 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
221 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
222 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
223 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
224 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
225 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
226 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
227 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
228 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100229};
230
231// must have the same order as enum in lexer.h
Damien George3ff16ff2016-05-20 12:38:15 +0100232STATIC const char *const tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100233 "False",
234 "None",
235 "True",
236 "and",
237 "as",
238 "assert",
pohmelie81ebba72016-01-27 23:23:11 +0300239 #if MICROPY_PY_ASYNC_AWAIT
240 "async",
241 "await",
242 #endif
Damien429d7192013-10-04 19:53:11 +0100243 "break",
244 "class",
245 "continue",
246 "def",
247 "del",
248 "elif",
249 "else",
250 "except",
251 "finally",
252 "for",
253 "from",
254 "global",
255 "if",
256 "import",
257 "in",
258 "is",
259 "lambda",
260 "nonlocal",
261 "not",
262 "or",
263 "pass",
264 "raise",
265 "return",
266 "try",
267 "while",
268 "with",
269 "yield",
Damien George97f9a282014-05-12 23:07:34 +0100270 "__debug__",
Damien429d7192013-10-04 19:53:11 +0100271};
272
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200273// This is called with CUR_CHAR() before first hex digit, and should return with
274// it pointing to last hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100275// num_digits must be greater than zero
276STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
277 mp_uint_t num = 0;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200278 while (num_digits-- != 0) {
279 next_char(lex);
280 unichar c = CUR_CHAR(lex);
281 if (!unichar_isxdigit(c)) {
282 return false;
283 }
Dave Hylands3ad94d62015-05-18 14:41:25 -0700284 num = (num << 4) + unichar_xdigit_value(c);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200285 }
286 *result = num;
287 return true;
288}
289
Damien Georgea4c52c52014-12-05 19:35:18 +0000290STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
291 // start new token text
292 vstr_reset(&lex->vstr);
293
Damiena5185f42013-10-20 14:41:27 +0100294 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100295 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100296 while (!is_end(lex)) {
297 if (is_physical_newline(lex)) {
298 had_physical_newline = true;
299 next_char(lex);
300 } else if (is_whitespace(lex)) {
301 next_char(lex);
302 } else if (is_char(lex, '#')) {
303 next_char(lex);
304 while (!is_end(lex) && !is_physical_newline(lex)) {
305 next_char(lex);
306 }
307 // had_physical_newline will be set on next loop
308 } else if (is_char(lex, '\\')) {
309 // backslash (outside string literals) must appear just before a physical newline
310 next_char(lex);
311 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000312 // SyntaxError: unexpected character after line continuation character
Damien Georgea4c52c52014-12-05 19:35:18 +0000313 lex->tok_line = lex->line;
314 lex->tok_column = lex->column;
315 lex->tok_kind = MP_TOKEN_BAD_LINE_CONTINUATION;
Damien George69a818d2014-01-12 13:55:24 +0000316 return;
Damien429d7192013-10-04 19:53:11 +0100317 } else {
318 next_char(lex);
319 }
320 } else {
321 break;
322 }
323 }
324
Damiena5185f42013-10-20 14:41:27 +0100325 // set token source information
Damien Georgea4c52c52014-12-05 19:35:18 +0000326 lex->tok_line = lex->line;
327 lex->tok_column = lex->column;
Damiena5185f42013-10-20 14:41:27 +0100328
329 if (first_token && lex->line == 1 && lex->column != 1) {
330 // check that the first token is in the first column
331 // if first token is not on first line, we get a physical newline and
332 // this check is done as part of normal indent/dedent checking below
333 // (done to get equivalence with CPython)
Damien Georgea4c52c52014-12-05 19:35:18 +0000334 lex->tok_kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100335
336 } else if (lex->emit_dent < 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000337 lex->tok_kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100338 lex->emit_dent += 1;
339
340 } else if (lex->emit_dent > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000341 lex->tok_kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100342 lex->emit_dent -= 1;
343
Damien91d387d2013-10-09 15:09:52 +0100344 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000345 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100346
Damien George54eb4e72014-07-03 13:47:47 +0100347 mp_uint_t num_spaces = lex->column - 1;
Damien429d7192013-10-04 19:53:11 +0100348 if (num_spaces == indent_top(lex)) {
349 } else if (num_spaces > indent_top(lex)) {
350 indent_push(lex, num_spaces);
351 lex->emit_dent += 1;
352 } else {
353 while (num_spaces < indent_top(lex)) {
354 indent_pop(lex);
355 lex->emit_dent -= 1;
356 }
357 if (num_spaces != indent_top(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000358 lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100359 }
360 }
361
362 } else if (is_end(lex)) {
Damien George31101d92016-10-12 11:00:17 +1100363 lex->tok_kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100364
365 } else if (is_char_or(lex, '\'', '\"')
366 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
367 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
368 // a string or bytes literal
369
370 // parse type codes
371 bool is_raw = false;
372 bool is_bytes = false;
373 if (is_char(lex, 'u')) {
374 next_char(lex);
375 } else if (is_char(lex, 'b')) {
376 is_bytes = true;
377 next_char(lex);
378 if (is_char(lex, 'r')) {
379 is_raw = true;
380 next_char(lex);
381 }
382 } else if (is_char(lex, 'r')) {
383 is_raw = true;
384 next_char(lex);
385 if (is_char(lex, 'b')) {
386 is_bytes = true;
387 next_char(lex);
388 }
389 }
390
391 // set token kind
392 if (is_bytes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000393 lex->tok_kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100394 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000395 lex->tok_kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100396 }
397
398 // get first quoting character
399 char quote_char = '\'';
400 if (is_char(lex, '\"')) {
401 quote_char = '\"';
402 }
403 next_char(lex);
404
405 // work out if it's a single or triple quoted literal
Damien George54eb4e72014-07-03 13:47:47 +0100406 mp_uint_t num_quotes;
Damien429d7192013-10-04 19:53:11 +0100407 if (is_char_and(lex, quote_char, quote_char)) {
408 // triple quotes
409 next_char(lex);
410 next_char(lex);
411 num_quotes = 3;
412 } else {
413 // single quotes
414 num_quotes = 1;
415 }
416
Damien429d7192013-10-04 19:53:11 +0100417 // parse the literal
Damien George54eb4e72014-07-03 13:47:47 +0100418 mp_uint_t n_closing = 0;
Damien429d7192013-10-04 19:53:11 +0100419 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
420 if (is_char(lex, quote_char)) {
421 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100422 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100423 } else {
424 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100425 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100426 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100427 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100428 if (is_raw) {
429 // raw strings allow escaping of quotes, but the backslash is also emitted
430 vstr_add_char(&lex->vstr, '\\');
431 } else {
432 switch (c) {
Damien George94fbe972014-07-30 11:46:05 +0100433 case MP_LEXER_EOF: break; // TODO a proper error message?
434 case '\n': c = MP_LEXER_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damien Georgea91f4142014-04-10 11:30:55 +0100435 case '\\': break;
436 case '\'': break;
437 case '"': break;
438 case 'a': c = 0x07; break;
439 case 'b': c = 0x08; break;
440 case 't': c = 0x09; break;
441 case 'n': c = 0x0a; break;
442 case 'v': c = 0x0b; break;
443 case 'f': c = 0x0c; break;
444 case 'r': c = 0x0d; break;
Chris Angelico2ba22992014-06-04 05:28:12 +1000445 case 'u':
446 case 'U':
447 if (is_bytes) {
448 // b'\u1234' == b'\\u1234'
449 vstr_add_char(&lex->vstr, '\\');
450 break;
451 }
452 // Otherwise fall through.
Damien Georgea91f4142014-04-10 11:30:55 +0100453 case 'x':
454 {
Damien George54eb4e72014-07-03 13:47:47 +0100455 mp_uint_t num = 0;
Chris Angelico2ba22992014-06-04 05:28:12 +1000456 if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
Damien Georged241c2a2015-07-23 23:20:37 +0100457 // not enough hex chars for escape sequence
458 lex->tok_kind = MP_TOKEN_INVALID;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200459 }
460 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100461 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200462 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000463 case 'N':
464 // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
465 // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
466 // 3MB of text; even gzip-compressed and with minimal structure, it'll take
467 // roughly half a meg of storage. This form of Unicode escape may be added
468 // later on, but it's definitely not a priority right now. -- CJA 20140607
Damien George081f9322015-09-07 17:08:49 +0100469 mp_not_implemented("unicode name escapes");
Chris Angelico2ba22992014-06-04 05:28:12 +1000470 break;
Damien Georgea91f4142014-04-10 11:30:55 +0100471 default:
472 if (c >= '0' && c <= '7') {
473 // Octal sequence, 1-3 chars
Damien George54eb4e72014-07-03 13:47:47 +0100474 mp_uint_t digits = 3;
475 mp_uint_t num = c - '0';
Damien Georgea91f4142014-04-10 11:30:55 +0100476 while (is_following_odigit(lex) && --digits != 0) {
477 next_char(lex);
478 num = num * 8 + (CUR_CHAR(lex) - '0');
479 }
480 c = num;
481 } else {
482 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
483 vstr_add_char(&lex->vstr, '\\');
484 }
485 break;
486 }
Damiena5185f42013-10-20 14:41:27 +0100487 }
Damien George94fbe972014-07-30 11:46:05 +0100488 if (c != MP_LEXER_EOF) {
Damien Georgeea235202016-02-11 22:30:53 +0000489 if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
490 if (c < 0x110000 && !is_bytes) {
491 vstr_add_char(&lex->vstr, c);
492 } else if (c < 0x100 && is_bytes) {
493 vstr_add_byte(&lex->vstr, c);
494 } else {
495 // unicode character out of range
496 // this raises a generic SyntaxError; could provide more info
497 lex->tok_kind = MP_TOKEN_INVALID;
498 }
499 } else {
500 // without unicode everything is just added as an 8-bit byte
501 if (c < 0x100) {
502 vstr_add_byte(&lex->vstr, c);
503 } else {
504 // 8-bit character out of range
505 // this raises a generic SyntaxError; could provide more info
506 lex->tok_kind = MP_TOKEN_INVALID;
507 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000508 }
Damiena5185f42013-10-20 14:41:27 +0100509 }
510 } else {
Damien George94fbe972014-07-30 11:46:05 +0100511 // Add the "character" as a byte so that we remain 8-bit clean.
512 // This way, strings are parsed correctly whether or not they contain utf-8 chars.
513 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100514 }
515 }
516 next_char(lex);
517 }
518
519 // check we got the required end quotes
520 if (n_closing < num_quotes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000521 lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100522 }
523
Damiena5185f42013-10-20 14:41:27 +0100524 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000525 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100526
527 } else if (is_head_of_identifier(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000528 lex->tok_kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100529
Damien George7ed58cb2015-06-09 10:58:07 +0000530 // get first char (add as byte to remain 8-bit clean and support utf-8)
531 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100532 next_char(lex);
533
Damiena5185f42013-10-20 14:41:27 +0100534 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100535 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damien George7ed58cb2015-06-09 10:58:07 +0000536 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100537 next_char(lex);
538 }
539
540 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damien George7d414a12015-02-08 01:57:40 +0000541 bool forced_integer = false;
542 if (is_char(lex, '.')) {
543 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
544 } else {
545 lex->tok_kind = MP_TOKEN_INTEGER;
Damien George2b000472015-09-07 17:33:44 +0100546 if (is_char(lex, '0') && is_following_base_char(lex)) {
Damien George7d414a12015-02-08 01:57:40 +0000547 forced_integer = true;
548 }
549 }
Damien429d7192013-10-04 19:53:11 +0100550
Damiena5185f42013-10-20 14:41:27 +0100551 // get first char
552 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100553 next_char(lex);
554
Damiena5185f42013-10-20 14:41:27 +0100555 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100556 while (!is_end(lex)) {
Damien George7d414a12015-02-08 01:57:40 +0000557 if (!forced_integer && is_char_or(lex, 'e', 'E')) {
558 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
Damiena5185f42013-10-20 14:41:27 +0100559 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100560 next_char(lex);
561 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100562 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100563 next_char(lex);
564 }
Damien George7d414a12015-02-08 01:57:40 +0000565 } else if (is_letter(lex) || is_digit(lex) || is_char(lex, '.')) {
566 if (is_char_or3(lex, '.', 'j', 'J')) {
567 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
568 }
Damiena5185f42013-10-20 14:41:27 +0100569 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100570 next_char(lex);
571 } else {
572 break;
573 }
574 }
575
Damien George2e9eb2d2014-04-10 12:19:33 +0100576 } else if (is_char(lex, '.')) {
577 // special handling for . and ... operators, because .. is not a valid operator
578
579 // get first char
580 vstr_add_char(&lex->vstr, '.');
581 next_char(lex);
582
583 if (is_char_and(lex, '.', '.')) {
584 vstr_add_char(&lex->vstr, '.');
585 vstr_add_char(&lex->vstr, '.');
586 next_char(lex);
587 next_char(lex);
Damien Georgea4c52c52014-12-05 19:35:18 +0000588 lex->tok_kind = MP_TOKEN_ELLIPSIS;
Damien George2e9eb2d2014-04-10 12:19:33 +0100589 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000590 lex->tok_kind = MP_TOKEN_DEL_PERIOD;
Damien George2e9eb2d2014-04-10 12:19:33 +0100591 }
592
Damien429d7192013-10-04 19:53:11 +0100593 } else {
594 // search for encoded delimiter or operator
595
596 const char *t = tok_enc;
Damien George54eb4e72014-07-03 13:47:47 +0100597 mp_uint_t tok_enc_index = 0;
Damien429d7192013-10-04 19:53:11 +0100598 for (; *t != 0 && !is_char(lex, *t); t += 1) {
599 if (*t == 'e' || *t == 'c') {
600 t += 1;
601 } else if (*t == 'E') {
602 tok_enc_index -= 1;
603 t += 1;
604 }
605 tok_enc_index += 1;
606 }
607
608 next_char(lex);
609
610 if (*t == 0) {
611 // didn't match any delimiter or operator characters
Damien Georgea4c52c52014-12-05 19:35:18 +0000612 lex->tok_kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100613
614 } else {
615 // matched a delimiter or operator character
616
617 // get the maximum characters for a valid token
618 t += 1;
Damien George54eb4e72014-07-03 13:47:47 +0100619 mp_uint_t t_index = tok_enc_index;
Damien429d7192013-10-04 19:53:11 +0100620 for (;;) {
621 for (; *t == 'e'; t += 1) {
622 t += 1;
623 t_index += 1;
624 if (is_char(lex, *t)) {
625 next_char(lex);
626 tok_enc_index = t_index;
627 break;
628 }
629 }
630
631 if (*t == 'E') {
632 t += 1;
633 if (is_char(lex, *t)) {
634 next_char(lex);
635 tok_enc_index = t_index;
636 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000637 lex->tok_kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100638 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100639 }
640 break;
641 }
642
643 if (*t == 'c') {
644 t += 1;
645 t_index += 1;
646 if (is_char(lex, *t)) {
647 next_char(lex);
648 tok_enc_index = t_index;
649 t += 1;
650 } else {
651 break;
652 }
653 } else {
654 break;
655 }
656 }
657
658 // set token kind
Damien Georgea4c52c52014-12-05 19:35:18 +0000659 lex->tok_kind = tok_enc_kind[tok_enc_index];
Damien429d7192013-10-04 19:53:11 +0100660
Damien George2e9eb2d2014-04-10 12:19:33 +0100661 tok_enc_no_match:
662
Damien429d7192013-10-04 19:53:11 +0100663 // compute bracket level for implicit line joining
Damien Georgea4c52c52014-12-05 19:35:18 +0000664 if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100665 lex->nested_bracket_level += 1;
Damien Georgea4c52c52014-12-05 19:35:18 +0000666 } else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100667 lex->nested_bracket_level -= 1;
668 }
669 }
670 }
671
Damiena5185f42013-10-20 14:41:27 +0100672 // check for keywords
Damien Georgea4c52c52014-12-05 19:35:18 +0000673 if (lex->tok_kind == MP_TOKEN_NAME) {
Damien George97f9a282014-05-12 23:07:34 +0100674 // We check for __debug__ here and convert it to its value. This is so
675 // the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
676 // need to check for this special token in many places in the compiler.
677 // TODO improve speed of these string comparisons
Damien George54eb4e72014-07-03 13:47:47 +0100678 //for (mp_int_t i = 0; tok_kw[i] != NULL; i++) {
Damien George963a5a32015-01-16 17:47:07 +0000679 for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000680 if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) {
Emmanuel Blotf6932d62014-06-19 18:54:34 +0200681 if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
682 // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
Damien Georgeb4b10fd2015-01-01 23:30:53 +0000683 lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
Damien George97f9a282014-05-12 23:07:34 +0100684 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000685 lex->tok_kind = MP_TOKEN_KW_FALSE + i;
Damien George97f9a282014-05-12 23:07:34 +0100686 }
Damien429d7192013-10-04 19:53:11 +0100687 break;
688 }
689 }
690 }
691}
692
Damien George5bdf1652016-11-16 18:27:20 +1100693mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
Damien George9bf5f282014-10-09 16:53:37 +0100694 mp_lexer_t *lex = m_new_obj_maybe(mp_lexer_t);
Damien Georgee1199ec2014-05-10 17:48:01 +0100695
696 // check for memory allocation error
697 if (lex == NULL) {
Damien George5bdf1652016-11-16 18:27:20 +1100698 reader.close(reader.data);
Damien Georgee1199ec2014-05-10 17:48:01 +0100699 return NULL;
700 }
Damien429d7192013-10-04 19:53:11 +0100701
Damien Georgeb829b5c2014-01-25 13:51:19 +0000702 lex->source_name = src_name;
Damien George5bdf1652016-11-16 18:27:20 +1100703 lex->reader = reader;
Damien429d7192013-10-04 19:53:11 +0100704 lex->line = 1;
705 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100706 lex->emit_dent = 0;
707 lex->nested_bracket_level = 0;
Damien George58ebde42014-05-21 20:32:59 +0100708 lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
Damien429d7192013-10-04 19:53:11 +0100709 lex->num_indent_level = 1;
Damien Georgee1199ec2014-05-10 17:48:01 +0100710 lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200711 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100712
Damien Georgee1199ec2014-05-10 17:48:01 +0100713 // check for memory allocation error
Damien George5da0d292016-09-19 11:17:02 +1000714 // note: vstr_init above may fail on malloc, but so may mp_lexer_next_token_into below
715 if (lex->indent_level == NULL) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100716 mp_lexer_free(lex);
717 return NULL;
718 }
719
720 // store sentinel for first indentation level
721 lex->indent_level[0] = 0;
722
Damien429d7192013-10-04 19:53:11 +0100723 // preload characters
Damien George5bdf1652016-11-16 18:27:20 +1100724 lex->chr0 = reader.readbyte(reader.data);
725 lex->chr1 = reader.readbyte(reader.data);
726 lex->chr2 = reader.readbyte(reader.data);
Damiena5185f42013-10-20 14:41:27 +0100727
728 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damien George94fbe972014-07-30 11:46:05 +0100729 if (lex->chr0 == MP_LEXER_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100730 lex->chr0 = '\n';
Damien George94fbe972014-07-30 11:46:05 +0100731 } else if (lex->chr1 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000732 if (lex->chr0 == '\r') {
733 lex->chr0 = '\n';
734 } else if (lex->chr0 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100735 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100736 }
Damien George94fbe972014-07-30 11:46:05 +0100737 } else if (lex->chr2 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000738 if (lex->chr1 == '\r') {
739 lex->chr1 = '\n';
740 } else if (lex->chr1 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100741 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100742 }
Damien429d7192013-10-04 19:53:11 +0100743 }
744
Damiena5185f42013-10-20 14:41:27 +0100745 // preload first token
Damien Georgea4c52c52014-12-05 19:35:18 +0000746 mp_lexer_next_token_into(lex, true);
Damien429d7192013-10-04 19:53:11 +0100747
748 return lex;
749}
750
Damien George511c0832016-11-16 16:22:08 +1100751mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, mp_uint_t len, mp_uint_t free_len) {
752 mp_reader_t reader;
753 if (!mp_reader_new_mem(&reader, (const byte*)str, len, free_len)) {
754 return NULL;
755 }
Damien George5bdf1652016-11-16 18:27:20 +1100756 return mp_lexer_new(src_name, reader);
Damien George511c0832016-11-16 16:22:08 +1100757}
758
Damien Georgee5ef15a2016-11-16 16:25:06 +1100759#if MICROPY_READER_POSIX || MICROPY_READER_FATFS
760
761mp_lexer_t *mp_lexer_new_from_file(const char *filename) {
762 mp_reader_t reader;
763 int ret = mp_reader_new_file(&reader, filename);
764 if (ret != 0) {
765 return NULL;
766 }
Damien George5bdf1652016-11-16 18:27:20 +1100767 return mp_lexer_new(qstr_from_str(filename), reader);
Damien Georgee5ef15a2016-11-16 16:25:06 +1100768}
769
Damien George66d955c2016-11-16 18:12:55 +1100770#if MICROPY_HELPER_LEXER_UNIX
771
772mp_lexer_t *mp_lexer_new_from_fd(qstr filename, int fd, bool close_fd) {
773 mp_reader_t reader;
774 int ret = mp_reader_new_file_from_fd(&reader, fd, close_fd);
775 if (ret != 0) {
776 return NULL;
777 }
Damien George5bdf1652016-11-16 18:27:20 +1100778 return mp_lexer_new(filename, reader);
Damien George66d955c2016-11-16 18:12:55 +1100779}
780
781#endif
782
Damien Georgee5ef15a2016-11-16 16:25:06 +1100783#endif
784
Damiend99b0522013-12-21 18:17:45 +0000785void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100786 if (lex) {
Damien George5bdf1652016-11-16 18:27:20 +1100787 lex->reader.close(lex->reader.data);
Damienbb5316b2013-10-22 21:12:29 +0100788 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200789 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000790 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100791 }
Damien429d7192013-10-04 19:53:11 +0100792}
793
Damiend99b0522013-12-21 18:17:45 +0000794void mp_lexer_to_next(mp_lexer_t *lex) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000795 mp_lexer_next_token_into(lex, false);
Damien429d7192013-10-04 19:53:11 +0100796}
797
Damien Georgea4c52c52014-12-05 19:35:18 +0000798#if MICROPY_DEBUG_PRINTERS
799void mp_lexer_show_token(const mp_lexer_t *lex) {
Damien George451a0872014-12-05 22:50:16 +0000800 printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
Damien Georgea4c52c52014-12-05 19:35:18 +0000801 if (lex->vstr.len > 0) {
802 const byte *i = (const byte *)lex->vstr.buf;
803 const byte *j = (const byte *)i + lex->vstr.len;
804 printf(" ");
805 while (i < j) {
806 unichar c = utf8_get_char(i);
807 i = utf8_next_char(i);
808 if (unichar_isprint(c)) {
Damien George7f19a392015-06-22 17:40:12 +0100809 printf("%c", (int)c);
Damien Georgea4c52c52014-12-05 19:35:18 +0000810 } else {
811 printf("?");
812 }
813 }
814 }
815 printf("\n");
Damien429d7192013-10-04 19:53:11 +0100816}
Damien Georgea4c52c52014-12-05 19:35:18 +0000817#endif
Damien Georgedd5353a2015-12-18 12:35:44 +0000818
819#endif // MICROPY_ENABLE_COMPILER