blob: 820f91be7807f0a16e9edce62a8596cc61e278dd [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027#include <stdio.h>
28#include <assert.h>
29
Damien Georgeb4b10fd2015-01-01 23:30:53 +000030#include "py/mpstate.h"
Damien George51dfcb42015-01-01 20:27:54 +000031#include "py/lexer.h"
Damien George081f9322015-09-07 17:08:49 +010032#include "py/runtime.h"
Damien429d7192013-10-04 19:53:11 +010033
Damien Georgedd5353a2015-12-18 12:35:44 +000034#if MICROPY_ENABLE_COMPILER
35
Damien429d7192013-10-04 19:53:11 +010036#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010037
Damien92c06562013-10-22 22:32:27 +010038// TODO seems that CPython allows NULL byte in the input stream
39// don't know if that's intentional or not, but we don't allow it
40
Damien George9528cd62014-01-15 21:23:31 +000041// TODO replace with a call to a standard function
Damien Georgea4c52c52014-12-05 19:35:18 +000042STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
Damien George54eb4e72014-07-03 13:47:47 +010043 mp_uint_t i = 0;
Damien429d7192013-10-04 19:53:11 +010044
Damiena5185f42013-10-20 14:41:27 +010045 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010046 ++i;
Damien429d7192013-10-04 19:53:11 +010047 ++str;
Damiena5185f42013-10-20 14:41:27 +010048 ++strn;
Damien429d7192013-10-04 19:53:11 +010049 }
50
Damiena5185f42013-10-20 14:41:27 +010051 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010052}
53
Damiena5185f42013-10-20 14:41:27 +010054#define CUR_CHAR(lex) ((lex)->chr0)
55
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020056STATIC bool is_end(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +010057 return lex->chr0 == MP_LEXER_EOF;
Damien429d7192013-10-04 19:53:11 +010058}
59
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020060STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien George32bade12015-01-30 00:27:46 +000061 return lex->chr0 == '\n';
Damien429d7192013-10-04 19:53:11 +010062}
63
Damien George2e2e4042015-03-19 00:21:29 +000064STATIC bool is_char(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010065 return lex->chr0 == c;
66}
67
Damien George2e2e4042015-03-19 00:21:29 +000068STATIC bool is_char_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010069 return lex->chr0 == c1 || lex->chr0 == c2;
70}
71
Damien George2e2e4042015-03-19 00:21:29 +000072STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
Damien429d7192013-10-04 19:53:11 +010073 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
74}
75
76/*
Damien George2e2e4042015-03-19 00:21:29 +000077STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010078 return lex->chr1 == c;
79}
80*/
81
Damien George2e2e4042015-03-19 00:21:29 +000082STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010083 return lex->chr1 == c1 || lex->chr1 == c2;
84}
85
Damien George2e2e4042015-03-19 00:21:29 +000086STATIC bool is_char_following_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010087 return lex->chr2 == c1 || lex->chr2 == c2;
88}
89
Damien George2e2e4042015-03-19 00:21:29 +000090STATIC bool is_char_and(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010091 return lex->chr0 == c1 && lex->chr1 == c2;
92}
93
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020094STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000095 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010096}
97
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020098STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000099 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100100}
101
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200102STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000103 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100104}
105
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200106STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000107 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100108}
109
Damien George2b000472015-09-07 17:33:44 +0100110STATIC bool is_following_base_char(mp_lexer_t *lex) {
111 const unichar chr1 = lex->chr1 | 0x20;
112 return chr1 == 'b' || chr1 == 'o' || chr1 == 'x';
Damien George7d414a12015-02-08 01:57:40 +0000113}
114
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200115STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200116 return lex->chr1 >= '0' && lex->chr1 <= '7';
117}
118
Damien George7ed58cb2015-06-09 10:58:07 +0000119// to easily parse utf-8 identifiers we allow any raw byte with high bit set
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200120STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien George7ed58cb2015-06-09 10:58:07 +0000121 return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
Damien429d7192013-10-04 19:53:11 +0100122}
123
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200124STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100125 return is_head_of_identifier(lex) || is_digit(lex);
126}
127
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200128STATIC void next_char(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +0100129 if (lex->chr0 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100130 return;
131 }
132
Damien429d7192013-10-04 19:53:11 +0100133 if (lex->chr0 == '\n') {
Damien George32bade12015-01-30 00:27:46 +0000134 // a new line
Damien429d7192013-10-04 19:53:11 +0100135 ++lex->line;
136 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100137 } else if (lex->chr0 == '\t') {
138 // a tab
139 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
140 } else {
141 // a character worth one column
142 ++lex->column;
143 }
144
Damien George32bade12015-01-30 00:27:46 +0000145 lex->chr0 = lex->chr1;
146 lex->chr1 = lex->chr2;
147 lex->chr2 = lex->stream_next_byte(lex->stream_data);
148
149 if (lex->chr0 == '\r') {
150 // CR is a new line, converted to LF
151 lex->chr0 = '\n';
152 if (lex->chr1 == '\n') {
153 // CR LF is a single new line
154 lex->chr1 = lex->chr2;
155 lex->chr2 = lex->stream_next_byte(lex->stream_data);
156 }
157 }
158
159 if (lex->chr2 == MP_LEXER_EOF) {
160 // EOF, check if we need to insert a newline at end of file
161 if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
162 // if lex->chr1 == '\r' then this makes a CR LF which will be converted to LF above
163 // otherwise it just inserts a LF
164 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100165 }
166 }
167}
168
Damien Georgea4c52c52014-12-05 19:35:18 +0000169STATIC void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
Damien429d7192013-10-04 19:53:11 +0100170 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100171 // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
Damien George58ebde42014-05-21 20:32:59 +0100172 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
173 lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
Damien429d7192013-10-04 19:53:11 +0100174 }
175 lex->indent_level[lex->num_indent_level++] = indent;
176}
177
Damien Georgea4c52c52014-12-05 19:35:18 +0000178STATIC mp_uint_t indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100179 return lex->indent_level[lex->num_indent_level - 1];
180}
181
Damien Georgea4c52c52014-12-05 19:35:18 +0000182STATIC void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100183 lex->num_indent_level -= 1;
184}
185
186// some tricky operator encoding:
187// <op> = begin with <op>, if this opchar matches then begin here
188// e<op> = end with <op>, if this opchar matches then end
189// E<op> = mandatory end with <op>, this opchar must match, then end
190// c<op> = continue with <op>, if this opchar matches then continue matching
191// this means if the start of two ops are the same then they are equal til the last char
192
Damien George3ff16ff2016-05-20 12:38:15 +0100193STATIC const char *const tok_enc =
Damien429d7192013-10-04 19:53:11 +0100194 "()[]{},:;@~" // singles
195 "<e=c<e=" // < <= << <<=
196 ">e=c>e=" // > >= >> >>=
197 "*e=c*e=" // * *= ** **=
198 "+e=" // + +=
199 "-e=e>" // - -= ->
200 "&e=" // & &=
201 "|e=" // | |=
202 "/e=c/e=" // / /= // //=
203 "%e=" // % %=
204 "^e=" // ^ ^=
205 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100206 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100207
208// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200209STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000210 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
211 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
212 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
213 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100214
Damiend99b0522013-12-21 18:17:45 +0000215 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
216 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
217 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
218 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
219 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
220 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
221 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
222 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
223 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
224 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
225 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
226 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100227};
228
229// must have the same order as enum in lexer.h
Damien George3ff16ff2016-05-20 12:38:15 +0100230STATIC const char *const tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100231 "False",
232 "None",
233 "True",
234 "and",
235 "as",
236 "assert",
pohmelie81ebba72016-01-27 23:23:11 +0300237 #if MICROPY_PY_ASYNC_AWAIT
238 "async",
239 "await",
240 #endif
Damien429d7192013-10-04 19:53:11 +0100241 "break",
242 "class",
243 "continue",
244 "def",
245 "del",
246 "elif",
247 "else",
248 "except",
249 "finally",
250 "for",
251 "from",
252 "global",
253 "if",
254 "import",
255 "in",
256 "is",
257 "lambda",
258 "nonlocal",
259 "not",
260 "or",
261 "pass",
262 "raise",
263 "return",
264 "try",
265 "while",
266 "with",
267 "yield",
Damien George97f9a282014-05-12 23:07:34 +0100268 "__debug__",
Damien429d7192013-10-04 19:53:11 +0100269};
270
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200271// This is called with CUR_CHAR() before first hex digit, and should return with
272// it pointing to last hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100273// num_digits must be greater than zero
274STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
275 mp_uint_t num = 0;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200276 while (num_digits-- != 0) {
277 next_char(lex);
278 unichar c = CUR_CHAR(lex);
279 if (!unichar_isxdigit(c)) {
280 return false;
281 }
Dave Hylands3ad94d62015-05-18 14:41:25 -0700282 num = (num << 4) + unichar_xdigit_value(c);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200283 }
284 *result = num;
285 return true;
286}
287
Damien Georgea4c52c52014-12-05 19:35:18 +0000288STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
289 // start new token text
290 vstr_reset(&lex->vstr);
291
Damiena5185f42013-10-20 14:41:27 +0100292 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100293 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100294 while (!is_end(lex)) {
295 if (is_physical_newline(lex)) {
296 had_physical_newline = true;
297 next_char(lex);
298 } else if (is_whitespace(lex)) {
299 next_char(lex);
300 } else if (is_char(lex, '#')) {
301 next_char(lex);
302 while (!is_end(lex) && !is_physical_newline(lex)) {
303 next_char(lex);
304 }
305 // had_physical_newline will be set on next loop
306 } else if (is_char(lex, '\\')) {
307 // backslash (outside string literals) must appear just before a physical newline
308 next_char(lex);
309 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000310 // SyntaxError: unexpected character after line continuation character
Damien Georgea4c52c52014-12-05 19:35:18 +0000311 lex->tok_line = lex->line;
312 lex->tok_column = lex->column;
313 lex->tok_kind = MP_TOKEN_BAD_LINE_CONTINUATION;
Damien George69a818d2014-01-12 13:55:24 +0000314 return;
Damien429d7192013-10-04 19:53:11 +0100315 } else {
316 next_char(lex);
317 }
318 } else {
319 break;
320 }
321 }
322
Damiena5185f42013-10-20 14:41:27 +0100323 // set token source information
Damien Georgea4c52c52014-12-05 19:35:18 +0000324 lex->tok_line = lex->line;
325 lex->tok_column = lex->column;
Damiena5185f42013-10-20 14:41:27 +0100326
327 if (first_token && lex->line == 1 && lex->column != 1) {
328 // check that the first token is in the first column
329 // if first token is not on first line, we get a physical newline and
330 // this check is done as part of normal indent/dedent checking below
331 // (done to get equivalence with CPython)
Damien Georgea4c52c52014-12-05 19:35:18 +0000332 lex->tok_kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100333
334 } else if (lex->emit_dent < 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000335 lex->tok_kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100336 lex->emit_dent += 1;
337
338 } else if (lex->emit_dent > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000339 lex->tok_kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100340 lex->emit_dent -= 1;
341
Damien91d387d2013-10-09 15:09:52 +0100342 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000343 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100344
Damien George54eb4e72014-07-03 13:47:47 +0100345 mp_uint_t num_spaces = lex->column - 1;
Damien429d7192013-10-04 19:53:11 +0100346 lex->emit_dent = 0;
347 if (num_spaces == indent_top(lex)) {
348 } else if (num_spaces > indent_top(lex)) {
349 indent_push(lex, num_spaces);
350 lex->emit_dent += 1;
351 } else {
352 while (num_spaces < indent_top(lex)) {
353 indent_pop(lex);
354 lex->emit_dent -= 1;
355 }
356 if (num_spaces != indent_top(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000357 lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100358 }
359 }
360
361 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100362 if (indent_top(lex) > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000363 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100364 lex->emit_dent = 0;
365 while (indent_top(lex) > 0) {
366 indent_pop(lex);
367 lex->emit_dent -= 1;
368 }
369 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000370 lex->tok_kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100371 }
372
373 } else if (is_char_or(lex, '\'', '\"')
374 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
375 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
376 // a string or bytes literal
377
378 // parse type codes
379 bool is_raw = false;
380 bool is_bytes = false;
381 if (is_char(lex, 'u')) {
382 next_char(lex);
383 } else if (is_char(lex, 'b')) {
384 is_bytes = true;
385 next_char(lex);
386 if (is_char(lex, 'r')) {
387 is_raw = true;
388 next_char(lex);
389 }
390 } else if (is_char(lex, 'r')) {
391 is_raw = true;
392 next_char(lex);
393 if (is_char(lex, 'b')) {
394 is_bytes = true;
395 next_char(lex);
396 }
397 }
398
399 // set token kind
400 if (is_bytes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000401 lex->tok_kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100402 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000403 lex->tok_kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100404 }
405
406 // get first quoting character
407 char quote_char = '\'';
408 if (is_char(lex, '\"')) {
409 quote_char = '\"';
410 }
411 next_char(lex);
412
413 // work out if it's a single or triple quoted literal
Damien George54eb4e72014-07-03 13:47:47 +0100414 mp_uint_t num_quotes;
Damien429d7192013-10-04 19:53:11 +0100415 if (is_char_and(lex, quote_char, quote_char)) {
416 // triple quotes
417 next_char(lex);
418 next_char(lex);
419 num_quotes = 3;
420 } else {
421 // single quotes
422 num_quotes = 1;
423 }
424
Damien429d7192013-10-04 19:53:11 +0100425 // parse the literal
Damien George54eb4e72014-07-03 13:47:47 +0100426 mp_uint_t n_closing = 0;
Damien429d7192013-10-04 19:53:11 +0100427 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
428 if (is_char(lex, quote_char)) {
429 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100430 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100431 } else {
432 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100433 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100434 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100435 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100436 if (is_raw) {
437 // raw strings allow escaping of quotes, but the backslash is also emitted
438 vstr_add_char(&lex->vstr, '\\');
439 } else {
440 switch (c) {
Damien George94fbe972014-07-30 11:46:05 +0100441 case MP_LEXER_EOF: break; // TODO a proper error message?
442 case '\n': c = MP_LEXER_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damien Georgea91f4142014-04-10 11:30:55 +0100443 case '\\': break;
444 case '\'': break;
445 case '"': break;
446 case 'a': c = 0x07; break;
447 case 'b': c = 0x08; break;
448 case 't': c = 0x09; break;
449 case 'n': c = 0x0a; break;
450 case 'v': c = 0x0b; break;
451 case 'f': c = 0x0c; break;
452 case 'r': c = 0x0d; break;
Chris Angelico2ba22992014-06-04 05:28:12 +1000453 case 'u':
454 case 'U':
455 if (is_bytes) {
456 // b'\u1234' == b'\\u1234'
457 vstr_add_char(&lex->vstr, '\\');
458 break;
459 }
460 // Otherwise fall through.
Damien Georgea91f4142014-04-10 11:30:55 +0100461 case 'x':
462 {
Damien George54eb4e72014-07-03 13:47:47 +0100463 mp_uint_t num = 0;
Chris Angelico2ba22992014-06-04 05:28:12 +1000464 if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
Damien Georged241c2a2015-07-23 23:20:37 +0100465 // not enough hex chars for escape sequence
466 lex->tok_kind = MP_TOKEN_INVALID;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200467 }
468 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100469 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200470 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000471 case 'N':
472 // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
473 // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
474 // 3MB of text; even gzip-compressed and with minimal structure, it'll take
475 // roughly half a meg of storage. This form of Unicode escape may be added
476 // later on, but it's definitely not a priority right now. -- CJA 20140607
Damien George081f9322015-09-07 17:08:49 +0100477 mp_not_implemented("unicode name escapes");
Chris Angelico2ba22992014-06-04 05:28:12 +1000478 break;
Damien Georgea91f4142014-04-10 11:30:55 +0100479 default:
480 if (c >= '0' && c <= '7') {
481 // Octal sequence, 1-3 chars
Damien George54eb4e72014-07-03 13:47:47 +0100482 mp_uint_t digits = 3;
483 mp_uint_t num = c - '0';
Damien Georgea91f4142014-04-10 11:30:55 +0100484 while (is_following_odigit(lex) && --digits != 0) {
485 next_char(lex);
486 num = num * 8 + (CUR_CHAR(lex) - '0');
487 }
488 c = num;
489 } else {
490 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
491 vstr_add_char(&lex->vstr, '\\');
492 }
493 break;
494 }
Damiena5185f42013-10-20 14:41:27 +0100495 }
Damien George94fbe972014-07-30 11:46:05 +0100496 if (c != MP_LEXER_EOF) {
Damien Georgeea235202016-02-11 22:30:53 +0000497 if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
498 if (c < 0x110000 && !is_bytes) {
499 vstr_add_char(&lex->vstr, c);
500 } else if (c < 0x100 && is_bytes) {
501 vstr_add_byte(&lex->vstr, c);
502 } else {
503 // unicode character out of range
504 // this raises a generic SyntaxError; could provide more info
505 lex->tok_kind = MP_TOKEN_INVALID;
506 }
507 } else {
508 // without unicode everything is just added as an 8-bit byte
509 if (c < 0x100) {
510 vstr_add_byte(&lex->vstr, c);
511 } else {
512 // 8-bit character out of range
513 // this raises a generic SyntaxError; could provide more info
514 lex->tok_kind = MP_TOKEN_INVALID;
515 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000516 }
Damiena5185f42013-10-20 14:41:27 +0100517 }
518 } else {
Damien George94fbe972014-07-30 11:46:05 +0100519 // Add the "character" as a byte so that we remain 8-bit clean.
520 // This way, strings are parsed correctly whether or not they contain utf-8 chars.
521 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100522 }
523 }
524 next_char(lex);
525 }
526
527 // check we got the required end quotes
528 if (n_closing < num_quotes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000529 lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100530 }
531
Damiena5185f42013-10-20 14:41:27 +0100532 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000533 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100534
535 } else if (is_head_of_identifier(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000536 lex->tok_kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100537
Damien George7ed58cb2015-06-09 10:58:07 +0000538 // get first char (add as byte to remain 8-bit clean and support utf-8)
539 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100540 next_char(lex);
541
Damiena5185f42013-10-20 14:41:27 +0100542 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100543 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damien George7ed58cb2015-06-09 10:58:07 +0000544 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100545 next_char(lex);
546 }
547
548 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damien George7d414a12015-02-08 01:57:40 +0000549 bool forced_integer = false;
550 if (is_char(lex, '.')) {
551 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
552 } else {
553 lex->tok_kind = MP_TOKEN_INTEGER;
Damien George2b000472015-09-07 17:33:44 +0100554 if (is_char(lex, '0') && is_following_base_char(lex)) {
Damien George7d414a12015-02-08 01:57:40 +0000555 forced_integer = true;
556 }
557 }
Damien429d7192013-10-04 19:53:11 +0100558
Damiena5185f42013-10-20 14:41:27 +0100559 // get first char
560 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100561 next_char(lex);
562
Damiena5185f42013-10-20 14:41:27 +0100563 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100564 while (!is_end(lex)) {
Damien George7d414a12015-02-08 01:57:40 +0000565 if (!forced_integer && is_char_or(lex, 'e', 'E')) {
566 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
Damiena5185f42013-10-20 14:41:27 +0100567 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100568 next_char(lex);
569 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100570 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100571 next_char(lex);
572 }
Damien George7d414a12015-02-08 01:57:40 +0000573 } else if (is_letter(lex) || is_digit(lex) || is_char(lex, '.')) {
574 if (is_char_or3(lex, '.', 'j', 'J')) {
575 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
576 }
Damiena5185f42013-10-20 14:41:27 +0100577 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100578 next_char(lex);
579 } else {
580 break;
581 }
582 }
583
Damien George2e9eb2d2014-04-10 12:19:33 +0100584 } else if (is_char(lex, '.')) {
585 // special handling for . and ... operators, because .. is not a valid operator
586
587 // get first char
588 vstr_add_char(&lex->vstr, '.');
589 next_char(lex);
590
591 if (is_char_and(lex, '.', '.')) {
592 vstr_add_char(&lex->vstr, '.');
593 vstr_add_char(&lex->vstr, '.');
594 next_char(lex);
595 next_char(lex);
Damien Georgea4c52c52014-12-05 19:35:18 +0000596 lex->tok_kind = MP_TOKEN_ELLIPSIS;
Damien George2e9eb2d2014-04-10 12:19:33 +0100597 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000598 lex->tok_kind = MP_TOKEN_DEL_PERIOD;
Damien George2e9eb2d2014-04-10 12:19:33 +0100599 }
600
Damien429d7192013-10-04 19:53:11 +0100601 } else {
602 // search for encoded delimiter or operator
603
604 const char *t = tok_enc;
Damien George54eb4e72014-07-03 13:47:47 +0100605 mp_uint_t tok_enc_index = 0;
Damien429d7192013-10-04 19:53:11 +0100606 for (; *t != 0 && !is_char(lex, *t); t += 1) {
607 if (*t == 'e' || *t == 'c') {
608 t += 1;
609 } else if (*t == 'E') {
610 tok_enc_index -= 1;
611 t += 1;
612 }
613 tok_enc_index += 1;
614 }
615
616 next_char(lex);
617
618 if (*t == 0) {
619 // didn't match any delimiter or operator characters
Damien Georgea4c52c52014-12-05 19:35:18 +0000620 lex->tok_kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100621
622 } else {
623 // matched a delimiter or operator character
624
625 // get the maximum characters for a valid token
626 t += 1;
Damien George54eb4e72014-07-03 13:47:47 +0100627 mp_uint_t t_index = tok_enc_index;
Damien429d7192013-10-04 19:53:11 +0100628 for (;;) {
629 for (; *t == 'e'; t += 1) {
630 t += 1;
631 t_index += 1;
632 if (is_char(lex, *t)) {
633 next_char(lex);
634 tok_enc_index = t_index;
635 break;
636 }
637 }
638
639 if (*t == 'E') {
640 t += 1;
641 if (is_char(lex, *t)) {
642 next_char(lex);
643 tok_enc_index = t_index;
644 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000645 lex->tok_kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100646 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100647 }
648 break;
649 }
650
651 if (*t == 'c') {
652 t += 1;
653 t_index += 1;
654 if (is_char(lex, *t)) {
655 next_char(lex);
656 tok_enc_index = t_index;
657 t += 1;
658 } else {
659 break;
660 }
661 } else {
662 break;
663 }
664 }
665
666 // set token kind
Damien Georgea4c52c52014-12-05 19:35:18 +0000667 lex->tok_kind = tok_enc_kind[tok_enc_index];
Damien429d7192013-10-04 19:53:11 +0100668
Damien George2e9eb2d2014-04-10 12:19:33 +0100669 tok_enc_no_match:
670
Damien429d7192013-10-04 19:53:11 +0100671 // compute bracket level for implicit line joining
Damien Georgea4c52c52014-12-05 19:35:18 +0000672 if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100673 lex->nested_bracket_level += 1;
Damien Georgea4c52c52014-12-05 19:35:18 +0000674 } else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100675 lex->nested_bracket_level -= 1;
676 }
677 }
678 }
679
Damiena5185f42013-10-20 14:41:27 +0100680 // check for keywords
Damien Georgea4c52c52014-12-05 19:35:18 +0000681 if (lex->tok_kind == MP_TOKEN_NAME) {
Damien George97f9a282014-05-12 23:07:34 +0100682 // We check for __debug__ here and convert it to its value. This is so
683 // the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
684 // need to check for this special token in many places in the compiler.
685 // TODO improve speed of these string comparisons
Damien George54eb4e72014-07-03 13:47:47 +0100686 //for (mp_int_t i = 0; tok_kw[i] != NULL; i++) {
Damien George963a5a32015-01-16 17:47:07 +0000687 for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000688 if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) {
Emmanuel Blotf6932d62014-06-19 18:54:34 +0200689 if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
690 // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
Damien Georgeb4b10fd2015-01-01 23:30:53 +0000691 lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
Damien George97f9a282014-05-12 23:07:34 +0100692 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000693 lex->tok_kind = MP_TOKEN_KW_FALSE + i;
Damien George97f9a282014-05-12 23:07:34 +0100694 }
Damien429d7192013-10-04 19:53:11 +0100695 break;
696 }
697 }
698 }
699}
700
Damien George94fbe972014-07-30 11:46:05 +0100701mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close) {
Damien George9bf5f282014-10-09 16:53:37 +0100702 mp_lexer_t *lex = m_new_obj_maybe(mp_lexer_t);
Damien Georgee1199ec2014-05-10 17:48:01 +0100703
704 // check for memory allocation error
705 if (lex == NULL) {
706 if (stream_close) {
707 stream_close(stream_data);
708 }
709 return NULL;
710 }
Damien429d7192013-10-04 19:53:11 +0100711
Damien Georgeb829b5c2014-01-25 13:51:19 +0000712 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100713 lex->stream_data = stream_data;
Damien George94fbe972014-07-30 11:46:05 +0100714 lex->stream_next_byte = stream_next_byte;
Damienfa2162b2013-10-20 17:42:00 +0100715 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100716 lex->line = 1;
717 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100718 lex->emit_dent = 0;
719 lex->nested_bracket_level = 0;
Damien George58ebde42014-05-21 20:32:59 +0100720 lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
Damien429d7192013-10-04 19:53:11 +0100721 lex->num_indent_level = 1;
Damien Georgee1199ec2014-05-10 17:48:01 +0100722 lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200723 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100724
Damien Georgee1199ec2014-05-10 17:48:01 +0100725 // check for memory allocation error
726 if (lex->indent_level == NULL || vstr_had_error(&lex->vstr)) {
727 mp_lexer_free(lex);
728 return NULL;
729 }
730
731 // store sentinel for first indentation level
732 lex->indent_level[0] = 0;
733
Damien429d7192013-10-04 19:53:11 +0100734 // preload characters
Damien George94fbe972014-07-30 11:46:05 +0100735 lex->chr0 = stream_next_byte(stream_data);
736 lex->chr1 = stream_next_byte(stream_data);
737 lex->chr2 = stream_next_byte(stream_data);
Damiena5185f42013-10-20 14:41:27 +0100738
739 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damien George94fbe972014-07-30 11:46:05 +0100740 if (lex->chr0 == MP_LEXER_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100741 lex->chr0 = '\n';
Damien George94fbe972014-07-30 11:46:05 +0100742 } else if (lex->chr1 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000743 if (lex->chr0 == '\r') {
744 lex->chr0 = '\n';
745 } else if (lex->chr0 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100746 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100747 }
Damien George94fbe972014-07-30 11:46:05 +0100748 } else if (lex->chr2 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000749 if (lex->chr1 == '\r') {
750 lex->chr1 = '\n';
751 } else if (lex->chr1 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100752 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100753 }
Damien429d7192013-10-04 19:53:11 +0100754 }
755
Damiena5185f42013-10-20 14:41:27 +0100756 // preload first token
Damien Georgea4c52c52014-12-05 19:35:18 +0000757 mp_lexer_next_token_into(lex, true);
Damien429d7192013-10-04 19:53:11 +0100758
759 return lex;
760}
761
Damiend99b0522013-12-21 18:17:45 +0000762void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100763 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100764 if (lex->stream_close) {
765 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100766 }
Damienbb5316b2013-10-22 21:12:29 +0100767 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200768 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000769 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100770 }
Damien429d7192013-10-04 19:53:11 +0100771}
772
Damiend99b0522013-12-21 18:17:45 +0000773void mp_lexer_to_next(mp_lexer_t *lex) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000774 mp_lexer_next_token_into(lex, false);
Damien429d7192013-10-04 19:53:11 +0100775}
776
Damien Georgea4c52c52014-12-05 19:35:18 +0000777#if MICROPY_DEBUG_PRINTERS
778void mp_lexer_show_token(const mp_lexer_t *lex) {
Damien George451a0872014-12-05 22:50:16 +0000779 printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
Damien Georgea4c52c52014-12-05 19:35:18 +0000780 if (lex->vstr.len > 0) {
781 const byte *i = (const byte *)lex->vstr.buf;
782 const byte *j = (const byte *)i + lex->vstr.len;
783 printf(" ");
784 while (i < j) {
785 unichar c = utf8_get_char(i);
786 i = utf8_next_char(i);
787 if (unichar_isprint(c)) {
Damien George7f19a392015-06-22 17:40:12 +0100788 printf("%c", (int)c);
Damien Georgea4c52c52014-12-05 19:35:18 +0000789 } else {
790 printf("?");
791 }
792 }
793 }
794 printf("\n");
Damien429d7192013-10-04 19:53:11 +0100795}
Damien Georgea4c52c52014-12-05 19:35:18 +0000796#endif
Damien Georgedd5353a2015-12-18 12:35:44 +0000797
798#endif // MICROPY_ENABLE_COMPILER