blob: 4a7c8f580a4219608aa54c9c760ee518e7d5b299 [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027#include <stdio.h>
28#include <assert.h>
29
Damien Georgeb4b10fd2015-01-01 23:30:53 +000030#include "py/mpstate.h"
Damien George51dfcb42015-01-01 20:27:54 +000031#include "py/lexer.h"
Damien George081f9322015-09-07 17:08:49 +010032#include "py/runtime.h"
Damien429d7192013-10-04 19:53:11 +010033
Damien Georgedd5353a2015-12-18 12:35:44 +000034#if MICROPY_ENABLE_COMPILER
35
Damien429d7192013-10-04 19:53:11 +010036#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010037
Damien92c06562013-10-22 22:32:27 +010038// TODO seems that CPython allows NULL byte in the input stream
39// don't know if that's intentional or not, but we don't allow it
40
Damien George9528cd62014-01-15 21:23:31 +000041// TODO replace with a call to a standard function
Damien Georgea4c52c52014-12-05 19:35:18 +000042STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
Damien George54eb4e72014-07-03 13:47:47 +010043 mp_uint_t i = 0;
Damien429d7192013-10-04 19:53:11 +010044
Damiena5185f42013-10-20 14:41:27 +010045 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010046 ++i;
Damien429d7192013-10-04 19:53:11 +010047 ++str;
Damiena5185f42013-10-20 14:41:27 +010048 ++strn;
Damien429d7192013-10-04 19:53:11 +010049 }
50
Damiena5185f42013-10-20 14:41:27 +010051 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010052}
53
Damiena5185f42013-10-20 14:41:27 +010054#define CUR_CHAR(lex) ((lex)->chr0)
55
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020056STATIC bool is_end(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +010057 return lex->chr0 == MP_LEXER_EOF;
Damien429d7192013-10-04 19:53:11 +010058}
59
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020060STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien George32bade12015-01-30 00:27:46 +000061 return lex->chr0 == '\n';
Damien429d7192013-10-04 19:53:11 +010062}
63
Damien George2e2e4042015-03-19 00:21:29 +000064STATIC bool is_char(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010065 return lex->chr0 == c;
66}
67
Damien George2e2e4042015-03-19 00:21:29 +000068STATIC bool is_char_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010069 return lex->chr0 == c1 || lex->chr0 == c2;
70}
71
Damien George2e2e4042015-03-19 00:21:29 +000072STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
Damien429d7192013-10-04 19:53:11 +010073 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
74}
75
76/*
Damien George2e2e4042015-03-19 00:21:29 +000077STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010078 return lex->chr1 == c;
79}
80*/
81
Damien George2e2e4042015-03-19 00:21:29 +000082STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010083 return lex->chr1 == c1 || lex->chr1 == c2;
84}
85
Damien George2e2e4042015-03-19 00:21:29 +000086STATIC bool is_char_following_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010087 return lex->chr2 == c1 || lex->chr2 == c2;
88}
89
Damien George2e2e4042015-03-19 00:21:29 +000090STATIC bool is_char_and(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010091 return lex->chr0 == c1 && lex->chr1 == c2;
92}
93
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020094STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000095 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010096}
97
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020098STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000099 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100100}
101
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200102STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000103 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100104}
105
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200106STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000107 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100108}
109
Damien George2b000472015-09-07 17:33:44 +0100110STATIC bool is_following_base_char(mp_lexer_t *lex) {
111 const unichar chr1 = lex->chr1 | 0x20;
112 return chr1 == 'b' || chr1 == 'o' || chr1 == 'x';
Damien George7d414a12015-02-08 01:57:40 +0000113}
114
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200115STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200116 return lex->chr1 >= '0' && lex->chr1 <= '7';
117}
118
Damien George7ed58cb2015-06-09 10:58:07 +0000119// to easily parse utf-8 identifiers we allow any raw byte with high bit set
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200120STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien George7ed58cb2015-06-09 10:58:07 +0000121 return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
Damien429d7192013-10-04 19:53:11 +0100122}
123
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200124STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100125 return is_head_of_identifier(lex) || is_digit(lex);
126}
127
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200128STATIC void next_char(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +0100129 if (lex->chr0 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100130 return;
131 }
132
Damien429d7192013-10-04 19:53:11 +0100133 if (lex->chr0 == '\n') {
Damien George32bade12015-01-30 00:27:46 +0000134 // a new line
Damien429d7192013-10-04 19:53:11 +0100135 ++lex->line;
136 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100137 } else if (lex->chr0 == '\t') {
138 // a tab
139 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
140 } else {
141 // a character worth one column
142 ++lex->column;
143 }
144
Damien George32bade12015-01-30 00:27:46 +0000145 lex->chr0 = lex->chr1;
146 lex->chr1 = lex->chr2;
147 lex->chr2 = lex->stream_next_byte(lex->stream_data);
148
149 if (lex->chr0 == '\r') {
150 // CR is a new line, converted to LF
151 lex->chr0 = '\n';
152 if (lex->chr1 == '\n') {
153 // CR LF is a single new line
154 lex->chr1 = lex->chr2;
155 lex->chr2 = lex->stream_next_byte(lex->stream_data);
156 }
157 }
158
159 if (lex->chr2 == MP_LEXER_EOF) {
160 // EOF, check if we need to insert a newline at end of file
161 if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
162 // if lex->chr1 == '\r' then this makes a CR LF which will be converted to LF above
163 // otherwise it just inserts a LF
164 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100165 }
166 }
167}
168
Damien Georgea4c52c52014-12-05 19:35:18 +0000169STATIC void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
Damien429d7192013-10-04 19:53:11 +0100170 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100171 // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
Damien George58ebde42014-05-21 20:32:59 +0100172 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
173 lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
Damien429d7192013-10-04 19:53:11 +0100174 }
175 lex->indent_level[lex->num_indent_level++] = indent;
176}
177
Damien Georgea4c52c52014-12-05 19:35:18 +0000178STATIC mp_uint_t indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100179 return lex->indent_level[lex->num_indent_level - 1];
180}
181
Damien Georgea4c52c52014-12-05 19:35:18 +0000182STATIC void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100183 lex->num_indent_level -= 1;
184}
185
186// some tricky operator encoding:
187// <op> = begin with <op>, if this opchar matches then begin here
188// e<op> = end with <op>, if this opchar matches then end
189// E<op> = mandatory end with <op>, this opchar must match, then end
190// c<op> = continue with <op>, if this opchar matches then continue matching
191// this means if the start of two ops are the same then they are equal til the last char
192
Damien George3ff16ff2016-05-20 12:38:15 +0100193STATIC const char *const tok_enc =
Damien429d7192013-10-04 19:53:11 +0100194 "()[]{},:;@~" // singles
195 "<e=c<e=" // < <= << <<=
196 ">e=c>e=" // > >= >> >>=
197 "*e=c*e=" // * *= ** **=
198 "+e=" // + +=
199 "-e=e>" // - -= ->
200 "&e=" // & &=
201 "|e=" // | |=
202 "/e=c/e=" // / /= // //=
203 "%e=" // % %=
204 "^e=" // ^ ^=
205 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100206 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100207
208// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200209STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000210 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
211 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
212 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
213 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100214
Damiend99b0522013-12-21 18:17:45 +0000215 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
216 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
217 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
218 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
219 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
220 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
221 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
222 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
223 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
224 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
225 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
226 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100227};
228
229// must have the same order as enum in lexer.h
Damien George3ff16ff2016-05-20 12:38:15 +0100230STATIC const char *const tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100231 "False",
232 "None",
233 "True",
234 "and",
235 "as",
236 "assert",
pohmelie81ebba72016-01-27 23:23:11 +0300237 #if MICROPY_PY_ASYNC_AWAIT
238 "async",
239 "await",
240 #endif
Damien429d7192013-10-04 19:53:11 +0100241 "break",
242 "class",
243 "continue",
244 "def",
245 "del",
246 "elif",
247 "else",
248 "except",
249 "finally",
250 "for",
251 "from",
252 "global",
253 "if",
254 "import",
255 "in",
256 "is",
257 "lambda",
258 "nonlocal",
259 "not",
260 "or",
261 "pass",
262 "raise",
263 "return",
264 "try",
265 "while",
266 "with",
267 "yield",
Damien George97f9a282014-05-12 23:07:34 +0100268 "__debug__",
Damien429d7192013-10-04 19:53:11 +0100269};
270
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200271// This is called with CUR_CHAR() before first hex digit, and should return with
272// it pointing to last hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100273// num_digits must be greater than zero
274STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
275 mp_uint_t num = 0;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200276 while (num_digits-- != 0) {
277 next_char(lex);
278 unichar c = CUR_CHAR(lex);
279 if (!unichar_isxdigit(c)) {
280 return false;
281 }
Dave Hylands3ad94d62015-05-18 14:41:25 -0700282 num = (num << 4) + unichar_xdigit_value(c);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200283 }
284 *result = num;
285 return true;
286}
287
Damien Georgea4c52c52014-12-05 19:35:18 +0000288STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
289 // start new token text
290 vstr_reset(&lex->vstr);
291
Damiena5185f42013-10-20 14:41:27 +0100292 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100293 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100294 while (!is_end(lex)) {
295 if (is_physical_newline(lex)) {
296 had_physical_newline = true;
297 next_char(lex);
298 } else if (is_whitespace(lex)) {
299 next_char(lex);
300 } else if (is_char(lex, '#')) {
301 next_char(lex);
302 while (!is_end(lex) && !is_physical_newline(lex)) {
303 next_char(lex);
304 }
305 // had_physical_newline will be set on next loop
306 } else if (is_char(lex, '\\')) {
307 // backslash (outside string literals) must appear just before a physical newline
308 next_char(lex);
309 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000310 // SyntaxError: unexpected character after line continuation character
Damien Georgea4c52c52014-12-05 19:35:18 +0000311 lex->tok_line = lex->line;
312 lex->tok_column = lex->column;
313 lex->tok_kind = MP_TOKEN_BAD_LINE_CONTINUATION;
Damien George69a818d2014-01-12 13:55:24 +0000314 return;
Damien429d7192013-10-04 19:53:11 +0100315 } else {
316 next_char(lex);
317 }
318 } else {
319 break;
320 }
321 }
322
Damiena5185f42013-10-20 14:41:27 +0100323 // set token source information
Damien Georgea4c52c52014-12-05 19:35:18 +0000324 lex->tok_line = lex->line;
325 lex->tok_column = lex->column;
Damiena5185f42013-10-20 14:41:27 +0100326
327 if (first_token && lex->line == 1 && lex->column != 1) {
328 // check that the first token is in the first column
329 // if first token is not on first line, we get a physical newline and
330 // this check is done as part of normal indent/dedent checking below
331 // (done to get equivalence with CPython)
Damien Georgea4c52c52014-12-05 19:35:18 +0000332 lex->tok_kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100333
334 } else if (lex->emit_dent < 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000335 lex->tok_kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100336 lex->emit_dent += 1;
337
338 } else if (lex->emit_dent > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000339 lex->tok_kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100340 lex->emit_dent -= 1;
341
Damien91d387d2013-10-09 15:09:52 +0100342 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000343 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100344
Damien George54eb4e72014-07-03 13:47:47 +0100345 mp_uint_t num_spaces = lex->column - 1;
Damien429d7192013-10-04 19:53:11 +0100346 if (num_spaces == indent_top(lex)) {
347 } else if (num_spaces > indent_top(lex)) {
348 indent_push(lex, num_spaces);
349 lex->emit_dent += 1;
350 } else {
351 while (num_spaces < indent_top(lex)) {
352 indent_pop(lex);
353 lex->emit_dent -= 1;
354 }
355 if (num_spaces != indent_top(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000356 lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100357 }
358 }
359
360 } else if (is_end(lex)) {
Damien George31101d92016-10-12 11:00:17 +1100361 lex->tok_kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100362
363 } else if (is_char_or(lex, '\'', '\"')
364 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
365 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
366 // a string or bytes literal
367
368 // parse type codes
369 bool is_raw = false;
370 bool is_bytes = false;
371 if (is_char(lex, 'u')) {
372 next_char(lex);
373 } else if (is_char(lex, 'b')) {
374 is_bytes = true;
375 next_char(lex);
376 if (is_char(lex, 'r')) {
377 is_raw = true;
378 next_char(lex);
379 }
380 } else if (is_char(lex, 'r')) {
381 is_raw = true;
382 next_char(lex);
383 if (is_char(lex, 'b')) {
384 is_bytes = true;
385 next_char(lex);
386 }
387 }
388
389 // set token kind
390 if (is_bytes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000391 lex->tok_kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100392 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000393 lex->tok_kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100394 }
395
396 // get first quoting character
397 char quote_char = '\'';
398 if (is_char(lex, '\"')) {
399 quote_char = '\"';
400 }
401 next_char(lex);
402
403 // work out if it's a single or triple quoted literal
Damien George54eb4e72014-07-03 13:47:47 +0100404 mp_uint_t num_quotes;
Damien429d7192013-10-04 19:53:11 +0100405 if (is_char_and(lex, quote_char, quote_char)) {
406 // triple quotes
407 next_char(lex);
408 next_char(lex);
409 num_quotes = 3;
410 } else {
411 // single quotes
412 num_quotes = 1;
413 }
414
Damien429d7192013-10-04 19:53:11 +0100415 // parse the literal
Damien George54eb4e72014-07-03 13:47:47 +0100416 mp_uint_t n_closing = 0;
Damien429d7192013-10-04 19:53:11 +0100417 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
418 if (is_char(lex, quote_char)) {
419 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100420 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100421 } else {
422 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100423 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100424 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100425 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100426 if (is_raw) {
427 // raw strings allow escaping of quotes, but the backslash is also emitted
428 vstr_add_char(&lex->vstr, '\\');
429 } else {
430 switch (c) {
Damien George94fbe972014-07-30 11:46:05 +0100431 case MP_LEXER_EOF: break; // TODO a proper error message?
432 case '\n': c = MP_LEXER_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damien Georgea91f4142014-04-10 11:30:55 +0100433 case '\\': break;
434 case '\'': break;
435 case '"': break;
436 case 'a': c = 0x07; break;
437 case 'b': c = 0x08; break;
438 case 't': c = 0x09; break;
439 case 'n': c = 0x0a; break;
440 case 'v': c = 0x0b; break;
441 case 'f': c = 0x0c; break;
442 case 'r': c = 0x0d; break;
Chris Angelico2ba22992014-06-04 05:28:12 +1000443 case 'u':
444 case 'U':
445 if (is_bytes) {
446 // b'\u1234' == b'\\u1234'
447 vstr_add_char(&lex->vstr, '\\');
448 break;
449 }
450 // Otherwise fall through.
Damien Georgea91f4142014-04-10 11:30:55 +0100451 case 'x':
452 {
Damien George54eb4e72014-07-03 13:47:47 +0100453 mp_uint_t num = 0;
Chris Angelico2ba22992014-06-04 05:28:12 +1000454 if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
Damien Georged241c2a2015-07-23 23:20:37 +0100455 // not enough hex chars for escape sequence
456 lex->tok_kind = MP_TOKEN_INVALID;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200457 }
458 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100459 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200460 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000461 case 'N':
462 // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
463 // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
464 // 3MB of text; even gzip-compressed and with minimal structure, it'll take
465 // roughly half a meg of storage. This form of Unicode escape may be added
466 // later on, but it's definitely not a priority right now. -- CJA 20140607
Damien George081f9322015-09-07 17:08:49 +0100467 mp_not_implemented("unicode name escapes");
Chris Angelico2ba22992014-06-04 05:28:12 +1000468 break;
Damien Georgea91f4142014-04-10 11:30:55 +0100469 default:
470 if (c >= '0' && c <= '7') {
471 // Octal sequence, 1-3 chars
Damien George54eb4e72014-07-03 13:47:47 +0100472 mp_uint_t digits = 3;
473 mp_uint_t num = c - '0';
Damien Georgea91f4142014-04-10 11:30:55 +0100474 while (is_following_odigit(lex) && --digits != 0) {
475 next_char(lex);
476 num = num * 8 + (CUR_CHAR(lex) - '0');
477 }
478 c = num;
479 } else {
480 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
481 vstr_add_char(&lex->vstr, '\\');
482 }
483 break;
484 }
Damiena5185f42013-10-20 14:41:27 +0100485 }
Damien George94fbe972014-07-30 11:46:05 +0100486 if (c != MP_LEXER_EOF) {
Damien Georgeea235202016-02-11 22:30:53 +0000487 if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
488 if (c < 0x110000 && !is_bytes) {
489 vstr_add_char(&lex->vstr, c);
490 } else if (c < 0x100 && is_bytes) {
491 vstr_add_byte(&lex->vstr, c);
492 } else {
493 // unicode character out of range
494 // this raises a generic SyntaxError; could provide more info
495 lex->tok_kind = MP_TOKEN_INVALID;
496 }
497 } else {
498 // without unicode everything is just added as an 8-bit byte
499 if (c < 0x100) {
500 vstr_add_byte(&lex->vstr, c);
501 } else {
502 // 8-bit character out of range
503 // this raises a generic SyntaxError; could provide more info
504 lex->tok_kind = MP_TOKEN_INVALID;
505 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000506 }
Damiena5185f42013-10-20 14:41:27 +0100507 }
508 } else {
Damien George94fbe972014-07-30 11:46:05 +0100509 // Add the "character" as a byte so that we remain 8-bit clean.
510 // This way, strings are parsed correctly whether or not they contain utf-8 chars.
511 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100512 }
513 }
514 next_char(lex);
515 }
516
517 // check we got the required end quotes
518 if (n_closing < num_quotes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000519 lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100520 }
521
Damiena5185f42013-10-20 14:41:27 +0100522 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000523 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100524
525 } else if (is_head_of_identifier(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000526 lex->tok_kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100527
Damien George7ed58cb2015-06-09 10:58:07 +0000528 // get first char (add as byte to remain 8-bit clean and support utf-8)
529 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100530 next_char(lex);
531
Damiena5185f42013-10-20 14:41:27 +0100532 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100533 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damien George7ed58cb2015-06-09 10:58:07 +0000534 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100535 next_char(lex);
536 }
537
538 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damien George7d414a12015-02-08 01:57:40 +0000539 bool forced_integer = false;
540 if (is_char(lex, '.')) {
541 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
542 } else {
543 lex->tok_kind = MP_TOKEN_INTEGER;
Damien George2b000472015-09-07 17:33:44 +0100544 if (is_char(lex, '0') && is_following_base_char(lex)) {
Damien George7d414a12015-02-08 01:57:40 +0000545 forced_integer = true;
546 }
547 }
Damien429d7192013-10-04 19:53:11 +0100548
Damiena5185f42013-10-20 14:41:27 +0100549 // get first char
550 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100551 next_char(lex);
552
Damiena5185f42013-10-20 14:41:27 +0100553 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100554 while (!is_end(lex)) {
Damien George7d414a12015-02-08 01:57:40 +0000555 if (!forced_integer && is_char_or(lex, 'e', 'E')) {
556 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
Damiena5185f42013-10-20 14:41:27 +0100557 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100558 next_char(lex);
559 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100560 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100561 next_char(lex);
562 }
Damien George7d414a12015-02-08 01:57:40 +0000563 } else if (is_letter(lex) || is_digit(lex) || is_char(lex, '.')) {
564 if (is_char_or3(lex, '.', 'j', 'J')) {
565 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
566 }
Damiena5185f42013-10-20 14:41:27 +0100567 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100568 next_char(lex);
569 } else {
570 break;
571 }
572 }
573
Damien George2e9eb2d2014-04-10 12:19:33 +0100574 } else if (is_char(lex, '.')) {
575 // special handling for . and ... operators, because .. is not a valid operator
576
577 // get first char
578 vstr_add_char(&lex->vstr, '.');
579 next_char(lex);
580
581 if (is_char_and(lex, '.', '.')) {
582 vstr_add_char(&lex->vstr, '.');
583 vstr_add_char(&lex->vstr, '.');
584 next_char(lex);
585 next_char(lex);
Damien Georgea4c52c52014-12-05 19:35:18 +0000586 lex->tok_kind = MP_TOKEN_ELLIPSIS;
Damien George2e9eb2d2014-04-10 12:19:33 +0100587 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000588 lex->tok_kind = MP_TOKEN_DEL_PERIOD;
Damien George2e9eb2d2014-04-10 12:19:33 +0100589 }
590
Damien429d7192013-10-04 19:53:11 +0100591 } else {
592 // search for encoded delimiter or operator
593
594 const char *t = tok_enc;
Damien George54eb4e72014-07-03 13:47:47 +0100595 mp_uint_t tok_enc_index = 0;
Damien429d7192013-10-04 19:53:11 +0100596 for (; *t != 0 && !is_char(lex, *t); t += 1) {
597 if (*t == 'e' || *t == 'c') {
598 t += 1;
599 } else if (*t == 'E') {
600 tok_enc_index -= 1;
601 t += 1;
602 }
603 tok_enc_index += 1;
604 }
605
606 next_char(lex);
607
608 if (*t == 0) {
609 // didn't match any delimiter or operator characters
Damien Georgea4c52c52014-12-05 19:35:18 +0000610 lex->tok_kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100611
612 } else {
613 // matched a delimiter or operator character
614
615 // get the maximum characters for a valid token
616 t += 1;
Damien George54eb4e72014-07-03 13:47:47 +0100617 mp_uint_t t_index = tok_enc_index;
Damien429d7192013-10-04 19:53:11 +0100618 for (;;) {
619 for (; *t == 'e'; t += 1) {
620 t += 1;
621 t_index += 1;
622 if (is_char(lex, *t)) {
623 next_char(lex);
624 tok_enc_index = t_index;
625 break;
626 }
627 }
628
629 if (*t == 'E') {
630 t += 1;
631 if (is_char(lex, *t)) {
632 next_char(lex);
633 tok_enc_index = t_index;
634 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000635 lex->tok_kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100636 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100637 }
638 break;
639 }
640
641 if (*t == 'c') {
642 t += 1;
643 t_index += 1;
644 if (is_char(lex, *t)) {
645 next_char(lex);
646 tok_enc_index = t_index;
647 t += 1;
648 } else {
649 break;
650 }
651 } else {
652 break;
653 }
654 }
655
656 // set token kind
Damien Georgea4c52c52014-12-05 19:35:18 +0000657 lex->tok_kind = tok_enc_kind[tok_enc_index];
Damien429d7192013-10-04 19:53:11 +0100658
Damien George2e9eb2d2014-04-10 12:19:33 +0100659 tok_enc_no_match:
660
Damien429d7192013-10-04 19:53:11 +0100661 // compute bracket level for implicit line joining
Damien Georgea4c52c52014-12-05 19:35:18 +0000662 if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100663 lex->nested_bracket_level += 1;
Damien Georgea4c52c52014-12-05 19:35:18 +0000664 } else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100665 lex->nested_bracket_level -= 1;
666 }
667 }
668 }
669
Damiena5185f42013-10-20 14:41:27 +0100670 // check for keywords
Damien Georgea4c52c52014-12-05 19:35:18 +0000671 if (lex->tok_kind == MP_TOKEN_NAME) {
Damien George97f9a282014-05-12 23:07:34 +0100672 // We check for __debug__ here and convert it to its value. This is so
673 // the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
674 // need to check for this special token in many places in the compiler.
675 // TODO improve speed of these string comparisons
Damien George54eb4e72014-07-03 13:47:47 +0100676 //for (mp_int_t i = 0; tok_kw[i] != NULL; i++) {
Damien George963a5a32015-01-16 17:47:07 +0000677 for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000678 if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) {
Emmanuel Blotf6932d62014-06-19 18:54:34 +0200679 if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
680 // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
Damien Georgeb4b10fd2015-01-01 23:30:53 +0000681 lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
Damien George97f9a282014-05-12 23:07:34 +0100682 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000683 lex->tok_kind = MP_TOKEN_KW_FALSE + i;
Damien George97f9a282014-05-12 23:07:34 +0100684 }
Damien429d7192013-10-04 19:53:11 +0100685 break;
686 }
687 }
688 }
689}
690
Damien George94fbe972014-07-30 11:46:05 +0100691mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close) {
Damien George9bf5f282014-10-09 16:53:37 +0100692 mp_lexer_t *lex = m_new_obj_maybe(mp_lexer_t);
Damien Georgee1199ec2014-05-10 17:48:01 +0100693
694 // check for memory allocation error
695 if (lex == NULL) {
696 if (stream_close) {
697 stream_close(stream_data);
698 }
699 return NULL;
700 }
Damien429d7192013-10-04 19:53:11 +0100701
Damien Georgeb829b5c2014-01-25 13:51:19 +0000702 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100703 lex->stream_data = stream_data;
Damien George94fbe972014-07-30 11:46:05 +0100704 lex->stream_next_byte = stream_next_byte;
Damienfa2162b2013-10-20 17:42:00 +0100705 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100706 lex->line = 1;
707 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100708 lex->emit_dent = 0;
709 lex->nested_bracket_level = 0;
Damien George58ebde42014-05-21 20:32:59 +0100710 lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
Damien429d7192013-10-04 19:53:11 +0100711 lex->num_indent_level = 1;
Damien Georgee1199ec2014-05-10 17:48:01 +0100712 lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200713 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100714
Damien Georgee1199ec2014-05-10 17:48:01 +0100715 // check for memory allocation error
Damien George5da0d292016-09-19 11:17:02 +1000716 // note: vstr_init above may fail on malloc, but so may mp_lexer_next_token_into below
717 if (lex->indent_level == NULL) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100718 mp_lexer_free(lex);
719 return NULL;
720 }
721
722 // store sentinel for first indentation level
723 lex->indent_level[0] = 0;
724
Damien429d7192013-10-04 19:53:11 +0100725 // preload characters
Damien George94fbe972014-07-30 11:46:05 +0100726 lex->chr0 = stream_next_byte(stream_data);
727 lex->chr1 = stream_next_byte(stream_data);
728 lex->chr2 = stream_next_byte(stream_data);
Damiena5185f42013-10-20 14:41:27 +0100729
730 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damien George94fbe972014-07-30 11:46:05 +0100731 if (lex->chr0 == MP_LEXER_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100732 lex->chr0 = '\n';
Damien George94fbe972014-07-30 11:46:05 +0100733 } else if (lex->chr1 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000734 if (lex->chr0 == '\r') {
735 lex->chr0 = '\n';
736 } else if (lex->chr0 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100737 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100738 }
Damien George94fbe972014-07-30 11:46:05 +0100739 } else if (lex->chr2 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000740 if (lex->chr1 == '\r') {
741 lex->chr1 = '\n';
742 } else if (lex->chr1 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100743 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100744 }
Damien429d7192013-10-04 19:53:11 +0100745 }
746
Damiena5185f42013-10-20 14:41:27 +0100747 // preload first token
Damien Georgea4c52c52014-12-05 19:35:18 +0000748 mp_lexer_next_token_into(lex, true);
Damien429d7192013-10-04 19:53:11 +0100749
750 return lex;
751}
752
Damiend99b0522013-12-21 18:17:45 +0000753void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100754 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100755 if (lex->stream_close) {
756 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100757 }
Damienbb5316b2013-10-22 21:12:29 +0100758 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200759 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000760 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100761 }
Damien429d7192013-10-04 19:53:11 +0100762}
763
Damiend99b0522013-12-21 18:17:45 +0000764void mp_lexer_to_next(mp_lexer_t *lex) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000765 mp_lexer_next_token_into(lex, false);
Damien429d7192013-10-04 19:53:11 +0100766}
767
Damien Georgea4c52c52014-12-05 19:35:18 +0000768#if MICROPY_DEBUG_PRINTERS
769void mp_lexer_show_token(const mp_lexer_t *lex) {
Damien George451a0872014-12-05 22:50:16 +0000770 printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
Damien Georgea4c52c52014-12-05 19:35:18 +0000771 if (lex->vstr.len > 0) {
772 const byte *i = (const byte *)lex->vstr.buf;
773 const byte *j = (const byte *)i + lex->vstr.len;
774 printf(" ");
775 while (i < j) {
776 unichar c = utf8_get_char(i);
777 i = utf8_next_char(i);
778 if (unichar_isprint(c)) {
Damien George7f19a392015-06-22 17:40:12 +0100779 printf("%c", (int)c);
Damien Georgea4c52c52014-12-05 19:35:18 +0000780 } else {
781 printf("?");
782 }
783 }
784 }
785 printf("\n");
Damien429d7192013-10-04 19:53:11 +0100786}
Damien Georgea4c52c52014-12-05 19:35:18 +0000787#endif
Damien Georgedd5353a2015-12-18 12:35:44 +0000788
789#endif // MICROPY_ENABLE_COMPILER