blob: 76abedd4519f440bbc89210780e22360e332c4cd [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027#include <stdio.h>
28#include <assert.h>
29
Damien Georgeb4b10fd2015-01-01 23:30:53 +000030#include "py/mpstate.h"
Damien George51dfcb42015-01-01 20:27:54 +000031#include "py/lexer.h"
Damien George081f9322015-09-07 17:08:49 +010032#include "py/runtime.h"
Damien429d7192013-10-04 19:53:11 +010033
Damien Georgedd5353a2015-12-18 12:35:44 +000034#if MICROPY_ENABLE_COMPILER
35
Damien429d7192013-10-04 19:53:11 +010036#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010037
Damien92c06562013-10-22 22:32:27 +010038// TODO seems that CPython allows NULL byte in the input stream
39// don't know if that's intentional or not, but we don't allow it
40
Damien George9528cd62014-01-15 21:23:31 +000041// TODO replace with a call to a standard function
Damien Georgea4c52c52014-12-05 19:35:18 +000042STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
Damien George54eb4e72014-07-03 13:47:47 +010043 mp_uint_t i = 0;
Damien429d7192013-10-04 19:53:11 +010044
Damiena5185f42013-10-20 14:41:27 +010045 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010046 ++i;
Damien429d7192013-10-04 19:53:11 +010047 ++str;
Damiena5185f42013-10-20 14:41:27 +010048 ++strn;
Damien429d7192013-10-04 19:53:11 +010049 }
50
Damiena5185f42013-10-20 14:41:27 +010051 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010052}
53
Damiena5185f42013-10-20 14:41:27 +010054#define CUR_CHAR(lex) ((lex)->chr0)
55
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020056STATIC bool is_end(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +010057 return lex->chr0 == MP_LEXER_EOF;
Damien429d7192013-10-04 19:53:11 +010058}
59
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020060STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien George32bade12015-01-30 00:27:46 +000061 return lex->chr0 == '\n';
Damien429d7192013-10-04 19:53:11 +010062}
63
Damien George2e2e4042015-03-19 00:21:29 +000064STATIC bool is_char(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010065 return lex->chr0 == c;
66}
67
Damien George2e2e4042015-03-19 00:21:29 +000068STATIC bool is_char_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010069 return lex->chr0 == c1 || lex->chr0 == c2;
70}
71
Damien George2e2e4042015-03-19 00:21:29 +000072STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
Damien429d7192013-10-04 19:53:11 +010073 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
74}
75
76/*
Damien George2e2e4042015-03-19 00:21:29 +000077STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010078 return lex->chr1 == c;
79}
80*/
81
Damien George2e2e4042015-03-19 00:21:29 +000082STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010083 return lex->chr1 == c1 || lex->chr1 == c2;
84}
85
Damien George2e2e4042015-03-19 00:21:29 +000086STATIC bool is_char_following_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010087 return lex->chr2 == c1 || lex->chr2 == c2;
88}
89
Damien George2e2e4042015-03-19 00:21:29 +000090STATIC bool is_char_and(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010091 return lex->chr0 == c1 && lex->chr1 == c2;
92}
93
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020094STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000095 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010096}
97
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020098STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000099 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100100}
101
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200102STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000103 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100104}
105
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200106STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000107 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100108}
109
Damien George2b000472015-09-07 17:33:44 +0100110STATIC bool is_following_base_char(mp_lexer_t *lex) {
111 const unichar chr1 = lex->chr1 | 0x20;
112 return chr1 == 'b' || chr1 == 'o' || chr1 == 'x';
Damien George7d414a12015-02-08 01:57:40 +0000113}
114
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200115STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200116 return lex->chr1 >= '0' && lex->chr1 <= '7';
117}
118
Damien George7ed58cb2015-06-09 10:58:07 +0000119// to easily parse utf-8 identifiers we allow any raw byte with high bit set
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200120STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien George7ed58cb2015-06-09 10:58:07 +0000121 return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
Damien429d7192013-10-04 19:53:11 +0100122}
123
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200124STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100125 return is_head_of_identifier(lex) || is_digit(lex);
126}
127
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200128STATIC void next_char(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +0100129 if (lex->chr0 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100130 return;
131 }
132
Damien429d7192013-10-04 19:53:11 +0100133 if (lex->chr0 == '\n') {
Damien George32bade12015-01-30 00:27:46 +0000134 // a new line
Damien429d7192013-10-04 19:53:11 +0100135 ++lex->line;
136 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100137 } else if (lex->chr0 == '\t') {
138 // a tab
139 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
140 } else {
141 // a character worth one column
142 ++lex->column;
143 }
144
Damien George32bade12015-01-30 00:27:46 +0000145 lex->chr0 = lex->chr1;
146 lex->chr1 = lex->chr2;
147 lex->chr2 = lex->stream_next_byte(lex->stream_data);
148
149 if (lex->chr0 == '\r') {
150 // CR is a new line, converted to LF
151 lex->chr0 = '\n';
152 if (lex->chr1 == '\n') {
153 // CR LF is a single new line
154 lex->chr1 = lex->chr2;
155 lex->chr2 = lex->stream_next_byte(lex->stream_data);
156 }
157 }
158
159 if (lex->chr2 == MP_LEXER_EOF) {
160 // EOF, check if we need to insert a newline at end of file
161 if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
162 // if lex->chr1 == '\r' then this makes a CR LF which will be converted to LF above
163 // otherwise it just inserts a LF
164 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100165 }
166 }
167}
168
Damien Georgea4c52c52014-12-05 19:35:18 +0000169STATIC void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
Damien429d7192013-10-04 19:53:11 +0100170 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100171 // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
Damien George58ebde42014-05-21 20:32:59 +0100172 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
173 lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
Damien429d7192013-10-04 19:53:11 +0100174 }
175 lex->indent_level[lex->num_indent_level++] = indent;
176}
177
Damien Georgea4c52c52014-12-05 19:35:18 +0000178STATIC mp_uint_t indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100179 return lex->indent_level[lex->num_indent_level - 1];
180}
181
Damien Georgea4c52c52014-12-05 19:35:18 +0000182STATIC void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100183 lex->num_indent_level -= 1;
184}
185
186// some tricky operator encoding:
187// <op> = begin with <op>, if this opchar matches then begin here
188// e<op> = end with <op>, if this opchar matches then end
189// E<op> = mandatory end with <op>, this opchar must match, then end
190// c<op> = continue with <op>, if this opchar matches then continue matching
191// this means if the start of two ops are the same then they are equal til the last char
192
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200193STATIC const char *tok_enc =
Damien429d7192013-10-04 19:53:11 +0100194 "()[]{},:;@~" // singles
195 "<e=c<e=" // < <= << <<=
196 ">e=c>e=" // > >= >> >>=
197 "*e=c*e=" // * *= ** **=
198 "+e=" // + +=
199 "-e=e>" // - -= ->
200 "&e=" // & &=
201 "|e=" // | |=
202 "/e=c/e=" // / /= // //=
203 "%e=" // % %=
204 "^e=" // ^ ^=
205 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100206 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100207
208// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200209STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000210 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
211 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
212 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
213 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100214
Damiend99b0522013-12-21 18:17:45 +0000215 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
216 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
217 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
218 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
219 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
220 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
221 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
222 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
223 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
224 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
225 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
226 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100227};
228
229// must have the same order as enum in lexer.h
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200230STATIC const char *tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100231 "False",
232 "None",
233 "True",
234 "and",
235 "as",
236 "assert",
237 "break",
238 "class",
239 "continue",
240 "def",
241 "del",
242 "elif",
243 "else",
244 "except",
245 "finally",
246 "for",
247 "from",
248 "global",
249 "if",
250 "import",
251 "in",
252 "is",
253 "lambda",
254 "nonlocal",
255 "not",
256 "or",
257 "pass",
258 "raise",
259 "return",
260 "try",
261 "while",
262 "with",
263 "yield",
Damien George97f9a282014-05-12 23:07:34 +0100264 "__debug__",
Damien429d7192013-10-04 19:53:11 +0100265};
266
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200267// This is called with CUR_CHAR() before first hex digit, and should return with
268// it pointing to last hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100269// num_digits must be greater than zero
270STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
271 mp_uint_t num = 0;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200272 while (num_digits-- != 0) {
273 next_char(lex);
274 unichar c = CUR_CHAR(lex);
275 if (!unichar_isxdigit(c)) {
276 return false;
277 }
Dave Hylands3ad94d62015-05-18 14:41:25 -0700278 num = (num << 4) + unichar_xdigit_value(c);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200279 }
280 *result = num;
281 return true;
282}
283
Damien Georgea4c52c52014-12-05 19:35:18 +0000284STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
285 // start new token text
286 vstr_reset(&lex->vstr);
287
Damiena5185f42013-10-20 14:41:27 +0100288 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100289 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100290 while (!is_end(lex)) {
291 if (is_physical_newline(lex)) {
292 had_physical_newline = true;
293 next_char(lex);
294 } else if (is_whitespace(lex)) {
295 next_char(lex);
296 } else if (is_char(lex, '#')) {
297 next_char(lex);
298 while (!is_end(lex) && !is_physical_newline(lex)) {
299 next_char(lex);
300 }
301 // had_physical_newline will be set on next loop
302 } else if (is_char(lex, '\\')) {
303 // backslash (outside string literals) must appear just before a physical newline
304 next_char(lex);
305 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000306 // SyntaxError: unexpected character after line continuation character
Damien Georgea4c52c52014-12-05 19:35:18 +0000307 lex->tok_line = lex->line;
308 lex->tok_column = lex->column;
309 lex->tok_kind = MP_TOKEN_BAD_LINE_CONTINUATION;
Damien George69a818d2014-01-12 13:55:24 +0000310 return;
Damien429d7192013-10-04 19:53:11 +0100311 } else {
312 next_char(lex);
313 }
314 } else {
315 break;
316 }
317 }
318
Damiena5185f42013-10-20 14:41:27 +0100319 // set token source information
Damien Georgea4c52c52014-12-05 19:35:18 +0000320 lex->tok_line = lex->line;
321 lex->tok_column = lex->column;
Damiena5185f42013-10-20 14:41:27 +0100322
323 if (first_token && lex->line == 1 && lex->column != 1) {
324 // check that the first token is in the first column
325 // if first token is not on first line, we get a physical newline and
326 // this check is done as part of normal indent/dedent checking below
327 // (done to get equivalence with CPython)
Damien Georgea4c52c52014-12-05 19:35:18 +0000328 lex->tok_kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100329
330 } else if (lex->emit_dent < 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000331 lex->tok_kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100332 lex->emit_dent += 1;
333
334 } else if (lex->emit_dent > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000335 lex->tok_kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100336 lex->emit_dent -= 1;
337
Damien91d387d2013-10-09 15:09:52 +0100338 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000339 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100340
Damien George54eb4e72014-07-03 13:47:47 +0100341 mp_uint_t num_spaces = lex->column - 1;
Damien429d7192013-10-04 19:53:11 +0100342 lex->emit_dent = 0;
343 if (num_spaces == indent_top(lex)) {
344 } else if (num_spaces > indent_top(lex)) {
345 indent_push(lex, num_spaces);
346 lex->emit_dent += 1;
347 } else {
348 while (num_spaces < indent_top(lex)) {
349 indent_pop(lex);
350 lex->emit_dent -= 1;
351 }
352 if (num_spaces != indent_top(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000353 lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100354 }
355 }
356
357 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100358 if (indent_top(lex) > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000359 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100360 lex->emit_dent = 0;
361 while (indent_top(lex) > 0) {
362 indent_pop(lex);
363 lex->emit_dent -= 1;
364 }
365 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000366 lex->tok_kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100367 }
368
369 } else if (is_char_or(lex, '\'', '\"')
370 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
371 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
372 // a string or bytes literal
373
374 // parse type codes
375 bool is_raw = false;
376 bool is_bytes = false;
377 if (is_char(lex, 'u')) {
378 next_char(lex);
379 } else if (is_char(lex, 'b')) {
380 is_bytes = true;
381 next_char(lex);
382 if (is_char(lex, 'r')) {
383 is_raw = true;
384 next_char(lex);
385 }
386 } else if (is_char(lex, 'r')) {
387 is_raw = true;
388 next_char(lex);
389 if (is_char(lex, 'b')) {
390 is_bytes = true;
391 next_char(lex);
392 }
393 }
394
395 // set token kind
396 if (is_bytes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000397 lex->tok_kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100398 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000399 lex->tok_kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100400 }
401
402 // get first quoting character
403 char quote_char = '\'';
404 if (is_char(lex, '\"')) {
405 quote_char = '\"';
406 }
407 next_char(lex);
408
409 // work out if it's a single or triple quoted literal
Damien George54eb4e72014-07-03 13:47:47 +0100410 mp_uint_t num_quotes;
Damien429d7192013-10-04 19:53:11 +0100411 if (is_char_and(lex, quote_char, quote_char)) {
412 // triple quotes
413 next_char(lex);
414 next_char(lex);
415 num_quotes = 3;
416 } else {
417 // single quotes
418 num_quotes = 1;
419 }
420
Damien429d7192013-10-04 19:53:11 +0100421 // parse the literal
Damien George54eb4e72014-07-03 13:47:47 +0100422 mp_uint_t n_closing = 0;
Damien429d7192013-10-04 19:53:11 +0100423 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
424 if (is_char(lex, quote_char)) {
425 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100426 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100427 } else {
428 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100429 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100430 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100431 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100432 if (is_raw) {
433 // raw strings allow escaping of quotes, but the backslash is also emitted
434 vstr_add_char(&lex->vstr, '\\');
435 } else {
436 switch (c) {
Damien George94fbe972014-07-30 11:46:05 +0100437 case MP_LEXER_EOF: break; // TODO a proper error message?
438 case '\n': c = MP_LEXER_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damien Georgea91f4142014-04-10 11:30:55 +0100439 case '\\': break;
440 case '\'': break;
441 case '"': break;
442 case 'a': c = 0x07; break;
443 case 'b': c = 0x08; break;
444 case 't': c = 0x09; break;
445 case 'n': c = 0x0a; break;
446 case 'v': c = 0x0b; break;
447 case 'f': c = 0x0c; break;
448 case 'r': c = 0x0d; break;
Chris Angelico2ba22992014-06-04 05:28:12 +1000449 case 'u':
450 case 'U':
451 if (is_bytes) {
452 // b'\u1234' == b'\\u1234'
453 vstr_add_char(&lex->vstr, '\\');
454 break;
455 }
456 // Otherwise fall through.
Damien Georgea91f4142014-04-10 11:30:55 +0100457 case 'x':
458 {
Damien George54eb4e72014-07-03 13:47:47 +0100459 mp_uint_t num = 0;
Chris Angelico2ba22992014-06-04 05:28:12 +1000460 if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
Damien Georged241c2a2015-07-23 23:20:37 +0100461 // not enough hex chars for escape sequence
462 lex->tok_kind = MP_TOKEN_INVALID;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200463 }
464 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100465 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200466 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000467 case 'N':
468 // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
469 // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
470 // 3MB of text; even gzip-compressed and with minimal structure, it'll take
471 // roughly half a meg of storage. This form of Unicode escape may be added
472 // later on, but it's definitely not a priority right now. -- CJA 20140607
Damien George081f9322015-09-07 17:08:49 +0100473 mp_not_implemented("unicode name escapes");
Chris Angelico2ba22992014-06-04 05:28:12 +1000474 break;
Damien Georgea91f4142014-04-10 11:30:55 +0100475 default:
476 if (c >= '0' && c <= '7') {
477 // Octal sequence, 1-3 chars
Damien George54eb4e72014-07-03 13:47:47 +0100478 mp_uint_t digits = 3;
479 mp_uint_t num = c - '0';
Damien Georgea91f4142014-04-10 11:30:55 +0100480 while (is_following_odigit(lex) && --digits != 0) {
481 next_char(lex);
482 num = num * 8 + (CUR_CHAR(lex) - '0');
483 }
484 c = num;
485 } else {
486 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
487 vstr_add_char(&lex->vstr, '\\');
488 }
489 break;
490 }
Damiena5185f42013-10-20 14:41:27 +0100491 }
Damien George94fbe972014-07-30 11:46:05 +0100492 if (c != MP_LEXER_EOF) {
Damien Georgeea235202016-02-11 22:30:53 +0000493 if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
494 if (c < 0x110000 && !is_bytes) {
495 vstr_add_char(&lex->vstr, c);
496 } else if (c < 0x100 && is_bytes) {
497 vstr_add_byte(&lex->vstr, c);
498 } else {
499 // unicode character out of range
500 // this raises a generic SyntaxError; could provide more info
501 lex->tok_kind = MP_TOKEN_INVALID;
502 }
503 } else {
504 // without unicode everything is just added as an 8-bit byte
505 if (c < 0x100) {
506 vstr_add_byte(&lex->vstr, c);
507 } else {
508 // 8-bit character out of range
509 // this raises a generic SyntaxError; could provide more info
510 lex->tok_kind = MP_TOKEN_INVALID;
511 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000512 }
Damiena5185f42013-10-20 14:41:27 +0100513 }
514 } else {
Damien George94fbe972014-07-30 11:46:05 +0100515 // Add the "character" as a byte so that we remain 8-bit clean.
516 // This way, strings are parsed correctly whether or not they contain utf-8 chars.
517 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100518 }
519 }
520 next_char(lex);
521 }
522
523 // check we got the required end quotes
524 if (n_closing < num_quotes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000525 lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100526 }
527
Damiena5185f42013-10-20 14:41:27 +0100528 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000529 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100530
531 } else if (is_head_of_identifier(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000532 lex->tok_kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100533
Damien George7ed58cb2015-06-09 10:58:07 +0000534 // get first char (add as byte to remain 8-bit clean and support utf-8)
535 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100536 next_char(lex);
537
Damiena5185f42013-10-20 14:41:27 +0100538 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100539 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damien George7ed58cb2015-06-09 10:58:07 +0000540 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100541 next_char(lex);
542 }
543
544 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damien George7d414a12015-02-08 01:57:40 +0000545 bool forced_integer = false;
546 if (is_char(lex, '.')) {
547 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
548 } else {
549 lex->tok_kind = MP_TOKEN_INTEGER;
Damien George2b000472015-09-07 17:33:44 +0100550 if (is_char(lex, '0') && is_following_base_char(lex)) {
Damien George7d414a12015-02-08 01:57:40 +0000551 forced_integer = true;
552 }
553 }
Damien429d7192013-10-04 19:53:11 +0100554
Damiena5185f42013-10-20 14:41:27 +0100555 // get first char
556 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100557 next_char(lex);
558
Damiena5185f42013-10-20 14:41:27 +0100559 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100560 while (!is_end(lex)) {
Damien George7d414a12015-02-08 01:57:40 +0000561 if (!forced_integer && is_char_or(lex, 'e', 'E')) {
562 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
Damiena5185f42013-10-20 14:41:27 +0100563 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100564 next_char(lex);
565 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100566 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100567 next_char(lex);
568 }
Damien George7d414a12015-02-08 01:57:40 +0000569 } else if (is_letter(lex) || is_digit(lex) || is_char(lex, '.')) {
570 if (is_char_or3(lex, '.', 'j', 'J')) {
571 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
572 }
Damiena5185f42013-10-20 14:41:27 +0100573 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100574 next_char(lex);
575 } else {
576 break;
577 }
578 }
579
Damien George2e9eb2d2014-04-10 12:19:33 +0100580 } else if (is_char(lex, '.')) {
581 // special handling for . and ... operators, because .. is not a valid operator
582
583 // get first char
584 vstr_add_char(&lex->vstr, '.');
585 next_char(lex);
586
587 if (is_char_and(lex, '.', '.')) {
588 vstr_add_char(&lex->vstr, '.');
589 vstr_add_char(&lex->vstr, '.');
590 next_char(lex);
591 next_char(lex);
Damien Georgea4c52c52014-12-05 19:35:18 +0000592 lex->tok_kind = MP_TOKEN_ELLIPSIS;
Damien George2e9eb2d2014-04-10 12:19:33 +0100593 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000594 lex->tok_kind = MP_TOKEN_DEL_PERIOD;
Damien George2e9eb2d2014-04-10 12:19:33 +0100595 }
596
Damien429d7192013-10-04 19:53:11 +0100597 } else {
598 // search for encoded delimiter or operator
599
600 const char *t = tok_enc;
Damien George54eb4e72014-07-03 13:47:47 +0100601 mp_uint_t tok_enc_index = 0;
Damien429d7192013-10-04 19:53:11 +0100602 for (; *t != 0 && !is_char(lex, *t); t += 1) {
603 if (*t == 'e' || *t == 'c') {
604 t += 1;
605 } else if (*t == 'E') {
606 tok_enc_index -= 1;
607 t += 1;
608 }
609 tok_enc_index += 1;
610 }
611
612 next_char(lex);
613
614 if (*t == 0) {
615 // didn't match any delimiter or operator characters
Damien Georgea4c52c52014-12-05 19:35:18 +0000616 lex->tok_kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100617
618 } else {
619 // matched a delimiter or operator character
620
621 // get the maximum characters for a valid token
622 t += 1;
Damien George54eb4e72014-07-03 13:47:47 +0100623 mp_uint_t t_index = tok_enc_index;
Damien429d7192013-10-04 19:53:11 +0100624 for (;;) {
625 for (; *t == 'e'; t += 1) {
626 t += 1;
627 t_index += 1;
628 if (is_char(lex, *t)) {
629 next_char(lex);
630 tok_enc_index = t_index;
631 break;
632 }
633 }
634
635 if (*t == 'E') {
636 t += 1;
637 if (is_char(lex, *t)) {
638 next_char(lex);
639 tok_enc_index = t_index;
640 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000641 lex->tok_kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100642 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100643 }
644 break;
645 }
646
647 if (*t == 'c') {
648 t += 1;
649 t_index += 1;
650 if (is_char(lex, *t)) {
651 next_char(lex);
652 tok_enc_index = t_index;
653 t += 1;
654 } else {
655 break;
656 }
657 } else {
658 break;
659 }
660 }
661
662 // set token kind
Damien Georgea4c52c52014-12-05 19:35:18 +0000663 lex->tok_kind = tok_enc_kind[tok_enc_index];
Damien429d7192013-10-04 19:53:11 +0100664
Damien George2e9eb2d2014-04-10 12:19:33 +0100665 tok_enc_no_match:
666
Damien429d7192013-10-04 19:53:11 +0100667 // compute bracket level for implicit line joining
Damien Georgea4c52c52014-12-05 19:35:18 +0000668 if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100669 lex->nested_bracket_level += 1;
Damien Georgea4c52c52014-12-05 19:35:18 +0000670 } else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100671 lex->nested_bracket_level -= 1;
672 }
673 }
674 }
675
Damiena5185f42013-10-20 14:41:27 +0100676 // check for keywords
Damien Georgea4c52c52014-12-05 19:35:18 +0000677 if (lex->tok_kind == MP_TOKEN_NAME) {
Damien George97f9a282014-05-12 23:07:34 +0100678 // We check for __debug__ here and convert it to its value. This is so
679 // the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
680 // need to check for this special token in many places in the compiler.
681 // TODO improve speed of these string comparisons
Damien George54eb4e72014-07-03 13:47:47 +0100682 //for (mp_int_t i = 0; tok_kw[i] != NULL; i++) {
Damien George963a5a32015-01-16 17:47:07 +0000683 for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000684 if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) {
Emmanuel Blotf6932d62014-06-19 18:54:34 +0200685 if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
686 // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
Damien Georgeb4b10fd2015-01-01 23:30:53 +0000687 lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
Damien George97f9a282014-05-12 23:07:34 +0100688 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000689 lex->tok_kind = MP_TOKEN_KW_FALSE + i;
Damien George97f9a282014-05-12 23:07:34 +0100690 }
Damien429d7192013-10-04 19:53:11 +0100691 break;
692 }
693 }
694 }
695}
696
Damien George94fbe972014-07-30 11:46:05 +0100697mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close) {
Damien George9bf5f282014-10-09 16:53:37 +0100698 mp_lexer_t *lex = m_new_obj_maybe(mp_lexer_t);
Damien Georgee1199ec2014-05-10 17:48:01 +0100699
700 // check for memory allocation error
701 if (lex == NULL) {
702 if (stream_close) {
703 stream_close(stream_data);
704 }
705 return NULL;
706 }
Damien429d7192013-10-04 19:53:11 +0100707
Damien Georgeb829b5c2014-01-25 13:51:19 +0000708 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100709 lex->stream_data = stream_data;
Damien George94fbe972014-07-30 11:46:05 +0100710 lex->stream_next_byte = stream_next_byte;
Damienfa2162b2013-10-20 17:42:00 +0100711 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100712 lex->line = 1;
713 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100714 lex->emit_dent = 0;
715 lex->nested_bracket_level = 0;
Damien George58ebde42014-05-21 20:32:59 +0100716 lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
Damien429d7192013-10-04 19:53:11 +0100717 lex->num_indent_level = 1;
Damien Georgee1199ec2014-05-10 17:48:01 +0100718 lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200719 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100720
Damien Georgee1199ec2014-05-10 17:48:01 +0100721 // check for memory allocation error
722 if (lex->indent_level == NULL || vstr_had_error(&lex->vstr)) {
723 mp_lexer_free(lex);
724 return NULL;
725 }
726
727 // store sentinel for first indentation level
728 lex->indent_level[0] = 0;
729
Damien429d7192013-10-04 19:53:11 +0100730 // preload characters
Damien George94fbe972014-07-30 11:46:05 +0100731 lex->chr0 = stream_next_byte(stream_data);
732 lex->chr1 = stream_next_byte(stream_data);
733 lex->chr2 = stream_next_byte(stream_data);
Damiena5185f42013-10-20 14:41:27 +0100734
735 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damien George94fbe972014-07-30 11:46:05 +0100736 if (lex->chr0 == MP_LEXER_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100737 lex->chr0 = '\n';
Damien George94fbe972014-07-30 11:46:05 +0100738 } else if (lex->chr1 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000739 if (lex->chr0 == '\r') {
740 lex->chr0 = '\n';
741 } else if (lex->chr0 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100742 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100743 }
Damien George94fbe972014-07-30 11:46:05 +0100744 } else if (lex->chr2 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000745 if (lex->chr1 == '\r') {
746 lex->chr1 = '\n';
747 } else if (lex->chr1 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100748 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100749 }
Damien429d7192013-10-04 19:53:11 +0100750 }
751
Damiena5185f42013-10-20 14:41:27 +0100752 // preload first token
Damien Georgea4c52c52014-12-05 19:35:18 +0000753 mp_lexer_next_token_into(lex, true);
Damien429d7192013-10-04 19:53:11 +0100754
755 return lex;
756}
757
Damiend99b0522013-12-21 18:17:45 +0000758void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100759 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100760 if (lex->stream_close) {
761 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100762 }
Damienbb5316b2013-10-22 21:12:29 +0100763 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200764 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000765 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100766 }
Damien429d7192013-10-04 19:53:11 +0100767}
768
Damiend99b0522013-12-21 18:17:45 +0000769void mp_lexer_to_next(mp_lexer_t *lex) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000770 mp_lexer_next_token_into(lex, false);
Damien429d7192013-10-04 19:53:11 +0100771}
772
Damien Georgea4c52c52014-12-05 19:35:18 +0000773#if MICROPY_DEBUG_PRINTERS
774void mp_lexer_show_token(const mp_lexer_t *lex) {
Damien George451a0872014-12-05 22:50:16 +0000775 printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
Damien Georgea4c52c52014-12-05 19:35:18 +0000776 if (lex->vstr.len > 0) {
777 const byte *i = (const byte *)lex->vstr.buf;
778 const byte *j = (const byte *)i + lex->vstr.len;
779 printf(" ");
780 while (i < j) {
781 unichar c = utf8_get_char(i);
782 i = utf8_next_char(i);
783 if (unichar_isprint(c)) {
Damien George7f19a392015-06-22 17:40:12 +0100784 printf("%c", (int)c);
Damien Georgea4c52c52014-12-05 19:35:18 +0000785 } else {
786 printf("?");
787 }
788 }
789 }
790 printf("\n");
Damien429d7192013-10-04 19:53:11 +0100791}
Damien Georgea4c52c52014-12-05 19:35:18 +0000792#endif
Damien Georgedd5353a2015-12-18 12:35:44 +0000793
794#endif // MICROPY_ENABLE_COMPILER