blob: 42f755ed982187bd65dac65bf2214a7d339c5915 [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027/* lexer.c -- simple tokeniser for Python implementation
28 */
29
xbeefe34222014-03-16 00:14:26 -070030#include <stdbool.h>
Damien429d7192013-10-04 19:53:11 +010031#include <stdint.h>
32#include <stdio.h>
33#include <assert.h>
34
35#include "misc.h"
Damien George55baff42014-01-21 21:40:13 +000036#include "mpconfig.h"
37#include "qstr.h"
Damien429d7192013-10-04 19:53:11 +010038#include "lexer.h"
39
40#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010041
Damien92c06562013-10-22 22:32:27 +010042// TODO seems that CPython allows NULL byte in the input stream
43// don't know if that's intentional or not, but we don't allow it
44
Damiend99b0522013-12-21 18:17:45 +000045struct _mp_lexer_t {
Damien George08335002014-01-18 23:24:36 +000046 qstr source_name; // name of source
Damiena5185f42013-10-20 14:41:27 +010047 void *stream_data; // data for stream
Damiend99b0522013-12-21 18:17:45 +000048 mp_lexer_stream_next_char_t stream_next_char; // stream callback to get next char
49 mp_lexer_stream_close_t stream_close; // stream callback to free
Damien429d7192013-10-04 19:53:11 +010050
Damiena5185f42013-10-20 14:41:27 +010051 unichar chr0, chr1, chr2; // current cached characters from source
Damien429d7192013-10-04 19:53:11 +010052
53 uint line; // source line
54 uint column; // source column
55
Damiena5185f42013-10-20 14:41:27 +010056 int emit_dent; // non-zero when there are INDENT/DEDENT tokens to emit
57 int nested_bracket_level; // >0 when there are nested brackets over multiple lines
Damien429d7192013-10-04 19:53:11 +010058
59 uint alloc_indent_level;
60 uint num_indent_level;
61 uint16_t *indent_level;
62
Damiena5185f42013-10-20 14:41:27 +010063 vstr_t vstr;
Damiend99b0522013-12-21 18:17:45 +000064 mp_token_t tok_cur;
Damien429d7192013-10-04 19:53:11 +010065};
66
Damien George97f9a282014-05-12 23:07:34 +010067// debug flag for __debug__ constant
68STATIC mp_token_kind_t mp_debug_value;
69
70void mp_set_debug(bool value) {
71 mp_debug_value = value ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE;
72}
73
Damien George9528cd62014-01-15 21:23:31 +000074// TODO replace with a call to a standard function
Damiena5185f42013-10-20 14:41:27 +010075bool str_strn_equal(const char *str, const char *strn, int len) {
Damien429d7192013-10-04 19:53:11 +010076 uint i = 0;
Damien429d7192013-10-04 19:53:11 +010077
Damiena5185f42013-10-20 14:41:27 +010078 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010079 ++i;
Damien429d7192013-10-04 19:53:11 +010080 ++str;
Damiena5185f42013-10-20 14:41:27 +010081 ++strn;
Damien429d7192013-10-04 19:53:11 +010082 }
83
Damiena5185f42013-10-20 14:41:27 +010084 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010085}
86
Damien Georgec5966122014-02-15 16:10:44 +000087#ifdef MICROPY_DEBUG_PRINTERS
Damiend99b0522013-12-21 18:17:45 +000088void mp_token_show(const mp_token_t *tok) {
Damien George08335002014-01-18 23:24:36 +000089 printf("(%d:%d) kind:%d str:%p len:%d", tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
Damien429d7192013-10-04 19:53:11 +010090 if (tok->str != NULL && tok->len > 0) {
91 const char *i = tok->str;
92 const char *j = i + tok->len;
93 printf(" ");
94 while (i < j) {
Damien George8cc96a32013-12-30 18:23:50 +000095 unichar c = utf8_get_char(i);
96 i = utf8_next_char(i);
97 if (unichar_isprint(c)) {
Damien429d7192013-10-04 19:53:11 +010098 printf("%c", c);
99 } else {
100 printf("?");
101 }
102 }
103 }
104 printf("\n");
105}
Damien Georgec5966122014-02-15 16:10:44 +0000106#endif
Damien429d7192013-10-04 19:53:11 +0100107
Damiena5185f42013-10-20 14:41:27 +0100108#define CUR_CHAR(lex) ((lex)->chr0)
109
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200110STATIC bool is_end(mp_lexer_t *lex) {
Damiend99b0522013-12-21 18:17:45 +0000111 return lex->chr0 == MP_LEXER_CHAR_EOF;
Damien429d7192013-10-04 19:53:11 +0100112}
113
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200114STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100115 return lex->chr0 == '\n' || lex->chr0 == '\r';
116}
117
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200118STATIC bool is_char(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +0100119 return lex->chr0 == c;
120}
121
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200122STATIC bool is_char_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100123 return lex->chr0 == c1 || lex->chr0 == c2;
124}
125
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200126STATIC bool is_char_or3(mp_lexer_t *lex, char c1, char c2, char c3) {
Damien429d7192013-10-04 19:53:11 +0100127 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
128}
129
130/*
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200131STATIC bool is_char_following(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +0100132 return lex->chr1 == c;
133}
134*/
135
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200136STATIC bool is_char_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100137 return lex->chr1 == c1 || lex->chr1 == c2;
138}
139
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200140STATIC bool is_char_following_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100141 return lex->chr2 == c1 || lex->chr2 == c2;
142}
143
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200144STATIC bool is_char_and(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100145 return lex->chr0 == c1 && lex->chr1 == c2;
146}
147
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200148STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000149 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100150}
151
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200152STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000153 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100154}
155
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200156STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000157 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100158}
159
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200160STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000161 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100162}
163
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200164STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200165 return lex->chr1 >= '0' && lex->chr1 <= '7';
166}
167
Damien429d7192013-10-04 19:53:11 +0100168// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200169STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100170 return is_letter(lex) || lex->chr0 == '_';
171}
172
173// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200174STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100175 return is_head_of_identifier(lex) || is_digit(lex);
176}
177
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200178STATIC void next_char(mp_lexer_t *lex) {
Damiend99b0522013-12-21 18:17:45 +0000179 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100180 return;
181 }
182
183 int advance = 1;
184
185 if (lex->chr0 == '\n') {
186 // LF is a new line
187 ++lex->line;
188 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100189 } else if (lex->chr0 == '\r') {
190 // CR is a new line
191 ++lex->line;
192 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100193 if (lex->chr1 == '\n') {
194 // CR LF is a single new line
195 advance = 2;
196 }
197 } else if (lex->chr0 == '\t') {
198 // a tab
199 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
200 } else {
201 // a character worth one column
202 ++lex->column;
203 }
204
205 for (; advance > 0; advance--) {
206 lex->chr0 = lex->chr1;
207 lex->chr1 = lex->chr2;
Damiena5185f42013-10-20 14:41:27 +0100208 lex->chr2 = lex->stream_next_char(lex->stream_data);
Damiend99b0522013-12-21 18:17:45 +0000209 if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100210 // EOF
Damiend99b0522013-12-21 18:17:45 +0000211 if (lex->chr1 != MP_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien429d7192013-10-04 19:53:11 +0100212 lex->chr2 = '\n'; // insert newline at end of file
Damien429d7192013-10-04 19:53:11 +0100213 }
214 }
215 }
216}
217
Damiend99b0522013-12-21 18:17:45 +0000218void indent_push(mp_lexer_t *lex, uint indent) {
Damien429d7192013-10-04 19:53:11 +0100219 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100220 // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
Damien George58ebde42014-05-21 20:32:59 +0100221 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
222 lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
Damien429d7192013-10-04 19:53:11 +0100223 }
224 lex->indent_level[lex->num_indent_level++] = indent;
225}
226
Damiend99b0522013-12-21 18:17:45 +0000227uint indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100228 return lex->indent_level[lex->num_indent_level - 1];
229}
230
Damiend99b0522013-12-21 18:17:45 +0000231void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100232 lex->num_indent_level -= 1;
233}
234
235// some tricky operator encoding:
236// <op> = begin with <op>, if this opchar matches then begin here
237// e<op> = end with <op>, if this opchar matches then end
238// E<op> = mandatory end with <op>, this opchar must match, then end
239// c<op> = continue with <op>, if this opchar matches then continue matching
240// this means if the start of two ops are the same then they are equal til the last char
241
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200242STATIC const char *tok_enc =
Damien429d7192013-10-04 19:53:11 +0100243 "()[]{},:;@~" // singles
244 "<e=c<e=" // < <= << <<=
245 ">e=c>e=" // > >= >> >>=
246 "*e=c*e=" // * *= ** **=
247 "+e=" // + +=
248 "-e=e>" // - -= ->
249 "&e=" // & &=
250 "|e=" // | |=
251 "/e=c/e=" // / /= // //=
252 "%e=" // % %=
253 "^e=" // ^ ^=
254 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100255 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100256
257// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200258STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000259 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
260 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
261 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
262 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100263
Damiend99b0522013-12-21 18:17:45 +0000264 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
265 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
266 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
267 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
268 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
269 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
270 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
271 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
272 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
273 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
274 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
275 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100276};
277
278// must have the same order as enum in lexer.h
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200279STATIC const char *tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100280 "False",
281 "None",
282 "True",
283 "and",
284 "as",
285 "assert",
286 "break",
287 "class",
288 "continue",
289 "def",
290 "del",
291 "elif",
292 "else",
293 "except",
294 "finally",
295 "for",
296 "from",
297 "global",
298 "if",
299 "import",
300 "in",
301 "is",
302 "lambda",
303 "nonlocal",
304 "not",
305 "or",
306 "pass",
307 "raise",
308 "return",
309 "try",
310 "while",
311 "with",
312 "yield",
Damien George97f9a282014-05-12 23:07:34 +0100313 "__debug__",
Damien429d7192013-10-04 19:53:11 +0100314};
315
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200316STATIC int hex_digit(unichar c) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200317 // c is assumed to be hex digit
318 int n = c - '0';
319 if (n > 9) {
320 n &= ~('a' - 'A');
321 n -= ('A' - ('9' + 1));
322 }
323 return n;
324}
325
326// This is called with CUR_CHAR() before first hex digit, and should return with
327// it pointing to last hex digit
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200328STATIC bool get_hex(mp_lexer_t *lex, int num_digits, uint *result) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200329 uint num = 0;
330 while (num_digits-- != 0) {
331 next_char(lex);
332 unichar c = CUR_CHAR(lex);
333 if (!unichar_isxdigit(c)) {
334 return false;
335 }
336 num = (num << 4) + hex_digit(c);
337 }
338 *result = num;
339 return true;
340}
341
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200342STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) {
Damiena5185f42013-10-20 14:41:27 +0100343 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100344 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100345 while (!is_end(lex)) {
346 if (is_physical_newline(lex)) {
347 had_physical_newline = true;
348 next_char(lex);
349 } else if (is_whitespace(lex)) {
350 next_char(lex);
351 } else if (is_char(lex, '#')) {
352 next_char(lex);
353 while (!is_end(lex) && !is_physical_newline(lex)) {
354 next_char(lex);
355 }
356 // had_physical_newline will be set on next loop
357 } else if (is_char(lex, '\\')) {
358 // backslash (outside string literals) must appear just before a physical newline
359 next_char(lex);
360 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000361 // SyntaxError: unexpected character after line continuation character
Damien George69a818d2014-01-12 13:55:24 +0000362 tok->src_line = lex->line;
363 tok->src_column = lex->column;
364 tok->kind = MP_TOKEN_BAD_LINE_CONTINUATION;
365 vstr_reset(&lex->vstr);
366 tok->str = vstr_str(&lex->vstr);
367 tok->len = 0;
368 return;
Damien429d7192013-10-04 19:53:11 +0100369 } else {
370 next_char(lex);
371 }
372 } else {
373 break;
374 }
375 }
376
Damiena5185f42013-10-20 14:41:27 +0100377 // set token source information
Damien429d7192013-10-04 19:53:11 +0100378 tok->src_line = lex->line;
379 tok->src_column = lex->column;
Damien429d7192013-10-04 19:53:11 +0100380
Damiena5185f42013-10-20 14:41:27 +0100381 // start new token text
382 vstr_reset(&lex->vstr);
383
384 if (first_token && lex->line == 1 && lex->column != 1) {
385 // check that the first token is in the first column
386 // if first token is not on first line, we get a physical newline and
387 // this check is done as part of normal indent/dedent checking below
388 // (done to get equivalence with CPython)
Damiend99b0522013-12-21 18:17:45 +0000389 tok->kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100390
391 } else if (lex->emit_dent < 0) {
Damiend99b0522013-12-21 18:17:45 +0000392 tok->kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100393 lex->emit_dent += 1;
394
395 } else if (lex->emit_dent > 0) {
Damiend99b0522013-12-21 18:17:45 +0000396 tok->kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100397 lex->emit_dent -= 1;
398
Damien91d387d2013-10-09 15:09:52 +0100399 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damiend99b0522013-12-21 18:17:45 +0000400 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100401
402 uint num_spaces = lex->column - 1;
403 lex->emit_dent = 0;
404 if (num_spaces == indent_top(lex)) {
405 } else if (num_spaces > indent_top(lex)) {
406 indent_push(lex, num_spaces);
407 lex->emit_dent += 1;
408 } else {
409 while (num_spaces < indent_top(lex)) {
410 indent_pop(lex);
411 lex->emit_dent -= 1;
412 }
413 if (num_spaces != indent_top(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000414 tok->kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100415 }
416 }
417
418 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100419 if (indent_top(lex) > 0) {
Damiend99b0522013-12-21 18:17:45 +0000420 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100421 lex->emit_dent = 0;
422 while (indent_top(lex) > 0) {
423 indent_pop(lex);
424 lex->emit_dent -= 1;
425 }
426 } else {
Damiend99b0522013-12-21 18:17:45 +0000427 tok->kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100428 }
429
430 } else if (is_char_or(lex, '\'', '\"')
431 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
432 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
433 // a string or bytes literal
434
435 // parse type codes
436 bool is_raw = false;
437 bool is_bytes = false;
438 if (is_char(lex, 'u')) {
439 next_char(lex);
440 } else if (is_char(lex, 'b')) {
441 is_bytes = true;
442 next_char(lex);
443 if (is_char(lex, 'r')) {
444 is_raw = true;
445 next_char(lex);
446 }
447 } else if (is_char(lex, 'r')) {
448 is_raw = true;
449 next_char(lex);
450 if (is_char(lex, 'b')) {
451 is_bytes = true;
452 next_char(lex);
453 }
454 }
455
456 // set token kind
457 if (is_bytes) {
Damiend99b0522013-12-21 18:17:45 +0000458 tok->kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100459 } else {
Damiend99b0522013-12-21 18:17:45 +0000460 tok->kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100461 }
462
463 // get first quoting character
464 char quote_char = '\'';
465 if (is_char(lex, '\"')) {
466 quote_char = '\"';
467 }
468 next_char(lex);
469
470 // work out if it's a single or triple quoted literal
471 int num_quotes;
472 if (is_char_and(lex, quote_char, quote_char)) {
473 // triple quotes
474 next_char(lex);
475 next_char(lex);
476 num_quotes = 3;
477 } else {
478 // single quotes
479 num_quotes = 1;
480 }
481
Damien429d7192013-10-04 19:53:11 +0100482 // parse the literal
Damien429d7192013-10-04 19:53:11 +0100483 int n_closing = 0;
484 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
485 if (is_char(lex, quote_char)) {
486 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100487 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100488 } else {
489 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100490 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100491 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100492 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100493 if (is_raw) {
494 // raw strings allow escaping of quotes, but the backslash is also emitted
495 vstr_add_char(&lex->vstr, '\\');
496 } else {
497 switch (c) {
498 case MP_LEXER_CHAR_EOF: break; // TODO a proper error message?
499 case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
500 case '\\': break;
501 case '\'': break;
502 case '"': break;
503 case 'a': c = 0x07; break;
504 case 'b': c = 0x08; break;
505 case 't': c = 0x09; break;
506 case 'n': c = 0x0a; break;
507 case 'v': c = 0x0b; break;
508 case 'f': c = 0x0c; break;
509 case 'r': c = 0x0d; break;
510 case 'x':
511 {
512 uint num = 0;
513 if (!get_hex(lex, 2, &num)) {
514 // TODO error message
515 assert(0);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200516 }
517 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100518 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200519 }
Damien Georgea91f4142014-04-10 11:30:55 +0100520 case 'N': break; // TODO \N{name} only in strings
521 case 'u': break; // TODO \uxxxx only in strings
522 case 'U': break; // TODO \Uxxxxxxxx only in strings
523 default:
524 if (c >= '0' && c <= '7') {
525 // Octal sequence, 1-3 chars
526 int digits = 3;
527 int num = c - '0';
528 while (is_following_odigit(lex) && --digits != 0) {
529 next_char(lex);
530 num = num * 8 + (CUR_CHAR(lex) - '0');
531 }
532 c = num;
533 } else {
534 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
535 vstr_add_char(&lex->vstr, '\\');
536 }
537 break;
538 }
Damiena5185f42013-10-20 14:41:27 +0100539 }
Damiend99b0522013-12-21 18:17:45 +0000540 if (c != MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100541 vstr_add_char(&lex->vstr, c);
542 }
543 } else {
544 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100545 }
546 }
547 next_char(lex);
548 }
549
550 // check we got the required end quotes
551 if (n_closing < num_quotes) {
Damiend99b0522013-12-21 18:17:45 +0000552 tok->kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100553 }
554
Damiena5185f42013-10-20 14:41:27 +0100555 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000556 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100557
558 } else if (is_head_of_identifier(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000559 tok->kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100560
Damiena5185f42013-10-20 14:41:27 +0100561 // get first char
562 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100563 next_char(lex);
564
Damiena5185f42013-10-20 14:41:27 +0100565 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100566 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100567 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100568 next_char(lex);
569 }
570
571 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damiend99b0522013-12-21 18:17:45 +0000572 tok->kind = MP_TOKEN_NUMBER;
Damien429d7192013-10-04 19:53:11 +0100573
Damiena5185f42013-10-20 14:41:27 +0100574 // get first char
575 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100576 next_char(lex);
577
Damiena5185f42013-10-20 14:41:27 +0100578 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100579 while (!is_end(lex)) {
580 if (is_char_or(lex, 'e', 'E')) {
Damiena5185f42013-10-20 14:41:27 +0100581 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100582 next_char(lex);
583 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100584 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100585 next_char(lex);
586 }
587 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
Damiena5185f42013-10-20 14:41:27 +0100588 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100589 next_char(lex);
590 } else {
591 break;
592 }
593 }
594
Damien George2e9eb2d2014-04-10 12:19:33 +0100595 } else if (is_char(lex, '.')) {
596 // special handling for . and ... operators, because .. is not a valid operator
597
598 // get first char
599 vstr_add_char(&lex->vstr, '.');
600 next_char(lex);
601
602 if (is_char_and(lex, '.', '.')) {
603 vstr_add_char(&lex->vstr, '.');
604 vstr_add_char(&lex->vstr, '.');
605 next_char(lex);
606 next_char(lex);
607 tok->kind = MP_TOKEN_ELLIPSIS;
608 } else {
609 tok->kind = MP_TOKEN_DEL_PERIOD;
610 }
611
Damien429d7192013-10-04 19:53:11 +0100612 } else {
613 // search for encoded delimiter or operator
614
615 const char *t = tok_enc;
616 uint tok_enc_index = 0;
617 for (; *t != 0 && !is_char(lex, *t); t += 1) {
618 if (*t == 'e' || *t == 'c') {
619 t += 1;
620 } else if (*t == 'E') {
621 tok_enc_index -= 1;
622 t += 1;
623 }
624 tok_enc_index += 1;
625 }
626
627 next_char(lex);
628
629 if (*t == 0) {
630 // didn't match any delimiter or operator characters
Damiend99b0522013-12-21 18:17:45 +0000631 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100632
633 } else {
634 // matched a delimiter or operator character
635
636 // get the maximum characters for a valid token
637 t += 1;
638 uint t_index = tok_enc_index;
639 for (;;) {
640 for (; *t == 'e'; t += 1) {
641 t += 1;
642 t_index += 1;
643 if (is_char(lex, *t)) {
644 next_char(lex);
645 tok_enc_index = t_index;
646 break;
647 }
648 }
649
650 if (*t == 'E') {
651 t += 1;
652 if (is_char(lex, *t)) {
653 next_char(lex);
654 tok_enc_index = t_index;
655 } else {
Damiend99b0522013-12-21 18:17:45 +0000656 tok->kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100657 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100658 }
659 break;
660 }
661
662 if (*t == 'c') {
663 t += 1;
664 t_index += 1;
665 if (is_char(lex, *t)) {
666 next_char(lex);
667 tok_enc_index = t_index;
668 t += 1;
669 } else {
670 break;
671 }
672 } else {
673 break;
674 }
675 }
676
677 // set token kind
678 tok->kind = tok_enc_kind[tok_enc_index];
679
Damien George2e9eb2d2014-04-10 12:19:33 +0100680 tok_enc_no_match:
681
Damien429d7192013-10-04 19:53:11 +0100682 // compute bracket level for implicit line joining
Damiend99b0522013-12-21 18:17:45 +0000683 if (tok->kind == MP_TOKEN_DEL_PAREN_OPEN || tok->kind == MP_TOKEN_DEL_BRACKET_OPEN || tok->kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100684 lex->nested_bracket_level += 1;
Damiend99b0522013-12-21 18:17:45 +0000685 } else if (tok->kind == MP_TOKEN_DEL_PAREN_CLOSE || tok->kind == MP_TOKEN_DEL_BRACKET_CLOSE || tok->kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100686 lex->nested_bracket_level -= 1;
687 }
688 }
689 }
690
Damiena5185f42013-10-20 14:41:27 +0100691 // point token text to vstr buffer
692 tok->str = vstr_str(&lex->vstr);
693 tok->len = vstr_len(&lex->vstr);
Damien429d7192013-10-04 19:53:11 +0100694
Damiena5185f42013-10-20 14:41:27 +0100695 // check for keywords
Damiend99b0522013-12-21 18:17:45 +0000696 if (tok->kind == MP_TOKEN_NAME) {
Damien George97f9a282014-05-12 23:07:34 +0100697 // We check for __debug__ here and convert it to its value. This is so
698 // the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
699 // need to check for this special token in many places in the compiler.
700 // TODO improve speed of these string comparisons
701 //for (int i = 0; tok_kw[i] != NULL; i++) {
702 for (int i = 0; i < ARRAY_SIZE(tok_kw); i++) {
Damiena5185f42013-10-20 14:41:27 +0100703 if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
Damien George97f9a282014-05-12 23:07:34 +0100704 if (i == ARRAY_SIZE(tok_kw) - 1) {
705 tok->kind = mp_debug_value;
706 } else {
707 tok->kind = MP_TOKEN_KW_FALSE + i;
708 }
Damien429d7192013-10-04 19:53:11 +0100709 break;
710 }
711 }
712 }
713}
714
Damien Georgeb829b5c2014-01-25 13:51:19 +0000715mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100716 mp_lexer_t *lex = m_new_maybe(mp_lexer_t, 1);
717
718 // check for memory allocation error
719 if (lex == NULL) {
720 if (stream_close) {
721 stream_close(stream_data);
722 }
723 return NULL;
724 }
Damien429d7192013-10-04 19:53:11 +0100725
Damien Georgeb829b5c2014-01-25 13:51:19 +0000726 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100727 lex->stream_data = stream_data;
728 lex->stream_next_char = stream_next_char;
Damienfa2162b2013-10-20 17:42:00 +0100729 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100730 lex->line = 1;
731 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100732 lex->emit_dent = 0;
733 lex->nested_bracket_level = 0;
Damien George58ebde42014-05-21 20:32:59 +0100734 lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
Damien429d7192013-10-04 19:53:11 +0100735 lex->num_indent_level = 1;
Damien Georgee1199ec2014-05-10 17:48:01 +0100736 lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200737 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100738
Damien Georgee1199ec2014-05-10 17:48:01 +0100739 // check for memory allocation error
740 if (lex->indent_level == NULL || vstr_had_error(&lex->vstr)) {
741 mp_lexer_free(lex);
742 return NULL;
743 }
744
745 // store sentinel for first indentation level
746 lex->indent_level[0] = 0;
747
Damien429d7192013-10-04 19:53:11 +0100748 // preload characters
Damiena5185f42013-10-20 14:41:27 +0100749 lex->chr0 = stream_next_char(stream_data);
750 lex->chr1 = stream_next_char(stream_data);
751 lex->chr2 = stream_next_char(stream_data);
752
753 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damiend99b0522013-12-21 18:17:45 +0000754 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100755 lex->chr0 = '\n';
Damiend99b0522013-12-21 18:17:45 +0000756 } else if (lex->chr1 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100757 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100758 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100759 }
Damiend99b0522013-12-21 18:17:45 +0000760 } else if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100761 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100762 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100763 }
Damien429d7192013-10-04 19:53:11 +0100764 }
765
Damiena5185f42013-10-20 14:41:27 +0100766 // preload first token
Damiend99b0522013-12-21 18:17:45 +0000767 mp_lexer_next_token_into(lex, &lex->tok_cur, true);
Damien429d7192013-10-04 19:53:11 +0100768
769 return lex;
770}
771
Damiend99b0522013-12-21 18:17:45 +0000772void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100773 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100774 if (lex->stream_close) {
775 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100776 }
Damienbb5316b2013-10-22 21:12:29 +0100777 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200778 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000779 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100780 }
Damien429d7192013-10-04 19:53:11 +0100781}
782
Damien George08335002014-01-18 23:24:36 +0000783qstr mp_lexer_source_name(mp_lexer_t *lex) {
784 return lex->source_name;
785}
786
Damiend99b0522013-12-21 18:17:45 +0000787void mp_lexer_to_next(mp_lexer_t *lex) {
788 mp_lexer_next_token_into(lex, &lex->tok_cur, false);
Damien429d7192013-10-04 19:53:11 +0100789}
790
Damiend99b0522013-12-21 18:17:45 +0000791const mp_token_t *mp_lexer_cur(const mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100792 return &lex->tok_cur;
793}
794
Damiend99b0522013-12-21 18:17:45 +0000795bool mp_lexer_is_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
Damien429d7192013-10-04 19:53:11 +0100796 return lex->tok_cur.kind == kind;
797}