blob: f69c395e7e4510a9dab282ec0087ceff5407090e [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027/* lexer.c -- simple tokeniser for Python implementation
28 */
29
xbeefe34222014-03-16 00:14:26 -070030#include <stdbool.h>
Damien429d7192013-10-04 19:53:11 +010031#include <stdint.h>
32#include <stdio.h>
33#include <assert.h>
34
35#include "misc.h"
Damien George55baff42014-01-21 21:40:13 +000036#include "mpconfig.h"
37#include "qstr.h"
Damien429d7192013-10-04 19:53:11 +010038#include "lexer.h"
39
40#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010041
Damien92c06562013-10-22 22:32:27 +010042// TODO seems that CPython allows NULL byte in the input stream
43// don't know if that's intentional or not, but we don't allow it
44
Damiend99b0522013-12-21 18:17:45 +000045struct _mp_lexer_t {
Damien George08335002014-01-18 23:24:36 +000046 qstr source_name; // name of source
Damiena5185f42013-10-20 14:41:27 +010047 void *stream_data; // data for stream
Damiend99b0522013-12-21 18:17:45 +000048 mp_lexer_stream_next_char_t stream_next_char; // stream callback to get next char
49 mp_lexer_stream_close_t stream_close; // stream callback to free
Damien429d7192013-10-04 19:53:11 +010050
Damiena5185f42013-10-20 14:41:27 +010051 unichar chr0, chr1, chr2; // current cached characters from source
Damien429d7192013-10-04 19:53:11 +010052
53 uint line; // source line
54 uint column; // source column
55
Damiena5185f42013-10-20 14:41:27 +010056 int emit_dent; // non-zero when there are INDENT/DEDENT tokens to emit
57 int nested_bracket_level; // >0 when there are nested brackets over multiple lines
Damien429d7192013-10-04 19:53:11 +010058
59 uint alloc_indent_level;
60 uint num_indent_level;
61 uint16_t *indent_level;
62
Damiena5185f42013-10-20 14:41:27 +010063 vstr_t vstr;
Damiend99b0522013-12-21 18:17:45 +000064 mp_token_t tok_cur;
Damien429d7192013-10-04 19:53:11 +010065};
66
Paul Sokolovskyd3439d02014-06-02 19:37:55 +030067uint mp_optimise_value;
Damien George97f9a282014-05-12 23:07:34 +010068
Damien George9528cd62014-01-15 21:23:31 +000069// TODO replace with a call to a standard function
Damiena5185f42013-10-20 14:41:27 +010070bool str_strn_equal(const char *str, const char *strn, int len) {
Damien429d7192013-10-04 19:53:11 +010071 uint i = 0;
Damien429d7192013-10-04 19:53:11 +010072
Damiena5185f42013-10-20 14:41:27 +010073 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010074 ++i;
Damien429d7192013-10-04 19:53:11 +010075 ++str;
Damiena5185f42013-10-20 14:41:27 +010076 ++strn;
Damien429d7192013-10-04 19:53:11 +010077 }
78
Damiena5185f42013-10-20 14:41:27 +010079 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010080}
81
Damien Georgec5966122014-02-15 16:10:44 +000082#ifdef MICROPY_DEBUG_PRINTERS
Damiend99b0522013-12-21 18:17:45 +000083void mp_token_show(const mp_token_t *tok) {
Damien George08335002014-01-18 23:24:36 +000084 printf("(%d:%d) kind:%d str:%p len:%d", tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
Damien429d7192013-10-04 19:53:11 +010085 if (tok->str != NULL && tok->len > 0) {
Paul Sokolovskyb0bb4582014-06-14 06:18:34 +030086 const byte *i = (const byte *)tok->str;
87 const byte *j = (const byte *)i + tok->len;
Damien429d7192013-10-04 19:53:11 +010088 printf(" ");
89 while (i < j) {
Damien George8cc96a32013-12-30 18:23:50 +000090 unichar c = utf8_get_char(i);
91 i = utf8_next_char(i);
92 if (unichar_isprint(c)) {
Damien429d7192013-10-04 19:53:11 +010093 printf("%c", c);
94 } else {
95 printf("?");
96 }
97 }
98 }
99 printf("\n");
100}
Damien Georgec5966122014-02-15 16:10:44 +0000101#endif
Damien429d7192013-10-04 19:53:11 +0100102
Damiena5185f42013-10-20 14:41:27 +0100103#define CUR_CHAR(lex) ((lex)->chr0)
104
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200105STATIC bool is_end(mp_lexer_t *lex) {
Damiend99b0522013-12-21 18:17:45 +0000106 return lex->chr0 == MP_LEXER_CHAR_EOF;
Damien429d7192013-10-04 19:53:11 +0100107}
108
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200109STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100110 return lex->chr0 == '\n' || lex->chr0 == '\r';
111}
112
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200113STATIC bool is_char(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +0100114 return lex->chr0 == c;
115}
116
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200117STATIC bool is_char_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100118 return lex->chr0 == c1 || lex->chr0 == c2;
119}
120
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200121STATIC bool is_char_or3(mp_lexer_t *lex, char c1, char c2, char c3) {
Damien429d7192013-10-04 19:53:11 +0100122 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
123}
124
125/*
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200126STATIC bool is_char_following(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +0100127 return lex->chr1 == c;
128}
129*/
130
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200131STATIC bool is_char_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100132 return lex->chr1 == c1 || lex->chr1 == c2;
133}
134
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200135STATIC bool is_char_following_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100136 return lex->chr2 == c1 || lex->chr2 == c2;
137}
138
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200139STATIC bool is_char_and(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100140 return lex->chr0 == c1 && lex->chr1 == c2;
141}
142
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200143STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000144 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100145}
146
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200147STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000148 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100149}
150
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200151STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000152 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100153}
154
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200155STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000156 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100157}
158
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200159STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200160 return lex->chr1 >= '0' && lex->chr1 <= '7';
161}
162
Damien429d7192013-10-04 19:53:11 +0100163// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200164STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100165 return is_letter(lex) || lex->chr0 == '_';
166}
167
168// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200169STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100170 return is_head_of_identifier(lex) || is_digit(lex);
171}
172
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200173STATIC void next_char(mp_lexer_t *lex) {
Damiend99b0522013-12-21 18:17:45 +0000174 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100175 return;
176 }
177
178 int advance = 1;
179
180 if (lex->chr0 == '\n') {
181 // LF is a new line
182 ++lex->line;
183 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100184 } else if (lex->chr0 == '\r') {
185 // CR is a new line
186 ++lex->line;
187 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100188 if (lex->chr1 == '\n') {
189 // CR LF is a single new line
190 advance = 2;
191 }
192 } else if (lex->chr0 == '\t') {
193 // a tab
194 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
195 } else {
196 // a character worth one column
197 ++lex->column;
198 }
199
200 for (; advance > 0; advance--) {
201 lex->chr0 = lex->chr1;
202 lex->chr1 = lex->chr2;
Damiena5185f42013-10-20 14:41:27 +0100203 lex->chr2 = lex->stream_next_char(lex->stream_data);
Damiend99b0522013-12-21 18:17:45 +0000204 if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100205 // EOF
Damiend99b0522013-12-21 18:17:45 +0000206 if (lex->chr1 != MP_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien429d7192013-10-04 19:53:11 +0100207 lex->chr2 = '\n'; // insert newline at end of file
Damien429d7192013-10-04 19:53:11 +0100208 }
209 }
210 }
211}
212
Damiend99b0522013-12-21 18:17:45 +0000213void indent_push(mp_lexer_t *lex, uint indent) {
Damien429d7192013-10-04 19:53:11 +0100214 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100215 // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
Damien George58ebde42014-05-21 20:32:59 +0100216 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
217 lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
Damien429d7192013-10-04 19:53:11 +0100218 }
219 lex->indent_level[lex->num_indent_level++] = indent;
220}
221
Damiend99b0522013-12-21 18:17:45 +0000222uint indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100223 return lex->indent_level[lex->num_indent_level - 1];
224}
225
Damiend99b0522013-12-21 18:17:45 +0000226void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100227 lex->num_indent_level -= 1;
228}
229
230// some tricky operator encoding:
231// <op> = begin with <op>, if this opchar matches then begin here
232// e<op> = end with <op>, if this opchar matches then end
233// E<op> = mandatory end with <op>, this opchar must match, then end
234// c<op> = continue with <op>, if this opchar matches then continue matching
235// this means if the start of two ops are the same then they are equal til the last char
236
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200237STATIC const char *tok_enc =
Damien429d7192013-10-04 19:53:11 +0100238 "()[]{},:;@~" // singles
239 "<e=c<e=" // < <= << <<=
240 ">e=c>e=" // > >= >> >>=
241 "*e=c*e=" // * *= ** **=
242 "+e=" // + +=
243 "-e=e>" // - -= ->
244 "&e=" // & &=
245 "|e=" // | |=
246 "/e=c/e=" // / /= // //=
247 "%e=" // % %=
248 "^e=" // ^ ^=
249 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100250 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100251
252// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200253STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000254 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
255 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
256 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
257 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100258
Damiend99b0522013-12-21 18:17:45 +0000259 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
260 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
261 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
262 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
263 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
264 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
265 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
266 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
267 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
268 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
269 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
270 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100271};
272
273// must have the same order as enum in lexer.h
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200274STATIC const char *tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100275 "False",
276 "None",
277 "True",
278 "and",
279 "as",
280 "assert",
281 "break",
282 "class",
283 "continue",
284 "def",
285 "del",
286 "elif",
287 "else",
288 "except",
289 "finally",
290 "for",
291 "from",
292 "global",
293 "if",
294 "import",
295 "in",
296 "is",
297 "lambda",
298 "nonlocal",
299 "not",
300 "or",
301 "pass",
302 "raise",
303 "return",
304 "try",
305 "while",
306 "with",
307 "yield",
Damien George97f9a282014-05-12 23:07:34 +0100308 "__debug__",
Damien429d7192013-10-04 19:53:11 +0100309};
310
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200311STATIC int hex_digit(unichar c) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200312 // c is assumed to be hex digit
313 int n = c - '0';
314 if (n > 9) {
315 n &= ~('a' - 'A');
316 n -= ('A' - ('9' + 1));
317 }
318 return n;
319}
320
321// This is called with CUR_CHAR() before first hex digit, and should return with
322// it pointing to last hex digit
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200323STATIC bool get_hex(mp_lexer_t *lex, int num_digits, uint *result) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200324 uint num = 0;
325 while (num_digits-- != 0) {
326 next_char(lex);
327 unichar c = CUR_CHAR(lex);
328 if (!unichar_isxdigit(c)) {
329 return false;
330 }
331 num = (num << 4) + hex_digit(c);
332 }
333 *result = num;
334 return true;
335}
336
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200337STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) {
Damiena5185f42013-10-20 14:41:27 +0100338 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100339 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100340 while (!is_end(lex)) {
341 if (is_physical_newline(lex)) {
342 had_physical_newline = true;
343 next_char(lex);
344 } else if (is_whitespace(lex)) {
345 next_char(lex);
346 } else if (is_char(lex, '#')) {
347 next_char(lex);
348 while (!is_end(lex) && !is_physical_newline(lex)) {
349 next_char(lex);
350 }
351 // had_physical_newline will be set on next loop
352 } else if (is_char(lex, '\\')) {
353 // backslash (outside string literals) must appear just before a physical newline
354 next_char(lex);
355 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000356 // SyntaxError: unexpected character after line continuation character
Damien George69a818d2014-01-12 13:55:24 +0000357 tok->src_line = lex->line;
358 tok->src_column = lex->column;
359 tok->kind = MP_TOKEN_BAD_LINE_CONTINUATION;
360 vstr_reset(&lex->vstr);
361 tok->str = vstr_str(&lex->vstr);
362 tok->len = 0;
363 return;
Damien429d7192013-10-04 19:53:11 +0100364 } else {
365 next_char(lex);
366 }
367 } else {
368 break;
369 }
370 }
371
Damiena5185f42013-10-20 14:41:27 +0100372 // set token source information
Damien429d7192013-10-04 19:53:11 +0100373 tok->src_line = lex->line;
374 tok->src_column = lex->column;
Damien429d7192013-10-04 19:53:11 +0100375
Damiena5185f42013-10-20 14:41:27 +0100376 // start new token text
377 vstr_reset(&lex->vstr);
378
379 if (first_token && lex->line == 1 && lex->column != 1) {
380 // check that the first token is in the first column
381 // if first token is not on first line, we get a physical newline and
382 // this check is done as part of normal indent/dedent checking below
383 // (done to get equivalence with CPython)
Damiend99b0522013-12-21 18:17:45 +0000384 tok->kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100385
386 } else if (lex->emit_dent < 0) {
Damiend99b0522013-12-21 18:17:45 +0000387 tok->kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100388 lex->emit_dent += 1;
389
390 } else if (lex->emit_dent > 0) {
Damiend99b0522013-12-21 18:17:45 +0000391 tok->kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100392 lex->emit_dent -= 1;
393
Damien91d387d2013-10-09 15:09:52 +0100394 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damiend99b0522013-12-21 18:17:45 +0000395 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100396
397 uint num_spaces = lex->column - 1;
398 lex->emit_dent = 0;
399 if (num_spaces == indent_top(lex)) {
400 } else if (num_spaces > indent_top(lex)) {
401 indent_push(lex, num_spaces);
402 lex->emit_dent += 1;
403 } else {
404 while (num_spaces < indent_top(lex)) {
405 indent_pop(lex);
406 lex->emit_dent -= 1;
407 }
408 if (num_spaces != indent_top(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000409 tok->kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100410 }
411 }
412
413 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100414 if (indent_top(lex) > 0) {
Damiend99b0522013-12-21 18:17:45 +0000415 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100416 lex->emit_dent = 0;
417 while (indent_top(lex) > 0) {
418 indent_pop(lex);
419 lex->emit_dent -= 1;
420 }
421 } else {
Damiend99b0522013-12-21 18:17:45 +0000422 tok->kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100423 }
424
425 } else if (is_char_or(lex, '\'', '\"')
426 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
427 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
428 // a string or bytes literal
429
430 // parse type codes
431 bool is_raw = false;
432 bool is_bytes = false;
433 if (is_char(lex, 'u')) {
434 next_char(lex);
435 } else if (is_char(lex, 'b')) {
436 is_bytes = true;
437 next_char(lex);
438 if (is_char(lex, 'r')) {
439 is_raw = true;
440 next_char(lex);
441 }
442 } else if (is_char(lex, 'r')) {
443 is_raw = true;
444 next_char(lex);
445 if (is_char(lex, 'b')) {
446 is_bytes = true;
447 next_char(lex);
448 }
449 }
450
451 // set token kind
452 if (is_bytes) {
Damiend99b0522013-12-21 18:17:45 +0000453 tok->kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100454 } else {
Damiend99b0522013-12-21 18:17:45 +0000455 tok->kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100456 }
457
458 // get first quoting character
459 char quote_char = '\'';
460 if (is_char(lex, '\"')) {
461 quote_char = '\"';
462 }
463 next_char(lex);
464
465 // work out if it's a single or triple quoted literal
466 int num_quotes;
467 if (is_char_and(lex, quote_char, quote_char)) {
468 // triple quotes
469 next_char(lex);
470 next_char(lex);
471 num_quotes = 3;
472 } else {
473 // single quotes
474 num_quotes = 1;
475 }
476
Damien429d7192013-10-04 19:53:11 +0100477 // parse the literal
Damien429d7192013-10-04 19:53:11 +0100478 int n_closing = 0;
479 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
480 if (is_char(lex, quote_char)) {
481 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100482 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100483 } else {
484 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100485 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100486 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100487 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100488 if (is_raw) {
489 // raw strings allow escaping of quotes, but the backslash is also emitted
490 vstr_add_char(&lex->vstr, '\\');
491 } else {
492 switch (c) {
493 case MP_LEXER_CHAR_EOF: break; // TODO a proper error message?
494 case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
495 case '\\': break;
496 case '\'': break;
497 case '"': break;
498 case 'a': c = 0x07; break;
499 case 'b': c = 0x08; break;
500 case 't': c = 0x09; break;
501 case 'n': c = 0x0a; break;
502 case 'v': c = 0x0b; break;
503 case 'f': c = 0x0c; break;
504 case 'r': c = 0x0d; break;
505 case 'x':
506 {
507 uint num = 0;
508 if (!get_hex(lex, 2, &num)) {
509 // TODO error message
510 assert(0);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200511 }
512 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100513 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200514 }
Damien Georgea91f4142014-04-10 11:30:55 +0100515 case 'N': break; // TODO \N{name} only in strings
516 case 'u': break; // TODO \uxxxx only in strings
517 case 'U': break; // TODO \Uxxxxxxxx only in strings
518 default:
519 if (c >= '0' && c <= '7') {
520 // Octal sequence, 1-3 chars
521 int digits = 3;
522 int num = c - '0';
523 while (is_following_odigit(lex) && --digits != 0) {
524 next_char(lex);
525 num = num * 8 + (CUR_CHAR(lex) - '0');
526 }
527 c = num;
528 } else {
529 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
530 vstr_add_char(&lex->vstr, '\\');
531 }
532 break;
533 }
Damiena5185f42013-10-20 14:41:27 +0100534 }
Damiend99b0522013-12-21 18:17:45 +0000535 if (c != MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100536 vstr_add_char(&lex->vstr, c);
537 }
538 } else {
539 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100540 }
541 }
542 next_char(lex);
543 }
544
545 // check we got the required end quotes
546 if (n_closing < num_quotes) {
Damiend99b0522013-12-21 18:17:45 +0000547 tok->kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100548 }
549
Damiena5185f42013-10-20 14:41:27 +0100550 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000551 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100552
553 } else if (is_head_of_identifier(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000554 tok->kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100555
Damiena5185f42013-10-20 14:41:27 +0100556 // get first char
557 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100558 next_char(lex);
559
Damiena5185f42013-10-20 14:41:27 +0100560 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100561 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100562 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100563 next_char(lex);
564 }
565
566 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damiend99b0522013-12-21 18:17:45 +0000567 tok->kind = MP_TOKEN_NUMBER;
Damien429d7192013-10-04 19:53:11 +0100568
Damiena5185f42013-10-20 14:41:27 +0100569 // get first char
570 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100571 next_char(lex);
572
Damiena5185f42013-10-20 14:41:27 +0100573 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100574 while (!is_end(lex)) {
575 if (is_char_or(lex, 'e', 'E')) {
Damiena5185f42013-10-20 14:41:27 +0100576 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100577 next_char(lex);
578 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100579 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100580 next_char(lex);
581 }
582 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
Damiena5185f42013-10-20 14:41:27 +0100583 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100584 next_char(lex);
585 } else {
586 break;
587 }
588 }
589
Damien George2e9eb2d2014-04-10 12:19:33 +0100590 } else if (is_char(lex, '.')) {
591 // special handling for . and ... operators, because .. is not a valid operator
592
593 // get first char
594 vstr_add_char(&lex->vstr, '.');
595 next_char(lex);
596
597 if (is_char_and(lex, '.', '.')) {
598 vstr_add_char(&lex->vstr, '.');
599 vstr_add_char(&lex->vstr, '.');
600 next_char(lex);
601 next_char(lex);
602 tok->kind = MP_TOKEN_ELLIPSIS;
603 } else {
604 tok->kind = MP_TOKEN_DEL_PERIOD;
605 }
606
Damien429d7192013-10-04 19:53:11 +0100607 } else {
608 // search for encoded delimiter or operator
609
610 const char *t = tok_enc;
611 uint tok_enc_index = 0;
612 for (; *t != 0 && !is_char(lex, *t); t += 1) {
613 if (*t == 'e' || *t == 'c') {
614 t += 1;
615 } else if (*t == 'E') {
616 tok_enc_index -= 1;
617 t += 1;
618 }
619 tok_enc_index += 1;
620 }
621
622 next_char(lex);
623
624 if (*t == 0) {
625 // didn't match any delimiter or operator characters
Damiend99b0522013-12-21 18:17:45 +0000626 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100627
628 } else {
629 // matched a delimiter or operator character
630
631 // get the maximum characters for a valid token
632 t += 1;
633 uint t_index = tok_enc_index;
634 for (;;) {
635 for (; *t == 'e'; t += 1) {
636 t += 1;
637 t_index += 1;
638 if (is_char(lex, *t)) {
639 next_char(lex);
640 tok_enc_index = t_index;
641 break;
642 }
643 }
644
645 if (*t == 'E') {
646 t += 1;
647 if (is_char(lex, *t)) {
648 next_char(lex);
649 tok_enc_index = t_index;
650 } else {
Damiend99b0522013-12-21 18:17:45 +0000651 tok->kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100652 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100653 }
654 break;
655 }
656
657 if (*t == 'c') {
658 t += 1;
659 t_index += 1;
660 if (is_char(lex, *t)) {
661 next_char(lex);
662 tok_enc_index = t_index;
663 t += 1;
664 } else {
665 break;
666 }
667 } else {
668 break;
669 }
670 }
671
672 // set token kind
673 tok->kind = tok_enc_kind[tok_enc_index];
674
Damien George2e9eb2d2014-04-10 12:19:33 +0100675 tok_enc_no_match:
676
Damien429d7192013-10-04 19:53:11 +0100677 // compute bracket level for implicit line joining
Damiend99b0522013-12-21 18:17:45 +0000678 if (tok->kind == MP_TOKEN_DEL_PAREN_OPEN || tok->kind == MP_TOKEN_DEL_BRACKET_OPEN || tok->kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100679 lex->nested_bracket_level += 1;
Damiend99b0522013-12-21 18:17:45 +0000680 } else if (tok->kind == MP_TOKEN_DEL_PAREN_CLOSE || tok->kind == MP_TOKEN_DEL_BRACKET_CLOSE || tok->kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100681 lex->nested_bracket_level -= 1;
682 }
683 }
684 }
685
Damiena5185f42013-10-20 14:41:27 +0100686 // point token text to vstr buffer
687 tok->str = vstr_str(&lex->vstr);
688 tok->len = vstr_len(&lex->vstr);
Damien429d7192013-10-04 19:53:11 +0100689
Damiena5185f42013-10-20 14:41:27 +0100690 // check for keywords
Damiend99b0522013-12-21 18:17:45 +0000691 if (tok->kind == MP_TOKEN_NAME) {
Damien George97f9a282014-05-12 23:07:34 +0100692 // We check for __debug__ here and convert it to its value. This is so
693 // the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
694 // need to check for this special token in many places in the compiler.
695 // TODO improve speed of these string comparisons
696 //for (int i = 0; tok_kw[i] != NULL; i++) {
Emmanuel Blotf6932d62014-06-19 18:54:34 +0200697 for (int i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
Damiena5185f42013-10-20 14:41:27 +0100698 if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
Emmanuel Blotf6932d62014-06-19 18:54:34 +0200699 if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
700 // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
Paul Sokolovskyd3439d02014-06-02 19:37:55 +0300701 tok->kind = (mp_optimise_value == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
Damien George97f9a282014-05-12 23:07:34 +0100702 } else {
703 tok->kind = MP_TOKEN_KW_FALSE + i;
704 }
Damien429d7192013-10-04 19:53:11 +0100705 break;
706 }
707 }
708 }
709}
710
Damien Georgeb829b5c2014-01-25 13:51:19 +0000711mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100712 mp_lexer_t *lex = m_new_maybe(mp_lexer_t, 1);
713
714 // check for memory allocation error
715 if (lex == NULL) {
716 if (stream_close) {
717 stream_close(stream_data);
718 }
719 return NULL;
720 }
Damien429d7192013-10-04 19:53:11 +0100721
Damien Georgeb829b5c2014-01-25 13:51:19 +0000722 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100723 lex->stream_data = stream_data;
724 lex->stream_next_char = stream_next_char;
Damienfa2162b2013-10-20 17:42:00 +0100725 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100726 lex->line = 1;
727 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100728 lex->emit_dent = 0;
729 lex->nested_bracket_level = 0;
Damien George58ebde42014-05-21 20:32:59 +0100730 lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
Damien429d7192013-10-04 19:53:11 +0100731 lex->num_indent_level = 1;
Damien Georgee1199ec2014-05-10 17:48:01 +0100732 lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200733 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100734
Damien Georgee1199ec2014-05-10 17:48:01 +0100735 // check for memory allocation error
736 if (lex->indent_level == NULL || vstr_had_error(&lex->vstr)) {
737 mp_lexer_free(lex);
738 return NULL;
739 }
740
741 // store sentinel for first indentation level
742 lex->indent_level[0] = 0;
743
Damien429d7192013-10-04 19:53:11 +0100744 // preload characters
Damiena5185f42013-10-20 14:41:27 +0100745 lex->chr0 = stream_next_char(stream_data);
746 lex->chr1 = stream_next_char(stream_data);
747 lex->chr2 = stream_next_char(stream_data);
748
749 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damiend99b0522013-12-21 18:17:45 +0000750 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100751 lex->chr0 = '\n';
Damiend99b0522013-12-21 18:17:45 +0000752 } else if (lex->chr1 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100753 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100754 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100755 }
Damiend99b0522013-12-21 18:17:45 +0000756 } else if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100757 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100758 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100759 }
Damien429d7192013-10-04 19:53:11 +0100760 }
761
Damiena5185f42013-10-20 14:41:27 +0100762 // preload first token
Damiend99b0522013-12-21 18:17:45 +0000763 mp_lexer_next_token_into(lex, &lex->tok_cur, true);
Damien429d7192013-10-04 19:53:11 +0100764
765 return lex;
766}
767
Damiend99b0522013-12-21 18:17:45 +0000768void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100769 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100770 if (lex->stream_close) {
771 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100772 }
Damienbb5316b2013-10-22 21:12:29 +0100773 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200774 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000775 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100776 }
Damien429d7192013-10-04 19:53:11 +0100777}
778
Damien George08335002014-01-18 23:24:36 +0000779qstr mp_lexer_source_name(mp_lexer_t *lex) {
780 return lex->source_name;
781}
782
Damiend99b0522013-12-21 18:17:45 +0000783void mp_lexer_to_next(mp_lexer_t *lex) {
784 mp_lexer_next_token_into(lex, &lex->tok_cur, false);
Damien429d7192013-10-04 19:53:11 +0100785}
786
Damiend99b0522013-12-21 18:17:45 +0000787const mp_token_t *mp_lexer_cur(const mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100788 return &lex->tok_cur;
789}
790
Damiend99b0522013-12-21 18:17:45 +0000791bool mp_lexer_is_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
Damien429d7192013-10-04 19:53:11 +0100792 return lex->tok_cur.kind == kind;
793}