blob: ff137fbbb4e85f59ee6b3e03e13198026d46a893 [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027/* lexer.c -- simple tokeniser for Python implementation
28 */
29
xbeefe34222014-03-16 00:14:26 -070030#include <stdbool.h>
Damien429d7192013-10-04 19:53:11 +010031#include <stdint.h>
32#include <stdio.h>
33#include <assert.h>
34
Damien George55baff42014-01-21 21:40:13 +000035#include "mpconfig.h"
Paul Sokolovsky59c675a2014-06-21 22:43:22 +030036#include "misc.h"
Damien George55baff42014-01-21 21:40:13 +000037#include "qstr.h"
Damien429d7192013-10-04 19:53:11 +010038#include "lexer.h"
39
40#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010041
Damien92c06562013-10-22 22:32:27 +010042// TODO seems that CPython allows NULL byte in the input stream
43// don't know if that's intentional or not, but we don't allow it
44
Damiend99b0522013-12-21 18:17:45 +000045struct _mp_lexer_t {
Damien George08335002014-01-18 23:24:36 +000046 qstr source_name; // name of source
Damiena5185f42013-10-20 14:41:27 +010047 void *stream_data; // data for stream
Damien George94fbe972014-07-30 11:46:05 +010048 mp_lexer_stream_next_byte_t stream_next_byte; // stream callback to get next byte
Damiend99b0522013-12-21 18:17:45 +000049 mp_lexer_stream_close_t stream_close; // stream callback to free
Damien429d7192013-10-04 19:53:11 +010050
Damiena5185f42013-10-20 14:41:27 +010051 unichar chr0, chr1, chr2; // current cached characters from source
Damien429d7192013-10-04 19:53:11 +010052
Damien George54eb4e72014-07-03 13:47:47 +010053 mp_uint_t line; // source line
54 mp_uint_t column; // source column
Damien429d7192013-10-04 19:53:11 +010055
Damien George54eb4e72014-07-03 13:47:47 +010056 mp_int_t emit_dent; // non-zero when there are INDENT/DEDENT tokens to emit
57 mp_int_t nested_bracket_level; // >0 when there are nested brackets over multiple lines
Damien429d7192013-10-04 19:53:11 +010058
Damien George54eb4e72014-07-03 13:47:47 +010059 mp_uint_t alloc_indent_level;
60 mp_uint_t num_indent_level;
Damien429d7192013-10-04 19:53:11 +010061 uint16_t *indent_level;
62
Damiena5185f42013-10-20 14:41:27 +010063 vstr_t vstr;
Damiend99b0522013-12-21 18:17:45 +000064 mp_token_t tok_cur;
Damien429d7192013-10-04 19:53:11 +010065};
66
Damien George54eb4e72014-07-03 13:47:47 +010067mp_uint_t mp_optimise_value;
Damien George97f9a282014-05-12 23:07:34 +010068
Damien George9528cd62014-01-15 21:23:31 +000069// TODO replace with a call to a standard function
Damien George54eb4e72014-07-03 13:47:47 +010070bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
71 mp_uint_t i = 0;
Damien429d7192013-10-04 19:53:11 +010072
Damiena5185f42013-10-20 14:41:27 +010073 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010074 ++i;
Damien429d7192013-10-04 19:53:11 +010075 ++str;
Damiena5185f42013-10-20 14:41:27 +010076 ++strn;
Damien429d7192013-10-04 19:53:11 +010077 }
78
Damiena5185f42013-10-20 14:41:27 +010079 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010080}
81
Damien Georgec5966122014-02-15 16:10:44 +000082#ifdef MICROPY_DEBUG_PRINTERS
Damiend99b0522013-12-21 18:17:45 +000083void mp_token_show(const mp_token_t *tok) {
Damien George54eb4e72014-07-03 13:47:47 +010084 printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:" UINT_FMT, tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
Damien429d7192013-10-04 19:53:11 +010085 if (tok->str != NULL && tok->len > 0) {
Paul Sokolovskyb0bb4582014-06-14 06:18:34 +030086 const byte *i = (const byte *)tok->str;
87 const byte *j = (const byte *)i + tok->len;
Damien429d7192013-10-04 19:53:11 +010088 printf(" ");
89 while (i < j) {
Damien George8cc96a32013-12-30 18:23:50 +000090 unichar c = utf8_get_char(i);
91 i = utf8_next_char(i);
92 if (unichar_isprint(c)) {
Damien429d7192013-10-04 19:53:11 +010093 printf("%c", c);
94 } else {
95 printf("?");
96 }
97 }
98 }
99 printf("\n");
100}
Damien Georgec5966122014-02-15 16:10:44 +0000101#endif
Damien429d7192013-10-04 19:53:11 +0100102
Damiena5185f42013-10-20 14:41:27 +0100103#define CUR_CHAR(lex) ((lex)->chr0)
104
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200105STATIC bool is_end(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +0100106 return lex->chr0 == MP_LEXER_EOF;
Damien429d7192013-10-04 19:53:11 +0100107}
108
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200109STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100110 return lex->chr0 == '\n' || lex->chr0 == '\r';
111}
112
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200113STATIC bool is_char(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +0100114 return lex->chr0 == c;
115}
116
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200117STATIC bool is_char_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100118 return lex->chr0 == c1 || lex->chr0 == c2;
119}
120
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200121STATIC bool is_char_or3(mp_lexer_t *lex, char c1, char c2, char c3) {
Damien429d7192013-10-04 19:53:11 +0100122 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
123}
124
125/*
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200126STATIC bool is_char_following(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +0100127 return lex->chr1 == c;
128}
129*/
130
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200131STATIC bool is_char_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100132 return lex->chr1 == c1 || lex->chr1 == c2;
133}
134
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200135STATIC bool is_char_following_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100136 return lex->chr2 == c1 || lex->chr2 == c2;
137}
138
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200139STATIC bool is_char_and(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100140 return lex->chr0 == c1 && lex->chr1 == c2;
141}
142
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200143STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000144 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100145}
146
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200147STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000148 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100149}
150
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200151STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000152 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100153}
154
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200155STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000156 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100157}
158
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200159STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200160 return lex->chr1 >= '0' && lex->chr1 <= '7';
161}
162
Damien429d7192013-10-04 19:53:11 +0100163// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200164STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100165 return is_letter(lex) || lex->chr0 == '_';
166}
167
168// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200169STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100170 return is_head_of_identifier(lex) || is_digit(lex);
171}
172
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200173STATIC void next_char(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +0100174 if (lex->chr0 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100175 return;
176 }
177
Damien George54eb4e72014-07-03 13:47:47 +0100178 mp_uint_t advance = 1;
Damien429d7192013-10-04 19:53:11 +0100179
180 if (lex->chr0 == '\n') {
181 // LF is a new line
182 ++lex->line;
183 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100184 } else if (lex->chr0 == '\r') {
185 // CR is a new line
186 ++lex->line;
187 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100188 if (lex->chr1 == '\n') {
189 // CR LF is a single new line
190 advance = 2;
191 }
192 } else if (lex->chr0 == '\t') {
193 // a tab
194 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
195 } else {
196 // a character worth one column
197 ++lex->column;
198 }
199
200 for (; advance > 0; advance--) {
201 lex->chr0 = lex->chr1;
202 lex->chr1 = lex->chr2;
Damien George94fbe972014-07-30 11:46:05 +0100203 lex->chr2 = lex->stream_next_byte(lex->stream_data);
204 if (lex->chr2 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100205 // EOF
Damien George94fbe972014-07-30 11:46:05 +0100206 if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien429d7192013-10-04 19:53:11 +0100207 lex->chr2 = '\n'; // insert newline at end of file
Damien429d7192013-10-04 19:53:11 +0100208 }
209 }
210 }
211}
212
Damien George54eb4e72014-07-03 13:47:47 +0100213void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
Damien429d7192013-10-04 19:53:11 +0100214 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100215 // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
Damien George58ebde42014-05-21 20:32:59 +0100216 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
217 lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
Damien429d7192013-10-04 19:53:11 +0100218 }
219 lex->indent_level[lex->num_indent_level++] = indent;
220}
221
Damien George54eb4e72014-07-03 13:47:47 +0100222mp_uint_t indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100223 return lex->indent_level[lex->num_indent_level - 1];
224}
225
Damiend99b0522013-12-21 18:17:45 +0000226void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100227 lex->num_indent_level -= 1;
228}
229
230// some tricky operator encoding:
231// <op> = begin with <op>, if this opchar matches then begin here
232// e<op> = end with <op>, if this opchar matches then end
233// E<op> = mandatory end with <op>, this opchar must match, then end
234// c<op> = continue with <op>, if this opchar matches then continue matching
235// this means if the start of two ops are the same then they are equal til the last char
236
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200237STATIC const char *tok_enc =
Damien429d7192013-10-04 19:53:11 +0100238 "()[]{},:;@~" // singles
239 "<e=c<e=" // < <= << <<=
240 ">e=c>e=" // > >= >> >>=
241 "*e=c*e=" // * *= ** **=
242 "+e=" // + +=
243 "-e=e>" // - -= ->
244 "&e=" // & &=
245 "|e=" // | |=
246 "/e=c/e=" // / /= // //=
247 "%e=" // % %=
248 "^e=" // ^ ^=
249 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100250 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100251
252// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200253STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000254 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
255 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
256 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
257 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100258
Damiend99b0522013-12-21 18:17:45 +0000259 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
260 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
261 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
262 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
263 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
264 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
265 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
266 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
267 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
268 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
269 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
270 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100271};
272
273// must have the same order as enum in lexer.h
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200274STATIC const char *tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100275 "False",
276 "None",
277 "True",
278 "and",
279 "as",
280 "assert",
281 "break",
282 "class",
283 "continue",
284 "def",
285 "del",
286 "elif",
287 "else",
288 "except",
289 "finally",
290 "for",
291 "from",
292 "global",
293 "if",
294 "import",
295 "in",
296 "is",
297 "lambda",
298 "nonlocal",
299 "not",
300 "or",
301 "pass",
302 "raise",
303 "return",
304 "try",
305 "while",
306 "with",
307 "yield",
Damien George97f9a282014-05-12 23:07:34 +0100308 "__debug__",
Damien429d7192013-10-04 19:53:11 +0100309};
310
Damien George54eb4e72014-07-03 13:47:47 +0100311STATIC mp_uint_t hex_digit(unichar c) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200312 // c is assumed to be hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100313 mp_uint_t n = c - '0';
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200314 if (n > 9) {
315 n &= ~('a' - 'A');
316 n -= ('A' - ('9' + 1));
317 }
318 return n;
319}
320
321// This is called with CUR_CHAR() before first hex digit, and should return with
322// it pointing to last hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100323// num_digits must be greater than zero
324STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
325 mp_uint_t num = 0;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200326 while (num_digits-- != 0) {
327 next_char(lex);
328 unichar c = CUR_CHAR(lex);
329 if (!unichar_isxdigit(c)) {
330 return false;
331 }
332 num = (num << 4) + hex_digit(c);
333 }
334 *result = num;
335 return true;
336}
337
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200338STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) {
Damiena5185f42013-10-20 14:41:27 +0100339 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100340 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100341 while (!is_end(lex)) {
342 if (is_physical_newline(lex)) {
343 had_physical_newline = true;
344 next_char(lex);
345 } else if (is_whitespace(lex)) {
346 next_char(lex);
347 } else if (is_char(lex, '#')) {
348 next_char(lex);
349 while (!is_end(lex) && !is_physical_newline(lex)) {
350 next_char(lex);
351 }
352 // had_physical_newline will be set on next loop
353 } else if (is_char(lex, '\\')) {
354 // backslash (outside string literals) must appear just before a physical newline
355 next_char(lex);
356 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000357 // SyntaxError: unexpected character after line continuation character
Damien George69a818d2014-01-12 13:55:24 +0000358 tok->src_line = lex->line;
359 tok->src_column = lex->column;
360 tok->kind = MP_TOKEN_BAD_LINE_CONTINUATION;
361 vstr_reset(&lex->vstr);
362 tok->str = vstr_str(&lex->vstr);
363 tok->len = 0;
364 return;
Damien429d7192013-10-04 19:53:11 +0100365 } else {
366 next_char(lex);
367 }
368 } else {
369 break;
370 }
371 }
372
Damiena5185f42013-10-20 14:41:27 +0100373 // set token source information
Damien429d7192013-10-04 19:53:11 +0100374 tok->src_line = lex->line;
375 tok->src_column = lex->column;
Damien429d7192013-10-04 19:53:11 +0100376
Damiena5185f42013-10-20 14:41:27 +0100377 // start new token text
378 vstr_reset(&lex->vstr);
379
380 if (first_token && lex->line == 1 && lex->column != 1) {
381 // check that the first token is in the first column
382 // if first token is not on first line, we get a physical newline and
383 // this check is done as part of normal indent/dedent checking below
384 // (done to get equivalence with CPython)
Damiend99b0522013-12-21 18:17:45 +0000385 tok->kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100386
387 } else if (lex->emit_dent < 0) {
Damiend99b0522013-12-21 18:17:45 +0000388 tok->kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100389 lex->emit_dent += 1;
390
391 } else if (lex->emit_dent > 0) {
Damiend99b0522013-12-21 18:17:45 +0000392 tok->kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100393 lex->emit_dent -= 1;
394
Damien91d387d2013-10-09 15:09:52 +0100395 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damiend99b0522013-12-21 18:17:45 +0000396 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100397
Damien George54eb4e72014-07-03 13:47:47 +0100398 mp_uint_t num_spaces = lex->column - 1;
Damien429d7192013-10-04 19:53:11 +0100399 lex->emit_dent = 0;
400 if (num_spaces == indent_top(lex)) {
401 } else if (num_spaces > indent_top(lex)) {
402 indent_push(lex, num_spaces);
403 lex->emit_dent += 1;
404 } else {
405 while (num_spaces < indent_top(lex)) {
406 indent_pop(lex);
407 lex->emit_dent -= 1;
408 }
409 if (num_spaces != indent_top(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000410 tok->kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100411 }
412 }
413
414 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100415 if (indent_top(lex) > 0) {
Damiend99b0522013-12-21 18:17:45 +0000416 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100417 lex->emit_dent = 0;
418 while (indent_top(lex) > 0) {
419 indent_pop(lex);
420 lex->emit_dent -= 1;
421 }
422 } else {
Damiend99b0522013-12-21 18:17:45 +0000423 tok->kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100424 }
425
426 } else if (is_char_or(lex, '\'', '\"')
427 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
428 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
429 // a string or bytes literal
430
431 // parse type codes
432 bool is_raw = false;
433 bool is_bytes = false;
434 if (is_char(lex, 'u')) {
435 next_char(lex);
436 } else if (is_char(lex, 'b')) {
437 is_bytes = true;
438 next_char(lex);
439 if (is_char(lex, 'r')) {
440 is_raw = true;
441 next_char(lex);
442 }
443 } else if (is_char(lex, 'r')) {
444 is_raw = true;
445 next_char(lex);
446 if (is_char(lex, 'b')) {
447 is_bytes = true;
448 next_char(lex);
449 }
450 }
451
452 // set token kind
453 if (is_bytes) {
Damiend99b0522013-12-21 18:17:45 +0000454 tok->kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100455 } else {
Damiend99b0522013-12-21 18:17:45 +0000456 tok->kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100457 }
458
459 // get first quoting character
460 char quote_char = '\'';
461 if (is_char(lex, '\"')) {
462 quote_char = '\"';
463 }
464 next_char(lex);
465
466 // work out if it's a single or triple quoted literal
Damien George54eb4e72014-07-03 13:47:47 +0100467 mp_uint_t num_quotes;
Damien429d7192013-10-04 19:53:11 +0100468 if (is_char_and(lex, quote_char, quote_char)) {
469 // triple quotes
470 next_char(lex);
471 next_char(lex);
472 num_quotes = 3;
473 } else {
474 // single quotes
475 num_quotes = 1;
476 }
477
Damien429d7192013-10-04 19:53:11 +0100478 // parse the literal
Damien George54eb4e72014-07-03 13:47:47 +0100479 mp_uint_t n_closing = 0;
Damien429d7192013-10-04 19:53:11 +0100480 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
481 if (is_char(lex, quote_char)) {
482 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100483 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100484 } else {
485 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100486 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100487 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100488 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100489 if (is_raw) {
490 // raw strings allow escaping of quotes, but the backslash is also emitted
491 vstr_add_char(&lex->vstr, '\\');
492 } else {
493 switch (c) {
Damien George94fbe972014-07-30 11:46:05 +0100494 case MP_LEXER_EOF: break; // TODO a proper error message?
495 case '\n': c = MP_LEXER_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damien Georgea91f4142014-04-10 11:30:55 +0100496 case '\\': break;
497 case '\'': break;
498 case '"': break;
499 case 'a': c = 0x07; break;
500 case 'b': c = 0x08; break;
501 case 't': c = 0x09; break;
502 case 'n': c = 0x0a; break;
503 case 'v': c = 0x0b; break;
504 case 'f': c = 0x0c; break;
505 case 'r': c = 0x0d; break;
Chris Angelico2ba22992014-06-04 05:28:12 +1000506 case 'u':
507 case 'U':
508 if (is_bytes) {
509 // b'\u1234' == b'\\u1234'
510 vstr_add_char(&lex->vstr, '\\');
511 break;
512 }
513 // Otherwise fall through.
Damien Georgea91f4142014-04-10 11:30:55 +0100514 case 'x':
515 {
Damien George54eb4e72014-07-03 13:47:47 +0100516 mp_uint_t num = 0;
Chris Angelico2ba22992014-06-04 05:28:12 +1000517 if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
Damien Georgea91f4142014-04-10 11:30:55 +0100518 // TODO error message
519 assert(0);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200520 }
521 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100522 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200523 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000524 case 'N':
525 // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
526 // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
527 // 3MB of text; even gzip-compressed and with minimal structure, it'll take
528 // roughly half a meg of storage. This form of Unicode escape may be added
529 // later on, but it's definitely not a priority right now. -- CJA 20140607
530 assert(!"Unicode name escapes not supported");
531 break;
Damien Georgea91f4142014-04-10 11:30:55 +0100532 default:
533 if (c >= '0' && c <= '7') {
534 // Octal sequence, 1-3 chars
Damien George54eb4e72014-07-03 13:47:47 +0100535 mp_uint_t digits = 3;
536 mp_uint_t num = c - '0';
Damien Georgea91f4142014-04-10 11:30:55 +0100537 while (is_following_odigit(lex) && --digits != 0) {
538 next_char(lex);
539 num = num * 8 + (CUR_CHAR(lex) - '0');
540 }
541 c = num;
542 } else {
543 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
544 vstr_add_char(&lex->vstr, '\\');
545 }
546 break;
547 }
Damiena5185f42013-10-20 14:41:27 +0100548 }
Damien George94fbe972014-07-30 11:46:05 +0100549 if (c != MP_LEXER_EOF) {
Chris Angelico2ba22992014-06-04 05:28:12 +1000550 if (c < 0x110000 && !is_bytes) {
551 vstr_add_char(&lex->vstr, c);
552 } else if (c < 0x100 && is_bytes) {
553 vstr_add_byte(&lex->vstr, c);
554 } else {
555 assert(!"TODO: Throw an error, invalid escape code probably");
556 }
Damiena5185f42013-10-20 14:41:27 +0100557 }
558 } else {
Damien George94fbe972014-07-30 11:46:05 +0100559 // Add the "character" as a byte so that we remain 8-bit clean.
560 // This way, strings are parsed correctly whether or not they contain utf-8 chars.
561 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100562 }
563 }
564 next_char(lex);
565 }
566
567 // check we got the required end quotes
568 if (n_closing < num_quotes) {
Damiend99b0522013-12-21 18:17:45 +0000569 tok->kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100570 }
571
Damiena5185f42013-10-20 14:41:27 +0100572 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000573 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100574
575 } else if (is_head_of_identifier(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000576 tok->kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100577
Damiena5185f42013-10-20 14:41:27 +0100578 // get first char
579 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100580 next_char(lex);
581
Damiena5185f42013-10-20 14:41:27 +0100582 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100583 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100584 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100585 next_char(lex);
586 }
587
588 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damiend99b0522013-12-21 18:17:45 +0000589 tok->kind = MP_TOKEN_NUMBER;
Damien429d7192013-10-04 19:53:11 +0100590
Damiena5185f42013-10-20 14:41:27 +0100591 // get first char
592 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100593 next_char(lex);
594
Damiena5185f42013-10-20 14:41:27 +0100595 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100596 while (!is_end(lex)) {
597 if (is_char_or(lex, 'e', 'E')) {
Damiena5185f42013-10-20 14:41:27 +0100598 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100599 next_char(lex);
600 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100601 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100602 next_char(lex);
603 }
604 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
Damiena5185f42013-10-20 14:41:27 +0100605 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100606 next_char(lex);
607 } else {
608 break;
609 }
610 }
611
Damien George2e9eb2d2014-04-10 12:19:33 +0100612 } else if (is_char(lex, '.')) {
613 // special handling for . and ... operators, because .. is not a valid operator
614
615 // get first char
616 vstr_add_char(&lex->vstr, '.');
617 next_char(lex);
618
619 if (is_char_and(lex, '.', '.')) {
620 vstr_add_char(&lex->vstr, '.');
621 vstr_add_char(&lex->vstr, '.');
622 next_char(lex);
623 next_char(lex);
624 tok->kind = MP_TOKEN_ELLIPSIS;
625 } else {
626 tok->kind = MP_TOKEN_DEL_PERIOD;
627 }
628
Damien429d7192013-10-04 19:53:11 +0100629 } else {
630 // search for encoded delimiter or operator
631
632 const char *t = tok_enc;
Damien George54eb4e72014-07-03 13:47:47 +0100633 mp_uint_t tok_enc_index = 0;
Damien429d7192013-10-04 19:53:11 +0100634 for (; *t != 0 && !is_char(lex, *t); t += 1) {
635 if (*t == 'e' || *t == 'c') {
636 t += 1;
637 } else if (*t == 'E') {
638 tok_enc_index -= 1;
639 t += 1;
640 }
641 tok_enc_index += 1;
642 }
643
644 next_char(lex);
645
646 if (*t == 0) {
647 // didn't match any delimiter or operator characters
Damiend99b0522013-12-21 18:17:45 +0000648 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100649
650 } else {
651 // matched a delimiter or operator character
652
653 // get the maximum characters for a valid token
654 t += 1;
Damien George54eb4e72014-07-03 13:47:47 +0100655 mp_uint_t t_index = tok_enc_index;
Damien429d7192013-10-04 19:53:11 +0100656 for (;;) {
657 for (; *t == 'e'; t += 1) {
658 t += 1;
659 t_index += 1;
660 if (is_char(lex, *t)) {
661 next_char(lex);
662 tok_enc_index = t_index;
663 break;
664 }
665 }
666
667 if (*t == 'E') {
668 t += 1;
669 if (is_char(lex, *t)) {
670 next_char(lex);
671 tok_enc_index = t_index;
672 } else {
Damiend99b0522013-12-21 18:17:45 +0000673 tok->kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100674 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100675 }
676 break;
677 }
678
679 if (*t == 'c') {
680 t += 1;
681 t_index += 1;
682 if (is_char(lex, *t)) {
683 next_char(lex);
684 tok_enc_index = t_index;
685 t += 1;
686 } else {
687 break;
688 }
689 } else {
690 break;
691 }
692 }
693
694 // set token kind
695 tok->kind = tok_enc_kind[tok_enc_index];
696
Damien George2e9eb2d2014-04-10 12:19:33 +0100697 tok_enc_no_match:
698
Damien429d7192013-10-04 19:53:11 +0100699 // compute bracket level for implicit line joining
Damiend99b0522013-12-21 18:17:45 +0000700 if (tok->kind == MP_TOKEN_DEL_PAREN_OPEN || tok->kind == MP_TOKEN_DEL_BRACKET_OPEN || tok->kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100701 lex->nested_bracket_level += 1;
Damiend99b0522013-12-21 18:17:45 +0000702 } else if (tok->kind == MP_TOKEN_DEL_PAREN_CLOSE || tok->kind == MP_TOKEN_DEL_BRACKET_CLOSE || tok->kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100703 lex->nested_bracket_level -= 1;
704 }
705 }
706 }
707
Damiena5185f42013-10-20 14:41:27 +0100708 // point token text to vstr buffer
709 tok->str = vstr_str(&lex->vstr);
710 tok->len = vstr_len(&lex->vstr);
Damien429d7192013-10-04 19:53:11 +0100711
Damiena5185f42013-10-20 14:41:27 +0100712 // check for keywords
Damiend99b0522013-12-21 18:17:45 +0000713 if (tok->kind == MP_TOKEN_NAME) {
Damien George97f9a282014-05-12 23:07:34 +0100714 // We check for __debug__ here and convert it to its value. This is so
715 // the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
716 // need to check for this special token in many places in the compiler.
717 // TODO improve speed of these string comparisons
Damien George54eb4e72014-07-03 13:47:47 +0100718 //for (mp_int_t i = 0; tok_kw[i] != NULL; i++) {
719 for (mp_int_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
Damiena5185f42013-10-20 14:41:27 +0100720 if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
Emmanuel Blotf6932d62014-06-19 18:54:34 +0200721 if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
722 // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
Paul Sokolovskyd3439d02014-06-02 19:37:55 +0300723 tok->kind = (mp_optimise_value == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
Damien George97f9a282014-05-12 23:07:34 +0100724 } else {
725 tok->kind = MP_TOKEN_KW_FALSE + i;
726 }
Damien429d7192013-10-04 19:53:11 +0100727 break;
728 }
729 }
730 }
731}
732
Damien George94fbe972014-07-30 11:46:05 +0100733mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100734 mp_lexer_t *lex = m_new_maybe(mp_lexer_t, 1);
735
736 // check for memory allocation error
737 if (lex == NULL) {
738 if (stream_close) {
739 stream_close(stream_data);
740 }
741 return NULL;
742 }
Damien429d7192013-10-04 19:53:11 +0100743
Damien Georgeb829b5c2014-01-25 13:51:19 +0000744 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100745 lex->stream_data = stream_data;
Damien George94fbe972014-07-30 11:46:05 +0100746 lex->stream_next_byte = stream_next_byte;
Damienfa2162b2013-10-20 17:42:00 +0100747 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100748 lex->line = 1;
749 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100750 lex->emit_dent = 0;
751 lex->nested_bracket_level = 0;
Damien George58ebde42014-05-21 20:32:59 +0100752 lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
Damien429d7192013-10-04 19:53:11 +0100753 lex->num_indent_level = 1;
Damien Georgee1199ec2014-05-10 17:48:01 +0100754 lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200755 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100756
Damien Georgee1199ec2014-05-10 17:48:01 +0100757 // check for memory allocation error
758 if (lex->indent_level == NULL || vstr_had_error(&lex->vstr)) {
759 mp_lexer_free(lex);
760 return NULL;
761 }
762
763 // store sentinel for first indentation level
764 lex->indent_level[0] = 0;
765
Damien429d7192013-10-04 19:53:11 +0100766 // preload characters
Damien George94fbe972014-07-30 11:46:05 +0100767 lex->chr0 = stream_next_byte(stream_data);
768 lex->chr1 = stream_next_byte(stream_data);
769 lex->chr2 = stream_next_byte(stream_data);
Damiena5185f42013-10-20 14:41:27 +0100770
771 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damien George94fbe972014-07-30 11:46:05 +0100772 if (lex->chr0 == MP_LEXER_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100773 lex->chr0 = '\n';
Damien George94fbe972014-07-30 11:46:05 +0100774 } else if (lex->chr1 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100775 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100776 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100777 }
Damien George94fbe972014-07-30 11:46:05 +0100778 } else if (lex->chr2 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100779 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100780 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100781 }
Damien429d7192013-10-04 19:53:11 +0100782 }
783
Damiena5185f42013-10-20 14:41:27 +0100784 // preload first token
Damiend99b0522013-12-21 18:17:45 +0000785 mp_lexer_next_token_into(lex, &lex->tok_cur, true);
Damien429d7192013-10-04 19:53:11 +0100786
787 return lex;
788}
789
Damiend99b0522013-12-21 18:17:45 +0000790void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100791 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100792 if (lex->stream_close) {
793 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100794 }
Damienbb5316b2013-10-22 21:12:29 +0100795 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200796 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000797 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100798 }
Damien429d7192013-10-04 19:53:11 +0100799}
800
Damien George08335002014-01-18 23:24:36 +0000801qstr mp_lexer_source_name(mp_lexer_t *lex) {
802 return lex->source_name;
803}
804
Damiend99b0522013-12-21 18:17:45 +0000805void mp_lexer_to_next(mp_lexer_t *lex) {
806 mp_lexer_next_token_into(lex, &lex->tok_cur, false);
Damien429d7192013-10-04 19:53:11 +0100807}
808
Damiend99b0522013-12-21 18:17:45 +0000809const mp_token_t *mp_lexer_cur(const mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100810 return &lex->tok_cur;
811}
812
Damiend99b0522013-12-21 18:17:45 +0000813bool mp_lexer_is_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
Damien429d7192013-10-04 19:53:11 +0100814 return lex->tok_cur.kind == kind;
815}