blob: f4aedb4016cb6bc5e5dec051d691171c76dbde27 [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027#include <stdio.h>
28#include <assert.h>
29
Damien Georgeb4b10fd2015-01-01 23:30:53 +000030#include "py/mpstate.h"
Damien George51dfcb42015-01-01 20:27:54 +000031#include "py/lexer.h"
Damien429d7192013-10-04 19:53:11 +010032
33#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010034
Damien92c06562013-10-22 22:32:27 +010035// TODO seems that CPython allows NULL byte in the input stream
36// don't know if that's intentional or not, but we don't allow it
37
Damien George9528cd62014-01-15 21:23:31 +000038// TODO replace with a call to a standard function
Damien Georgea4c52c52014-12-05 19:35:18 +000039STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
Damien George54eb4e72014-07-03 13:47:47 +010040 mp_uint_t i = 0;
Damien429d7192013-10-04 19:53:11 +010041
Damiena5185f42013-10-20 14:41:27 +010042 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010043 ++i;
Damien429d7192013-10-04 19:53:11 +010044 ++str;
Damiena5185f42013-10-20 14:41:27 +010045 ++strn;
Damien429d7192013-10-04 19:53:11 +010046 }
47
Damiena5185f42013-10-20 14:41:27 +010048 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010049}
50
Damiena5185f42013-10-20 14:41:27 +010051#define CUR_CHAR(lex) ((lex)->chr0)
52
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020053STATIC bool is_end(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +010054 return lex->chr0 == MP_LEXER_EOF;
Damien429d7192013-10-04 19:53:11 +010055}
56
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020057STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +010058 return lex->chr0 == '\n' || lex->chr0 == '\r';
59}
60
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020061STATIC bool is_char(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010062 return lex->chr0 == c;
63}
64
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020065STATIC bool is_char_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010066 return lex->chr0 == c1 || lex->chr0 == c2;
67}
68
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020069STATIC bool is_char_or3(mp_lexer_t *lex, char c1, char c2, char c3) {
Damien429d7192013-10-04 19:53:11 +010070 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
71}
72
73/*
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020074STATIC bool is_char_following(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010075 return lex->chr1 == c;
76}
77*/
78
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020079STATIC bool is_char_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010080 return lex->chr1 == c1 || lex->chr1 == c2;
81}
82
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020083STATIC bool is_char_following_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010084 return lex->chr2 == c1 || lex->chr2 == c2;
85}
86
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020087STATIC bool is_char_and(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010088 return lex->chr0 == c1 && lex->chr1 == c2;
89}
90
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020091STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000092 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010093}
94
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020095STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000096 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010097}
98
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020099STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000100 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100101}
102
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200103STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000104 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100105}
106
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200107STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200108 return lex->chr1 >= '0' && lex->chr1 <= '7';
109}
110
Damien429d7192013-10-04 19:53:11 +0100111// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200112STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100113 return is_letter(lex) || lex->chr0 == '_';
114}
115
116// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200117STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100118 return is_head_of_identifier(lex) || is_digit(lex);
119}
120
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200121STATIC void next_char(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +0100122 if (lex->chr0 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100123 return;
124 }
125
Damien George54eb4e72014-07-03 13:47:47 +0100126 mp_uint_t advance = 1;
Damien429d7192013-10-04 19:53:11 +0100127
128 if (lex->chr0 == '\n') {
129 // LF is a new line
130 ++lex->line;
131 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100132 } else if (lex->chr0 == '\r') {
133 // CR is a new line
134 ++lex->line;
135 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100136 if (lex->chr1 == '\n') {
137 // CR LF is a single new line
138 advance = 2;
139 }
140 } else if (lex->chr0 == '\t') {
141 // a tab
142 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
143 } else {
144 // a character worth one column
145 ++lex->column;
146 }
147
148 for (; advance > 0; advance--) {
149 lex->chr0 = lex->chr1;
150 lex->chr1 = lex->chr2;
Damien George94fbe972014-07-30 11:46:05 +0100151 lex->chr2 = lex->stream_next_byte(lex->stream_data);
152 if (lex->chr2 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100153 // EOF
Damien George94fbe972014-07-30 11:46:05 +0100154 if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien429d7192013-10-04 19:53:11 +0100155 lex->chr2 = '\n'; // insert newline at end of file
Damien429d7192013-10-04 19:53:11 +0100156 }
157 }
158 }
159}
160
Damien Georgea4c52c52014-12-05 19:35:18 +0000161STATIC void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
Damien429d7192013-10-04 19:53:11 +0100162 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100163 // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
Damien George58ebde42014-05-21 20:32:59 +0100164 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
165 lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
Damien429d7192013-10-04 19:53:11 +0100166 }
167 lex->indent_level[lex->num_indent_level++] = indent;
168}
169
Damien Georgea4c52c52014-12-05 19:35:18 +0000170STATIC mp_uint_t indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100171 return lex->indent_level[lex->num_indent_level - 1];
172}
173
Damien Georgea4c52c52014-12-05 19:35:18 +0000174STATIC void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100175 lex->num_indent_level -= 1;
176}
177
178// some tricky operator encoding:
179// <op> = begin with <op>, if this opchar matches then begin here
180// e<op> = end with <op>, if this opchar matches then end
181// E<op> = mandatory end with <op>, this opchar must match, then end
182// c<op> = continue with <op>, if this opchar matches then continue matching
183// this means if the start of two ops are the same then they are equal til the last char
184
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200185STATIC const char *tok_enc =
Damien429d7192013-10-04 19:53:11 +0100186 "()[]{},:;@~" // singles
187 "<e=c<e=" // < <= << <<=
188 ">e=c>e=" // > >= >> >>=
189 "*e=c*e=" // * *= ** **=
190 "+e=" // + +=
191 "-e=e>" // - -= ->
192 "&e=" // & &=
193 "|e=" // | |=
194 "/e=c/e=" // / /= // //=
195 "%e=" // % %=
196 "^e=" // ^ ^=
197 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100198 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100199
200// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200201STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000202 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
203 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
204 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
205 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100206
Damiend99b0522013-12-21 18:17:45 +0000207 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
208 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
209 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
210 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
211 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
212 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
213 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
214 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
215 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
216 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
217 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
218 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100219};
220
221// must have the same order as enum in lexer.h
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200222STATIC const char *tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100223 "False",
224 "None",
225 "True",
226 "and",
227 "as",
228 "assert",
229 "break",
230 "class",
231 "continue",
232 "def",
233 "del",
234 "elif",
235 "else",
236 "except",
237 "finally",
238 "for",
239 "from",
240 "global",
241 "if",
242 "import",
243 "in",
244 "is",
245 "lambda",
246 "nonlocal",
247 "not",
248 "or",
249 "pass",
250 "raise",
251 "return",
252 "try",
253 "while",
254 "with",
255 "yield",
Damien George97f9a282014-05-12 23:07:34 +0100256 "__debug__",
Damien429d7192013-10-04 19:53:11 +0100257};
258
Damien George54eb4e72014-07-03 13:47:47 +0100259STATIC mp_uint_t hex_digit(unichar c) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200260 // c is assumed to be hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100261 mp_uint_t n = c - '0';
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200262 if (n > 9) {
263 n &= ~('a' - 'A');
264 n -= ('A' - ('9' + 1));
265 }
266 return n;
267}
268
269// This is called with CUR_CHAR() before first hex digit, and should return with
270// it pointing to last hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100271// num_digits must be greater than zero
272STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
273 mp_uint_t num = 0;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200274 while (num_digits-- != 0) {
275 next_char(lex);
276 unichar c = CUR_CHAR(lex);
277 if (!unichar_isxdigit(c)) {
278 return false;
279 }
280 num = (num << 4) + hex_digit(c);
281 }
282 *result = num;
283 return true;
284}
285
Damien Georgea4c52c52014-12-05 19:35:18 +0000286STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
287 // start new token text
288 vstr_reset(&lex->vstr);
289
Damiena5185f42013-10-20 14:41:27 +0100290 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100291 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100292 while (!is_end(lex)) {
293 if (is_physical_newline(lex)) {
294 had_physical_newline = true;
295 next_char(lex);
296 } else if (is_whitespace(lex)) {
297 next_char(lex);
298 } else if (is_char(lex, '#')) {
299 next_char(lex);
300 while (!is_end(lex) && !is_physical_newline(lex)) {
301 next_char(lex);
302 }
303 // had_physical_newline will be set on next loop
304 } else if (is_char(lex, '\\')) {
305 // backslash (outside string literals) must appear just before a physical newline
306 next_char(lex);
307 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000308 // SyntaxError: unexpected character after line continuation character
Damien Georgea4c52c52014-12-05 19:35:18 +0000309 lex->tok_line = lex->line;
310 lex->tok_column = lex->column;
311 lex->tok_kind = MP_TOKEN_BAD_LINE_CONTINUATION;
Damien George69a818d2014-01-12 13:55:24 +0000312 return;
Damien429d7192013-10-04 19:53:11 +0100313 } else {
314 next_char(lex);
315 }
316 } else {
317 break;
318 }
319 }
320
Damiena5185f42013-10-20 14:41:27 +0100321 // set token source information
Damien Georgea4c52c52014-12-05 19:35:18 +0000322 lex->tok_line = lex->line;
323 lex->tok_column = lex->column;
Damiena5185f42013-10-20 14:41:27 +0100324
325 if (first_token && lex->line == 1 && lex->column != 1) {
326 // check that the first token is in the first column
327 // if first token is not on first line, we get a physical newline and
328 // this check is done as part of normal indent/dedent checking below
329 // (done to get equivalence with CPython)
Damien Georgea4c52c52014-12-05 19:35:18 +0000330 lex->tok_kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100331
332 } else if (lex->emit_dent < 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000333 lex->tok_kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100334 lex->emit_dent += 1;
335
336 } else if (lex->emit_dent > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000337 lex->tok_kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100338 lex->emit_dent -= 1;
339
Damien91d387d2013-10-09 15:09:52 +0100340 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000341 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100342
Damien George54eb4e72014-07-03 13:47:47 +0100343 mp_uint_t num_spaces = lex->column - 1;
Damien429d7192013-10-04 19:53:11 +0100344 lex->emit_dent = 0;
345 if (num_spaces == indent_top(lex)) {
346 } else if (num_spaces > indent_top(lex)) {
347 indent_push(lex, num_spaces);
348 lex->emit_dent += 1;
349 } else {
350 while (num_spaces < indent_top(lex)) {
351 indent_pop(lex);
352 lex->emit_dent -= 1;
353 }
354 if (num_spaces != indent_top(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000355 lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100356 }
357 }
358
359 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100360 if (indent_top(lex) > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000361 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100362 lex->emit_dent = 0;
363 while (indent_top(lex) > 0) {
364 indent_pop(lex);
365 lex->emit_dent -= 1;
366 }
367 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000368 lex->tok_kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100369 }
370
371 } else if (is_char_or(lex, '\'', '\"')
372 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
373 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
374 // a string or bytes literal
375
376 // parse type codes
377 bool is_raw = false;
378 bool is_bytes = false;
379 if (is_char(lex, 'u')) {
380 next_char(lex);
381 } else if (is_char(lex, 'b')) {
382 is_bytes = true;
383 next_char(lex);
384 if (is_char(lex, 'r')) {
385 is_raw = true;
386 next_char(lex);
387 }
388 } else if (is_char(lex, 'r')) {
389 is_raw = true;
390 next_char(lex);
391 if (is_char(lex, 'b')) {
392 is_bytes = true;
393 next_char(lex);
394 }
395 }
396
397 // set token kind
398 if (is_bytes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000399 lex->tok_kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100400 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000401 lex->tok_kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100402 }
403
404 // get first quoting character
405 char quote_char = '\'';
406 if (is_char(lex, '\"')) {
407 quote_char = '\"';
408 }
409 next_char(lex);
410
411 // work out if it's a single or triple quoted literal
Damien George54eb4e72014-07-03 13:47:47 +0100412 mp_uint_t num_quotes;
Damien429d7192013-10-04 19:53:11 +0100413 if (is_char_and(lex, quote_char, quote_char)) {
414 // triple quotes
415 next_char(lex);
416 next_char(lex);
417 num_quotes = 3;
418 } else {
419 // single quotes
420 num_quotes = 1;
421 }
422
Damien429d7192013-10-04 19:53:11 +0100423 // parse the literal
Damien George54eb4e72014-07-03 13:47:47 +0100424 mp_uint_t n_closing = 0;
Damien429d7192013-10-04 19:53:11 +0100425 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
426 if (is_char(lex, quote_char)) {
427 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100428 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100429 } else {
430 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100431 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100432 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100433 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100434 if (is_raw) {
435 // raw strings allow escaping of quotes, but the backslash is also emitted
436 vstr_add_char(&lex->vstr, '\\');
437 } else {
438 switch (c) {
Damien George94fbe972014-07-30 11:46:05 +0100439 case MP_LEXER_EOF: break; // TODO a proper error message?
440 case '\n': c = MP_LEXER_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damien Georgea91f4142014-04-10 11:30:55 +0100441 case '\\': break;
442 case '\'': break;
443 case '"': break;
444 case 'a': c = 0x07; break;
445 case 'b': c = 0x08; break;
446 case 't': c = 0x09; break;
447 case 'n': c = 0x0a; break;
448 case 'v': c = 0x0b; break;
449 case 'f': c = 0x0c; break;
450 case 'r': c = 0x0d; break;
Chris Angelico2ba22992014-06-04 05:28:12 +1000451 case 'u':
452 case 'U':
453 if (is_bytes) {
454 // b'\u1234' == b'\\u1234'
455 vstr_add_char(&lex->vstr, '\\');
456 break;
457 }
458 // Otherwise fall through.
Damien Georgea91f4142014-04-10 11:30:55 +0100459 case 'x':
460 {
Damien George54eb4e72014-07-03 13:47:47 +0100461 mp_uint_t num = 0;
Chris Angelico2ba22992014-06-04 05:28:12 +1000462 if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
Damien Georgea91f4142014-04-10 11:30:55 +0100463 // TODO error message
464 assert(0);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200465 }
466 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100467 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200468 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000469 case 'N':
470 // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
471 // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
472 // 3MB of text; even gzip-compressed and with minimal structure, it'll take
473 // roughly half a meg of storage. This form of Unicode escape may be added
474 // later on, but it's definitely not a priority right now. -- CJA 20140607
475 assert(!"Unicode name escapes not supported");
476 break;
Damien Georgea91f4142014-04-10 11:30:55 +0100477 default:
478 if (c >= '0' && c <= '7') {
479 // Octal sequence, 1-3 chars
Damien George54eb4e72014-07-03 13:47:47 +0100480 mp_uint_t digits = 3;
481 mp_uint_t num = c - '0';
Damien Georgea91f4142014-04-10 11:30:55 +0100482 while (is_following_odigit(lex) && --digits != 0) {
483 next_char(lex);
484 num = num * 8 + (CUR_CHAR(lex) - '0');
485 }
486 c = num;
487 } else {
488 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
489 vstr_add_char(&lex->vstr, '\\');
490 }
491 break;
492 }
Damiena5185f42013-10-20 14:41:27 +0100493 }
Damien George94fbe972014-07-30 11:46:05 +0100494 if (c != MP_LEXER_EOF) {
Chris Angelico2ba22992014-06-04 05:28:12 +1000495 if (c < 0x110000 && !is_bytes) {
496 vstr_add_char(&lex->vstr, c);
497 } else if (c < 0x100 && is_bytes) {
498 vstr_add_byte(&lex->vstr, c);
499 } else {
500 assert(!"TODO: Throw an error, invalid escape code probably");
501 }
Damiena5185f42013-10-20 14:41:27 +0100502 }
503 } else {
Damien George94fbe972014-07-30 11:46:05 +0100504 // Add the "character" as a byte so that we remain 8-bit clean.
505 // This way, strings are parsed correctly whether or not they contain utf-8 chars.
506 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100507 }
508 }
509 next_char(lex);
510 }
511
512 // check we got the required end quotes
513 if (n_closing < num_quotes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000514 lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100515 }
516
Damiena5185f42013-10-20 14:41:27 +0100517 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000518 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100519
520 } else if (is_head_of_identifier(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000521 lex->tok_kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100522
Damiena5185f42013-10-20 14:41:27 +0100523 // get first char
524 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100525 next_char(lex);
526
Damiena5185f42013-10-20 14:41:27 +0100527 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100528 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100529 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100530 next_char(lex);
531 }
532
533 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000534 lex->tok_kind = MP_TOKEN_NUMBER;
Damien429d7192013-10-04 19:53:11 +0100535
Damiena5185f42013-10-20 14:41:27 +0100536 // get first char
537 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100538 next_char(lex);
539
Damiena5185f42013-10-20 14:41:27 +0100540 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100541 while (!is_end(lex)) {
542 if (is_char_or(lex, 'e', 'E')) {
Damiena5185f42013-10-20 14:41:27 +0100543 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100544 next_char(lex);
545 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100546 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100547 next_char(lex);
548 }
549 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
Damiena5185f42013-10-20 14:41:27 +0100550 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100551 next_char(lex);
552 } else {
553 break;
554 }
555 }
556
Damien George2e9eb2d2014-04-10 12:19:33 +0100557 } else if (is_char(lex, '.')) {
558 // special handling for . and ... operators, because .. is not a valid operator
559
560 // get first char
561 vstr_add_char(&lex->vstr, '.');
562 next_char(lex);
563
564 if (is_char_and(lex, '.', '.')) {
565 vstr_add_char(&lex->vstr, '.');
566 vstr_add_char(&lex->vstr, '.');
567 next_char(lex);
568 next_char(lex);
Damien Georgea4c52c52014-12-05 19:35:18 +0000569 lex->tok_kind = MP_TOKEN_ELLIPSIS;
Damien George2e9eb2d2014-04-10 12:19:33 +0100570 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000571 lex->tok_kind = MP_TOKEN_DEL_PERIOD;
Damien George2e9eb2d2014-04-10 12:19:33 +0100572 }
573
Damien429d7192013-10-04 19:53:11 +0100574 } else {
575 // search for encoded delimiter or operator
576
577 const char *t = tok_enc;
Damien George54eb4e72014-07-03 13:47:47 +0100578 mp_uint_t tok_enc_index = 0;
Damien429d7192013-10-04 19:53:11 +0100579 for (; *t != 0 && !is_char(lex, *t); t += 1) {
580 if (*t == 'e' || *t == 'c') {
581 t += 1;
582 } else if (*t == 'E') {
583 tok_enc_index -= 1;
584 t += 1;
585 }
586 tok_enc_index += 1;
587 }
588
589 next_char(lex);
590
591 if (*t == 0) {
592 // didn't match any delimiter or operator characters
Damien Georgea4c52c52014-12-05 19:35:18 +0000593 lex->tok_kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100594
595 } else {
596 // matched a delimiter or operator character
597
598 // get the maximum characters for a valid token
599 t += 1;
Damien George54eb4e72014-07-03 13:47:47 +0100600 mp_uint_t t_index = tok_enc_index;
Damien429d7192013-10-04 19:53:11 +0100601 for (;;) {
602 for (; *t == 'e'; t += 1) {
603 t += 1;
604 t_index += 1;
605 if (is_char(lex, *t)) {
606 next_char(lex);
607 tok_enc_index = t_index;
608 break;
609 }
610 }
611
612 if (*t == 'E') {
613 t += 1;
614 if (is_char(lex, *t)) {
615 next_char(lex);
616 tok_enc_index = t_index;
617 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000618 lex->tok_kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100619 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100620 }
621 break;
622 }
623
624 if (*t == 'c') {
625 t += 1;
626 t_index += 1;
627 if (is_char(lex, *t)) {
628 next_char(lex);
629 tok_enc_index = t_index;
630 t += 1;
631 } else {
632 break;
633 }
634 } else {
635 break;
636 }
637 }
638
639 // set token kind
Damien Georgea4c52c52014-12-05 19:35:18 +0000640 lex->tok_kind = tok_enc_kind[tok_enc_index];
Damien429d7192013-10-04 19:53:11 +0100641
Damien George2e9eb2d2014-04-10 12:19:33 +0100642 tok_enc_no_match:
643
Damien429d7192013-10-04 19:53:11 +0100644 // compute bracket level for implicit line joining
Damien Georgea4c52c52014-12-05 19:35:18 +0000645 if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100646 lex->nested_bracket_level += 1;
Damien Georgea4c52c52014-12-05 19:35:18 +0000647 } else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100648 lex->nested_bracket_level -= 1;
649 }
650 }
651 }
652
Damiena5185f42013-10-20 14:41:27 +0100653 // check for keywords
Damien Georgea4c52c52014-12-05 19:35:18 +0000654 if (lex->tok_kind == MP_TOKEN_NAME) {
Damien George97f9a282014-05-12 23:07:34 +0100655 // We check for __debug__ here and convert it to its value. This is so
656 // the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
657 // need to check for this special token in many places in the compiler.
658 // TODO improve speed of these string comparisons
Damien George54eb4e72014-07-03 13:47:47 +0100659 //for (mp_int_t i = 0; tok_kw[i] != NULL; i++) {
660 for (mp_int_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000661 if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) {
Emmanuel Blotf6932d62014-06-19 18:54:34 +0200662 if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
663 // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
Damien Georgeb4b10fd2015-01-01 23:30:53 +0000664 lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
Damien George97f9a282014-05-12 23:07:34 +0100665 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000666 lex->tok_kind = MP_TOKEN_KW_FALSE + i;
Damien George97f9a282014-05-12 23:07:34 +0100667 }
Damien429d7192013-10-04 19:53:11 +0100668 break;
669 }
670 }
671 }
672}
673
Damien George94fbe972014-07-30 11:46:05 +0100674mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close) {
Damien George9bf5f282014-10-09 16:53:37 +0100675 mp_lexer_t *lex = m_new_obj_maybe(mp_lexer_t);
Damien Georgee1199ec2014-05-10 17:48:01 +0100676
677 // check for memory allocation error
678 if (lex == NULL) {
679 if (stream_close) {
680 stream_close(stream_data);
681 }
682 return NULL;
683 }
Damien429d7192013-10-04 19:53:11 +0100684
Damien Georgeb829b5c2014-01-25 13:51:19 +0000685 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100686 lex->stream_data = stream_data;
Damien George94fbe972014-07-30 11:46:05 +0100687 lex->stream_next_byte = stream_next_byte;
Damienfa2162b2013-10-20 17:42:00 +0100688 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100689 lex->line = 1;
690 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100691 lex->emit_dent = 0;
692 lex->nested_bracket_level = 0;
Damien George58ebde42014-05-21 20:32:59 +0100693 lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
Damien429d7192013-10-04 19:53:11 +0100694 lex->num_indent_level = 1;
Damien Georgee1199ec2014-05-10 17:48:01 +0100695 lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200696 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100697
Damien Georgee1199ec2014-05-10 17:48:01 +0100698 // check for memory allocation error
699 if (lex->indent_level == NULL || vstr_had_error(&lex->vstr)) {
700 mp_lexer_free(lex);
701 return NULL;
702 }
703
704 // store sentinel for first indentation level
705 lex->indent_level[0] = 0;
706
Damien429d7192013-10-04 19:53:11 +0100707 // preload characters
Damien George94fbe972014-07-30 11:46:05 +0100708 lex->chr0 = stream_next_byte(stream_data);
709 lex->chr1 = stream_next_byte(stream_data);
710 lex->chr2 = stream_next_byte(stream_data);
Damiena5185f42013-10-20 14:41:27 +0100711
712 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damien George94fbe972014-07-30 11:46:05 +0100713 if (lex->chr0 == MP_LEXER_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100714 lex->chr0 = '\n';
Damien George94fbe972014-07-30 11:46:05 +0100715 } else if (lex->chr1 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100716 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100717 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100718 }
Damien George94fbe972014-07-30 11:46:05 +0100719 } else if (lex->chr2 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100720 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100721 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100722 }
Damien429d7192013-10-04 19:53:11 +0100723 }
724
Damiena5185f42013-10-20 14:41:27 +0100725 // preload first token
Damien Georgea4c52c52014-12-05 19:35:18 +0000726 mp_lexer_next_token_into(lex, true);
Damien429d7192013-10-04 19:53:11 +0100727
728 return lex;
729}
730
Damiend99b0522013-12-21 18:17:45 +0000731void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100732 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100733 if (lex->stream_close) {
734 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100735 }
Damienbb5316b2013-10-22 21:12:29 +0100736 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200737 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000738 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100739 }
Damien429d7192013-10-04 19:53:11 +0100740}
741
Damiend99b0522013-12-21 18:17:45 +0000742void mp_lexer_to_next(mp_lexer_t *lex) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000743 mp_lexer_next_token_into(lex, false);
Damien429d7192013-10-04 19:53:11 +0100744}
745
Damien Georgea4c52c52014-12-05 19:35:18 +0000746#if MICROPY_DEBUG_PRINTERS
747void mp_lexer_show_token(const mp_lexer_t *lex) {
Damien George451a0872014-12-05 22:50:16 +0000748 printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
Damien Georgea4c52c52014-12-05 19:35:18 +0000749 if (lex->vstr.len > 0) {
750 const byte *i = (const byte *)lex->vstr.buf;
751 const byte *j = (const byte *)i + lex->vstr.len;
752 printf(" ");
753 while (i < j) {
754 unichar c = utf8_get_char(i);
755 i = utf8_next_char(i);
756 if (unichar_isprint(c)) {
757 printf("%c", c);
758 } else {
759 printf("?");
760 }
761 }
762 }
763 printf("\n");
Damien429d7192013-10-04 19:53:11 +0100764}
Damien Georgea4c52c52014-12-05 19:35:18 +0000765#endif