blob: 8a8875ed51850b117374693c3246ef5869bb8d7d [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027#include <stdio.h>
28#include <assert.h>
29
Damien Georgeb4b10fd2015-01-01 23:30:53 +000030#include "py/mpstate.h"
Damien George51dfcb42015-01-01 20:27:54 +000031#include "py/lexer.h"
Damien429d7192013-10-04 19:53:11 +010032
33#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010034
Damien92c06562013-10-22 22:32:27 +010035// TODO seems that CPython allows NULL byte in the input stream
36// don't know if that's intentional or not, but we don't allow it
37
Damien George9528cd62014-01-15 21:23:31 +000038// TODO replace with a call to a standard function
Damien Georgea4c52c52014-12-05 19:35:18 +000039STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
Damien George54eb4e72014-07-03 13:47:47 +010040 mp_uint_t i = 0;
Damien429d7192013-10-04 19:53:11 +010041
Damiena5185f42013-10-20 14:41:27 +010042 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010043 ++i;
Damien429d7192013-10-04 19:53:11 +010044 ++str;
Damiena5185f42013-10-20 14:41:27 +010045 ++strn;
Damien429d7192013-10-04 19:53:11 +010046 }
47
Damiena5185f42013-10-20 14:41:27 +010048 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010049}
50
Damiena5185f42013-10-20 14:41:27 +010051#define CUR_CHAR(lex) ((lex)->chr0)
52
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020053STATIC bool is_end(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +010054 return lex->chr0 == MP_LEXER_EOF;
Damien429d7192013-10-04 19:53:11 +010055}
56
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020057STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +010058 return lex->chr0 == '\n' || lex->chr0 == '\r';
59}
60
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020061STATIC bool is_char(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010062 return lex->chr0 == c;
63}
64
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020065STATIC bool is_char_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010066 return lex->chr0 == c1 || lex->chr0 == c2;
67}
68
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020069STATIC bool is_char_or3(mp_lexer_t *lex, char c1, char c2, char c3) {
Damien429d7192013-10-04 19:53:11 +010070 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
71}
72
73/*
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020074STATIC bool is_char_following(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010075 return lex->chr1 == c;
76}
77*/
78
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020079STATIC bool is_char_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010080 return lex->chr1 == c1 || lex->chr1 == c2;
81}
82
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020083STATIC bool is_char_following_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010084 return lex->chr2 == c1 || lex->chr2 == c2;
85}
86
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020087STATIC bool is_char_and(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010088 return lex->chr0 == c1 && lex->chr1 == c2;
89}
90
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020091STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000092 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010093}
94
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020095STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000096 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010097}
98
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020099STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000100 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100101}
102
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200103STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000104 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100105}
106
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200107STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200108 return lex->chr1 >= '0' && lex->chr1 <= '7';
109}
110
Damien429d7192013-10-04 19:53:11 +0100111// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200112STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100113 return is_letter(lex) || lex->chr0 == '_';
114}
115
116// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200117STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100118 return is_head_of_identifier(lex) || is_digit(lex);
119}
120
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200121STATIC void next_char(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +0100122 if (lex->chr0 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100123 return;
124 }
125
Damien George54eb4e72014-07-03 13:47:47 +0100126 mp_uint_t advance = 1;
Damien429d7192013-10-04 19:53:11 +0100127
128 if (lex->chr0 == '\n') {
129 // LF is a new line
130 ++lex->line;
131 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100132 } else if (lex->chr0 == '\r') {
133 // CR is a new line
134 ++lex->line;
135 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100136 if (lex->chr1 == '\n') {
137 // CR LF is a single new line
138 advance = 2;
139 }
140 } else if (lex->chr0 == '\t') {
141 // a tab
142 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
143 } else {
144 // a character worth one column
145 ++lex->column;
146 }
147
148 for (; advance > 0; advance--) {
149 lex->chr0 = lex->chr1;
150 lex->chr1 = lex->chr2;
Damien George94fbe972014-07-30 11:46:05 +0100151 lex->chr2 = lex->stream_next_byte(lex->stream_data);
152 if (lex->chr2 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100153 // EOF
Damien George94fbe972014-07-30 11:46:05 +0100154 if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien429d7192013-10-04 19:53:11 +0100155 lex->chr2 = '\n'; // insert newline at end of file
Damien429d7192013-10-04 19:53:11 +0100156 }
157 }
158 }
159}
160
Damien Georgea4c52c52014-12-05 19:35:18 +0000161STATIC void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
Damien429d7192013-10-04 19:53:11 +0100162 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100163 // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
Damien George58ebde42014-05-21 20:32:59 +0100164 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
165 lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
Damien429d7192013-10-04 19:53:11 +0100166 }
167 lex->indent_level[lex->num_indent_level++] = indent;
168}
169
Damien Georgea4c52c52014-12-05 19:35:18 +0000170STATIC mp_uint_t indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100171 return lex->indent_level[lex->num_indent_level - 1];
172}
173
Damien Georgea4c52c52014-12-05 19:35:18 +0000174STATIC void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100175 lex->num_indent_level -= 1;
176}
177
178// some tricky operator encoding:
179// <op> = begin with <op>, if this opchar matches then begin here
180// e<op> = end with <op>, if this opchar matches then end
181// E<op> = mandatory end with <op>, this opchar must match, then end
182// c<op> = continue with <op>, if this opchar matches then continue matching
183// this means if the start of two ops are the same then they are equal til the last char
184
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200185STATIC const char *tok_enc =
Damien429d7192013-10-04 19:53:11 +0100186 "()[]{},:;@~" // singles
187 "<e=c<e=" // < <= << <<=
188 ">e=c>e=" // > >= >> >>=
189 "*e=c*e=" // * *= ** **=
190 "+e=" // + +=
191 "-e=e>" // - -= ->
192 "&e=" // & &=
193 "|e=" // | |=
194 "/e=c/e=" // / /= // //=
195 "%e=" // % %=
196 "^e=" // ^ ^=
197 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100198 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100199
200// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200201STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000202 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
203 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
204 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
205 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100206
Damiend99b0522013-12-21 18:17:45 +0000207 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
208 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
209 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
210 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
211 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
212 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
213 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
214 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
215 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
216 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
217 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
218 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100219};
220
221// must have the same order as enum in lexer.h
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200222STATIC const char *tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100223 "False",
224 "None",
225 "True",
226 "and",
227 "as",
228 "assert",
229 "break",
230 "class",
231 "continue",
232 "def",
233 "del",
234 "elif",
235 "else",
236 "except",
237 "finally",
238 "for",
239 "from",
240 "global",
241 "if",
242 "import",
243 "in",
244 "is",
245 "lambda",
246 "nonlocal",
247 "not",
248 "or",
249 "pass",
250 "raise",
251 "return",
252 "try",
253 "while",
254 "with",
255 "yield",
Damien George97f9a282014-05-12 23:07:34 +0100256 "__debug__",
Damien429d7192013-10-04 19:53:11 +0100257};
258
Damien George54eb4e72014-07-03 13:47:47 +0100259STATIC mp_uint_t hex_digit(unichar c) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200260 // c is assumed to be hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100261 mp_uint_t n = c - '0';
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200262 if (n > 9) {
263 n &= ~('a' - 'A');
264 n -= ('A' - ('9' + 1));
265 }
266 return n;
267}
268
269// This is called with CUR_CHAR() before first hex digit, and should return with
270// it pointing to last hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100271// num_digits must be greater than zero
272STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
273 mp_uint_t num = 0;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200274 while (num_digits-- != 0) {
275 next_char(lex);
276 unichar c = CUR_CHAR(lex);
277 if (!unichar_isxdigit(c)) {
278 return false;
279 }
280 num = (num << 4) + hex_digit(c);
281 }
282 *result = num;
283 return true;
284}
285
Damien Georgea4c52c52014-12-05 19:35:18 +0000286STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
287 // start new token text
288 vstr_reset(&lex->vstr);
289
Damiena5185f42013-10-20 14:41:27 +0100290 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100291 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100292 while (!is_end(lex)) {
293 if (is_physical_newline(lex)) {
294 had_physical_newline = true;
295 next_char(lex);
296 } else if (is_whitespace(lex)) {
297 next_char(lex);
298 } else if (is_char(lex, '#')) {
299 next_char(lex);
300 while (!is_end(lex) && !is_physical_newline(lex)) {
301 next_char(lex);
302 }
303 // had_physical_newline will be set on next loop
304 } else if (is_char(lex, '\\')) {
305 // backslash (outside string literals) must appear just before a physical newline
306 next_char(lex);
307 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000308 // SyntaxError: unexpected character after line continuation character
Damien Georgea4c52c52014-12-05 19:35:18 +0000309 lex->tok_line = lex->line;
310 lex->tok_column = lex->column;
311 lex->tok_kind = MP_TOKEN_BAD_LINE_CONTINUATION;
Damien George69a818d2014-01-12 13:55:24 +0000312 return;
Damien429d7192013-10-04 19:53:11 +0100313 } else {
314 next_char(lex);
315 }
316 } else {
317 break;
318 }
319 }
320
Damiena5185f42013-10-20 14:41:27 +0100321 // set token source information
Damien Georgea4c52c52014-12-05 19:35:18 +0000322 lex->tok_line = lex->line;
323 lex->tok_column = lex->column;
Damiena5185f42013-10-20 14:41:27 +0100324
325 if (first_token && lex->line == 1 && lex->column != 1) {
326 // check that the first token is in the first column
327 // if first token is not on first line, we get a physical newline and
328 // this check is done as part of normal indent/dedent checking below
329 // (done to get equivalence with CPython)
Damien Georgea4c52c52014-12-05 19:35:18 +0000330 lex->tok_kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100331
332 } else if (lex->emit_dent < 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000333 lex->tok_kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100334 lex->emit_dent += 1;
335
336 } else if (lex->emit_dent > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000337 lex->tok_kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100338 lex->emit_dent -= 1;
339
Damien91d387d2013-10-09 15:09:52 +0100340 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000341 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100342
Damien George54eb4e72014-07-03 13:47:47 +0100343 mp_uint_t num_spaces = lex->column - 1;
Damien429d7192013-10-04 19:53:11 +0100344 lex->emit_dent = 0;
345 if (num_spaces == indent_top(lex)) {
346 } else if (num_spaces > indent_top(lex)) {
347 indent_push(lex, num_spaces);
348 lex->emit_dent += 1;
349 } else {
350 while (num_spaces < indent_top(lex)) {
351 indent_pop(lex);
352 lex->emit_dent -= 1;
353 }
354 if (num_spaces != indent_top(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000355 lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100356 }
357 }
358
359 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100360 if (indent_top(lex) > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000361 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100362 lex->emit_dent = 0;
363 while (indent_top(lex) > 0) {
364 indent_pop(lex);
365 lex->emit_dent -= 1;
366 }
367 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000368 lex->tok_kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100369 }
370
371 } else if (is_char_or(lex, '\'', '\"')
372 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
373 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
374 // a string or bytes literal
375
376 // parse type codes
377 bool is_raw = false;
378 bool is_bytes = false;
379 if (is_char(lex, 'u')) {
380 next_char(lex);
381 } else if (is_char(lex, 'b')) {
382 is_bytes = true;
383 next_char(lex);
384 if (is_char(lex, 'r')) {
385 is_raw = true;
386 next_char(lex);
387 }
388 } else if (is_char(lex, 'r')) {
389 is_raw = true;
390 next_char(lex);
391 if (is_char(lex, 'b')) {
392 is_bytes = true;
393 next_char(lex);
394 }
395 }
396
397 // set token kind
398 if (is_bytes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000399 lex->tok_kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100400 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000401 lex->tok_kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100402 }
403
404 // get first quoting character
405 char quote_char = '\'';
406 if (is_char(lex, '\"')) {
407 quote_char = '\"';
408 }
409 next_char(lex);
410
411 // work out if it's a single or triple quoted literal
Damien George54eb4e72014-07-03 13:47:47 +0100412 mp_uint_t num_quotes;
Damien429d7192013-10-04 19:53:11 +0100413 if (is_char_and(lex, quote_char, quote_char)) {
414 // triple quotes
415 next_char(lex);
416 next_char(lex);
417 num_quotes = 3;
418 } else {
419 // single quotes
420 num_quotes = 1;
421 }
422
Damien429d7192013-10-04 19:53:11 +0100423 // parse the literal
Damien George54eb4e72014-07-03 13:47:47 +0100424 mp_uint_t n_closing = 0;
Damien429d7192013-10-04 19:53:11 +0100425 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
426 if (is_char(lex, quote_char)) {
427 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100428 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100429 } else {
430 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100431 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100432 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100433 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100434 if (is_raw) {
435 // raw strings allow escaping of quotes, but the backslash is also emitted
436 vstr_add_char(&lex->vstr, '\\');
437 } else {
438 switch (c) {
Damien George94fbe972014-07-30 11:46:05 +0100439 case MP_LEXER_EOF: break; // TODO a proper error message?
440 case '\n': c = MP_LEXER_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damien Georgea91f4142014-04-10 11:30:55 +0100441 case '\\': break;
442 case '\'': break;
443 case '"': break;
444 case 'a': c = 0x07; break;
445 case 'b': c = 0x08; break;
446 case 't': c = 0x09; break;
447 case 'n': c = 0x0a; break;
448 case 'v': c = 0x0b; break;
449 case 'f': c = 0x0c; break;
450 case 'r': c = 0x0d; break;
Chris Angelico2ba22992014-06-04 05:28:12 +1000451 case 'u':
452 case 'U':
453 if (is_bytes) {
454 // b'\u1234' == b'\\u1234'
455 vstr_add_char(&lex->vstr, '\\');
456 break;
457 }
458 // Otherwise fall through.
Damien Georgea91f4142014-04-10 11:30:55 +0100459 case 'x':
460 {
Damien George54eb4e72014-07-03 13:47:47 +0100461 mp_uint_t num = 0;
Chris Angelico2ba22992014-06-04 05:28:12 +1000462 if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
Damien Georgea91f4142014-04-10 11:30:55 +0100463 // TODO error message
464 assert(0);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200465 }
466 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100467 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200468 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000469 case 'N':
470 // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
471 // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
472 // 3MB of text; even gzip-compressed and with minimal structure, it'll take
473 // roughly half a meg of storage. This form of Unicode escape may be added
474 // later on, but it's definitely not a priority right now. -- CJA 20140607
475 assert(!"Unicode name escapes not supported");
476 break;
Damien Georgea91f4142014-04-10 11:30:55 +0100477 default:
478 if (c >= '0' && c <= '7') {
479 // Octal sequence, 1-3 chars
Damien George54eb4e72014-07-03 13:47:47 +0100480 mp_uint_t digits = 3;
481 mp_uint_t num = c - '0';
Damien Georgea91f4142014-04-10 11:30:55 +0100482 while (is_following_odigit(lex) && --digits != 0) {
483 next_char(lex);
484 num = num * 8 + (CUR_CHAR(lex) - '0');
485 }
486 c = num;
487 } else {
488 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
489 vstr_add_char(&lex->vstr, '\\');
490 }
491 break;
492 }
Damiena5185f42013-10-20 14:41:27 +0100493 }
Damien George94fbe972014-07-30 11:46:05 +0100494 if (c != MP_LEXER_EOF) {
Damien George16677ce2015-01-28 14:07:11 +0000495 #if MICROPY_PY_BUILTINS_STR_UNICODE
Chris Angelico2ba22992014-06-04 05:28:12 +1000496 if (c < 0x110000 && !is_bytes) {
497 vstr_add_char(&lex->vstr, c);
498 } else if (c < 0x100 && is_bytes) {
499 vstr_add_byte(&lex->vstr, c);
Damien George16677ce2015-01-28 14:07:11 +0000500 }
501 #else
502 // without unicode everything is just added as an 8-bit byte
503 if (c < 0x100) {
504 vstr_add_byte(&lex->vstr, c);
505 }
506 #endif
507 else {
Chris Angelico2ba22992014-06-04 05:28:12 +1000508 assert(!"TODO: Throw an error, invalid escape code probably");
509 }
Damiena5185f42013-10-20 14:41:27 +0100510 }
511 } else {
Damien George94fbe972014-07-30 11:46:05 +0100512 // Add the "character" as a byte so that we remain 8-bit clean.
513 // This way, strings are parsed correctly whether or not they contain utf-8 chars.
514 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100515 }
516 }
517 next_char(lex);
518 }
519
520 // check we got the required end quotes
521 if (n_closing < num_quotes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000522 lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100523 }
524
Damiena5185f42013-10-20 14:41:27 +0100525 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000526 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100527
528 } else if (is_head_of_identifier(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000529 lex->tok_kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100530
Damiena5185f42013-10-20 14:41:27 +0100531 // get first char
532 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100533 next_char(lex);
534
Damiena5185f42013-10-20 14:41:27 +0100535 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100536 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100537 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100538 next_char(lex);
539 }
540
541 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000542 lex->tok_kind = MP_TOKEN_NUMBER;
Damien429d7192013-10-04 19:53:11 +0100543
Damiena5185f42013-10-20 14:41:27 +0100544 // get first char
545 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100546 next_char(lex);
547
Damiena5185f42013-10-20 14:41:27 +0100548 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100549 while (!is_end(lex)) {
550 if (is_char_or(lex, 'e', 'E')) {
Damiena5185f42013-10-20 14:41:27 +0100551 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100552 next_char(lex);
553 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100554 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100555 next_char(lex);
556 }
557 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
Damiena5185f42013-10-20 14:41:27 +0100558 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100559 next_char(lex);
560 } else {
561 break;
562 }
563 }
564
Damien George2e9eb2d2014-04-10 12:19:33 +0100565 } else if (is_char(lex, '.')) {
566 // special handling for . and ... operators, because .. is not a valid operator
567
568 // get first char
569 vstr_add_char(&lex->vstr, '.');
570 next_char(lex);
571
572 if (is_char_and(lex, '.', '.')) {
573 vstr_add_char(&lex->vstr, '.');
574 vstr_add_char(&lex->vstr, '.');
575 next_char(lex);
576 next_char(lex);
Damien Georgea4c52c52014-12-05 19:35:18 +0000577 lex->tok_kind = MP_TOKEN_ELLIPSIS;
Damien George2e9eb2d2014-04-10 12:19:33 +0100578 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000579 lex->tok_kind = MP_TOKEN_DEL_PERIOD;
Damien George2e9eb2d2014-04-10 12:19:33 +0100580 }
581
Damien429d7192013-10-04 19:53:11 +0100582 } else {
583 // search for encoded delimiter or operator
584
585 const char *t = tok_enc;
Damien George54eb4e72014-07-03 13:47:47 +0100586 mp_uint_t tok_enc_index = 0;
Damien429d7192013-10-04 19:53:11 +0100587 for (; *t != 0 && !is_char(lex, *t); t += 1) {
588 if (*t == 'e' || *t == 'c') {
589 t += 1;
590 } else if (*t == 'E') {
591 tok_enc_index -= 1;
592 t += 1;
593 }
594 tok_enc_index += 1;
595 }
596
597 next_char(lex);
598
599 if (*t == 0) {
600 // didn't match any delimiter or operator characters
Damien Georgea4c52c52014-12-05 19:35:18 +0000601 lex->tok_kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100602
603 } else {
604 // matched a delimiter or operator character
605
606 // get the maximum characters for a valid token
607 t += 1;
Damien George54eb4e72014-07-03 13:47:47 +0100608 mp_uint_t t_index = tok_enc_index;
Damien429d7192013-10-04 19:53:11 +0100609 for (;;) {
610 for (; *t == 'e'; t += 1) {
611 t += 1;
612 t_index += 1;
613 if (is_char(lex, *t)) {
614 next_char(lex);
615 tok_enc_index = t_index;
616 break;
617 }
618 }
619
620 if (*t == 'E') {
621 t += 1;
622 if (is_char(lex, *t)) {
623 next_char(lex);
624 tok_enc_index = t_index;
625 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000626 lex->tok_kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100627 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100628 }
629 break;
630 }
631
632 if (*t == 'c') {
633 t += 1;
634 t_index += 1;
635 if (is_char(lex, *t)) {
636 next_char(lex);
637 tok_enc_index = t_index;
638 t += 1;
639 } else {
640 break;
641 }
642 } else {
643 break;
644 }
645 }
646
647 // set token kind
Damien Georgea4c52c52014-12-05 19:35:18 +0000648 lex->tok_kind = tok_enc_kind[tok_enc_index];
Damien429d7192013-10-04 19:53:11 +0100649
Damien George2e9eb2d2014-04-10 12:19:33 +0100650 tok_enc_no_match:
651
Damien429d7192013-10-04 19:53:11 +0100652 // compute bracket level for implicit line joining
Damien Georgea4c52c52014-12-05 19:35:18 +0000653 if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100654 lex->nested_bracket_level += 1;
Damien Georgea4c52c52014-12-05 19:35:18 +0000655 } else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100656 lex->nested_bracket_level -= 1;
657 }
658 }
659 }
660
Damiena5185f42013-10-20 14:41:27 +0100661 // check for keywords
Damien Georgea4c52c52014-12-05 19:35:18 +0000662 if (lex->tok_kind == MP_TOKEN_NAME) {
Damien George97f9a282014-05-12 23:07:34 +0100663 // We check for __debug__ here and convert it to its value. This is so
664 // the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
665 // need to check for this special token in many places in the compiler.
666 // TODO improve speed of these string comparisons
Damien George54eb4e72014-07-03 13:47:47 +0100667 //for (mp_int_t i = 0; tok_kw[i] != NULL; i++) {
Damien George963a5a32015-01-16 17:47:07 +0000668 for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000669 if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) {
Emmanuel Blotf6932d62014-06-19 18:54:34 +0200670 if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
671 // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
Damien Georgeb4b10fd2015-01-01 23:30:53 +0000672 lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
Damien George97f9a282014-05-12 23:07:34 +0100673 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000674 lex->tok_kind = MP_TOKEN_KW_FALSE + i;
Damien George97f9a282014-05-12 23:07:34 +0100675 }
Damien429d7192013-10-04 19:53:11 +0100676 break;
677 }
678 }
679 }
680}
681
Damien George94fbe972014-07-30 11:46:05 +0100682mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close) {
Damien George9bf5f282014-10-09 16:53:37 +0100683 mp_lexer_t *lex = m_new_obj_maybe(mp_lexer_t);
Damien Georgee1199ec2014-05-10 17:48:01 +0100684
685 // check for memory allocation error
686 if (lex == NULL) {
687 if (stream_close) {
688 stream_close(stream_data);
689 }
690 return NULL;
691 }
Damien429d7192013-10-04 19:53:11 +0100692
Damien Georgeb829b5c2014-01-25 13:51:19 +0000693 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100694 lex->stream_data = stream_data;
Damien George94fbe972014-07-30 11:46:05 +0100695 lex->stream_next_byte = stream_next_byte;
Damienfa2162b2013-10-20 17:42:00 +0100696 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100697 lex->line = 1;
698 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100699 lex->emit_dent = 0;
700 lex->nested_bracket_level = 0;
Damien George58ebde42014-05-21 20:32:59 +0100701 lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
Damien429d7192013-10-04 19:53:11 +0100702 lex->num_indent_level = 1;
Damien Georgee1199ec2014-05-10 17:48:01 +0100703 lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200704 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100705
Damien Georgee1199ec2014-05-10 17:48:01 +0100706 // check for memory allocation error
707 if (lex->indent_level == NULL || vstr_had_error(&lex->vstr)) {
708 mp_lexer_free(lex);
709 return NULL;
710 }
711
712 // store sentinel for first indentation level
713 lex->indent_level[0] = 0;
714
Damien429d7192013-10-04 19:53:11 +0100715 // preload characters
Damien George94fbe972014-07-30 11:46:05 +0100716 lex->chr0 = stream_next_byte(stream_data);
717 lex->chr1 = stream_next_byte(stream_data);
718 lex->chr2 = stream_next_byte(stream_data);
Damiena5185f42013-10-20 14:41:27 +0100719
720 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damien George94fbe972014-07-30 11:46:05 +0100721 if (lex->chr0 == MP_LEXER_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100722 lex->chr0 = '\n';
Damien George94fbe972014-07-30 11:46:05 +0100723 } else if (lex->chr1 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100724 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100725 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100726 }
Damien George94fbe972014-07-30 11:46:05 +0100727 } else if (lex->chr2 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100728 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100729 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100730 }
Damien429d7192013-10-04 19:53:11 +0100731 }
732
Damiena5185f42013-10-20 14:41:27 +0100733 // preload first token
Damien Georgea4c52c52014-12-05 19:35:18 +0000734 mp_lexer_next_token_into(lex, true);
Damien429d7192013-10-04 19:53:11 +0100735
736 return lex;
737}
738
Damiend99b0522013-12-21 18:17:45 +0000739void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100740 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100741 if (lex->stream_close) {
742 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100743 }
Damienbb5316b2013-10-22 21:12:29 +0100744 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200745 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000746 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100747 }
Damien429d7192013-10-04 19:53:11 +0100748}
749
Damiend99b0522013-12-21 18:17:45 +0000750void mp_lexer_to_next(mp_lexer_t *lex) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000751 mp_lexer_next_token_into(lex, false);
Damien429d7192013-10-04 19:53:11 +0100752}
753
Damien Georgea4c52c52014-12-05 19:35:18 +0000754#if MICROPY_DEBUG_PRINTERS
755void mp_lexer_show_token(const mp_lexer_t *lex) {
Damien George451a0872014-12-05 22:50:16 +0000756 printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
Damien Georgea4c52c52014-12-05 19:35:18 +0000757 if (lex->vstr.len > 0) {
758 const byte *i = (const byte *)lex->vstr.buf;
759 const byte *j = (const byte *)i + lex->vstr.len;
760 printf(" ");
761 while (i < j) {
762 unichar c = utf8_get_char(i);
763 i = utf8_next_char(i);
764 if (unichar_isprint(c)) {
765 printf("%c", c);
766 } else {
767 printf("?");
768 }
769 }
770 }
771 printf("\n");
Damien429d7192013-10-04 19:53:11 +0100772}
Damien Georgea4c52c52014-12-05 19:35:18 +0000773#endif