blob: 536208e41f7f6a831c68f1da20986f350760037a [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027#include <stdio.h>
28#include <assert.h>
29
Damien Georgeb4b10fd2015-01-01 23:30:53 +000030#include "py/mpstate.h"
Damien George51dfcb42015-01-01 20:27:54 +000031#include "py/lexer.h"
Damien429d7192013-10-04 19:53:11 +010032
33#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010034
Damien92c06562013-10-22 22:32:27 +010035// TODO seems that CPython allows NULL byte in the input stream
36// don't know if that's intentional or not, but we don't allow it
37
Damien George9528cd62014-01-15 21:23:31 +000038// TODO replace with a call to a standard function
Damien Georgea4c52c52014-12-05 19:35:18 +000039STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
Damien George54eb4e72014-07-03 13:47:47 +010040 mp_uint_t i = 0;
Damien429d7192013-10-04 19:53:11 +010041
Damiena5185f42013-10-20 14:41:27 +010042 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010043 ++i;
Damien429d7192013-10-04 19:53:11 +010044 ++str;
Damiena5185f42013-10-20 14:41:27 +010045 ++strn;
Damien429d7192013-10-04 19:53:11 +010046 }
47
Damiena5185f42013-10-20 14:41:27 +010048 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010049}
50
Damiena5185f42013-10-20 14:41:27 +010051#define CUR_CHAR(lex) ((lex)->chr0)
52
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020053STATIC bool is_end(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +010054 return lex->chr0 == MP_LEXER_EOF;
Damien429d7192013-10-04 19:53:11 +010055}
56
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020057STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien George32bade12015-01-30 00:27:46 +000058 return lex->chr0 == '\n';
Damien429d7192013-10-04 19:53:11 +010059}
60
Damien George2e2e4042015-03-19 00:21:29 +000061STATIC bool is_char(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010062 return lex->chr0 == c;
63}
64
Damien George2e2e4042015-03-19 00:21:29 +000065STATIC bool is_char_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010066 return lex->chr0 == c1 || lex->chr0 == c2;
67}
68
Damien George2e2e4042015-03-19 00:21:29 +000069STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
Damien429d7192013-10-04 19:53:11 +010070 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
71}
72
73/*
Damien George2e2e4042015-03-19 00:21:29 +000074STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010075 return lex->chr1 == c;
76}
77*/
78
Damien George2e2e4042015-03-19 00:21:29 +000079STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010080 return lex->chr1 == c1 || lex->chr1 == c2;
81}
82
Damien George2e2e4042015-03-19 00:21:29 +000083STATIC bool is_char_following_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010084 return lex->chr2 == c1 || lex->chr2 == c2;
85}
86
Damien George2e2e4042015-03-19 00:21:29 +000087STATIC bool is_char_and(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010088 return lex->chr0 == c1 && lex->chr1 == c2;
89}
90
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020091STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000092 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010093}
94
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020095STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000096 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010097}
98
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020099STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000100 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100101}
102
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200103STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000104 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100105}
106
Damien George7d414a12015-02-08 01:57:40 +0000107STATIC bool is_following_letter(mp_lexer_t *lex) {
108 return unichar_isalpha(lex->chr1);
109}
110
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200111STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200112 return lex->chr1 >= '0' && lex->chr1 <= '7';
113}
114
Damien429d7192013-10-04 19:53:11 +0100115// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200116STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100117 return is_letter(lex) || lex->chr0 == '_';
118}
119
120// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200121STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100122 return is_head_of_identifier(lex) || is_digit(lex);
123}
124
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200125STATIC void next_char(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +0100126 if (lex->chr0 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100127 return;
128 }
129
Damien429d7192013-10-04 19:53:11 +0100130 if (lex->chr0 == '\n') {
Damien George32bade12015-01-30 00:27:46 +0000131 // a new line
Damien429d7192013-10-04 19:53:11 +0100132 ++lex->line;
133 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100134 } else if (lex->chr0 == '\t') {
135 // a tab
136 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
137 } else {
138 // a character worth one column
139 ++lex->column;
140 }
141
Damien George32bade12015-01-30 00:27:46 +0000142 lex->chr0 = lex->chr1;
143 lex->chr1 = lex->chr2;
144 lex->chr2 = lex->stream_next_byte(lex->stream_data);
145
146 if (lex->chr0 == '\r') {
147 // CR is a new line, converted to LF
148 lex->chr0 = '\n';
149 if (lex->chr1 == '\n') {
150 // CR LF is a single new line
151 lex->chr1 = lex->chr2;
152 lex->chr2 = lex->stream_next_byte(lex->stream_data);
153 }
154 }
155
156 if (lex->chr2 == MP_LEXER_EOF) {
157 // EOF, check if we need to insert a newline at end of file
158 if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
159 // if lex->chr1 == '\r' then this makes a CR LF which will be converted to LF above
160 // otherwise it just inserts a LF
161 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100162 }
163 }
164}
165
Damien Georgea4c52c52014-12-05 19:35:18 +0000166STATIC void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
Damien429d7192013-10-04 19:53:11 +0100167 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100168 // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
Damien George58ebde42014-05-21 20:32:59 +0100169 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
170 lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
Damien429d7192013-10-04 19:53:11 +0100171 }
172 lex->indent_level[lex->num_indent_level++] = indent;
173}
174
Damien Georgea4c52c52014-12-05 19:35:18 +0000175STATIC mp_uint_t indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100176 return lex->indent_level[lex->num_indent_level - 1];
177}
178
Damien Georgea4c52c52014-12-05 19:35:18 +0000179STATIC void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100180 lex->num_indent_level -= 1;
181}
182
183// some tricky operator encoding:
184// <op> = begin with <op>, if this opchar matches then begin here
185// e<op> = end with <op>, if this opchar matches then end
186// E<op> = mandatory end with <op>, this opchar must match, then end
187// c<op> = continue with <op>, if this opchar matches then continue matching
188// this means if the start of two ops are the same then they are equal til the last char
189
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200190STATIC const char *tok_enc =
Damien429d7192013-10-04 19:53:11 +0100191 "()[]{},:;@~" // singles
192 "<e=c<e=" // < <= << <<=
193 ">e=c>e=" // > >= >> >>=
194 "*e=c*e=" // * *= ** **=
195 "+e=" // + +=
196 "-e=e>" // - -= ->
197 "&e=" // & &=
198 "|e=" // | |=
199 "/e=c/e=" // / /= // //=
200 "%e=" // % %=
201 "^e=" // ^ ^=
202 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100203 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100204
205// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200206STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000207 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
208 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
209 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
210 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100211
Damiend99b0522013-12-21 18:17:45 +0000212 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
213 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
214 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
215 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
216 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
217 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
218 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
219 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
220 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
221 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
222 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
223 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100224};
225
226// must have the same order as enum in lexer.h
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200227STATIC const char *tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100228 "False",
229 "None",
230 "True",
231 "and",
232 "as",
233 "assert",
234 "break",
235 "class",
236 "continue",
237 "def",
238 "del",
239 "elif",
240 "else",
241 "except",
242 "finally",
243 "for",
244 "from",
245 "global",
246 "if",
247 "import",
248 "in",
249 "is",
250 "lambda",
251 "nonlocal",
252 "not",
253 "or",
254 "pass",
255 "raise",
256 "return",
257 "try",
258 "while",
259 "with",
260 "yield",
Damien George97f9a282014-05-12 23:07:34 +0100261 "__debug__",
Damien429d7192013-10-04 19:53:11 +0100262};
263
Damien George54eb4e72014-07-03 13:47:47 +0100264STATIC mp_uint_t hex_digit(unichar c) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200265 // c is assumed to be hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100266 mp_uint_t n = c - '0';
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200267 if (n > 9) {
268 n &= ~('a' - 'A');
269 n -= ('A' - ('9' + 1));
270 }
271 return n;
272}
273
274// This is called with CUR_CHAR() before first hex digit, and should return with
275// it pointing to last hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100276// num_digits must be greater than zero
277STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
278 mp_uint_t num = 0;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200279 while (num_digits-- != 0) {
280 next_char(lex);
281 unichar c = CUR_CHAR(lex);
282 if (!unichar_isxdigit(c)) {
283 return false;
284 }
285 num = (num << 4) + hex_digit(c);
286 }
287 *result = num;
288 return true;
289}
290
Damien Georgea4c52c52014-12-05 19:35:18 +0000291STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
292 // start new token text
293 vstr_reset(&lex->vstr);
294
Damiena5185f42013-10-20 14:41:27 +0100295 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100296 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100297 while (!is_end(lex)) {
298 if (is_physical_newline(lex)) {
299 had_physical_newline = true;
300 next_char(lex);
301 } else if (is_whitespace(lex)) {
302 next_char(lex);
303 } else if (is_char(lex, '#')) {
304 next_char(lex);
305 while (!is_end(lex) && !is_physical_newline(lex)) {
306 next_char(lex);
307 }
308 // had_physical_newline will be set on next loop
309 } else if (is_char(lex, '\\')) {
310 // backslash (outside string literals) must appear just before a physical newline
311 next_char(lex);
312 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000313 // SyntaxError: unexpected character after line continuation character
Damien Georgea4c52c52014-12-05 19:35:18 +0000314 lex->tok_line = lex->line;
315 lex->tok_column = lex->column;
316 lex->tok_kind = MP_TOKEN_BAD_LINE_CONTINUATION;
Damien George69a818d2014-01-12 13:55:24 +0000317 return;
Damien429d7192013-10-04 19:53:11 +0100318 } else {
319 next_char(lex);
320 }
321 } else {
322 break;
323 }
324 }
325
Damiena5185f42013-10-20 14:41:27 +0100326 // set token source information
Damien Georgea4c52c52014-12-05 19:35:18 +0000327 lex->tok_line = lex->line;
328 lex->tok_column = lex->column;
Damiena5185f42013-10-20 14:41:27 +0100329
330 if (first_token && lex->line == 1 && lex->column != 1) {
331 // check that the first token is in the first column
332 // if first token is not on first line, we get a physical newline and
333 // this check is done as part of normal indent/dedent checking below
334 // (done to get equivalence with CPython)
Damien Georgea4c52c52014-12-05 19:35:18 +0000335 lex->tok_kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100336
337 } else if (lex->emit_dent < 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000338 lex->tok_kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100339 lex->emit_dent += 1;
340
341 } else if (lex->emit_dent > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000342 lex->tok_kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100343 lex->emit_dent -= 1;
344
Damien91d387d2013-10-09 15:09:52 +0100345 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000346 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100347
Damien George54eb4e72014-07-03 13:47:47 +0100348 mp_uint_t num_spaces = lex->column - 1;
Damien429d7192013-10-04 19:53:11 +0100349 lex->emit_dent = 0;
350 if (num_spaces == indent_top(lex)) {
351 } else if (num_spaces > indent_top(lex)) {
352 indent_push(lex, num_spaces);
353 lex->emit_dent += 1;
354 } else {
355 while (num_spaces < indent_top(lex)) {
356 indent_pop(lex);
357 lex->emit_dent -= 1;
358 }
359 if (num_spaces != indent_top(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000360 lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100361 }
362 }
363
364 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100365 if (indent_top(lex) > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000366 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100367 lex->emit_dent = 0;
368 while (indent_top(lex) > 0) {
369 indent_pop(lex);
370 lex->emit_dent -= 1;
371 }
372 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000373 lex->tok_kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100374 }
375
376 } else if (is_char_or(lex, '\'', '\"')
377 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
378 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
379 // a string or bytes literal
380
381 // parse type codes
382 bool is_raw = false;
383 bool is_bytes = false;
384 if (is_char(lex, 'u')) {
385 next_char(lex);
386 } else if (is_char(lex, 'b')) {
387 is_bytes = true;
388 next_char(lex);
389 if (is_char(lex, 'r')) {
390 is_raw = true;
391 next_char(lex);
392 }
393 } else if (is_char(lex, 'r')) {
394 is_raw = true;
395 next_char(lex);
396 if (is_char(lex, 'b')) {
397 is_bytes = true;
398 next_char(lex);
399 }
400 }
401
402 // set token kind
403 if (is_bytes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000404 lex->tok_kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100405 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000406 lex->tok_kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100407 }
408
409 // get first quoting character
410 char quote_char = '\'';
411 if (is_char(lex, '\"')) {
412 quote_char = '\"';
413 }
414 next_char(lex);
415
416 // work out if it's a single or triple quoted literal
Damien George54eb4e72014-07-03 13:47:47 +0100417 mp_uint_t num_quotes;
Damien429d7192013-10-04 19:53:11 +0100418 if (is_char_and(lex, quote_char, quote_char)) {
419 // triple quotes
420 next_char(lex);
421 next_char(lex);
422 num_quotes = 3;
423 } else {
424 // single quotes
425 num_quotes = 1;
426 }
427
Damien429d7192013-10-04 19:53:11 +0100428 // parse the literal
Damien George54eb4e72014-07-03 13:47:47 +0100429 mp_uint_t n_closing = 0;
Damien429d7192013-10-04 19:53:11 +0100430 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
431 if (is_char(lex, quote_char)) {
432 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100433 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100434 } else {
435 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100436 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100437 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100438 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100439 if (is_raw) {
440 // raw strings allow escaping of quotes, but the backslash is also emitted
441 vstr_add_char(&lex->vstr, '\\');
442 } else {
443 switch (c) {
Damien George94fbe972014-07-30 11:46:05 +0100444 case MP_LEXER_EOF: break; // TODO a proper error message?
445 case '\n': c = MP_LEXER_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damien Georgea91f4142014-04-10 11:30:55 +0100446 case '\\': break;
447 case '\'': break;
448 case '"': break;
449 case 'a': c = 0x07; break;
450 case 'b': c = 0x08; break;
451 case 't': c = 0x09; break;
452 case 'n': c = 0x0a; break;
453 case 'v': c = 0x0b; break;
454 case 'f': c = 0x0c; break;
455 case 'r': c = 0x0d; break;
Chris Angelico2ba22992014-06-04 05:28:12 +1000456 case 'u':
457 case 'U':
458 if (is_bytes) {
459 // b'\u1234' == b'\\u1234'
460 vstr_add_char(&lex->vstr, '\\');
461 break;
462 }
463 // Otherwise fall through.
Damien Georgea91f4142014-04-10 11:30:55 +0100464 case 'x':
465 {
Damien George54eb4e72014-07-03 13:47:47 +0100466 mp_uint_t num = 0;
Chris Angelico2ba22992014-06-04 05:28:12 +1000467 if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
Damien Georgea91f4142014-04-10 11:30:55 +0100468 // TODO error message
469 assert(0);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200470 }
471 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100472 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200473 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000474 case 'N':
475 // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
476 // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
477 // 3MB of text; even gzip-compressed and with minimal structure, it'll take
478 // roughly half a meg of storage. This form of Unicode escape may be added
479 // later on, but it's definitely not a priority right now. -- CJA 20140607
480 assert(!"Unicode name escapes not supported");
481 break;
Damien Georgea91f4142014-04-10 11:30:55 +0100482 default:
483 if (c >= '0' && c <= '7') {
484 // Octal sequence, 1-3 chars
Damien George54eb4e72014-07-03 13:47:47 +0100485 mp_uint_t digits = 3;
486 mp_uint_t num = c - '0';
Damien Georgea91f4142014-04-10 11:30:55 +0100487 while (is_following_odigit(lex) && --digits != 0) {
488 next_char(lex);
489 num = num * 8 + (CUR_CHAR(lex) - '0');
490 }
491 c = num;
492 } else {
493 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
494 vstr_add_char(&lex->vstr, '\\');
495 }
496 break;
497 }
Damiena5185f42013-10-20 14:41:27 +0100498 }
Damien George94fbe972014-07-30 11:46:05 +0100499 if (c != MP_LEXER_EOF) {
Damien George16677ce2015-01-28 14:07:11 +0000500 #if MICROPY_PY_BUILTINS_STR_UNICODE
Chris Angelico2ba22992014-06-04 05:28:12 +1000501 if (c < 0x110000 && !is_bytes) {
502 vstr_add_char(&lex->vstr, c);
503 } else if (c < 0x100 && is_bytes) {
504 vstr_add_byte(&lex->vstr, c);
Damien George16677ce2015-01-28 14:07:11 +0000505 }
506 #else
507 // without unicode everything is just added as an 8-bit byte
508 if (c < 0x100) {
509 vstr_add_byte(&lex->vstr, c);
510 }
511 #endif
512 else {
Chris Angelico2ba22992014-06-04 05:28:12 +1000513 assert(!"TODO: Throw an error, invalid escape code probably");
514 }
Damiena5185f42013-10-20 14:41:27 +0100515 }
516 } else {
Damien George94fbe972014-07-30 11:46:05 +0100517 // Add the "character" as a byte so that we remain 8-bit clean.
518 // This way, strings are parsed correctly whether or not they contain utf-8 chars.
519 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100520 }
521 }
522 next_char(lex);
523 }
524
525 // check we got the required end quotes
526 if (n_closing < num_quotes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000527 lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100528 }
529
Damiena5185f42013-10-20 14:41:27 +0100530 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000531 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100532
533 } else if (is_head_of_identifier(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000534 lex->tok_kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100535
Damiena5185f42013-10-20 14:41:27 +0100536 // get first char
537 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100538 next_char(lex);
539
Damiena5185f42013-10-20 14:41:27 +0100540 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100541 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100542 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100543 next_char(lex);
544 }
545
546 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damien George7d414a12015-02-08 01:57:40 +0000547 bool forced_integer = false;
548 if (is_char(lex, '.')) {
549 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
550 } else {
551 lex->tok_kind = MP_TOKEN_INTEGER;
552 if (is_char(lex, '0') && is_following_letter(lex)) {
553 forced_integer = true;
554 }
555 }
Damien429d7192013-10-04 19:53:11 +0100556
Damiena5185f42013-10-20 14:41:27 +0100557 // get first char
558 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100559 next_char(lex);
560
Damiena5185f42013-10-20 14:41:27 +0100561 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100562 while (!is_end(lex)) {
Damien George7d414a12015-02-08 01:57:40 +0000563 if (!forced_integer && is_char_or(lex, 'e', 'E')) {
564 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
Damiena5185f42013-10-20 14:41:27 +0100565 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100566 next_char(lex);
567 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100568 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100569 next_char(lex);
570 }
Damien George7d414a12015-02-08 01:57:40 +0000571 } else if (is_letter(lex) || is_digit(lex) || is_char(lex, '.')) {
572 if (is_char_or3(lex, '.', 'j', 'J')) {
573 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
574 }
Damiena5185f42013-10-20 14:41:27 +0100575 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100576 next_char(lex);
577 } else {
578 break;
579 }
580 }
581
Damien George2e9eb2d2014-04-10 12:19:33 +0100582 } else if (is_char(lex, '.')) {
583 // special handling for . and ... operators, because .. is not a valid operator
584
585 // get first char
586 vstr_add_char(&lex->vstr, '.');
587 next_char(lex);
588
589 if (is_char_and(lex, '.', '.')) {
590 vstr_add_char(&lex->vstr, '.');
591 vstr_add_char(&lex->vstr, '.');
592 next_char(lex);
593 next_char(lex);
Damien Georgea4c52c52014-12-05 19:35:18 +0000594 lex->tok_kind = MP_TOKEN_ELLIPSIS;
Damien George2e9eb2d2014-04-10 12:19:33 +0100595 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000596 lex->tok_kind = MP_TOKEN_DEL_PERIOD;
Damien George2e9eb2d2014-04-10 12:19:33 +0100597 }
598
Damien429d7192013-10-04 19:53:11 +0100599 } else {
600 // search for encoded delimiter or operator
601
602 const char *t = tok_enc;
Damien George54eb4e72014-07-03 13:47:47 +0100603 mp_uint_t tok_enc_index = 0;
Damien429d7192013-10-04 19:53:11 +0100604 for (; *t != 0 && !is_char(lex, *t); t += 1) {
605 if (*t == 'e' || *t == 'c') {
606 t += 1;
607 } else if (*t == 'E') {
608 tok_enc_index -= 1;
609 t += 1;
610 }
611 tok_enc_index += 1;
612 }
613
614 next_char(lex);
615
616 if (*t == 0) {
617 // didn't match any delimiter or operator characters
Damien Georgea4c52c52014-12-05 19:35:18 +0000618 lex->tok_kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100619
620 } else {
621 // matched a delimiter or operator character
622
623 // get the maximum characters for a valid token
624 t += 1;
Damien George54eb4e72014-07-03 13:47:47 +0100625 mp_uint_t t_index = tok_enc_index;
Damien429d7192013-10-04 19:53:11 +0100626 for (;;) {
627 for (; *t == 'e'; t += 1) {
628 t += 1;
629 t_index += 1;
630 if (is_char(lex, *t)) {
631 next_char(lex);
632 tok_enc_index = t_index;
633 break;
634 }
635 }
636
637 if (*t == 'E') {
638 t += 1;
639 if (is_char(lex, *t)) {
640 next_char(lex);
641 tok_enc_index = t_index;
642 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000643 lex->tok_kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100644 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100645 }
646 break;
647 }
648
649 if (*t == 'c') {
650 t += 1;
651 t_index += 1;
652 if (is_char(lex, *t)) {
653 next_char(lex);
654 tok_enc_index = t_index;
655 t += 1;
656 } else {
657 break;
658 }
659 } else {
660 break;
661 }
662 }
663
664 // set token kind
Damien Georgea4c52c52014-12-05 19:35:18 +0000665 lex->tok_kind = tok_enc_kind[tok_enc_index];
Damien429d7192013-10-04 19:53:11 +0100666
Damien George2e9eb2d2014-04-10 12:19:33 +0100667 tok_enc_no_match:
668
Damien429d7192013-10-04 19:53:11 +0100669 // compute bracket level for implicit line joining
Damien Georgea4c52c52014-12-05 19:35:18 +0000670 if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100671 lex->nested_bracket_level += 1;
Damien Georgea4c52c52014-12-05 19:35:18 +0000672 } else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100673 lex->nested_bracket_level -= 1;
674 }
675 }
676 }
677
Damiena5185f42013-10-20 14:41:27 +0100678 // check for keywords
Damien Georgea4c52c52014-12-05 19:35:18 +0000679 if (lex->tok_kind == MP_TOKEN_NAME) {
Damien George97f9a282014-05-12 23:07:34 +0100680 // We check for __debug__ here and convert it to its value. This is so
681 // the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
682 // need to check for this special token in many places in the compiler.
683 // TODO improve speed of these string comparisons
Damien George54eb4e72014-07-03 13:47:47 +0100684 //for (mp_int_t i = 0; tok_kw[i] != NULL; i++) {
Damien George963a5a32015-01-16 17:47:07 +0000685 for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000686 if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) {
Emmanuel Blotf6932d62014-06-19 18:54:34 +0200687 if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
688 // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
Damien Georgeb4b10fd2015-01-01 23:30:53 +0000689 lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
Damien George97f9a282014-05-12 23:07:34 +0100690 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000691 lex->tok_kind = MP_TOKEN_KW_FALSE + i;
Damien George97f9a282014-05-12 23:07:34 +0100692 }
Damien429d7192013-10-04 19:53:11 +0100693 break;
694 }
695 }
696 }
697}
698
Damien George94fbe972014-07-30 11:46:05 +0100699mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close) {
Damien George9bf5f282014-10-09 16:53:37 +0100700 mp_lexer_t *lex = m_new_obj_maybe(mp_lexer_t);
Damien Georgee1199ec2014-05-10 17:48:01 +0100701
702 // check for memory allocation error
703 if (lex == NULL) {
704 if (stream_close) {
705 stream_close(stream_data);
706 }
707 return NULL;
708 }
Damien429d7192013-10-04 19:53:11 +0100709
Damien Georgeb829b5c2014-01-25 13:51:19 +0000710 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100711 lex->stream_data = stream_data;
Damien George94fbe972014-07-30 11:46:05 +0100712 lex->stream_next_byte = stream_next_byte;
Damienfa2162b2013-10-20 17:42:00 +0100713 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100714 lex->line = 1;
715 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100716 lex->emit_dent = 0;
717 lex->nested_bracket_level = 0;
Damien George58ebde42014-05-21 20:32:59 +0100718 lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
Damien429d7192013-10-04 19:53:11 +0100719 lex->num_indent_level = 1;
Damien Georgee1199ec2014-05-10 17:48:01 +0100720 lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200721 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100722
Damien Georgee1199ec2014-05-10 17:48:01 +0100723 // check for memory allocation error
724 if (lex->indent_level == NULL || vstr_had_error(&lex->vstr)) {
725 mp_lexer_free(lex);
726 return NULL;
727 }
728
729 // store sentinel for first indentation level
730 lex->indent_level[0] = 0;
731
Damien429d7192013-10-04 19:53:11 +0100732 // preload characters
Damien George94fbe972014-07-30 11:46:05 +0100733 lex->chr0 = stream_next_byte(stream_data);
734 lex->chr1 = stream_next_byte(stream_data);
735 lex->chr2 = stream_next_byte(stream_data);
Damiena5185f42013-10-20 14:41:27 +0100736
737 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damien George94fbe972014-07-30 11:46:05 +0100738 if (lex->chr0 == MP_LEXER_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100739 lex->chr0 = '\n';
Damien George94fbe972014-07-30 11:46:05 +0100740 } else if (lex->chr1 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000741 if (lex->chr0 == '\r') {
742 lex->chr0 = '\n';
743 } else if (lex->chr0 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100744 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100745 }
Damien George94fbe972014-07-30 11:46:05 +0100746 } else if (lex->chr2 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000747 if (lex->chr1 == '\r') {
748 lex->chr1 = '\n';
749 } else if (lex->chr1 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100750 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100751 }
Damien429d7192013-10-04 19:53:11 +0100752 }
753
Damiena5185f42013-10-20 14:41:27 +0100754 // preload first token
Damien Georgea4c52c52014-12-05 19:35:18 +0000755 mp_lexer_next_token_into(lex, true);
Damien429d7192013-10-04 19:53:11 +0100756
757 return lex;
758}
759
Damiend99b0522013-12-21 18:17:45 +0000760void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100761 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100762 if (lex->stream_close) {
763 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100764 }
Damienbb5316b2013-10-22 21:12:29 +0100765 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200766 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000767 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100768 }
Damien429d7192013-10-04 19:53:11 +0100769}
770
Damiend99b0522013-12-21 18:17:45 +0000771void mp_lexer_to_next(mp_lexer_t *lex) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000772 mp_lexer_next_token_into(lex, false);
Damien429d7192013-10-04 19:53:11 +0100773}
774
Damien Georgea4c52c52014-12-05 19:35:18 +0000775#if MICROPY_DEBUG_PRINTERS
776void mp_lexer_show_token(const mp_lexer_t *lex) {
Damien George451a0872014-12-05 22:50:16 +0000777 printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
Damien Georgea4c52c52014-12-05 19:35:18 +0000778 if (lex->vstr.len > 0) {
779 const byte *i = (const byte *)lex->vstr.buf;
780 const byte *j = (const byte *)i + lex->vstr.len;
781 printf(" ");
782 while (i < j) {
783 unichar c = utf8_get_char(i);
784 i = utf8_next_char(i);
785 if (unichar_isprint(c)) {
786 printf("%c", c);
787 } else {
788 printf("?");
789 }
790 }
791 }
792 printf("\n");
Damien429d7192013-10-04 19:53:11 +0100793}
Damien Georgea4c52c52014-12-05 19:35:18 +0000794#endif