blob: 12cb5ae5b2c696b38e8ac24753ae65f080b392e8 [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027#include <stdio.h>
28#include <assert.h>
29
Damien Georgeb4b10fd2015-01-01 23:30:53 +000030#include "py/mpstate.h"
Damien George51dfcb42015-01-01 20:27:54 +000031#include "py/lexer.h"
Damien429d7192013-10-04 19:53:11 +010032
33#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010034
Damien92c06562013-10-22 22:32:27 +010035// TODO seems that CPython allows NULL byte in the input stream
36// don't know if that's intentional or not, but we don't allow it
37
Damien George9528cd62014-01-15 21:23:31 +000038// TODO replace with a call to a standard function
Damien Georgea4c52c52014-12-05 19:35:18 +000039STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
Damien George54eb4e72014-07-03 13:47:47 +010040 mp_uint_t i = 0;
Damien429d7192013-10-04 19:53:11 +010041
Damiena5185f42013-10-20 14:41:27 +010042 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010043 ++i;
Damien429d7192013-10-04 19:53:11 +010044 ++str;
Damiena5185f42013-10-20 14:41:27 +010045 ++strn;
Damien429d7192013-10-04 19:53:11 +010046 }
47
Damiena5185f42013-10-20 14:41:27 +010048 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010049}
50
Damiena5185f42013-10-20 14:41:27 +010051#define CUR_CHAR(lex) ((lex)->chr0)
52
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020053STATIC bool is_end(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +010054 return lex->chr0 == MP_LEXER_EOF;
Damien429d7192013-10-04 19:53:11 +010055}
56
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020057STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien George32bade12015-01-30 00:27:46 +000058 return lex->chr0 == '\n';
Damien429d7192013-10-04 19:53:11 +010059}
60
Damien George2e2e4042015-03-19 00:21:29 +000061STATIC bool is_char(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010062 return lex->chr0 == c;
63}
64
Damien George2e2e4042015-03-19 00:21:29 +000065STATIC bool is_char_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010066 return lex->chr0 == c1 || lex->chr0 == c2;
67}
68
Damien George2e2e4042015-03-19 00:21:29 +000069STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
Damien429d7192013-10-04 19:53:11 +010070 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
71}
72
73/*
Damien George2e2e4042015-03-19 00:21:29 +000074STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
Damien429d7192013-10-04 19:53:11 +010075 return lex->chr1 == c;
76}
77*/
78
Damien George2e2e4042015-03-19 00:21:29 +000079STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010080 return lex->chr1 == c1 || lex->chr1 == c2;
81}
82
Damien George2e2e4042015-03-19 00:21:29 +000083STATIC bool is_char_following_following_or(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010084 return lex->chr2 == c1 || lex->chr2 == c2;
85}
86
Damien George2e2e4042015-03-19 00:21:29 +000087STATIC bool is_char_and(mp_lexer_t *lex, byte c1, byte c2) {
Damien429d7192013-10-04 19:53:11 +010088 return lex->chr0 == c1 && lex->chr1 == c2;
89}
90
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020091STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000092 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010093}
94
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020095STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000096 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010097}
98
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020099STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000100 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100101}
102
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200103STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000104 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100105}
106
Damien George7d414a12015-02-08 01:57:40 +0000107STATIC bool is_following_letter(mp_lexer_t *lex) {
108 return unichar_isalpha(lex->chr1);
109}
110
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200111STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200112 return lex->chr1 >= '0' && lex->chr1 <= '7';
113}
114
Damien429d7192013-10-04 19:53:11 +0100115// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200116STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100117 return is_letter(lex) || lex->chr0 == '_';
118}
119
120// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200121STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100122 return is_head_of_identifier(lex) || is_digit(lex);
123}
124
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200125STATIC void next_char(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +0100126 if (lex->chr0 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100127 return;
128 }
129
Damien429d7192013-10-04 19:53:11 +0100130 if (lex->chr0 == '\n') {
Damien George32bade12015-01-30 00:27:46 +0000131 // a new line
Damien429d7192013-10-04 19:53:11 +0100132 ++lex->line;
133 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100134 } else if (lex->chr0 == '\t') {
135 // a tab
136 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
137 } else {
138 // a character worth one column
139 ++lex->column;
140 }
141
Damien George32bade12015-01-30 00:27:46 +0000142 lex->chr0 = lex->chr1;
143 lex->chr1 = lex->chr2;
144 lex->chr2 = lex->stream_next_byte(lex->stream_data);
145
146 if (lex->chr0 == '\r') {
147 // CR is a new line, converted to LF
148 lex->chr0 = '\n';
149 if (lex->chr1 == '\n') {
150 // CR LF is a single new line
151 lex->chr1 = lex->chr2;
152 lex->chr2 = lex->stream_next_byte(lex->stream_data);
153 }
154 }
155
156 if (lex->chr2 == MP_LEXER_EOF) {
157 // EOF, check if we need to insert a newline at end of file
158 if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
159 // if lex->chr1 == '\r' then this makes a CR LF which will be converted to LF above
160 // otherwise it just inserts a LF
161 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100162 }
163 }
164}
165
Damien Georgea4c52c52014-12-05 19:35:18 +0000166STATIC void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
Damien429d7192013-10-04 19:53:11 +0100167 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100168 // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
Damien George58ebde42014-05-21 20:32:59 +0100169 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
170 lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
Damien429d7192013-10-04 19:53:11 +0100171 }
172 lex->indent_level[lex->num_indent_level++] = indent;
173}
174
Damien Georgea4c52c52014-12-05 19:35:18 +0000175STATIC mp_uint_t indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100176 return lex->indent_level[lex->num_indent_level - 1];
177}
178
Damien Georgea4c52c52014-12-05 19:35:18 +0000179STATIC void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100180 lex->num_indent_level -= 1;
181}
182
183// some tricky operator encoding:
184// <op> = begin with <op>, if this opchar matches then begin here
185// e<op> = end with <op>, if this opchar matches then end
186// E<op> = mandatory end with <op>, this opchar must match, then end
187// c<op> = continue with <op>, if this opchar matches then continue matching
188// this means if the start of two ops are the same then they are equal til the last char
189
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200190STATIC const char *tok_enc =
Damien429d7192013-10-04 19:53:11 +0100191 "()[]{},:;@~" // singles
192 "<e=c<e=" // < <= << <<=
193 ">e=c>e=" // > >= >> >>=
194 "*e=c*e=" // * *= ** **=
195 "+e=" // + +=
196 "-e=e>" // - -= ->
197 "&e=" // & &=
198 "|e=" // | |=
199 "/e=c/e=" // / /= // //=
200 "%e=" // % %=
201 "^e=" // ^ ^=
202 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100203 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100204
205// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200206STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000207 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
208 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
209 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
210 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100211
Damiend99b0522013-12-21 18:17:45 +0000212 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
213 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
214 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
215 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
216 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
217 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
218 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
219 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
220 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
221 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
222 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
223 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100224};
225
226// must have the same order as enum in lexer.h
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200227STATIC const char *tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100228 "False",
229 "None",
230 "True",
231 "and",
232 "as",
233 "assert",
234 "break",
235 "class",
236 "continue",
237 "def",
238 "del",
239 "elif",
240 "else",
241 "except",
242 "finally",
243 "for",
244 "from",
245 "global",
246 "if",
247 "import",
248 "in",
249 "is",
250 "lambda",
251 "nonlocal",
252 "not",
253 "or",
254 "pass",
255 "raise",
256 "return",
257 "try",
258 "while",
259 "with",
260 "yield",
Damien George97f9a282014-05-12 23:07:34 +0100261 "__debug__",
Damien429d7192013-10-04 19:53:11 +0100262};
263
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200264// This is called with CUR_CHAR() before first hex digit, and should return with
265// it pointing to last hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100266// num_digits must be greater than zero
267STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
268 mp_uint_t num = 0;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200269 while (num_digits-- != 0) {
270 next_char(lex);
271 unichar c = CUR_CHAR(lex);
272 if (!unichar_isxdigit(c)) {
273 return false;
274 }
Dave Hylands3ad94d62015-05-18 14:41:25 -0700275 num = (num << 4) + unichar_xdigit_value(c);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200276 }
277 *result = num;
278 return true;
279}
280
Damien Georgea4c52c52014-12-05 19:35:18 +0000281STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
282 // start new token text
283 vstr_reset(&lex->vstr);
284
Damiena5185f42013-10-20 14:41:27 +0100285 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100286 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100287 while (!is_end(lex)) {
288 if (is_physical_newline(lex)) {
289 had_physical_newline = true;
290 next_char(lex);
291 } else if (is_whitespace(lex)) {
292 next_char(lex);
293 } else if (is_char(lex, '#')) {
294 next_char(lex);
295 while (!is_end(lex) && !is_physical_newline(lex)) {
296 next_char(lex);
297 }
298 // had_physical_newline will be set on next loop
299 } else if (is_char(lex, '\\')) {
300 // backslash (outside string literals) must appear just before a physical newline
301 next_char(lex);
302 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000303 // SyntaxError: unexpected character after line continuation character
Damien Georgea4c52c52014-12-05 19:35:18 +0000304 lex->tok_line = lex->line;
305 lex->tok_column = lex->column;
306 lex->tok_kind = MP_TOKEN_BAD_LINE_CONTINUATION;
Damien George69a818d2014-01-12 13:55:24 +0000307 return;
Damien429d7192013-10-04 19:53:11 +0100308 } else {
309 next_char(lex);
310 }
311 } else {
312 break;
313 }
314 }
315
Damiena5185f42013-10-20 14:41:27 +0100316 // set token source information
Damien Georgea4c52c52014-12-05 19:35:18 +0000317 lex->tok_line = lex->line;
318 lex->tok_column = lex->column;
Damiena5185f42013-10-20 14:41:27 +0100319
320 if (first_token && lex->line == 1 && lex->column != 1) {
321 // check that the first token is in the first column
322 // if first token is not on first line, we get a physical newline and
323 // this check is done as part of normal indent/dedent checking below
324 // (done to get equivalence with CPython)
Damien Georgea4c52c52014-12-05 19:35:18 +0000325 lex->tok_kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100326
327 } else if (lex->emit_dent < 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000328 lex->tok_kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100329 lex->emit_dent += 1;
330
331 } else if (lex->emit_dent > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000332 lex->tok_kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100333 lex->emit_dent -= 1;
334
Damien91d387d2013-10-09 15:09:52 +0100335 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000336 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100337
Damien George54eb4e72014-07-03 13:47:47 +0100338 mp_uint_t num_spaces = lex->column - 1;
Damien429d7192013-10-04 19:53:11 +0100339 lex->emit_dent = 0;
340 if (num_spaces == indent_top(lex)) {
341 } else if (num_spaces > indent_top(lex)) {
342 indent_push(lex, num_spaces);
343 lex->emit_dent += 1;
344 } else {
345 while (num_spaces < indent_top(lex)) {
346 indent_pop(lex);
347 lex->emit_dent -= 1;
348 }
349 if (num_spaces != indent_top(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000350 lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100351 }
352 }
353
354 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100355 if (indent_top(lex) > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000356 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100357 lex->emit_dent = 0;
358 while (indent_top(lex) > 0) {
359 indent_pop(lex);
360 lex->emit_dent -= 1;
361 }
362 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000363 lex->tok_kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100364 }
365
366 } else if (is_char_or(lex, '\'', '\"')
367 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
368 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
369 // a string or bytes literal
370
371 // parse type codes
372 bool is_raw = false;
373 bool is_bytes = false;
374 if (is_char(lex, 'u')) {
375 next_char(lex);
376 } else if (is_char(lex, 'b')) {
377 is_bytes = true;
378 next_char(lex);
379 if (is_char(lex, 'r')) {
380 is_raw = true;
381 next_char(lex);
382 }
383 } else if (is_char(lex, 'r')) {
384 is_raw = true;
385 next_char(lex);
386 if (is_char(lex, 'b')) {
387 is_bytes = true;
388 next_char(lex);
389 }
390 }
391
392 // set token kind
393 if (is_bytes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000394 lex->tok_kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100395 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000396 lex->tok_kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100397 }
398
399 // get first quoting character
400 char quote_char = '\'';
401 if (is_char(lex, '\"')) {
402 quote_char = '\"';
403 }
404 next_char(lex);
405
406 // work out if it's a single or triple quoted literal
Damien George54eb4e72014-07-03 13:47:47 +0100407 mp_uint_t num_quotes;
Damien429d7192013-10-04 19:53:11 +0100408 if (is_char_and(lex, quote_char, quote_char)) {
409 // triple quotes
410 next_char(lex);
411 next_char(lex);
412 num_quotes = 3;
413 } else {
414 // single quotes
415 num_quotes = 1;
416 }
417
Damien429d7192013-10-04 19:53:11 +0100418 // parse the literal
Damien George54eb4e72014-07-03 13:47:47 +0100419 mp_uint_t n_closing = 0;
Damien429d7192013-10-04 19:53:11 +0100420 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
421 if (is_char(lex, quote_char)) {
422 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100423 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100424 } else {
425 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100426 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100427 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100428 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100429 if (is_raw) {
430 // raw strings allow escaping of quotes, but the backslash is also emitted
431 vstr_add_char(&lex->vstr, '\\');
432 } else {
433 switch (c) {
Damien George94fbe972014-07-30 11:46:05 +0100434 case MP_LEXER_EOF: break; // TODO a proper error message?
435 case '\n': c = MP_LEXER_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damien Georgea91f4142014-04-10 11:30:55 +0100436 case '\\': break;
437 case '\'': break;
438 case '"': break;
439 case 'a': c = 0x07; break;
440 case 'b': c = 0x08; break;
441 case 't': c = 0x09; break;
442 case 'n': c = 0x0a; break;
443 case 'v': c = 0x0b; break;
444 case 'f': c = 0x0c; break;
445 case 'r': c = 0x0d; break;
Chris Angelico2ba22992014-06-04 05:28:12 +1000446 case 'u':
447 case 'U':
448 if (is_bytes) {
449 // b'\u1234' == b'\\u1234'
450 vstr_add_char(&lex->vstr, '\\');
451 break;
452 }
453 // Otherwise fall through.
Damien Georgea91f4142014-04-10 11:30:55 +0100454 case 'x':
455 {
Damien George54eb4e72014-07-03 13:47:47 +0100456 mp_uint_t num = 0;
Chris Angelico2ba22992014-06-04 05:28:12 +1000457 if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
Damien Georgea91f4142014-04-10 11:30:55 +0100458 // TODO error message
459 assert(0);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200460 }
461 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100462 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200463 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000464 case 'N':
465 // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
466 // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
467 // 3MB of text; even gzip-compressed and with minimal structure, it'll take
468 // roughly half a meg of storage. This form of Unicode escape may be added
469 // later on, but it's definitely not a priority right now. -- CJA 20140607
470 assert(!"Unicode name escapes not supported");
471 break;
Damien Georgea91f4142014-04-10 11:30:55 +0100472 default:
473 if (c >= '0' && c <= '7') {
474 // Octal sequence, 1-3 chars
Damien George54eb4e72014-07-03 13:47:47 +0100475 mp_uint_t digits = 3;
476 mp_uint_t num = c - '0';
Damien Georgea91f4142014-04-10 11:30:55 +0100477 while (is_following_odigit(lex) && --digits != 0) {
478 next_char(lex);
479 num = num * 8 + (CUR_CHAR(lex) - '0');
480 }
481 c = num;
482 } else {
483 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
484 vstr_add_char(&lex->vstr, '\\');
485 }
486 break;
487 }
Damiena5185f42013-10-20 14:41:27 +0100488 }
Damien George94fbe972014-07-30 11:46:05 +0100489 if (c != MP_LEXER_EOF) {
Damien George16677ce2015-01-28 14:07:11 +0000490 #if MICROPY_PY_BUILTINS_STR_UNICODE
Chris Angelico2ba22992014-06-04 05:28:12 +1000491 if (c < 0x110000 && !is_bytes) {
492 vstr_add_char(&lex->vstr, c);
493 } else if (c < 0x100 && is_bytes) {
494 vstr_add_byte(&lex->vstr, c);
Damien George16677ce2015-01-28 14:07:11 +0000495 }
496 #else
497 // without unicode everything is just added as an 8-bit byte
498 if (c < 0x100) {
499 vstr_add_byte(&lex->vstr, c);
500 }
501 #endif
502 else {
Chris Angelico2ba22992014-06-04 05:28:12 +1000503 assert(!"TODO: Throw an error, invalid escape code probably");
504 }
Damiena5185f42013-10-20 14:41:27 +0100505 }
506 } else {
Damien George94fbe972014-07-30 11:46:05 +0100507 // Add the "character" as a byte so that we remain 8-bit clean.
508 // This way, strings are parsed correctly whether or not they contain utf-8 chars.
509 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100510 }
511 }
512 next_char(lex);
513 }
514
515 // check we got the required end quotes
516 if (n_closing < num_quotes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000517 lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100518 }
519
Damiena5185f42013-10-20 14:41:27 +0100520 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000521 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100522
523 } else if (is_head_of_identifier(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000524 lex->tok_kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100525
Damiena5185f42013-10-20 14:41:27 +0100526 // get first char
527 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100528 next_char(lex);
529
Damiena5185f42013-10-20 14:41:27 +0100530 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100531 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100532 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100533 next_char(lex);
534 }
535
536 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damien George7d414a12015-02-08 01:57:40 +0000537 bool forced_integer = false;
538 if (is_char(lex, '.')) {
539 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
540 } else {
541 lex->tok_kind = MP_TOKEN_INTEGER;
542 if (is_char(lex, '0') && is_following_letter(lex)) {
543 forced_integer = true;
544 }
545 }
Damien429d7192013-10-04 19:53:11 +0100546
Damiena5185f42013-10-20 14:41:27 +0100547 // get first char
548 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100549 next_char(lex);
550
Damiena5185f42013-10-20 14:41:27 +0100551 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100552 while (!is_end(lex)) {
Damien George7d414a12015-02-08 01:57:40 +0000553 if (!forced_integer && is_char_or(lex, 'e', 'E')) {
554 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
Damiena5185f42013-10-20 14:41:27 +0100555 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100556 next_char(lex);
557 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100558 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100559 next_char(lex);
560 }
Damien George7d414a12015-02-08 01:57:40 +0000561 } else if (is_letter(lex) || is_digit(lex) || is_char(lex, '.')) {
562 if (is_char_or3(lex, '.', 'j', 'J')) {
563 lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;
564 }
Damiena5185f42013-10-20 14:41:27 +0100565 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100566 next_char(lex);
567 } else {
568 break;
569 }
570 }
571
Damien George2e9eb2d2014-04-10 12:19:33 +0100572 } else if (is_char(lex, '.')) {
573 // special handling for . and ... operators, because .. is not a valid operator
574
575 // get first char
576 vstr_add_char(&lex->vstr, '.');
577 next_char(lex);
578
579 if (is_char_and(lex, '.', '.')) {
580 vstr_add_char(&lex->vstr, '.');
581 vstr_add_char(&lex->vstr, '.');
582 next_char(lex);
583 next_char(lex);
Damien Georgea4c52c52014-12-05 19:35:18 +0000584 lex->tok_kind = MP_TOKEN_ELLIPSIS;
Damien George2e9eb2d2014-04-10 12:19:33 +0100585 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000586 lex->tok_kind = MP_TOKEN_DEL_PERIOD;
Damien George2e9eb2d2014-04-10 12:19:33 +0100587 }
588
Damien429d7192013-10-04 19:53:11 +0100589 } else {
590 // search for encoded delimiter or operator
591
592 const char *t = tok_enc;
Damien George54eb4e72014-07-03 13:47:47 +0100593 mp_uint_t tok_enc_index = 0;
Damien429d7192013-10-04 19:53:11 +0100594 for (; *t != 0 && !is_char(lex, *t); t += 1) {
595 if (*t == 'e' || *t == 'c') {
596 t += 1;
597 } else if (*t == 'E') {
598 tok_enc_index -= 1;
599 t += 1;
600 }
601 tok_enc_index += 1;
602 }
603
604 next_char(lex);
605
606 if (*t == 0) {
607 // didn't match any delimiter or operator characters
Damien Georgea4c52c52014-12-05 19:35:18 +0000608 lex->tok_kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100609
610 } else {
611 // matched a delimiter or operator character
612
613 // get the maximum characters for a valid token
614 t += 1;
Damien George54eb4e72014-07-03 13:47:47 +0100615 mp_uint_t t_index = tok_enc_index;
Damien429d7192013-10-04 19:53:11 +0100616 for (;;) {
617 for (; *t == 'e'; t += 1) {
618 t += 1;
619 t_index += 1;
620 if (is_char(lex, *t)) {
621 next_char(lex);
622 tok_enc_index = t_index;
623 break;
624 }
625 }
626
627 if (*t == 'E') {
628 t += 1;
629 if (is_char(lex, *t)) {
630 next_char(lex);
631 tok_enc_index = t_index;
632 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000633 lex->tok_kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100634 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100635 }
636 break;
637 }
638
639 if (*t == 'c') {
640 t += 1;
641 t_index += 1;
642 if (is_char(lex, *t)) {
643 next_char(lex);
644 tok_enc_index = t_index;
645 t += 1;
646 } else {
647 break;
648 }
649 } else {
650 break;
651 }
652 }
653
654 // set token kind
Damien Georgea4c52c52014-12-05 19:35:18 +0000655 lex->tok_kind = tok_enc_kind[tok_enc_index];
Damien429d7192013-10-04 19:53:11 +0100656
Damien George2e9eb2d2014-04-10 12:19:33 +0100657 tok_enc_no_match:
658
Damien429d7192013-10-04 19:53:11 +0100659 // compute bracket level for implicit line joining
Damien Georgea4c52c52014-12-05 19:35:18 +0000660 if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100661 lex->nested_bracket_level += 1;
Damien Georgea4c52c52014-12-05 19:35:18 +0000662 } else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100663 lex->nested_bracket_level -= 1;
664 }
665 }
666 }
667
Damiena5185f42013-10-20 14:41:27 +0100668 // check for keywords
Damien Georgea4c52c52014-12-05 19:35:18 +0000669 if (lex->tok_kind == MP_TOKEN_NAME) {
Damien George97f9a282014-05-12 23:07:34 +0100670 // We check for __debug__ here and convert it to its value. This is so
671 // the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
672 // need to check for this special token in many places in the compiler.
673 // TODO improve speed of these string comparisons
Damien George54eb4e72014-07-03 13:47:47 +0100674 //for (mp_int_t i = 0; tok_kw[i] != NULL; i++) {
Damien George963a5a32015-01-16 17:47:07 +0000675 for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000676 if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) {
Emmanuel Blotf6932d62014-06-19 18:54:34 +0200677 if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
678 // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
Damien Georgeb4b10fd2015-01-01 23:30:53 +0000679 lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
Damien George97f9a282014-05-12 23:07:34 +0100680 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000681 lex->tok_kind = MP_TOKEN_KW_FALSE + i;
Damien George97f9a282014-05-12 23:07:34 +0100682 }
Damien429d7192013-10-04 19:53:11 +0100683 break;
684 }
685 }
686 }
687}
688
Damien George94fbe972014-07-30 11:46:05 +0100689mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close) {
Damien George9bf5f282014-10-09 16:53:37 +0100690 mp_lexer_t *lex = m_new_obj_maybe(mp_lexer_t);
Damien Georgee1199ec2014-05-10 17:48:01 +0100691
692 // check for memory allocation error
693 if (lex == NULL) {
694 if (stream_close) {
695 stream_close(stream_data);
696 }
697 return NULL;
698 }
Damien429d7192013-10-04 19:53:11 +0100699
Damien Georgeb829b5c2014-01-25 13:51:19 +0000700 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100701 lex->stream_data = stream_data;
Damien George94fbe972014-07-30 11:46:05 +0100702 lex->stream_next_byte = stream_next_byte;
Damienfa2162b2013-10-20 17:42:00 +0100703 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100704 lex->line = 1;
705 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100706 lex->emit_dent = 0;
707 lex->nested_bracket_level = 0;
Damien George58ebde42014-05-21 20:32:59 +0100708 lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
Damien429d7192013-10-04 19:53:11 +0100709 lex->num_indent_level = 1;
Damien Georgee1199ec2014-05-10 17:48:01 +0100710 lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200711 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100712
Damien Georgee1199ec2014-05-10 17:48:01 +0100713 // check for memory allocation error
714 if (lex->indent_level == NULL || vstr_had_error(&lex->vstr)) {
715 mp_lexer_free(lex);
716 return NULL;
717 }
718
719 // store sentinel for first indentation level
720 lex->indent_level[0] = 0;
721
Damien429d7192013-10-04 19:53:11 +0100722 // preload characters
Damien George94fbe972014-07-30 11:46:05 +0100723 lex->chr0 = stream_next_byte(stream_data);
724 lex->chr1 = stream_next_byte(stream_data);
725 lex->chr2 = stream_next_byte(stream_data);
Damiena5185f42013-10-20 14:41:27 +0100726
727 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damien George94fbe972014-07-30 11:46:05 +0100728 if (lex->chr0 == MP_LEXER_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100729 lex->chr0 = '\n';
Damien George94fbe972014-07-30 11:46:05 +0100730 } else if (lex->chr1 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000731 if (lex->chr0 == '\r') {
732 lex->chr0 = '\n';
733 } else if (lex->chr0 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100734 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100735 }
Damien George94fbe972014-07-30 11:46:05 +0100736 } else if (lex->chr2 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000737 if (lex->chr1 == '\r') {
738 lex->chr1 = '\n';
739 } else if (lex->chr1 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100740 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100741 }
Damien429d7192013-10-04 19:53:11 +0100742 }
743
Damiena5185f42013-10-20 14:41:27 +0100744 // preload first token
Damien Georgea4c52c52014-12-05 19:35:18 +0000745 mp_lexer_next_token_into(lex, true);
Damien429d7192013-10-04 19:53:11 +0100746
747 return lex;
748}
749
Damiend99b0522013-12-21 18:17:45 +0000750void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100751 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100752 if (lex->stream_close) {
753 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100754 }
Damienbb5316b2013-10-22 21:12:29 +0100755 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200756 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000757 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100758 }
Damien429d7192013-10-04 19:53:11 +0100759}
760
Damiend99b0522013-12-21 18:17:45 +0000761void mp_lexer_to_next(mp_lexer_t *lex) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000762 mp_lexer_next_token_into(lex, false);
Damien429d7192013-10-04 19:53:11 +0100763}
764
Damien Georgea4c52c52014-12-05 19:35:18 +0000765#if MICROPY_DEBUG_PRINTERS
766void mp_lexer_show_token(const mp_lexer_t *lex) {
Damien George451a0872014-12-05 22:50:16 +0000767 printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
Damien Georgea4c52c52014-12-05 19:35:18 +0000768 if (lex->vstr.len > 0) {
769 const byte *i = (const byte *)lex->vstr.buf;
770 const byte *j = (const byte *)i + lex->vstr.len;
771 printf(" ");
772 while (i < j) {
773 unichar c = utf8_get_char(i);
774 i = utf8_next_char(i);
775 if (unichar_isprint(c)) {
776 printf("%c", c);
777 } else {
778 printf("?");
779 }
780 }
781 }
782 printf("\n");
Damien429d7192013-10-04 19:53:11 +0100783}
Damien Georgea4c52c52014-12-05 19:35:18 +0000784#endif