blob: e3d52e7141e9f0b55134ec77b4f81224a9618011 [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027#include <stdio.h>
28#include <assert.h>
29
Damien Georgeb4b10fd2015-01-01 23:30:53 +000030#include "py/mpstate.h"
Damien George51dfcb42015-01-01 20:27:54 +000031#include "py/lexer.h"
Damien429d7192013-10-04 19:53:11 +010032
33#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010034
Damien92c06562013-10-22 22:32:27 +010035// TODO seems that CPython allows NULL byte in the input stream
36// don't know if that's intentional or not, but we don't allow it
37
Damien George9528cd62014-01-15 21:23:31 +000038// TODO replace with a call to a standard function
Damien Georgea4c52c52014-12-05 19:35:18 +000039STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
Damien George54eb4e72014-07-03 13:47:47 +010040 mp_uint_t i = 0;
Damien429d7192013-10-04 19:53:11 +010041
Damiena5185f42013-10-20 14:41:27 +010042 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010043 ++i;
Damien429d7192013-10-04 19:53:11 +010044 ++str;
Damiena5185f42013-10-20 14:41:27 +010045 ++strn;
Damien429d7192013-10-04 19:53:11 +010046 }
47
Damiena5185f42013-10-20 14:41:27 +010048 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010049}
50
Damiena5185f42013-10-20 14:41:27 +010051#define CUR_CHAR(lex) ((lex)->chr0)
52
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020053STATIC bool is_end(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +010054 return lex->chr0 == MP_LEXER_EOF;
Damien429d7192013-10-04 19:53:11 +010055}
56
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020057STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien George32bade12015-01-30 00:27:46 +000058 return lex->chr0 == '\n';
Damien429d7192013-10-04 19:53:11 +010059}
60
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020061STATIC bool is_char(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010062 return lex->chr0 == c;
63}
64
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020065STATIC bool is_char_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010066 return lex->chr0 == c1 || lex->chr0 == c2;
67}
68
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020069STATIC bool is_char_or3(mp_lexer_t *lex, char c1, char c2, char c3) {
Damien429d7192013-10-04 19:53:11 +010070 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
71}
72
73/*
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020074STATIC bool is_char_following(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010075 return lex->chr1 == c;
76}
77*/
78
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020079STATIC bool is_char_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010080 return lex->chr1 == c1 || lex->chr1 == c2;
81}
82
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020083STATIC bool is_char_following_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010084 return lex->chr2 == c1 || lex->chr2 == c2;
85}
86
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020087STATIC bool is_char_and(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010088 return lex->chr0 == c1 && lex->chr1 == c2;
89}
90
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020091STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000092 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010093}
94
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020095STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +000096 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +010097}
98
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020099STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000100 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100101}
102
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200103STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000104 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100105}
106
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200107STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200108 return lex->chr1 >= '0' && lex->chr1 <= '7';
109}
110
Damien429d7192013-10-04 19:53:11 +0100111// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200112STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100113 return is_letter(lex) || lex->chr0 == '_';
114}
115
116// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200117STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100118 return is_head_of_identifier(lex) || is_digit(lex);
119}
120
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200121STATIC void next_char(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +0100122 if (lex->chr0 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100123 return;
124 }
125
Damien429d7192013-10-04 19:53:11 +0100126 if (lex->chr0 == '\n') {
Damien George32bade12015-01-30 00:27:46 +0000127 // a new line
Damien429d7192013-10-04 19:53:11 +0100128 ++lex->line;
129 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100130 } else if (lex->chr0 == '\t') {
131 // a tab
132 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
133 } else {
134 // a character worth one column
135 ++lex->column;
136 }
137
Damien George32bade12015-01-30 00:27:46 +0000138 lex->chr0 = lex->chr1;
139 lex->chr1 = lex->chr2;
140 lex->chr2 = lex->stream_next_byte(lex->stream_data);
141
142 if (lex->chr0 == '\r') {
143 // CR is a new line, converted to LF
144 lex->chr0 = '\n';
145 if (lex->chr1 == '\n') {
146 // CR LF is a single new line
147 lex->chr1 = lex->chr2;
148 lex->chr2 = lex->stream_next_byte(lex->stream_data);
149 }
150 }
151
152 if (lex->chr2 == MP_LEXER_EOF) {
153 // EOF, check if we need to insert a newline at end of file
154 if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
155 // if lex->chr1 == '\r' then this makes a CR LF which will be converted to LF above
156 // otherwise it just inserts a LF
157 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100158 }
159 }
160}
161
Damien Georgea4c52c52014-12-05 19:35:18 +0000162STATIC void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
Damien429d7192013-10-04 19:53:11 +0100163 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100164 // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
Damien George58ebde42014-05-21 20:32:59 +0100165 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
166 lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
Damien429d7192013-10-04 19:53:11 +0100167 }
168 lex->indent_level[lex->num_indent_level++] = indent;
169}
170
Damien Georgea4c52c52014-12-05 19:35:18 +0000171STATIC mp_uint_t indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100172 return lex->indent_level[lex->num_indent_level - 1];
173}
174
Damien Georgea4c52c52014-12-05 19:35:18 +0000175STATIC void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100176 lex->num_indent_level -= 1;
177}
178
179// some tricky operator encoding:
180// <op> = begin with <op>, if this opchar matches then begin here
181// e<op> = end with <op>, if this opchar matches then end
182// E<op> = mandatory end with <op>, this opchar must match, then end
183// c<op> = continue with <op>, if this opchar matches then continue matching
184// this means if the start of two ops are the same then they are equal til the last char
185
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200186STATIC const char *tok_enc =
Damien429d7192013-10-04 19:53:11 +0100187 "()[]{},:;@~" // singles
188 "<e=c<e=" // < <= << <<=
189 ">e=c>e=" // > >= >> >>=
190 "*e=c*e=" // * *= ** **=
191 "+e=" // + +=
192 "-e=e>" // - -= ->
193 "&e=" // & &=
194 "|e=" // | |=
195 "/e=c/e=" // / /= // //=
196 "%e=" // % %=
197 "^e=" // ^ ^=
198 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100199 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100200
201// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200202STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000203 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
204 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
205 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
206 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100207
Damiend99b0522013-12-21 18:17:45 +0000208 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
209 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
210 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
211 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
212 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
213 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
214 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
215 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
216 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
217 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
218 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
219 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100220};
221
222// must have the same order as enum in lexer.h
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200223STATIC const char *tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100224 "False",
225 "None",
226 "True",
227 "and",
228 "as",
229 "assert",
230 "break",
231 "class",
232 "continue",
233 "def",
234 "del",
235 "elif",
236 "else",
237 "except",
238 "finally",
239 "for",
240 "from",
241 "global",
242 "if",
243 "import",
244 "in",
245 "is",
246 "lambda",
247 "nonlocal",
248 "not",
249 "or",
250 "pass",
251 "raise",
252 "return",
253 "try",
254 "while",
255 "with",
256 "yield",
Damien George97f9a282014-05-12 23:07:34 +0100257 "__debug__",
Damien429d7192013-10-04 19:53:11 +0100258};
259
Damien George54eb4e72014-07-03 13:47:47 +0100260STATIC mp_uint_t hex_digit(unichar c) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200261 // c is assumed to be hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100262 mp_uint_t n = c - '0';
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200263 if (n > 9) {
264 n &= ~('a' - 'A');
265 n -= ('A' - ('9' + 1));
266 }
267 return n;
268}
269
270// This is called with CUR_CHAR() before first hex digit, and should return with
271// it pointing to last hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100272// num_digits must be greater than zero
273STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
274 mp_uint_t num = 0;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200275 while (num_digits-- != 0) {
276 next_char(lex);
277 unichar c = CUR_CHAR(lex);
278 if (!unichar_isxdigit(c)) {
279 return false;
280 }
281 num = (num << 4) + hex_digit(c);
282 }
283 *result = num;
284 return true;
285}
286
Damien Georgea4c52c52014-12-05 19:35:18 +0000287STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
288 // start new token text
289 vstr_reset(&lex->vstr);
290
Damiena5185f42013-10-20 14:41:27 +0100291 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100292 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100293 while (!is_end(lex)) {
294 if (is_physical_newline(lex)) {
295 had_physical_newline = true;
296 next_char(lex);
297 } else if (is_whitespace(lex)) {
298 next_char(lex);
299 } else if (is_char(lex, '#')) {
300 next_char(lex);
301 while (!is_end(lex) && !is_physical_newline(lex)) {
302 next_char(lex);
303 }
304 // had_physical_newline will be set on next loop
305 } else if (is_char(lex, '\\')) {
306 // backslash (outside string literals) must appear just before a physical newline
307 next_char(lex);
308 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000309 // SyntaxError: unexpected character after line continuation character
Damien Georgea4c52c52014-12-05 19:35:18 +0000310 lex->tok_line = lex->line;
311 lex->tok_column = lex->column;
312 lex->tok_kind = MP_TOKEN_BAD_LINE_CONTINUATION;
Damien George69a818d2014-01-12 13:55:24 +0000313 return;
Damien429d7192013-10-04 19:53:11 +0100314 } else {
315 next_char(lex);
316 }
317 } else {
318 break;
319 }
320 }
321
Damiena5185f42013-10-20 14:41:27 +0100322 // set token source information
Damien Georgea4c52c52014-12-05 19:35:18 +0000323 lex->tok_line = lex->line;
324 lex->tok_column = lex->column;
Damiena5185f42013-10-20 14:41:27 +0100325
326 if (first_token && lex->line == 1 && lex->column != 1) {
327 // check that the first token is in the first column
328 // if first token is not on first line, we get a physical newline and
329 // this check is done as part of normal indent/dedent checking below
330 // (done to get equivalence with CPython)
Damien Georgea4c52c52014-12-05 19:35:18 +0000331 lex->tok_kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100332
333 } else if (lex->emit_dent < 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000334 lex->tok_kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100335 lex->emit_dent += 1;
336
337 } else if (lex->emit_dent > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000338 lex->tok_kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100339 lex->emit_dent -= 1;
340
Damien91d387d2013-10-09 15:09:52 +0100341 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000342 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100343
Damien George54eb4e72014-07-03 13:47:47 +0100344 mp_uint_t num_spaces = lex->column - 1;
Damien429d7192013-10-04 19:53:11 +0100345 lex->emit_dent = 0;
346 if (num_spaces == indent_top(lex)) {
347 } else if (num_spaces > indent_top(lex)) {
348 indent_push(lex, num_spaces);
349 lex->emit_dent += 1;
350 } else {
351 while (num_spaces < indent_top(lex)) {
352 indent_pop(lex);
353 lex->emit_dent -= 1;
354 }
355 if (num_spaces != indent_top(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000356 lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100357 }
358 }
359
360 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100361 if (indent_top(lex) > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000362 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100363 lex->emit_dent = 0;
364 while (indent_top(lex) > 0) {
365 indent_pop(lex);
366 lex->emit_dent -= 1;
367 }
368 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000369 lex->tok_kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100370 }
371
372 } else if (is_char_or(lex, '\'', '\"')
373 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
374 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
375 // a string or bytes literal
376
377 // parse type codes
378 bool is_raw = false;
379 bool is_bytes = false;
380 if (is_char(lex, 'u')) {
381 next_char(lex);
382 } else if (is_char(lex, 'b')) {
383 is_bytes = true;
384 next_char(lex);
385 if (is_char(lex, 'r')) {
386 is_raw = true;
387 next_char(lex);
388 }
389 } else if (is_char(lex, 'r')) {
390 is_raw = true;
391 next_char(lex);
392 if (is_char(lex, 'b')) {
393 is_bytes = true;
394 next_char(lex);
395 }
396 }
397
398 // set token kind
399 if (is_bytes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000400 lex->tok_kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100401 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000402 lex->tok_kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100403 }
404
405 // get first quoting character
406 char quote_char = '\'';
407 if (is_char(lex, '\"')) {
408 quote_char = '\"';
409 }
410 next_char(lex);
411
412 // work out if it's a single or triple quoted literal
Damien George54eb4e72014-07-03 13:47:47 +0100413 mp_uint_t num_quotes;
Damien429d7192013-10-04 19:53:11 +0100414 if (is_char_and(lex, quote_char, quote_char)) {
415 // triple quotes
416 next_char(lex);
417 next_char(lex);
418 num_quotes = 3;
419 } else {
420 // single quotes
421 num_quotes = 1;
422 }
423
Damien429d7192013-10-04 19:53:11 +0100424 // parse the literal
Damien George54eb4e72014-07-03 13:47:47 +0100425 mp_uint_t n_closing = 0;
Damien429d7192013-10-04 19:53:11 +0100426 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
427 if (is_char(lex, quote_char)) {
428 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100429 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100430 } else {
431 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100432 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100433 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100434 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100435 if (is_raw) {
436 // raw strings allow escaping of quotes, but the backslash is also emitted
437 vstr_add_char(&lex->vstr, '\\');
438 } else {
439 switch (c) {
Damien George94fbe972014-07-30 11:46:05 +0100440 case MP_LEXER_EOF: break; // TODO a proper error message?
441 case '\n': c = MP_LEXER_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damien Georgea91f4142014-04-10 11:30:55 +0100442 case '\\': break;
443 case '\'': break;
444 case '"': break;
445 case 'a': c = 0x07; break;
446 case 'b': c = 0x08; break;
447 case 't': c = 0x09; break;
448 case 'n': c = 0x0a; break;
449 case 'v': c = 0x0b; break;
450 case 'f': c = 0x0c; break;
451 case 'r': c = 0x0d; break;
Chris Angelico2ba22992014-06-04 05:28:12 +1000452 case 'u':
453 case 'U':
454 if (is_bytes) {
455 // b'\u1234' == b'\\u1234'
456 vstr_add_char(&lex->vstr, '\\');
457 break;
458 }
459 // Otherwise fall through.
Damien Georgea91f4142014-04-10 11:30:55 +0100460 case 'x':
461 {
Damien George54eb4e72014-07-03 13:47:47 +0100462 mp_uint_t num = 0;
Chris Angelico2ba22992014-06-04 05:28:12 +1000463 if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
Damien Georgea91f4142014-04-10 11:30:55 +0100464 // TODO error message
465 assert(0);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200466 }
467 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100468 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200469 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000470 case 'N':
471 // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
472 // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
473 // 3MB of text; even gzip-compressed and with minimal structure, it'll take
474 // roughly half a meg of storage. This form of Unicode escape may be added
475 // later on, but it's definitely not a priority right now. -- CJA 20140607
476 assert(!"Unicode name escapes not supported");
477 break;
Damien Georgea91f4142014-04-10 11:30:55 +0100478 default:
479 if (c >= '0' && c <= '7') {
480 // Octal sequence, 1-3 chars
Damien George54eb4e72014-07-03 13:47:47 +0100481 mp_uint_t digits = 3;
482 mp_uint_t num = c - '0';
Damien Georgea91f4142014-04-10 11:30:55 +0100483 while (is_following_odigit(lex) && --digits != 0) {
484 next_char(lex);
485 num = num * 8 + (CUR_CHAR(lex) - '0');
486 }
487 c = num;
488 } else {
489 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
490 vstr_add_char(&lex->vstr, '\\');
491 }
492 break;
493 }
Damiena5185f42013-10-20 14:41:27 +0100494 }
Damien George94fbe972014-07-30 11:46:05 +0100495 if (c != MP_LEXER_EOF) {
Damien George16677ce2015-01-28 14:07:11 +0000496 #if MICROPY_PY_BUILTINS_STR_UNICODE
Chris Angelico2ba22992014-06-04 05:28:12 +1000497 if (c < 0x110000 && !is_bytes) {
498 vstr_add_char(&lex->vstr, c);
499 } else if (c < 0x100 && is_bytes) {
500 vstr_add_byte(&lex->vstr, c);
Damien George16677ce2015-01-28 14:07:11 +0000501 }
502 #else
503 // without unicode everything is just added as an 8-bit byte
504 if (c < 0x100) {
505 vstr_add_byte(&lex->vstr, c);
506 }
507 #endif
508 else {
Chris Angelico2ba22992014-06-04 05:28:12 +1000509 assert(!"TODO: Throw an error, invalid escape code probably");
510 }
Damiena5185f42013-10-20 14:41:27 +0100511 }
512 } else {
Damien George94fbe972014-07-30 11:46:05 +0100513 // Add the "character" as a byte so that we remain 8-bit clean.
514 // This way, strings are parsed correctly whether or not they contain utf-8 chars.
515 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100516 }
517 }
518 next_char(lex);
519 }
520
521 // check we got the required end quotes
522 if (n_closing < num_quotes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000523 lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100524 }
525
Damiena5185f42013-10-20 14:41:27 +0100526 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000527 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100528
529 } else if (is_head_of_identifier(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000530 lex->tok_kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100531
Damiena5185f42013-10-20 14:41:27 +0100532 // get first char
533 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100534 next_char(lex);
535
Damiena5185f42013-10-20 14:41:27 +0100536 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100537 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100538 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100539 next_char(lex);
540 }
541
542 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000543 lex->tok_kind = MP_TOKEN_NUMBER;
Damien429d7192013-10-04 19:53:11 +0100544
Damiena5185f42013-10-20 14:41:27 +0100545 // get first char
546 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100547 next_char(lex);
548
Damiena5185f42013-10-20 14:41:27 +0100549 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100550 while (!is_end(lex)) {
551 if (is_char_or(lex, 'e', 'E')) {
Damiena5185f42013-10-20 14:41:27 +0100552 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100553 next_char(lex);
554 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100555 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100556 next_char(lex);
557 }
558 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
Damiena5185f42013-10-20 14:41:27 +0100559 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100560 next_char(lex);
561 } else {
562 break;
563 }
564 }
565
Damien George2e9eb2d2014-04-10 12:19:33 +0100566 } else if (is_char(lex, '.')) {
567 // special handling for . and ... operators, because .. is not a valid operator
568
569 // get first char
570 vstr_add_char(&lex->vstr, '.');
571 next_char(lex);
572
573 if (is_char_and(lex, '.', '.')) {
574 vstr_add_char(&lex->vstr, '.');
575 vstr_add_char(&lex->vstr, '.');
576 next_char(lex);
577 next_char(lex);
Damien Georgea4c52c52014-12-05 19:35:18 +0000578 lex->tok_kind = MP_TOKEN_ELLIPSIS;
Damien George2e9eb2d2014-04-10 12:19:33 +0100579 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000580 lex->tok_kind = MP_TOKEN_DEL_PERIOD;
Damien George2e9eb2d2014-04-10 12:19:33 +0100581 }
582
Damien429d7192013-10-04 19:53:11 +0100583 } else {
584 // search for encoded delimiter or operator
585
586 const char *t = tok_enc;
Damien George54eb4e72014-07-03 13:47:47 +0100587 mp_uint_t tok_enc_index = 0;
Damien429d7192013-10-04 19:53:11 +0100588 for (; *t != 0 && !is_char(lex, *t); t += 1) {
589 if (*t == 'e' || *t == 'c') {
590 t += 1;
591 } else if (*t == 'E') {
592 tok_enc_index -= 1;
593 t += 1;
594 }
595 tok_enc_index += 1;
596 }
597
598 next_char(lex);
599
600 if (*t == 0) {
601 // didn't match any delimiter or operator characters
Damien Georgea4c52c52014-12-05 19:35:18 +0000602 lex->tok_kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100603
604 } else {
605 // matched a delimiter or operator character
606
607 // get the maximum characters for a valid token
608 t += 1;
Damien George54eb4e72014-07-03 13:47:47 +0100609 mp_uint_t t_index = tok_enc_index;
Damien429d7192013-10-04 19:53:11 +0100610 for (;;) {
611 for (; *t == 'e'; t += 1) {
612 t += 1;
613 t_index += 1;
614 if (is_char(lex, *t)) {
615 next_char(lex);
616 tok_enc_index = t_index;
617 break;
618 }
619 }
620
621 if (*t == 'E') {
622 t += 1;
623 if (is_char(lex, *t)) {
624 next_char(lex);
625 tok_enc_index = t_index;
626 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000627 lex->tok_kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100628 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100629 }
630 break;
631 }
632
633 if (*t == 'c') {
634 t += 1;
635 t_index += 1;
636 if (is_char(lex, *t)) {
637 next_char(lex);
638 tok_enc_index = t_index;
639 t += 1;
640 } else {
641 break;
642 }
643 } else {
644 break;
645 }
646 }
647
648 // set token kind
Damien Georgea4c52c52014-12-05 19:35:18 +0000649 lex->tok_kind = tok_enc_kind[tok_enc_index];
Damien429d7192013-10-04 19:53:11 +0100650
Damien George2e9eb2d2014-04-10 12:19:33 +0100651 tok_enc_no_match:
652
Damien429d7192013-10-04 19:53:11 +0100653 // compute bracket level for implicit line joining
Damien Georgea4c52c52014-12-05 19:35:18 +0000654 if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100655 lex->nested_bracket_level += 1;
Damien Georgea4c52c52014-12-05 19:35:18 +0000656 } else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100657 lex->nested_bracket_level -= 1;
658 }
659 }
660 }
661
Damiena5185f42013-10-20 14:41:27 +0100662 // check for keywords
Damien Georgea4c52c52014-12-05 19:35:18 +0000663 if (lex->tok_kind == MP_TOKEN_NAME) {
Damien George97f9a282014-05-12 23:07:34 +0100664 // We check for __debug__ here and convert it to its value. This is so
665 // the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
666 // need to check for this special token in many places in the compiler.
667 // TODO improve speed of these string comparisons
Damien George54eb4e72014-07-03 13:47:47 +0100668 //for (mp_int_t i = 0; tok_kw[i] != NULL; i++) {
Damien George963a5a32015-01-16 17:47:07 +0000669 for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000670 if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) {
Emmanuel Blotf6932d62014-06-19 18:54:34 +0200671 if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
672 // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
Damien Georgeb4b10fd2015-01-01 23:30:53 +0000673 lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
Damien George97f9a282014-05-12 23:07:34 +0100674 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000675 lex->tok_kind = MP_TOKEN_KW_FALSE + i;
Damien George97f9a282014-05-12 23:07:34 +0100676 }
Damien429d7192013-10-04 19:53:11 +0100677 break;
678 }
679 }
680 }
681}
682
Damien George94fbe972014-07-30 11:46:05 +0100683mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close) {
Damien George9bf5f282014-10-09 16:53:37 +0100684 mp_lexer_t *lex = m_new_obj_maybe(mp_lexer_t);
Damien Georgee1199ec2014-05-10 17:48:01 +0100685
686 // check for memory allocation error
687 if (lex == NULL) {
688 if (stream_close) {
689 stream_close(stream_data);
690 }
691 return NULL;
692 }
Damien429d7192013-10-04 19:53:11 +0100693
Damien Georgeb829b5c2014-01-25 13:51:19 +0000694 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100695 lex->stream_data = stream_data;
Damien George94fbe972014-07-30 11:46:05 +0100696 lex->stream_next_byte = stream_next_byte;
Damienfa2162b2013-10-20 17:42:00 +0100697 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100698 lex->line = 1;
699 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100700 lex->emit_dent = 0;
701 lex->nested_bracket_level = 0;
Damien George58ebde42014-05-21 20:32:59 +0100702 lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
Damien429d7192013-10-04 19:53:11 +0100703 lex->num_indent_level = 1;
Damien Georgee1199ec2014-05-10 17:48:01 +0100704 lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200705 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100706
Damien Georgee1199ec2014-05-10 17:48:01 +0100707 // check for memory allocation error
708 if (lex->indent_level == NULL || vstr_had_error(&lex->vstr)) {
709 mp_lexer_free(lex);
710 return NULL;
711 }
712
713 // store sentinel for first indentation level
714 lex->indent_level[0] = 0;
715
Damien429d7192013-10-04 19:53:11 +0100716 // preload characters
Damien George94fbe972014-07-30 11:46:05 +0100717 lex->chr0 = stream_next_byte(stream_data);
718 lex->chr1 = stream_next_byte(stream_data);
719 lex->chr2 = stream_next_byte(stream_data);
Damiena5185f42013-10-20 14:41:27 +0100720
721 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damien George94fbe972014-07-30 11:46:05 +0100722 if (lex->chr0 == MP_LEXER_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100723 lex->chr0 = '\n';
Damien George94fbe972014-07-30 11:46:05 +0100724 } else if (lex->chr1 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000725 if (lex->chr0 == '\r') {
726 lex->chr0 = '\n';
727 } else if (lex->chr0 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100728 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100729 }
Damien George94fbe972014-07-30 11:46:05 +0100730 } else if (lex->chr2 == MP_LEXER_EOF) {
Damien George32bade12015-01-30 00:27:46 +0000731 if (lex->chr1 == '\r') {
732 lex->chr1 = '\n';
733 } else if (lex->chr1 != '\n') {
Damiena5185f42013-10-20 14:41:27 +0100734 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100735 }
Damien429d7192013-10-04 19:53:11 +0100736 }
737
Damiena5185f42013-10-20 14:41:27 +0100738 // preload first token
Damien Georgea4c52c52014-12-05 19:35:18 +0000739 mp_lexer_next_token_into(lex, true);
Damien429d7192013-10-04 19:53:11 +0100740
741 return lex;
742}
743
Damiend99b0522013-12-21 18:17:45 +0000744void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100745 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100746 if (lex->stream_close) {
747 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100748 }
Damienbb5316b2013-10-22 21:12:29 +0100749 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200750 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000751 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100752 }
Damien429d7192013-10-04 19:53:11 +0100753}
754
Damiend99b0522013-12-21 18:17:45 +0000755void mp_lexer_to_next(mp_lexer_t *lex) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000756 mp_lexer_next_token_into(lex, false);
Damien429d7192013-10-04 19:53:11 +0100757}
758
Damien Georgea4c52c52014-12-05 19:35:18 +0000759#if MICROPY_DEBUG_PRINTERS
760void mp_lexer_show_token(const mp_lexer_t *lex) {
Damien George451a0872014-12-05 22:50:16 +0000761 printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
Damien Georgea4c52c52014-12-05 19:35:18 +0000762 if (lex->vstr.len > 0) {
763 const byte *i = (const byte *)lex->vstr.buf;
764 const byte *j = (const byte *)i + lex->vstr.len;
765 printf(" ");
766 while (i < j) {
767 unichar c = utf8_get_char(i);
768 i = utf8_next_char(i);
769 if (unichar_isprint(c)) {
770 printf("%c", c);
771 } else {
772 printf("?");
773 }
774 }
775 }
776 printf("\n");
Damien429d7192013-10-04 19:53:11 +0100777}
Damien Georgea4c52c52014-12-05 19:35:18 +0000778#endif