blob: a93d8ad0d0ced7c839abbafea17e12c5809a6d26 [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027/* lexer.c -- simple tokeniser for Python implementation
28 */
29
xbeefe34222014-03-16 00:14:26 -070030#include <stdbool.h>
Damien429d7192013-10-04 19:53:11 +010031#include <stdint.h>
32#include <stdio.h>
33#include <assert.h>
34
Damien George55baff42014-01-21 21:40:13 +000035#include "mpconfig.h"
Paul Sokolovsky59c675a2014-06-21 22:43:22 +030036#include "misc.h"
Damien George55baff42014-01-21 21:40:13 +000037#include "qstr.h"
Damien429d7192013-10-04 19:53:11 +010038#include "lexer.h"
39
40#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010041
Damien92c06562013-10-22 22:32:27 +010042// TODO seems that CPython allows NULL byte in the input stream
43// don't know if that's intentional or not, but we don't allow it
44
Damien George54eb4e72014-07-03 13:47:47 +010045mp_uint_t mp_optimise_value;
Damien George97f9a282014-05-12 23:07:34 +010046
Damien George9528cd62014-01-15 21:23:31 +000047// TODO replace with a call to a standard function
Damien Georgea4c52c52014-12-05 19:35:18 +000048STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
Damien George54eb4e72014-07-03 13:47:47 +010049 mp_uint_t i = 0;
Damien429d7192013-10-04 19:53:11 +010050
Damiena5185f42013-10-20 14:41:27 +010051 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010052 ++i;
Damien429d7192013-10-04 19:53:11 +010053 ++str;
Damiena5185f42013-10-20 14:41:27 +010054 ++strn;
Damien429d7192013-10-04 19:53:11 +010055 }
56
Damiena5185f42013-10-20 14:41:27 +010057 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010058}
59
Damiena5185f42013-10-20 14:41:27 +010060#define CUR_CHAR(lex) ((lex)->chr0)
61
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020062STATIC bool is_end(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +010063 return lex->chr0 == MP_LEXER_EOF;
Damien429d7192013-10-04 19:53:11 +010064}
65
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020066STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +010067 return lex->chr0 == '\n' || lex->chr0 == '\r';
68}
69
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020070STATIC bool is_char(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010071 return lex->chr0 == c;
72}
73
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020074STATIC bool is_char_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010075 return lex->chr0 == c1 || lex->chr0 == c2;
76}
77
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020078STATIC bool is_char_or3(mp_lexer_t *lex, char c1, char c2, char c3) {
Damien429d7192013-10-04 19:53:11 +010079 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
80}
81
82/*
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020083STATIC bool is_char_following(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010084 return lex->chr1 == c;
85}
86*/
87
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020088STATIC bool is_char_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010089 return lex->chr1 == c1 || lex->chr1 == c2;
90}
91
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020092STATIC bool is_char_following_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010093 return lex->chr2 == c1 || lex->chr2 == c2;
94}
95
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020096STATIC bool is_char_and(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010097 return lex->chr0 == c1 && lex->chr1 == c2;
98}
99
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200100STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000101 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100102}
103
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200104STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000105 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100106}
107
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200108STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000109 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100110}
111
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200112STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000113 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100114}
115
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200116STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200117 return lex->chr1 >= '0' && lex->chr1 <= '7';
118}
119
Damien429d7192013-10-04 19:53:11 +0100120// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200121STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100122 return is_letter(lex) || lex->chr0 == '_';
123}
124
125// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200126STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100127 return is_head_of_identifier(lex) || is_digit(lex);
128}
129
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200130STATIC void next_char(mp_lexer_t *lex) {
Damien George94fbe972014-07-30 11:46:05 +0100131 if (lex->chr0 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100132 return;
133 }
134
Damien George54eb4e72014-07-03 13:47:47 +0100135 mp_uint_t advance = 1;
Damien429d7192013-10-04 19:53:11 +0100136
137 if (lex->chr0 == '\n') {
138 // LF is a new line
139 ++lex->line;
140 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100141 } else if (lex->chr0 == '\r') {
142 // CR is a new line
143 ++lex->line;
144 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100145 if (lex->chr1 == '\n') {
146 // CR LF is a single new line
147 advance = 2;
148 }
149 } else if (lex->chr0 == '\t') {
150 // a tab
151 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
152 } else {
153 // a character worth one column
154 ++lex->column;
155 }
156
157 for (; advance > 0; advance--) {
158 lex->chr0 = lex->chr1;
159 lex->chr1 = lex->chr2;
Damien George94fbe972014-07-30 11:46:05 +0100160 lex->chr2 = lex->stream_next_byte(lex->stream_data);
161 if (lex->chr2 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100162 // EOF
Damien George94fbe972014-07-30 11:46:05 +0100163 if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien429d7192013-10-04 19:53:11 +0100164 lex->chr2 = '\n'; // insert newline at end of file
Damien429d7192013-10-04 19:53:11 +0100165 }
166 }
167 }
168}
169
Damien Georgea4c52c52014-12-05 19:35:18 +0000170STATIC void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
Damien429d7192013-10-04 19:53:11 +0100171 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100172 // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
Damien George58ebde42014-05-21 20:32:59 +0100173 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
174 lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
Damien429d7192013-10-04 19:53:11 +0100175 }
176 lex->indent_level[lex->num_indent_level++] = indent;
177}
178
Damien Georgea4c52c52014-12-05 19:35:18 +0000179STATIC mp_uint_t indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100180 return lex->indent_level[lex->num_indent_level - 1];
181}
182
Damien Georgea4c52c52014-12-05 19:35:18 +0000183STATIC void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100184 lex->num_indent_level -= 1;
185}
186
187// some tricky operator encoding:
188// <op> = begin with <op>, if this opchar matches then begin here
189// e<op> = end with <op>, if this opchar matches then end
190// E<op> = mandatory end with <op>, this opchar must match, then end
191// c<op> = continue with <op>, if this opchar matches then continue matching
192// this means if the start of two ops are the same then they are equal til the last char
193
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200194STATIC const char *tok_enc =
Damien429d7192013-10-04 19:53:11 +0100195 "()[]{},:;@~" // singles
196 "<e=c<e=" // < <= << <<=
197 ">e=c>e=" // > >= >> >>=
198 "*e=c*e=" // * *= ** **=
199 "+e=" // + +=
200 "-e=e>" // - -= ->
201 "&e=" // & &=
202 "|e=" // | |=
203 "/e=c/e=" // / /= // //=
204 "%e=" // % %=
205 "^e=" // ^ ^=
206 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100207 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100208
209// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200210STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000211 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
212 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
213 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
214 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100215
Damiend99b0522013-12-21 18:17:45 +0000216 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
217 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
218 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
219 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
220 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
221 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
222 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
223 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
224 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
225 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
226 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
227 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100228};
229
230// must have the same order as enum in lexer.h
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200231STATIC const char *tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100232 "False",
233 "None",
234 "True",
235 "and",
236 "as",
237 "assert",
238 "break",
239 "class",
240 "continue",
241 "def",
242 "del",
243 "elif",
244 "else",
245 "except",
246 "finally",
247 "for",
248 "from",
249 "global",
250 "if",
251 "import",
252 "in",
253 "is",
254 "lambda",
255 "nonlocal",
256 "not",
257 "or",
258 "pass",
259 "raise",
260 "return",
261 "try",
262 "while",
263 "with",
264 "yield",
Damien George97f9a282014-05-12 23:07:34 +0100265 "__debug__",
Damien429d7192013-10-04 19:53:11 +0100266};
267
Damien George54eb4e72014-07-03 13:47:47 +0100268STATIC mp_uint_t hex_digit(unichar c) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200269 // c is assumed to be hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100270 mp_uint_t n = c - '0';
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200271 if (n > 9) {
272 n &= ~('a' - 'A');
273 n -= ('A' - ('9' + 1));
274 }
275 return n;
276}
277
278// This is called with CUR_CHAR() before first hex digit, and should return with
279// it pointing to last hex digit
Damien George54eb4e72014-07-03 13:47:47 +0100280// num_digits must be greater than zero
281STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
282 mp_uint_t num = 0;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200283 while (num_digits-- != 0) {
284 next_char(lex);
285 unichar c = CUR_CHAR(lex);
286 if (!unichar_isxdigit(c)) {
287 return false;
288 }
289 num = (num << 4) + hex_digit(c);
290 }
291 *result = num;
292 return true;
293}
294
Damien Georgea4c52c52014-12-05 19:35:18 +0000295STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
296 // start new token text
297 vstr_reset(&lex->vstr);
298
Damiena5185f42013-10-20 14:41:27 +0100299 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100300 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100301 while (!is_end(lex)) {
302 if (is_physical_newline(lex)) {
303 had_physical_newline = true;
304 next_char(lex);
305 } else if (is_whitespace(lex)) {
306 next_char(lex);
307 } else if (is_char(lex, '#')) {
308 next_char(lex);
309 while (!is_end(lex) && !is_physical_newline(lex)) {
310 next_char(lex);
311 }
312 // had_physical_newline will be set on next loop
313 } else if (is_char(lex, '\\')) {
314 // backslash (outside string literals) must appear just before a physical newline
315 next_char(lex);
316 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000317 // SyntaxError: unexpected character after line continuation character
Damien Georgea4c52c52014-12-05 19:35:18 +0000318 lex->tok_line = lex->line;
319 lex->tok_column = lex->column;
320 lex->tok_kind = MP_TOKEN_BAD_LINE_CONTINUATION;
Damien George69a818d2014-01-12 13:55:24 +0000321 return;
Damien429d7192013-10-04 19:53:11 +0100322 } else {
323 next_char(lex);
324 }
325 } else {
326 break;
327 }
328 }
329
Damiena5185f42013-10-20 14:41:27 +0100330 // set token source information
Damien Georgea4c52c52014-12-05 19:35:18 +0000331 lex->tok_line = lex->line;
332 lex->tok_column = lex->column;
Damiena5185f42013-10-20 14:41:27 +0100333
334 if (first_token && lex->line == 1 && lex->column != 1) {
335 // check that the first token is in the first column
336 // if first token is not on first line, we get a physical newline and
337 // this check is done as part of normal indent/dedent checking below
338 // (done to get equivalence with CPython)
Damien Georgea4c52c52014-12-05 19:35:18 +0000339 lex->tok_kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100340
341 } else if (lex->emit_dent < 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000342 lex->tok_kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100343 lex->emit_dent += 1;
344
345 } else if (lex->emit_dent > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000346 lex->tok_kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100347 lex->emit_dent -= 1;
348
Damien91d387d2013-10-09 15:09:52 +0100349 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000350 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100351
Damien George54eb4e72014-07-03 13:47:47 +0100352 mp_uint_t num_spaces = lex->column - 1;
Damien429d7192013-10-04 19:53:11 +0100353 lex->emit_dent = 0;
354 if (num_spaces == indent_top(lex)) {
355 } else if (num_spaces > indent_top(lex)) {
356 indent_push(lex, num_spaces);
357 lex->emit_dent += 1;
358 } else {
359 while (num_spaces < indent_top(lex)) {
360 indent_pop(lex);
361 lex->emit_dent -= 1;
362 }
363 if (num_spaces != indent_top(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000364 lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100365 }
366 }
367
368 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100369 if (indent_top(lex) > 0) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000370 lex->tok_kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100371 lex->emit_dent = 0;
372 while (indent_top(lex) > 0) {
373 indent_pop(lex);
374 lex->emit_dent -= 1;
375 }
376 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000377 lex->tok_kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100378 }
379
380 } else if (is_char_or(lex, '\'', '\"')
381 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
382 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
383 // a string or bytes literal
384
385 // parse type codes
386 bool is_raw = false;
387 bool is_bytes = false;
388 if (is_char(lex, 'u')) {
389 next_char(lex);
390 } else if (is_char(lex, 'b')) {
391 is_bytes = true;
392 next_char(lex);
393 if (is_char(lex, 'r')) {
394 is_raw = true;
395 next_char(lex);
396 }
397 } else if (is_char(lex, 'r')) {
398 is_raw = true;
399 next_char(lex);
400 if (is_char(lex, 'b')) {
401 is_bytes = true;
402 next_char(lex);
403 }
404 }
405
406 // set token kind
407 if (is_bytes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000408 lex->tok_kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100409 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000410 lex->tok_kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100411 }
412
413 // get first quoting character
414 char quote_char = '\'';
415 if (is_char(lex, '\"')) {
416 quote_char = '\"';
417 }
418 next_char(lex);
419
420 // work out if it's a single or triple quoted literal
Damien George54eb4e72014-07-03 13:47:47 +0100421 mp_uint_t num_quotes;
Damien429d7192013-10-04 19:53:11 +0100422 if (is_char_and(lex, quote_char, quote_char)) {
423 // triple quotes
424 next_char(lex);
425 next_char(lex);
426 num_quotes = 3;
427 } else {
428 // single quotes
429 num_quotes = 1;
430 }
431
Damien429d7192013-10-04 19:53:11 +0100432 // parse the literal
Damien George54eb4e72014-07-03 13:47:47 +0100433 mp_uint_t n_closing = 0;
Damien429d7192013-10-04 19:53:11 +0100434 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
435 if (is_char(lex, quote_char)) {
436 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100437 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100438 } else {
439 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100440 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100441 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100442 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100443 if (is_raw) {
444 // raw strings allow escaping of quotes, but the backslash is also emitted
445 vstr_add_char(&lex->vstr, '\\');
446 } else {
447 switch (c) {
Damien George94fbe972014-07-30 11:46:05 +0100448 case MP_LEXER_EOF: break; // TODO a proper error message?
449 case '\n': c = MP_LEXER_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damien Georgea91f4142014-04-10 11:30:55 +0100450 case '\\': break;
451 case '\'': break;
452 case '"': break;
453 case 'a': c = 0x07; break;
454 case 'b': c = 0x08; break;
455 case 't': c = 0x09; break;
456 case 'n': c = 0x0a; break;
457 case 'v': c = 0x0b; break;
458 case 'f': c = 0x0c; break;
459 case 'r': c = 0x0d; break;
Chris Angelico2ba22992014-06-04 05:28:12 +1000460 case 'u':
461 case 'U':
462 if (is_bytes) {
463 // b'\u1234' == b'\\u1234'
464 vstr_add_char(&lex->vstr, '\\');
465 break;
466 }
467 // Otherwise fall through.
Damien Georgea91f4142014-04-10 11:30:55 +0100468 case 'x':
469 {
Damien George54eb4e72014-07-03 13:47:47 +0100470 mp_uint_t num = 0;
Chris Angelico2ba22992014-06-04 05:28:12 +1000471 if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
Damien Georgea91f4142014-04-10 11:30:55 +0100472 // TODO error message
473 assert(0);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200474 }
475 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100476 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200477 }
Chris Angelico2ba22992014-06-04 05:28:12 +1000478 case 'N':
479 // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
480 // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
481 // 3MB of text; even gzip-compressed and with minimal structure, it'll take
482 // roughly half a meg of storage. This form of Unicode escape may be added
483 // later on, but it's definitely not a priority right now. -- CJA 20140607
484 assert(!"Unicode name escapes not supported");
485 break;
Damien Georgea91f4142014-04-10 11:30:55 +0100486 default:
487 if (c >= '0' && c <= '7') {
488 // Octal sequence, 1-3 chars
Damien George54eb4e72014-07-03 13:47:47 +0100489 mp_uint_t digits = 3;
490 mp_uint_t num = c - '0';
Damien Georgea91f4142014-04-10 11:30:55 +0100491 while (is_following_odigit(lex) && --digits != 0) {
492 next_char(lex);
493 num = num * 8 + (CUR_CHAR(lex) - '0');
494 }
495 c = num;
496 } else {
497 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
498 vstr_add_char(&lex->vstr, '\\');
499 }
500 break;
501 }
Damiena5185f42013-10-20 14:41:27 +0100502 }
Damien George94fbe972014-07-30 11:46:05 +0100503 if (c != MP_LEXER_EOF) {
Chris Angelico2ba22992014-06-04 05:28:12 +1000504 if (c < 0x110000 && !is_bytes) {
505 vstr_add_char(&lex->vstr, c);
506 } else if (c < 0x100 && is_bytes) {
507 vstr_add_byte(&lex->vstr, c);
508 } else {
509 assert(!"TODO: Throw an error, invalid escape code probably");
510 }
Damiena5185f42013-10-20 14:41:27 +0100511 }
512 } else {
Damien George94fbe972014-07-30 11:46:05 +0100513 // Add the "character" as a byte so that we remain 8-bit clean.
514 // This way, strings are parsed correctly whether or not they contain utf-8 chars.
515 vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100516 }
517 }
518 next_char(lex);
519 }
520
521 // check we got the required end quotes
522 if (n_closing < num_quotes) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000523 lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100524 }
525
Damiena5185f42013-10-20 14:41:27 +0100526 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000527 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100528
529 } else if (is_head_of_identifier(lex)) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000530 lex->tok_kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100531
Damiena5185f42013-10-20 14:41:27 +0100532 // get first char
533 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100534 next_char(lex);
535
Damiena5185f42013-10-20 14:41:27 +0100536 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100537 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100538 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100539 next_char(lex);
540 }
541
542 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000543 lex->tok_kind = MP_TOKEN_NUMBER;
Damien429d7192013-10-04 19:53:11 +0100544
Damiena5185f42013-10-20 14:41:27 +0100545 // get first char
546 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100547 next_char(lex);
548
Damiena5185f42013-10-20 14:41:27 +0100549 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100550 while (!is_end(lex)) {
551 if (is_char_or(lex, 'e', 'E')) {
Damiena5185f42013-10-20 14:41:27 +0100552 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100553 next_char(lex);
554 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100555 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100556 next_char(lex);
557 }
558 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
Damiena5185f42013-10-20 14:41:27 +0100559 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100560 next_char(lex);
561 } else {
562 break;
563 }
564 }
565
Damien George2e9eb2d2014-04-10 12:19:33 +0100566 } else if (is_char(lex, '.')) {
567 // special handling for . and ... operators, because .. is not a valid operator
568
569 // get first char
570 vstr_add_char(&lex->vstr, '.');
571 next_char(lex);
572
573 if (is_char_and(lex, '.', '.')) {
574 vstr_add_char(&lex->vstr, '.');
575 vstr_add_char(&lex->vstr, '.');
576 next_char(lex);
577 next_char(lex);
Damien Georgea4c52c52014-12-05 19:35:18 +0000578 lex->tok_kind = MP_TOKEN_ELLIPSIS;
Damien George2e9eb2d2014-04-10 12:19:33 +0100579 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000580 lex->tok_kind = MP_TOKEN_DEL_PERIOD;
Damien George2e9eb2d2014-04-10 12:19:33 +0100581 }
582
Damien429d7192013-10-04 19:53:11 +0100583 } else {
584 // search for encoded delimiter or operator
585
586 const char *t = tok_enc;
Damien George54eb4e72014-07-03 13:47:47 +0100587 mp_uint_t tok_enc_index = 0;
Damien429d7192013-10-04 19:53:11 +0100588 for (; *t != 0 && !is_char(lex, *t); t += 1) {
589 if (*t == 'e' || *t == 'c') {
590 t += 1;
591 } else if (*t == 'E') {
592 tok_enc_index -= 1;
593 t += 1;
594 }
595 tok_enc_index += 1;
596 }
597
598 next_char(lex);
599
600 if (*t == 0) {
601 // didn't match any delimiter or operator characters
Damien Georgea4c52c52014-12-05 19:35:18 +0000602 lex->tok_kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100603
604 } else {
605 // matched a delimiter or operator character
606
607 // get the maximum characters for a valid token
608 t += 1;
Damien George54eb4e72014-07-03 13:47:47 +0100609 mp_uint_t t_index = tok_enc_index;
Damien429d7192013-10-04 19:53:11 +0100610 for (;;) {
611 for (; *t == 'e'; t += 1) {
612 t += 1;
613 t_index += 1;
614 if (is_char(lex, *t)) {
615 next_char(lex);
616 tok_enc_index = t_index;
617 break;
618 }
619 }
620
621 if (*t == 'E') {
622 t += 1;
623 if (is_char(lex, *t)) {
624 next_char(lex);
625 tok_enc_index = t_index;
626 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000627 lex->tok_kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100628 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100629 }
630 break;
631 }
632
633 if (*t == 'c') {
634 t += 1;
635 t_index += 1;
636 if (is_char(lex, *t)) {
637 next_char(lex);
638 tok_enc_index = t_index;
639 t += 1;
640 } else {
641 break;
642 }
643 } else {
644 break;
645 }
646 }
647
648 // set token kind
Damien Georgea4c52c52014-12-05 19:35:18 +0000649 lex->tok_kind = tok_enc_kind[tok_enc_index];
Damien429d7192013-10-04 19:53:11 +0100650
Damien George2e9eb2d2014-04-10 12:19:33 +0100651 tok_enc_no_match:
652
Damien429d7192013-10-04 19:53:11 +0100653 // compute bracket level for implicit line joining
Damien Georgea4c52c52014-12-05 19:35:18 +0000654 if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100655 lex->nested_bracket_level += 1;
Damien Georgea4c52c52014-12-05 19:35:18 +0000656 } else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100657 lex->nested_bracket_level -= 1;
658 }
659 }
660 }
661
Damiena5185f42013-10-20 14:41:27 +0100662 // check for keywords
Damien Georgea4c52c52014-12-05 19:35:18 +0000663 if (lex->tok_kind == MP_TOKEN_NAME) {
Damien George97f9a282014-05-12 23:07:34 +0100664 // We check for __debug__ here and convert it to its value. This is so
665 // the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
666 // need to check for this special token in many places in the compiler.
667 // TODO improve speed of these string comparisons
Damien George54eb4e72014-07-03 13:47:47 +0100668 //for (mp_int_t i = 0; tok_kw[i] != NULL; i++) {
669 for (mp_int_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000670 if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) {
Emmanuel Blotf6932d62014-06-19 18:54:34 +0200671 if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
672 // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
Damien Georgea4c52c52014-12-05 19:35:18 +0000673 lex->tok_kind = (mp_optimise_value == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
Damien George97f9a282014-05-12 23:07:34 +0100674 } else {
Damien Georgea4c52c52014-12-05 19:35:18 +0000675 lex->tok_kind = MP_TOKEN_KW_FALSE + i;
Damien George97f9a282014-05-12 23:07:34 +0100676 }
Damien429d7192013-10-04 19:53:11 +0100677 break;
678 }
679 }
680 }
681}
682
Damien George94fbe972014-07-30 11:46:05 +0100683mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close) {
Damien George9bf5f282014-10-09 16:53:37 +0100684 mp_lexer_t *lex = m_new_obj_maybe(mp_lexer_t);
Damien Georgee1199ec2014-05-10 17:48:01 +0100685
686 // check for memory allocation error
687 if (lex == NULL) {
688 if (stream_close) {
689 stream_close(stream_data);
690 }
691 return NULL;
692 }
Damien429d7192013-10-04 19:53:11 +0100693
Damien Georgeb829b5c2014-01-25 13:51:19 +0000694 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100695 lex->stream_data = stream_data;
Damien George94fbe972014-07-30 11:46:05 +0100696 lex->stream_next_byte = stream_next_byte;
Damienfa2162b2013-10-20 17:42:00 +0100697 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100698 lex->line = 1;
699 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100700 lex->emit_dent = 0;
701 lex->nested_bracket_level = 0;
Damien George58ebde42014-05-21 20:32:59 +0100702 lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
Damien429d7192013-10-04 19:53:11 +0100703 lex->num_indent_level = 1;
Damien Georgee1199ec2014-05-10 17:48:01 +0100704 lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200705 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100706
Damien Georgee1199ec2014-05-10 17:48:01 +0100707 // check for memory allocation error
708 if (lex->indent_level == NULL || vstr_had_error(&lex->vstr)) {
709 mp_lexer_free(lex);
710 return NULL;
711 }
712
713 // store sentinel for first indentation level
714 lex->indent_level[0] = 0;
715
Damien429d7192013-10-04 19:53:11 +0100716 // preload characters
Damien George94fbe972014-07-30 11:46:05 +0100717 lex->chr0 = stream_next_byte(stream_data);
718 lex->chr1 = stream_next_byte(stream_data);
719 lex->chr2 = stream_next_byte(stream_data);
Damiena5185f42013-10-20 14:41:27 +0100720
721 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damien George94fbe972014-07-30 11:46:05 +0100722 if (lex->chr0 == MP_LEXER_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100723 lex->chr0 = '\n';
Damien George94fbe972014-07-30 11:46:05 +0100724 } else if (lex->chr1 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100725 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100726 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100727 }
Damien George94fbe972014-07-30 11:46:05 +0100728 } else if (lex->chr2 == MP_LEXER_EOF) {
Damien429d7192013-10-04 19:53:11 +0100729 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100730 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100731 }
Damien429d7192013-10-04 19:53:11 +0100732 }
733
Damiena5185f42013-10-20 14:41:27 +0100734 // preload first token
Damien Georgea4c52c52014-12-05 19:35:18 +0000735 mp_lexer_next_token_into(lex, true);
Damien429d7192013-10-04 19:53:11 +0100736
737 return lex;
738}
739
Damiend99b0522013-12-21 18:17:45 +0000740void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100741 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100742 if (lex->stream_close) {
743 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100744 }
Damienbb5316b2013-10-22 21:12:29 +0100745 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200746 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000747 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100748 }
Damien429d7192013-10-04 19:53:11 +0100749}
750
Damiend99b0522013-12-21 18:17:45 +0000751void mp_lexer_to_next(mp_lexer_t *lex) {
Damien Georgea4c52c52014-12-05 19:35:18 +0000752 mp_lexer_next_token_into(lex, false);
Damien429d7192013-10-04 19:53:11 +0100753}
754
Damien Georgea4c52c52014-12-05 19:35:18 +0000755#if MICROPY_DEBUG_PRINTERS
756void mp_lexer_show_token(const mp_lexer_t *lex) {
757 printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%u", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
758 if (lex->vstr.len > 0) {
759 const byte *i = (const byte *)lex->vstr.buf;
760 const byte *j = (const byte *)i + lex->vstr.len;
761 printf(" ");
762 while (i < j) {
763 unichar c = utf8_get_char(i);
764 i = utf8_next_char(i);
765 if (unichar_isprint(c)) {
766 printf("%c", c);
767 } else {
768 printf("?");
769 }
770 }
771 }
772 printf("\n");
Damien429d7192013-10-04 19:53:11 +0100773}
Damien Georgea4c52c52014-12-05 19:35:18 +0000774#endif