blob: 373a8d7231849af798fa0c4e5b54f0a0379f7bda [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027/* lexer.c -- simple tokeniser for Python implementation
28 */
29
xbeefe34222014-03-16 00:14:26 -070030#include <stdbool.h>
Damien429d7192013-10-04 19:53:11 +010031#include <stdint.h>
32#include <stdio.h>
33#include <assert.h>
34
35#include "misc.h"
Damien George55baff42014-01-21 21:40:13 +000036#include "mpconfig.h"
37#include "qstr.h"
Damien429d7192013-10-04 19:53:11 +010038#include "lexer.h"
39
40#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010041
Damien92c06562013-10-22 22:32:27 +010042// TODO seems that CPython allows NULL byte in the input stream
43// don't know if that's intentional or not, but we don't allow it
44
Damiend99b0522013-12-21 18:17:45 +000045struct _mp_lexer_t {
Damien George08335002014-01-18 23:24:36 +000046 qstr source_name; // name of source
Damiena5185f42013-10-20 14:41:27 +010047 void *stream_data; // data for stream
Damiend99b0522013-12-21 18:17:45 +000048 mp_lexer_stream_next_char_t stream_next_char; // stream callback to get next char
49 mp_lexer_stream_close_t stream_close; // stream callback to free
Damien429d7192013-10-04 19:53:11 +010050
Damiena5185f42013-10-20 14:41:27 +010051 unichar chr0, chr1, chr2; // current cached characters from source
Damien429d7192013-10-04 19:53:11 +010052
53 uint line; // source line
54 uint column; // source column
55
Damiena5185f42013-10-20 14:41:27 +010056 int emit_dent; // non-zero when there are INDENT/DEDENT tokens to emit
57 int nested_bracket_level; // >0 when there are nested brackets over multiple lines
Damien429d7192013-10-04 19:53:11 +010058
59 uint alloc_indent_level;
60 uint num_indent_level;
61 uint16_t *indent_level;
62
Damiena5185f42013-10-20 14:41:27 +010063 vstr_t vstr;
Damiend99b0522013-12-21 18:17:45 +000064 mp_token_t tok_cur;
Damien429d7192013-10-04 19:53:11 +010065};
66
Damien George9528cd62014-01-15 21:23:31 +000067// TODO replace with a call to a standard function
Damiena5185f42013-10-20 14:41:27 +010068bool str_strn_equal(const char *str, const char *strn, int len) {
Damien429d7192013-10-04 19:53:11 +010069 uint i = 0;
Damien429d7192013-10-04 19:53:11 +010070
Damiena5185f42013-10-20 14:41:27 +010071 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010072 ++i;
Damien429d7192013-10-04 19:53:11 +010073 ++str;
Damiena5185f42013-10-20 14:41:27 +010074 ++strn;
Damien429d7192013-10-04 19:53:11 +010075 }
76
Damiena5185f42013-10-20 14:41:27 +010077 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010078}
79
Damien Georgec5966122014-02-15 16:10:44 +000080#ifdef MICROPY_DEBUG_PRINTERS
Damiend99b0522013-12-21 18:17:45 +000081void mp_token_show(const mp_token_t *tok) {
Damien George08335002014-01-18 23:24:36 +000082 printf("(%d:%d) kind:%d str:%p len:%d", tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
Damien429d7192013-10-04 19:53:11 +010083 if (tok->str != NULL && tok->len > 0) {
84 const char *i = tok->str;
85 const char *j = i + tok->len;
86 printf(" ");
87 while (i < j) {
Damien George8cc96a32013-12-30 18:23:50 +000088 unichar c = utf8_get_char(i);
89 i = utf8_next_char(i);
90 if (unichar_isprint(c)) {
Damien429d7192013-10-04 19:53:11 +010091 printf("%c", c);
92 } else {
93 printf("?");
94 }
95 }
96 }
97 printf("\n");
98}
Damien Georgec5966122014-02-15 16:10:44 +000099#endif
Damien429d7192013-10-04 19:53:11 +0100100
Damiena5185f42013-10-20 14:41:27 +0100101#define CUR_CHAR(lex) ((lex)->chr0)
102
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200103STATIC bool is_end(mp_lexer_t *lex) {
Damiend99b0522013-12-21 18:17:45 +0000104 return lex->chr0 == MP_LEXER_CHAR_EOF;
Damien429d7192013-10-04 19:53:11 +0100105}
106
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200107STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100108 return lex->chr0 == '\n' || lex->chr0 == '\r';
109}
110
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200111STATIC bool is_char(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +0100112 return lex->chr0 == c;
113}
114
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200115STATIC bool is_char_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100116 return lex->chr0 == c1 || lex->chr0 == c2;
117}
118
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200119STATIC bool is_char_or3(mp_lexer_t *lex, char c1, char c2, char c3) {
Damien429d7192013-10-04 19:53:11 +0100120 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
121}
122
123/*
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200124STATIC bool is_char_following(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +0100125 return lex->chr1 == c;
126}
127*/
128
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200129STATIC bool is_char_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100130 return lex->chr1 == c1 || lex->chr1 == c2;
131}
132
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200133STATIC bool is_char_following_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100134 return lex->chr2 == c1 || lex->chr2 == c2;
135}
136
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200137STATIC bool is_char_and(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100138 return lex->chr0 == c1 && lex->chr1 == c2;
139}
140
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200141STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000142 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100143}
144
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200145STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000146 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100147}
148
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200149STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000150 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100151}
152
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200153STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000154 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100155}
156
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200157STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200158 return lex->chr1 >= '0' && lex->chr1 <= '7';
159}
160
Damien429d7192013-10-04 19:53:11 +0100161// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200162STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100163 return is_letter(lex) || lex->chr0 == '_';
164}
165
166// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200167STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100168 return is_head_of_identifier(lex) || is_digit(lex);
169}
170
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200171STATIC void next_char(mp_lexer_t *lex) {
Damiend99b0522013-12-21 18:17:45 +0000172 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100173 return;
174 }
175
176 int advance = 1;
177
178 if (lex->chr0 == '\n') {
179 // LF is a new line
180 ++lex->line;
181 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100182 } else if (lex->chr0 == '\r') {
183 // CR is a new line
184 ++lex->line;
185 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100186 if (lex->chr1 == '\n') {
187 // CR LF is a single new line
188 advance = 2;
189 }
190 } else if (lex->chr0 == '\t') {
191 // a tab
192 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
193 } else {
194 // a character worth one column
195 ++lex->column;
196 }
197
198 for (; advance > 0; advance--) {
199 lex->chr0 = lex->chr1;
200 lex->chr1 = lex->chr2;
Damiena5185f42013-10-20 14:41:27 +0100201 lex->chr2 = lex->stream_next_char(lex->stream_data);
Damiend99b0522013-12-21 18:17:45 +0000202 if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100203 // EOF
Damiend99b0522013-12-21 18:17:45 +0000204 if (lex->chr1 != MP_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien429d7192013-10-04 19:53:11 +0100205 lex->chr2 = '\n'; // insert newline at end of file
Damien429d7192013-10-04 19:53:11 +0100206 }
207 }
208 }
209}
210
Damiend99b0522013-12-21 18:17:45 +0000211void indent_push(mp_lexer_t *lex, uint indent) {
Damien429d7192013-10-04 19:53:11 +0100212 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien732407f2013-12-29 19:33:23 +0000213 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level * 2);
Damien429d7192013-10-04 19:53:11 +0100214 lex->alloc_indent_level *= 2;
Damien429d7192013-10-04 19:53:11 +0100215 }
216 lex->indent_level[lex->num_indent_level++] = indent;
217}
218
Damiend99b0522013-12-21 18:17:45 +0000219uint indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100220 return lex->indent_level[lex->num_indent_level - 1];
221}
222
Damiend99b0522013-12-21 18:17:45 +0000223void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100224 lex->num_indent_level -= 1;
225}
226
227// some tricky operator encoding:
228// <op> = begin with <op>, if this opchar matches then begin here
229// e<op> = end with <op>, if this opchar matches then end
230// E<op> = mandatory end with <op>, this opchar must match, then end
231// c<op> = continue with <op>, if this opchar matches then continue matching
232// this means if the start of two ops are the same then they are equal til the last char
233
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200234STATIC const char *tok_enc =
Damien429d7192013-10-04 19:53:11 +0100235 "()[]{},:;@~" // singles
236 "<e=c<e=" // < <= << <<=
237 ">e=c>e=" // > >= >> >>=
238 "*e=c*e=" // * *= ** **=
239 "+e=" // + +=
240 "-e=e>" // - -= ->
241 "&e=" // & &=
242 "|e=" // | |=
243 "/e=c/e=" // / /= // //=
244 "%e=" // % %=
245 "^e=" // ^ ^=
246 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100247 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100248
249// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200250STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000251 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
252 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
253 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
254 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100255
Damiend99b0522013-12-21 18:17:45 +0000256 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
257 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
258 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
259 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
260 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
261 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
262 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
263 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
264 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
265 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
266 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
267 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100268};
269
270// must have the same order as enum in lexer.h
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200271STATIC const char *tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100272 "False",
273 "None",
274 "True",
275 "and",
276 "as",
277 "assert",
278 "break",
279 "class",
280 "continue",
281 "def",
282 "del",
283 "elif",
284 "else",
285 "except",
286 "finally",
287 "for",
288 "from",
289 "global",
290 "if",
291 "import",
292 "in",
293 "is",
294 "lambda",
295 "nonlocal",
296 "not",
297 "or",
298 "pass",
299 "raise",
300 "return",
301 "try",
302 "while",
303 "with",
304 "yield",
305 NULL,
306};
307
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200308STATIC int hex_digit(unichar c) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200309 // c is assumed to be hex digit
310 int n = c - '0';
311 if (n > 9) {
312 n &= ~('a' - 'A');
313 n -= ('A' - ('9' + 1));
314 }
315 return n;
316}
317
318// This is called with CUR_CHAR() before first hex digit, and should return with
319// it pointing to last hex digit
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200320STATIC bool get_hex(mp_lexer_t *lex, int num_digits, uint *result) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200321 uint num = 0;
322 while (num_digits-- != 0) {
323 next_char(lex);
324 unichar c = CUR_CHAR(lex);
325 if (!unichar_isxdigit(c)) {
326 return false;
327 }
328 num = (num << 4) + hex_digit(c);
329 }
330 *result = num;
331 return true;
332}
333
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200334STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) {
Damiena5185f42013-10-20 14:41:27 +0100335 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100336 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100337 while (!is_end(lex)) {
338 if (is_physical_newline(lex)) {
339 had_physical_newline = true;
340 next_char(lex);
341 } else if (is_whitespace(lex)) {
342 next_char(lex);
343 } else if (is_char(lex, '#')) {
344 next_char(lex);
345 while (!is_end(lex) && !is_physical_newline(lex)) {
346 next_char(lex);
347 }
348 // had_physical_newline will be set on next loop
349 } else if (is_char(lex, '\\')) {
350 // backslash (outside string literals) must appear just before a physical newline
351 next_char(lex);
352 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000353 // SyntaxError: unexpected character after line continuation character
Damien George69a818d2014-01-12 13:55:24 +0000354 tok->src_line = lex->line;
355 tok->src_column = lex->column;
356 tok->kind = MP_TOKEN_BAD_LINE_CONTINUATION;
357 vstr_reset(&lex->vstr);
358 tok->str = vstr_str(&lex->vstr);
359 tok->len = 0;
360 return;
Damien429d7192013-10-04 19:53:11 +0100361 } else {
362 next_char(lex);
363 }
364 } else {
365 break;
366 }
367 }
368
Damiena5185f42013-10-20 14:41:27 +0100369 // set token source information
Damien429d7192013-10-04 19:53:11 +0100370 tok->src_line = lex->line;
371 tok->src_column = lex->column;
Damien429d7192013-10-04 19:53:11 +0100372
Damiena5185f42013-10-20 14:41:27 +0100373 // start new token text
374 vstr_reset(&lex->vstr);
375
376 if (first_token && lex->line == 1 && lex->column != 1) {
377 // check that the first token is in the first column
378 // if first token is not on first line, we get a physical newline and
379 // this check is done as part of normal indent/dedent checking below
380 // (done to get equivalence with CPython)
Damiend99b0522013-12-21 18:17:45 +0000381 tok->kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100382
383 } else if (lex->emit_dent < 0) {
Damiend99b0522013-12-21 18:17:45 +0000384 tok->kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100385 lex->emit_dent += 1;
386
387 } else if (lex->emit_dent > 0) {
Damiend99b0522013-12-21 18:17:45 +0000388 tok->kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100389 lex->emit_dent -= 1;
390
Damien91d387d2013-10-09 15:09:52 +0100391 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damiend99b0522013-12-21 18:17:45 +0000392 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100393
394 uint num_spaces = lex->column - 1;
395 lex->emit_dent = 0;
396 if (num_spaces == indent_top(lex)) {
397 } else if (num_spaces > indent_top(lex)) {
398 indent_push(lex, num_spaces);
399 lex->emit_dent += 1;
400 } else {
401 while (num_spaces < indent_top(lex)) {
402 indent_pop(lex);
403 lex->emit_dent -= 1;
404 }
405 if (num_spaces != indent_top(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000406 tok->kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100407 }
408 }
409
410 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100411 if (indent_top(lex) > 0) {
Damiend99b0522013-12-21 18:17:45 +0000412 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100413 lex->emit_dent = 0;
414 while (indent_top(lex) > 0) {
415 indent_pop(lex);
416 lex->emit_dent -= 1;
417 }
418 } else {
Damiend99b0522013-12-21 18:17:45 +0000419 tok->kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100420 }
421
422 } else if (is_char_or(lex, '\'', '\"')
423 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
424 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
425 // a string or bytes literal
426
427 // parse type codes
428 bool is_raw = false;
429 bool is_bytes = false;
430 if (is_char(lex, 'u')) {
431 next_char(lex);
432 } else if (is_char(lex, 'b')) {
433 is_bytes = true;
434 next_char(lex);
435 if (is_char(lex, 'r')) {
436 is_raw = true;
437 next_char(lex);
438 }
439 } else if (is_char(lex, 'r')) {
440 is_raw = true;
441 next_char(lex);
442 if (is_char(lex, 'b')) {
443 is_bytes = true;
444 next_char(lex);
445 }
446 }
447
448 // set token kind
449 if (is_bytes) {
Damiend99b0522013-12-21 18:17:45 +0000450 tok->kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100451 } else {
Damiend99b0522013-12-21 18:17:45 +0000452 tok->kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100453 }
454
455 // get first quoting character
456 char quote_char = '\'';
457 if (is_char(lex, '\"')) {
458 quote_char = '\"';
459 }
460 next_char(lex);
461
462 // work out if it's a single or triple quoted literal
463 int num_quotes;
464 if (is_char_and(lex, quote_char, quote_char)) {
465 // triple quotes
466 next_char(lex);
467 next_char(lex);
468 num_quotes = 3;
469 } else {
470 // single quotes
471 num_quotes = 1;
472 }
473
Damien429d7192013-10-04 19:53:11 +0100474 // parse the literal
Damien429d7192013-10-04 19:53:11 +0100475 int n_closing = 0;
476 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
477 if (is_char(lex, quote_char)) {
478 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100479 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100480 } else {
481 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100482 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100483 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100484 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100485 if (is_raw) {
486 // raw strings allow escaping of quotes, but the backslash is also emitted
487 vstr_add_char(&lex->vstr, '\\');
488 } else {
489 switch (c) {
490 case MP_LEXER_CHAR_EOF: break; // TODO a proper error message?
491 case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
492 case '\\': break;
493 case '\'': break;
494 case '"': break;
495 case 'a': c = 0x07; break;
496 case 'b': c = 0x08; break;
497 case 't': c = 0x09; break;
498 case 'n': c = 0x0a; break;
499 case 'v': c = 0x0b; break;
500 case 'f': c = 0x0c; break;
501 case 'r': c = 0x0d; break;
502 case 'x':
503 {
504 uint num = 0;
505 if (!get_hex(lex, 2, &num)) {
506 // TODO error message
507 assert(0);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200508 }
509 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100510 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200511 }
Damien Georgea91f4142014-04-10 11:30:55 +0100512 case 'N': break; // TODO \N{name} only in strings
513 case 'u': break; // TODO \uxxxx only in strings
514 case 'U': break; // TODO \Uxxxxxxxx only in strings
515 default:
516 if (c >= '0' && c <= '7') {
517 // Octal sequence, 1-3 chars
518 int digits = 3;
519 int num = c - '0';
520 while (is_following_odigit(lex) && --digits != 0) {
521 next_char(lex);
522 num = num * 8 + (CUR_CHAR(lex) - '0');
523 }
524 c = num;
525 } else {
526 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
527 vstr_add_char(&lex->vstr, '\\');
528 }
529 break;
530 }
Damiena5185f42013-10-20 14:41:27 +0100531 }
Damiend99b0522013-12-21 18:17:45 +0000532 if (c != MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100533 vstr_add_char(&lex->vstr, c);
534 }
535 } else {
536 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100537 }
538 }
539 next_char(lex);
540 }
541
542 // check we got the required end quotes
543 if (n_closing < num_quotes) {
Damiend99b0522013-12-21 18:17:45 +0000544 tok->kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100545 }
546
Damiena5185f42013-10-20 14:41:27 +0100547 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000548 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100549
550 } else if (is_head_of_identifier(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000551 tok->kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100552
Damiena5185f42013-10-20 14:41:27 +0100553 // get first char
554 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100555 next_char(lex);
556
Damiena5185f42013-10-20 14:41:27 +0100557 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100558 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100559 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100560 next_char(lex);
561 }
562
563 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damiend99b0522013-12-21 18:17:45 +0000564 tok->kind = MP_TOKEN_NUMBER;
Damien429d7192013-10-04 19:53:11 +0100565
Damiena5185f42013-10-20 14:41:27 +0100566 // get first char
567 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100568 next_char(lex);
569
Damiena5185f42013-10-20 14:41:27 +0100570 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100571 while (!is_end(lex)) {
572 if (is_char_or(lex, 'e', 'E')) {
Damiena5185f42013-10-20 14:41:27 +0100573 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100574 next_char(lex);
575 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100576 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100577 next_char(lex);
578 }
579 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
Damiena5185f42013-10-20 14:41:27 +0100580 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100581 next_char(lex);
582 } else {
583 break;
584 }
585 }
586
Damien George2e9eb2d2014-04-10 12:19:33 +0100587 } else if (is_char(lex, '.')) {
588 // special handling for . and ... operators, because .. is not a valid operator
589
590 // get first char
591 vstr_add_char(&lex->vstr, '.');
592 next_char(lex);
593
594 if (is_char_and(lex, '.', '.')) {
595 vstr_add_char(&lex->vstr, '.');
596 vstr_add_char(&lex->vstr, '.');
597 next_char(lex);
598 next_char(lex);
599 tok->kind = MP_TOKEN_ELLIPSIS;
600 } else {
601 tok->kind = MP_TOKEN_DEL_PERIOD;
602 }
603
Damien429d7192013-10-04 19:53:11 +0100604 } else {
605 // search for encoded delimiter or operator
606
607 const char *t = tok_enc;
608 uint tok_enc_index = 0;
609 for (; *t != 0 && !is_char(lex, *t); t += 1) {
610 if (*t == 'e' || *t == 'c') {
611 t += 1;
612 } else if (*t == 'E') {
613 tok_enc_index -= 1;
614 t += 1;
615 }
616 tok_enc_index += 1;
617 }
618
619 next_char(lex);
620
621 if (*t == 0) {
622 // didn't match any delimiter or operator characters
Damiend99b0522013-12-21 18:17:45 +0000623 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100624
625 } else {
626 // matched a delimiter or operator character
627
628 // get the maximum characters for a valid token
629 t += 1;
630 uint t_index = tok_enc_index;
631 for (;;) {
632 for (; *t == 'e'; t += 1) {
633 t += 1;
634 t_index += 1;
635 if (is_char(lex, *t)) {
636 next_char(lex);
637 tok_enc_index = t_index;
638 break;
639 }
640 }
641
642 if (*t == 'E') {
643 t += 1;
644 if (is_char(lex, *t)) {
645 next_char(lex);
646 tok_enc_index = t_index;
647 } else {
Damiend99b0522013-12-21 18:17:45 +0000648 tok->kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100649 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100650 }
651 break;
652 }
653
654 if (*t == 'c') {
655 t += 1;
656 t_index += 1;
657 if (is_char(lex, *t)) {
658 next_char(lex);
659 tok_enc_index = t_index;
660 t += 1;
661 } else {
662 break;
663 }
664 } else {
665 break;
666 }
667 }
668
669 // set token kind
670 tok->kind = tok_enc_kind[tok_enc_index];
671
Damien George2e9eb2d2014-04-10 12:19:33 +0100672 tok_enc_no_match:
673
Damien429d7192013-10-04 19:53:11 +0100674 // compute bracket level for implicit line joining
Damiend99b0522013-12-21 18:17:45 +0000675 if (tok->kind == MP_TOKEN_DEL_PAREN_OPEN || tok->kind == MP_TOKEN_DEL_BRACKET_OPEN || tok->kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100676 lex->nested_bracket_level += 1;
Damiend99b0522013-12-21 18:17:45 +0000677 } else if (tok->kind == MP_TOKEN_DEL_PAREN_CLOSE || tok->kind == MP_TOKEN_DEL_BRACKET_CLOSE || tok->kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100678 lex->nested_bracket_level -= 1;
679 }
680 }
681 }
682
Damiena5185f42013-10-20 14:41:27 +0100683 // point token text to vstr buffer
684 tok->str = vstr_str(&lex->vstr);
685 tok->len = vstr_len(&lex->vstr);
Damien429d7192013-10-04 19:53:11 +0100686
Damiena5185f42013-10-20 14:41:27 +0100687 // check for keywords
Damiend99b0522013-12-21 18:17:45 +0000688 if (tok->kind == MP_TOKEN_NAME) {
Damien429d7192013-10-04 19:53:11 +0100689 for (int i = 0; tok_kw[i] != NULL; i++) {
Damiena5185f42013-10-20 14:41:27 +0100690 if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
Damiend99b0522013-12-21 18:17:45 +0000691 tok->kind = MP_TOKEN_KW_FALSE + i;
Damien429d7192013-10-04 19:53:11 +0100692 break;
693 }
694 }
695 }
696}
697
Damien Georgeb829b5c2014-01-25 13:51:19 +0000698mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) {
Damiend99b0522013-12-21 18:17:45 +0000699 mp_lexer_t *lex = m_new(mp_lexer_t, 1);
Damien429d7192013-10-04 19:53:11 +0100700
Damien Georgeb829b5c2014-01-25 13:51:19 +0000701 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100702 lex->stream_data = stream_data;
703 lex->stream_next_char = stream_next_char;
Damienfa2162b2013-10-20 17:42:00 +0100704 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100705 lex->line = 1;
706 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100707 lex->emit_dent = 0;
708 lex->nested_bracket_level = 0;
709 lex->alloc_indent_level = 16;
710 lex->num_indent_level = 1;
711 lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
712 lex->indent_level[0] = 0;
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200713 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100714
715 // preload characters
Damiena5185f42013-10-20 14:41:27 +0100716 lex->chr0 = stream_next_char(stream_data);
717 lex->chr1 = stream_next_char(stream_data);
718 lex->chr2 = stream_next_char(stream_data);
719
720 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damiend99b0522013-12-21 18:17:45 +0000721 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100722 lex->chr0 = '\n';
Damiend99b0522013-12-21 18:17:45 +0000723 } else if (lex->chr1 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100724 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100725 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100726 }
Damiend99b0522013-12-21 18:17:45 +0000727 } else if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100728 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100729 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100730 }
Damien429d7192013-10-04 19:53:11 +0100731 }
732
Damiena5185f42013-10-20 14:41:27 +0100733 // preload first token
Damiend99b0522013-12-21 18:17:45 +0000734 mp_lexer_next_token_into(lex, &lex->tok_cur, true);
Damien429d7192013-10-04 19:53:11 +0100735
736 return lex;
737}
738
Damiend99b0522013-12-21 18:17:45 +0000739void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100740 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100741 if (lex->stream_close) {
742 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100743 }
Damienbb5316b2013-10-22 21:12:29 +0100744 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200745 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000746 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100747 }
Damien429d7192013-10-04 19:53:11 +0100748}
749
Damien George08335002014-01-18 23:24:36 +0000750qstr mp_lexer_source_name(mp_lexer_t *lex) {
751 return lex->source_name;
752}
753
Damiend99b0522013-12-21 18:17:45 +0000754void mp_lexer_to_next(mp_lexer_t *lex) {
755 mp_lexer_next_token_into(lex, &lex->tok_cur, false);
Damien429d7192013-10-04 19:53:11 +0100756}
757
Damiend99b0522013-12-21 18:17:45 +0000758const mp_token_t *mp_lexer_cur(const mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100759 return &lex->tok_cur;
760}
761
Damiend99b0522013-12-21 18:17:45 +0000762bool mp_lexer_is_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
Damien429d7192013-10-04 19:53:11 +0100763 return lex->tok_cur.kind == kind;
764}