blob: f736ef3030d996caaa2683f2fd00a2851b42d209 [file] [log] [blame]
Damien George04b91472014-05-03 23:27:38 +01001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26
Damien429d7192013-10-04 19:53:11 +010027/* lexer.c -- simple tokeniser for Python implementation
28 */
29
xbeefe34222014-03-16 00:14:26 -070030#include <stdbool.h>
Damien429d7192013-10-04 19:53:11 +010031#include <stdint.h>
32#include <stdio.h>
33#include <assert.h>
34
35#include "misc.h"
Damien George55baff42014-01-21 21:40:13 +000036#include "mpconfig.h"
37#include "qstr.h"
Damien429d7192013-10-04 19:53:11 +010038#include "lexer.h"
39
40#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010041
Damien92c06562013-10-22 22:32:27 +010042// TODO seems that CPython allows NULL byte in the input stream
43// don't know if that's intentional or not, but we don't allow it
44
Damiend99b0522013-12-21 18:17:45 +000045struct _mp_lexer_t {
Damien George08335002014-01-18 23:24:36 +000046 qstr source_name; // name of source
Damiena5185f42013-10-20 14:41:27 +010047 void *stream_data; // data for stream
Damiend99b0522013-12-21 18:17:45 +000048 mp_lexer_stream_next_char_t stream_next_char; // stream callback to get next char
49 mp_lexer_stream_close_t stream_close; // stream callback to free
Damien429d7192013-10-04 19:53:11 +010050
Damiena5185f42013-10-20 14:41:27 +010051 unichar chr0, chr1, chr2; // current cached characters from source
Damien429d7192013-10-04 19:53:11 +010052
53 uint line; // source line
54 uint column; // source column
55
Damiena5185f42013-10-20 14:41:27 +010056 int emit_dent; // non-zero when there are INDENT/DEDENT tokens to emit
57 int nested_bracket_level; // >0 when there are nested brackets over multiple lines
Damien429d7192013-10-04 19:53:11 +010058
59 uint alloc_indent_level;
60 uint num_indent_level;
61 uint16_t *indent_level;
62
Damiena5185f42013-10-20 14:41:27 +010063 vstr_t vstr;
Damiend99b0522013-12-21 18:17:45 +000064 mp_token_t tok_cur;
Damien429d7192013-10-04 19:53:11 +010065};
66
Damien George9528cd62014-01-15 21:23:31 +000067// TODO replace with a call to a standard function
Damiena5185f42013-10-20 14:41:27 +010068bool str_strn_equal(const char *str, const char *strn, int len) {
Damien429d7192013-10-04 19:53:11 +010069 uint i = 0;
Damien429d7192013-10-04 19:53:11 +010070
Damiena5185f42013-10-20 14:41:27 +010071 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010072 ++i;
Damien429d7192013-10-04 19:53:11 +010073 ++str;
Damiena5185f42013-10-20 14:41:27 +010074 ++strn;
Damien429d7192013-10-04 19:53:11 +010075 }
76
Damiena5185f42013-10-20 14:41:27 +010077 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010078}
79
Damien Georgec5966122014-02-15 16:10:44 +000080#ifdef MICROPY_DEBUG_PRINTERS
Damiend99b0522013-12-21 18:17:45 +000081void mp_token_show(const mp_token_t *tok) {
Damien George08335002014-01-18 23:24:36 +000082 printf("(%d:%d) kind:%d str:%p len:%d", tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
Damien429d7192013-10-04 19:53:11 +010083 if (tok->str != NULL && tok->len > 0) {
84 const char *i = tok->str;
85 const char *j = i + tok->len;
86 printf(" ");
87 while (i < j) {
Damien George8cc96a32013-12-30 18:23:50 +000088 unichar c = utf8_get_char(i);
89 i = utf8_next_char(i);
90 if (unichar_isprint(c)) {
Damien429d7192013-10-04 19:53:11 +010091 printf("%c", c);
92 } else {
93 printf("?");
94 }
95 }
96 }
97 printf("\n");
98}
Damien Georgec5966122014-02-15 16:10:44 +000099#endif
Damien429d7192013-10-04 19:53:11 +0100100
Damiena5185f42013-10-20 14:41:27 +0100101#define CUR_CHAR(lex) ((lex)->chr0)
102
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200103STATIC bool is_end(mp_lexer_t *lex) {
Damiend99b0522013-12-21 18:17:45 +0000104 return lex->chr0 == MP_LEXER_CHAR_EOF;
Damien429d7192013-10-04 19:53:11 +0100105}
106
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200107STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100108 return lex->chr0 == '\n' || lex->chr0 == '\r';
109}
110
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200111STATIC bool is_char(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +0100112 return lex->chr0 == c;
113}
114
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200115STATIC bool is_char_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100116 return lex->chr0 == c1 || lex->chr0 == c2;
117}
118
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200119STATIC bool is_char_or3(mp_lexer_t *lex, char c1, char c2, char c3) {
Damien429d7192013-10-04 19:53:11 +0100120 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
121}
122
123/*
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200124STATIC bool is_char_following(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +0100125 return lex->chr1 == c;
126}
127*/
128
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200129STATIC bool is_char_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100130 return lex->chr1 == c1 || lex->chr1 == c2;
131}
132
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200133STATIC bool is_char_following_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100134 return lex->chr2 == c1 || lex->chr2 == c2;
135}
136
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200137STATIC bool is_char_and(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100138 return lex->chr0 == c1 && lex->chr1 == c2;
139}
140
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200141STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000142 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100143}
144
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200145STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000146 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100147}
148
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200149STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000150 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100151}
152
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200153STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000154 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100155}
156
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200157STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200158 return lex->chr1 >= '0' && lex->chr1 <= '7';
159}
160
Damien429d7192013-10-04 19:53:11 +0100161// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200162STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100163 return is_letter(lex) || lex->chr0 == '_';
164}
165
166// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200167STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100168 return is_head_of_identifier(lex) || is_digit(lex);
169}
170
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200171STATIC void next_char(mp_lexer_t *lex) {
Damiend99b0522013-12-21 18:17:45 +0000172 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100173 return;
174 }
175
176 int advance = 1;
177
178 if (lex->chr0 == '\n') {
179 // LF is a new line
180 ++lex->line;
181 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100182 } else if (lex->chr0 == '\r') {
183 // CR is a new line
184 ++lex->line;
185 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100186 if (lex->chr1 == '\n') {
187 // CR LF is a single new line
188 advance = 2;
189 }
190 } else if (lex->chr0 == '\t') {
191 // a tab
192 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
193 } else {
194 // a character worth one column
195 ++lex->column;
196 }
197
198 for (; advance > 0; advance--) {
199 lex->chr0 = lex->chr1;
200 lex->chr1 = lex->chr2;
Damiena5185f42013-10-20 14:41:27 +0100201 lex->chr2 = lex->stream_next_char(lex->stream_data);
Damiend99b0522013-12-21 18:17:45 +0000202 if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100203 // EOF
Damiend99b0522013-12-21 18:17:45 +0000204 if (lex->chr1 != MP_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien429d7192013-10-04 19:53:11 +0100205 lex->chr2 = '\n'; // insert newline at end of file
Damien429d7192013-10-04 19:53:11 +0100206 }
207 }
208 }
209}
210
Damiend99b0522013-12-21 18:17:45 +0000211void indent_push(mp_lexer_t *lex, uint indent) {
Damien429d7192013-10-04 19:53:11 +0100212 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100213 // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
214 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MP_ALLOC_LEXEL_INDENT_INC);
215 lex->alloc_indent_level += MP_ALLOC_LEXEL_INDENT_INC;
Damien429d7192013-10-04 19:53:11 +0100216 }
217 lex->indent_level[lex->num_indent_level++] = indent;
218}
219
Damiend99b0522013-12-21 18:17:45 +0000220uint indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100221 return lex->indent_level[lex->num_indent_level - 1];
222}
223
Damiend99b0522013-12-21 18:17:45 +0000224void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100225 lex->num_indent_level -= 1;
226}
227
228// some tricky operator encoding:
229// <op> = begin with <op>, if this opchar matches then begin here
230// e<op> = end with <op>, if this opchar matches then end
231// E<op> = mandatory end with <op>, this opchar must match, then end
232// c<op> = continue with <op>, if this opchar matches then continue matching
233// this means if the start of two ops are the same then they are equal til the last char
234
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200235STATIC const char *tok_enc =
Damien429d7192013-10-04 19:53:11 +0100236 "()[]{},:;@~" // singles
237 "<e=c<e=" // < <= << <<=
238 ">e=c>e=" // > >= >> >>=
239 "*e=c*e=" // * *= ** **=
240 "+e=" // + +=
241 "-e=e>" // - -= ->
242 "&e=" // & &=
243 "|e=" // | |=
244 "/e=c/e=" // / /= // //=
245 "%e=" // % %=
246 "^e=" // ^ ^=
247 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100248 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100249
250// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200251STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000252 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
253 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
254 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
255 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100256
Damiend99b0522013-12-21 18:17:45 +0000257 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
258 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
259 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
260 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
261 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
262 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
263 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
264 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
265 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
266 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
267 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
268 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100269};
270
271// must have the same order as enum in lexer.h
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200272STATIC const char *tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100273 "False",
274 "None",
275 "True",
276 "and",
277 "as",
278 "assert",
279 "break",
280 "class",
281 "continue",
282 "def",
283 "del",
284 "elif",
285 "else",
286 "except",
287 "finally",
288 "for",
289 "from",
290 "global",
291 "if",
292 "import",
293 "in",
294 "is",
295 "lambda",
296 "nonlocal",
297 "not",
298 "or",
299 "pass",
300 "raise",
301 "return",
302 "try",
303 "while",
304 "with",
305 "yield",
306 NULL,
307};
308
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200309STATIC int hex_digit(unichar c) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200310 // c is assumed to be hex digit
311 int n = c - '0';
312 if (n > 9) {
313 n &= ~('a' - 'A');
314 n -= ('A' - ('9' + 1));
315 }
316 return n;
317}
318
319// This is called with CUR_CHAR() before first hex digit, and should return with
320// it pointing to last hex digit
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200321STATIC bool get_hex(mp_lexer_t *lex, int num_digits, uint *result) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200322 uint num = 0;
323 while (num_digits-- != 0) {
324 next_char(lex);
325 unichar c = CUR_CHAR(lex);
326 if (!unichar_isxdigit(c)) {
327 return false;
328 }
329 num = (num << 4) + hex_digit(c);
330 }
331 *result = num;
332 return true;
333}
334
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200335STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) {
Damiena5185f42013-10-20 14:41:27 +0100336 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100337 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100338 while (!is_end(lex)) {
339 if (is_physical_newline(lex)) {
340 had_physical_newline = true;
341 next_char(lex);
342 } else if (is_whitespace(lex)) {
343 next_char(lex);
344 } else if (is_char(lex, '#')) {
345 next_char(lex);
346 while (!is_end(lex) && !is_physical_newline(lex)) {
347 next_char(lex);
348 }
349 // had_physical_newline will be set on next loop
350 } else if (is_char(lex, '\\')) {
351 // backslash (outside string literals) must appear just before a physical newline
352 next_char(lex);
353 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000354 // SyntaxError: unexpected character after line continuation character
Damien George69a818d2014-01-12 13:55:24 +0000355 tok->src_line = lex->line;
356 tok->src_column = lex->column;
357 tok->kind = MP_TOKEN_BAD_LINE_CONTINUATION;
358 vstr_reset(&lex->vstr);
359 tok->str = vstr_str(&lex->vstr);
360 tok->len = 0;
361 return;
Damien429d7192013-10-04 19:53:11 +0100362 } else {
363 next_char(lex);
364 }
365 } else {
366 break;
367 }
368 }
369
Damiena5185f42013-10-20 14:41:27 +0100370 // set token source information
Damien429d7192013-10-04 19:53:11 +0100371 tok->src_line = lex->line;
372 tok->src_column = lex->column;
Damien429d7192013-10-04 19:53:11 +0100373
Damiena5185f42013-10-20 14:41:27 +0100374 // start new token text
375 vstr_reset(&lex->vstr);
376
377 if (first_token && lex->line == 1 && lex->column != 1) {
378 // check that the first token is in the first column
379 // if first token is not on first line, we get a physical newline and
380 // this check is done as part of normal indent/dedent checking below
381 // (done to get equivalence with CPython)
Damiend99b0522013-12-21 18:17:45 +0000382 tok->kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100383
384 } else if (lex->emit_dent < 0) {
Damiend99b0522013-12-21 18:17:45 +0000385 tok->kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100386 lex->emit_dent += 1;
387
388 } else if (lex->emit_dent > 0) {
Damiend99b0522013-12-21 18:17:45 +0000389 tok->kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100390 lex->emit_dent -= 1;
391
Damien91d387d2013-10-09 15:09:52 +0100392 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damiend99b0522013-12-21 18:17:45 +0000393 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100394
395 uint num_spaces = lex->column - 1;
396 lex->emit_dent = 0;
397 if (num_spaces == indent_top(lex)) {
398 } else if (num_spaces > indent_top(lex)) {
399 indent_push(lex, num_spaces);
400 lex->emit_dent += 1;
401 } else {
402 while (num_spaces < indent_top(lex)) {
403 indent_pop(lex);
404 lex->emit_dent -= 1;
405 }
406 if (num_spaces != indent_top(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000407 tok->kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100408 }
409 }
410
411 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100412 if (indent_top(lex) > 0) {
Damiend99b0522013-12-21 18:17:45 +0000413 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100414 lex->emit_dent = 0;
415 while (indent_top(lex) > 0) {
416 indent_pop(lex);
417 lex->emit_dent -= 1;
418 }
419 } else {
Damiend99b0522013-12-21 18:17:45 +0000420 tok->kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100421 }
422
423 } else if (is_char_or(lex, '\'', '\"')
424 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
425 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
426 // a string or bytes literal
427
428 // parse type codes
429 bool is_raw = false;
430 bool is_bytes = false;
431 if (is_char(lex, 'u')) {
432 next_char(lex);
433 } else if (is_char(lex, 'b')) {
434 is_bytes = true;
435 next_char(lex);
436 if (is_char(lex, 'r')) {
437 is_raw = true;
438 next_char(lex);
439 }
440 } else if (is_char(lex, 'r')) {
441 is_raw = true;
442 next_char(lex);
443 if (is_char(lex, 'b')) {
444 is_bytes = true;
445 next_char(lex);
446 }
447 }
448
449 // set token kind
450 if (is_bytes) {
Damiend99b0522013-12-21 18:17:45 +0000451 tok->kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100452 } else {
Damiend99b0522013-12-21 18:17:45 +0000453 tok->kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100454 }
455
456 // get first quoting character
457 char quote_char = '\'';
458 if (is_char(lex, '\"')) {
459 quote_char = '\"';
460 }
461 next_char(lex);
462
463 // work out if it's a single or triple quoted literal
464 int num_quotes;
465 if (is_char_and(lex, quote_char, quote_char)) {
466 // triple quotes
467 next_char(lex);
468 next_char(lex);
469 num_quotes = 3;
470 } else {
471 // single quotes
472 num_quotes = 1;
473 }
474
Damien429d7192013-10-04 19:53:11 +0100475 // parse the literal
Damien429d7192013-10-04 19:53:11 +0100476 int n_closing = 0;
477 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
478 if (is_char(lex, quote_char)) {
479 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100480 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100481 } else {
482 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100483 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100484 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100485 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100486 if (is_raw) {
487 // raw strings allow escaping of quotes, but the backslash is also emitted
488 vstr_add_char(&lex->vstr, '\\');
489 } else {
490 switch (c) {
491 case MP_LEXER_CHAR_EOF: break; // TODO a proper error message?
492 case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
493 case '\\': break;
494 case '\'': break;
495 case '"': break;
496 case 'a': c = 0x07; break;
497 case 'b': c = 0x08; break;
498 case 't': c = 0x09; break;
499 case 'n': c = 0x0a; break;
500 case 'v': c = 0x0b; break;
501 case 'f': c = 0x0c; break;
502 case 'r': c = 0x0d; break;
503 case 'x':
504 {
505 uint num = 0;
506 if (!get_hex(lex, 2, &num)) {
507 // TODO error message
508 assert(0);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200509 }
510 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100511 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200512 }
Damien Georgea91f4142014-04-10 11:30:55 +0100513 case 'N': break; // TODO \N{name} only in strings
514 case 'u': break; // TODO \uxxxx only in strings
515 case 'U': break; // TODO \Uxxxxxxxx only in strings
516 default:
517 if (c >= '0' && c <= '7') {
518 // Octal sequence, 1-3 chars
519 int digits = 3;
520 int num = c - '0';
521 while (is_following_odigit(lex) && --digits != 0) {
522 next_char(lex);
523 num = num * 8 + (CUR_CHAR(lex) - '0');
524 }
525 c = num;
526 } else {
527 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
528 vstr_add_char(&lex->vstr, '\\');
529 }
530 break;
531 }
Damiena5185f42013-10-20 14:41:27 +0100532 }
Damiend99b0522013-12-21 18:17:45 +0000533 if (c != MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100534 vstr_add_char(&lex->vstr, c);
535 }
536 } else {
537 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100538 }
539 }
540 next_char(lex);
541 }
542
543 // check we got the required end quotes
544 if (n_closing < num_quotes) {
Damiend99b0522013-12-21 18:17:45 +0000545 tok->kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100546 }
547
Damiena5185f42013-10-20 14:41:27 +0100548 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000549 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100550
551 } else if (is_head_of_identifier(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000552 tok->kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100553
Damiena5185f42013-10-20 14:41:27 +0100554 // get first char
555 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100556 next_char(lex);
557
Damiena5185f42013-10-20 14:41:27 +0100558 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100559 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100560 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100561 next_char(lex);
562 }
563
564 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damiend99b0522013-12-21 18:17:45 +0000565 tok->kind = MP_TOKEN_NUMBER;
Damien429d7192013-10-04 19:53:11 +0100566
Damiena5185f42013-10-20 14:41:27 +0100567 // get first char
568 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100569 next_char(lex);
570
Damiena5185f42013-10-20 14:41:27 +0100571 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100572 while (!is_end(lex)) {
573 if (is_char_or(lex, 'e', 'E')) {
Damiena5185f42013-10-20 14:41:27 +0100574 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100575 next_char(lex);
576 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100577 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100578 next_char(lex);
579 }
580 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
Damiena5185f42013-10-20 14:41:27 +0100581 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100582 next_char(lex);
583 } else {
584 break;
585 }
586 }
587
Damien George2e9eb2d2014-04-10 12:19:33 +0100588 } else if (is_char(lex, '.')) {
589 // special handling for . and ... operators, because .. is not a valid operator
590
591 // get first char
592 vstr_add_char(&lex->vstr, '.');
593 next_char(lex);
594
595 if (is_char_and(lex, '.', '.')) {
596 vstr_add_char(&lex->vstr, '.');
597 vstr_add_char(&lex->vstr, '.');
598 next_char(lex);
599 next_char(lex);
600 tok->kind = MP_TOKEN_ELLIPSIS;
601 } else {
602 tok->kind = MP_TOKEN_DEL_PERIOD;
603 }
604
Damien429d7192013-10-04 19:53:11 +0100605 } else {
606 // search for encoded delimiter or operator
607
608 const char *t = tok_enc;
609 uint tok_enc_index = 0;
610 for (; *t != 0 && !is_char(lex, *t); t += 1) {
611 if (*t == 'e' || *t == 'c') {
612 t += 1;
613 } else if (*t == 'E') {
614 tok_enc_index -= 1;
615 t += 1;
616 }
617 tok_enc_index += 1;
618 }
619
620 next_char(lex);
621
622 if (*t == 0) {
623 // didn't match any delimiter or operator characters
Damiend99b0522013-12-21 18:17:45 +0000624 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100625
626 } else {
627 // matched a delimiter or operator character
628
629 // get the maximum characters for a valid token
630 t += 1;
631 uint t_index = tok_enc_index;
632 for (;;) {
633 for (; *t == 'e'; t += 1) {
634 t += 1;
635 t_index += 1;
636 if (is_char(lex, *t)) {
637 next_char(lex);
638 tok_enc_index = t_index;
639 break;
640 }
641 }
642
643 if (*t == 'E') {
644 t += 1;
645 if (is_char(lex, *t)) {
646 next_char(lex);
647 tok_enc_index = t_index;
648 } else {
Damiend99b0522013-12-21 18:17:45 +0000649 tok->kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100650 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100651 }
652 break;
653 }
654
655 if (*t == 'c') {
656 t += 1;
657 t_index += 1;
658 if (is_char(lex, *t)) {
659 next_char(lex);
660 tok_enc_index = t_index;
661 t += 1;
662 } else {
663 break;
664 }
665 } else {
666 break;
667 }
668 }
669
670 // set token kind
671 tok->kind = tok_enc_kind[tok_enc_index];
672
Damien George2e9eb2d2014-04-10 12:19:33 +0100673 tok_enc_no_match:
674
Damien429d7192013-10-04 19:53:11 +0100675 // compute bracket level for implicit line joining
Damiend99b0522013-12-21 18:17:45 +0000676 if (tok->kind == MP_TOKEN_DEL_PAREN_OPEN || tok->kind == MP_TOKEN_DEL_BRACKET_OPEN || tok->kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100677 lex->nested_bracket_level += 1;
Damiend99b0522013-12-21 18:17:45 +0000678 } else if (tok->kind == MP_TOKEN_DEL_PAREN_CLOSE || tok->kind == MP_TOKEN_DEL_BRACKET_CLOSE || tok->kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100679 lex->nested_bracket_level -= 1;
680 }
681 }
682 }
683
Damiena5185f42013-10-20 14:41:27 +0100684 // point token text to vstr buffer
685 tok->str = vstr_str(&lex->vstr);
686 tok->len = vstr_len(&lex->vstr);
Damien429d7192013-10-04 19:53:11 +0100687
Damiena5185f42013-10-20 14:41:27 +0100688 // check for keywords
Damiend99b0522013-12-21 18:17:45 +0000689 if (tok->kind == MP_TOKEN_NAME) {
Damien429d7192013-10-04 19:53:11 +0100690 for (int i = 0; tok_kw[i] != NULL; i++) {
Damiena5185f42013-10-20 14:41:27 +0100691 if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
Damiend99b0522013-12-21 18:17:45 +0000692 tok->kind = MP_TOKEN_KW_FALSE + i;
Damien429d7192013-10-04 19:53:11 +0100693 break;
694 }
695 }
696 }
697}
698
Damien Georgeb829b5c2014-01-25 13:51:19 +0000699mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) {
Damien Georgee1199ec2014-05-10 17:48:01 +0100700 mp_lexer_t *lex = m_new_maybe(mp_lexer_t, 1);
701
702 // check for memory allocation error
703 if (lex == NULL) {
704 if (stream_close) {
705 stream_close(stream_data);
706 }
707 return NULL;
708 }
Damien429d7192013-10-04 19:53:11 +0100709
Damien Georgeb829b5c2014-01-25 13:51:19 +0000710 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100711 lex->stream_data = stream_data;
712 lex->stream_next_char = stream_next_char;
Damienfa2162b2013-10-20 17:42:00 +0100713 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100714 lex->line = 1;
715 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100716 lex->emit_dent = 0;
717 lex->nested_bracket_level = 0;
Damien Georgee1199ec2014-05-10 17:48:01 +0100718 lex->alloc_indent_level = MP_ALLOC_LEXER_INDENT_INIT;
Damien429d7192013-10-04 19:53:11 +0100719 lex->num_indent_level = 1;
Damien Georgee1199ec2014-05-10 17:48:01 +0100720 lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200721 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100722
Damien Georgee1199ec2014-05-10 17:48:01 +0100723 // check for memory allocation error
724 if (lex->indent_level == NULL || vstr_had_error(&lex->vstr)) {
725 mp_lexer_free(lex);
726 return NULL;
727 }
728
729 // store sentinel for first indentation level
730 lex->indent_level[0] = 0;
731
Damien429d7192013-10-04 19:53:11 +0100732 // preload characters
Damiena5185f42013-10-20 14:41:27 +0100733 lex->chr0 = stream_next_char(stream_data);
734 lex->chr1 = stream_next_char(stream_data);
735 lex->chr2 = stream_next_char(stream_data);
736
737 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damiend99b0522013-12-21 18:17:45 +0000738 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100739 lex->chr0 = '\n';
Damiend99b0522013-12-21 18:17:45 +0000740 } else if (lex->chr1 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100741 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100742 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100743 }
Damiend99b0522013-12-21 18:17:45 +0000744 } else if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100745 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100746 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100747 }
Damien429d7192013-10-04 19:53:11 +0100748 }
749
Damiena5185f42013-10-20 14:41:27 +0100750 // preload first token
Damiend99b0522013-12-21 18:17:45 +0000751 mp_lexer_next_token_into(lex, &lex->tok_cur, true);
Damien429d7192013-10-04 19:53:11 +0100752
753 return lex;
754}
755
Damiend99b0522013-12-21 18:17:45 +0000756void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100757 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100758 if (lex->stream_close) {
759 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100760 }
Damienbb5316b2013-10-22 21:12:29 +0100761 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200762 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000763 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100764 }
Damien429d7192013-10-04 19:53:11 +0100765}
766
Damien George08335002014-01-18 23:24:36 +0000767qstr mp_lexer_source_name(mp_lexer_t *lex) {
768 return lex->source_name;
769}
770
Damiend99b0522013-12-21 18:17:45 +0000771void mp_lexer_to_next(mp_lexer_t *lex) {
772 mp_lexer_next_token_into(lex, &lex->tok_cur, false);
Damien429d7192013-10-04 19:53:11 +0100773}
774
Damiend99b0522013-12-21 18:17:45 +0000775const mp_token_t *mp_lexer_cur(const mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100776 return &lex->tok_cur;
777}
778
Damiend99b0522013-12-21 18:17:45 +0000779bool mp_lexer_is_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
Damien429d7192013-10-04 19:53:11 +0100780 return lex->tok_cur.kind == kind;
781}