blob: 58d54b6980604abeb5b0855a48d938c06cce230a [file] [log] [blame]
Damien429d7192013-10-04 19:53:11 +01001/* lexer.c -- simple tokeniser for Python implementation
2 */
3
xbeefe34222014-03-16 00:14:26 -07004#include <stdbool.h>
Damien429d7192013-10-04 19:53:11 +01005#include <stdint.h>
6#include <stdio.h>
7#include <assert.h>
8
9#include "misc.h"
Damien George55baff42014-01-21 21:40:13 +000010#include "mpconfig.h"
11#include "qstr.h"
Damien429d7192013-10-04 19:53:11 +010012#include "lexer.h"
13
14#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010015
Damien92c06562013-10-22 22:32:27 +010016// TODO seems that CPython allows NULL byte in the input stream
17// don't know if that's intentional or not, but we don't allow it
18
Damiend99b0522013-12-21 18:17:45 +000019struct _mp_lexer_t {
Damien George08335002014-01-18 23:24:36 +000020 qstr source_name; // name of source
Damiena5185f42013-10-20 14:41:27 +010021 void *stream_data; // data for stream
Damiend99b0522013-12-21 18:17:45 +000022 mp_lexer_stream_next_char_t stream_next_char; // stream callback to get next char
23 mp_lexer_stream_close_t stream_close; // stream callback to free
Damien429d7192013-10-04 19:53:11 +010024
Damiena5185f42013-10-20 14:41:27 +010025 unichar chr0, chr1, chr2; // current cached characters from source
Damien429d7192013-10-04 19:53:11 +010026
27 uint line; // source line
28 uint column; // source column
29
Damiena5185f42013-10-20 14:41:27 +010030 int emit_dent; // non-zero when there are INDENT/DEDENT tokens to emit
31 int nested_bracket_level; // >0 when there are nested brackets over multiple lines
Damien429d7192013-10-04 19:53:11 +010032
33 uint alloc_indent_level;
34 uint num_indent_level;
35 uint16_t *indent_level;
36
Damiena5185f42013-10-20 14:41:27 +010037 vstr_t vstr;
Damiend99b0522013-12-21 18:17:45 +000038 mp_token_t tok_cur;
Damien429d7192013-10-04 19:53:11 +010039};
40
Damien George9528cd62014-01-15 21:23:31 +000041// TODO replace with a call to a standard function
Damiena5185f42013-10-20 14:41:27 +010042bool str_strn_equal(const char *str, const char *strn, int len) {
Damien429d7192013-10-04 19:53:11 +010043 uint i = 0;
Damien429d7192013-10-04 19:53:11 +010044
Damiena5185f42013-10-20 14:41:27 +010045 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010046 ++i;
Damien429d7192013-10-04 19:53:11 +010047 ++str;
Damiena5185f42013-10-20 14:41:27 +010048 ++strn;
Damien429d7192013-10-04 19:53:11 +010049 }
50
Damiena5185f42013-10-20 14:41:27 +010051 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010052}
53
Damien Georgec5966122014-02-15 16:10:44 +000054#ifdef MICROPY_DEBUG_PRINTERS
Damiend99b0522013-12-21 18:17:45 +000055void mp_token_show(const mp_token_t *tok) {
Damien George08335002014-01-18 23:24:36 +000056 printf("(%d:%d) kind:%d str:%p len:%d", tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
Damien429d7192013-10-04 19:53:11 +010057 if (tok->str != NULL && tok->len > 0) {
58 const char *i = tok->str;
59 const char *j = i + tok->len;
60 printf(" ");
61 while (i < j) {
Damien George8cc96a32013-12-30 18:23:50 +000062 unichar c = utf8_get_char(i);
63 i = utf8_next_char(i);
64 if (unichar_isprint(c)) {
Damien429d7192013-10-04 19:53:11 +010065 printf("%c", c);
66 } else {
67 printf("?");
68 }
69 }
70 }
71 printf("\n");
72}
Damien Georgec5966122014-02-15 16:10:44 +000073#endif
Damien429d7192013-10-04 19:53:11 +010074
Damiena5185f42013-10-20 14:41:27 +010075#define CUR_CHAR(lex) ((lex)->chr0)
76
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020077STATIC bool is_end(mp_lexer_t *lex) {
Damiend99b0522013-12-21 18:17:45 +000078 return lex->chr0 == MP_LEXER_CHAR_EOF;
Damien429d7192013-10-04 19:53:11 +010079}
80
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020081STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +010082 return lex->chr0 == '\n' || lex->chr0 == '\r';
83}
84
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020085STATIC bool is_char(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010086 return lex->chr0 == c;
87}
88
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020089STATIC bool is_char_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010090 return lex->chr0 == c1 || lex->chr0 == c2;
91}
92
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020093STATIC bool is_char_or3(mp_lexer_t *lex, char c1, char c2, char c3) {
Damien429d7192013-10-04 19:53:11 +010094 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
95}
96
97/*
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020098STATIC bool is_char_following(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010099 return lex->chr1 == c;
100}
101*/
102
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200103STATIC bool is_char_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100104 return lex->chr1 == c1 || lex->chr1 == c2;
105}
106
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200107STATIC bool is_char_following_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100108 return lex->chr2 == c1 || lex->chr2 == c2;
109}
110
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200111STATIC bool is_char_and(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100112 return lex->chr0 == c1 && lex->chr1 == c2;
113}
114
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200115STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000116 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100117}
118
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200119STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000120 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100121}
122
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200123STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000124 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100125}
126
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200127STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000128 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100129}
130
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200131STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200132 return lex->chr1 >= '0' && lex->chr1 <= '7';
133}
134
Damien429d7192013-10-04 19:53:11 +0100135// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200136STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100137 return is_letter(lex) || lex->chr0 == '_';
138}
139
140// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200141STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100142 return is_head_of_identifier(lex) || is_digit(lex);
143}
144
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200145STATIC void next_char(mp_lexer_t *lex) {
Damiend99b0522013-12-21 18:17:45 +0000146 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100147 return;
148 }
149
150 int advance = 1;
151
152 if (lex->chr0 == '\n') {
153 // LF is a new line
154 ++lex->line;
155 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100156 } else if (lex->chr0 == '\r') {
157 // CR is a new line
158 ++lex->line;
159 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100160 if (lex->chr1 == '\n') {
161 // CR LF is a single new line
162 advance = 2;
163 }
164 } else if (lex->chr0 == '\t') {
165 // a tab
166 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
167 } else {
168 // a character worth one column
169 ++lex->column;
170 }
171
172 for (; advance > 0; advance--) {
173 lex->chr0 = lex->chr1;
174 lex->chr1 = lex->chr2;
Damiena5185f42013-10-20 14:41:27 +0100175 lex->chr2 = lex->stream_next_char(lex->stream_data);
Damiend99b0522013-12-21 18:17:45 +0000176 if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100177 // EOF
Damiend99b0522013-12-21 18:17:45 +0000178 if (lex->chr1 != MP_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien429d7192013-10-04 19:53:11 +0100179 lex->chr2 = '\n'; // insert newline at end of file
Damien429d7192013-10-04 19:53:11 +0100180 }
181 }
182 }
183}
184
Damiend99b0522013-12-21 18:17:45 +0000185void indent_push(mp_lexer_t *lex, uint indent) {
Damien429d7192013-10-04 19:53:11 +0100186 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien732407f2013-12-29 19:33:23 +0000187 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level * 2);
Damien429d7192013-10-04 19:53:11 +0100188 lex->alloc_indent_level *= 2;
Damien429d7192013-10-04 19:53:11 +0100189 }
190 lex->indent_level[lex->num_indent_level++] = indent;
191}
192
Damiend99b0522013-12-21 18:17:45 +0000193uint indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100194 return lex->indent_level[lex->num_indent_level - 1];
195}
196
Damiend99b0522013-12-21 18:17:45 +0000197void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100198 lex->num_indent_level -= 1;
199}
200
201// some tricky operator encoding:
202// <op> = begin with <op>, if this opchar matches then begin here
203// e<op> = end with <op>, if this opchar matches then end
204// E<op> = mandatory end with <op>, this opchar must match, then end
205// c<op> = continue with <op>, if this opchar matches then continue matching
206// this means if the start of two ops are the same then they are equal til the last char
207
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200208STATIC const char *tok_enc =
Damien429d7192013-10-04 19:53:11 +0100209 "()[]{},:;@~" // singles
210 "<e=c<e=" // < <= << <<=
211 ">e=c>e=" // > >= >> >>=
212 "*e=c*e=" // * *= ** **=
213 "+e=" // + +=
214 "-e=e>" // - -= ->
215 "&e=" // & &=
216 "|e=" // | |=
217 "/e=c/e=" // / /= // //=
218 "%e=" // % %=
219 "^e=" // ^ ^=
220 "=e=" // = ==
221 "!E=" // !=
222 ".c.E."; // . ...
223
224// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200225STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000226 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
227 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
228 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
229 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100230
Damiend99b0522013-12-21 18:17:45 +0000231 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
232 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
233 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
234 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
235 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
236 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
237 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
238 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
239 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
240 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
241 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
242 MP_TOKEN_OP_NOT_EQUAL,
Damien Georgee9906ac2014-01-04 18:44:46 +0000243 MP_TOKEN_DEL_PERIOD, MP_TOKEN_ELLIPSIS,
Damien429d7192013-10-04 19:53:11 +0100244};
245
246// must have the same order as enum in lexer.h
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200247STATIC const char *tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100248 "False",
249 "None",
250 "True",
251 "and",
252 "as",
253 "assert",
254 "break",
255 "class",
256 "continue",
257 "def",
258 "del",
259 "elif",
260 "else",
261 "except",
262 "finally",
263 "for",
264 "from",
265 "global",
266 "if",
267 "import",
268 "in",
269 "is",
270 "lambda",
271 "nonlocal",
272 "not",
273 "or",
274 "pass",
275 "raise",
276 "return",
277 "try",
278 "while",
279 "with",
280 "yield",
281 NULL,
282};
283
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200284STATIC int hex_digit(unichar c) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200285 // c is assumed to be hex digit
286 int n = c - '0';
287 if (n > 9) {
288 n &= ~('a' - 'A');
289 n -= ('A' - ('9' + 1));
290 }
291 return n;
292}
293
294// This is called with CUR_CHAR() before first hex digit, and should return with
295// it pointing to last hex digit
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200296STATIC bool get_hex(mp_lexer_t *lex, int num_digits, uint *result) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200297 uint num = 0;
298 while (num_digits-- != 0) {
299 next_char(lex);
300 unichar c = CUR_CHAR(lex);
301 if (!unichar_isxdigit(c)) {
302 return false;
303 }
304 num = (num << 4) + hex_digit(c);
305 }
306 *result = num;
307 return true;
308}
309
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200310STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) {
Damiena5185f42013-10-20 14:41:27 +0100311 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100312 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100313 while (!is_end(lex)) {
314 if (is_physical_newline(lex)) {
315 had_physical_newline = true;
316 next_char(lex);
317 } else if (is_whitespace(lex)) {
318 next_char(lex);
319 } else if (is_char(lex, '#')) {
320 next_char(lex);
321 while (!is_end(lex) && !is_physical_newline(lex)) {
322 next_char(lex);
323 }
324 // had_physical_newline will be set on next loop
325 } else if (is_char(lex, '\\')) {
326 // backslash (outside string literals) must appear just before a physical newline
327 next_char(lex);
328 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000329 // SyntaxError: unexpected character after line continuation character
Damien George69a818d2014-01-12 13:55:24 +0000330 tok->src_line = lex->line;
331 tok->src_column = lex->column;
332 tok->kind = MP_TOKEN_BAD_LINE_CONTINUATION;
333 vstr_reset(&lex->vstr);
334 tok->str = vstr_str(&lex->vstr);
335 tok->len = 0;
336 return;
Damien429d7192013-10-04 19:53:11 +0100337 } else {
338 next_char(lex);
339 }
340 } else {
341 break;
342 }
343 }
344
Damiena5185f42013-10-20 14:41:27 +0100345 // set token source information
Damien429d7192013-10-04 19:53:11 +0100346 tok->src_line = lex->line;
347 tok->src_column = lex->column;
Damien429d7192013-10-04 19:53:11 +0100348
Damiena5185f42013-10-20 14:41:27 +0100349 // start new token text
350 vstr_reset(&lex->vstr);
351
352 if (first_token && lex->line == 1 && lex->column != 1) {
353 // check that the first token is in the first column
354 // if first token is not on first line, we get a physical newline and
355 // this check is done as part of normal indent/dedent checking below
356 // (done to get equivalence with CPython)
Damiend99b0522013-12-21 18:17:45 +0000357 tok->kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100358
359 } else if (lex->emit_dent < 0) {
Damiend99b0522013-12-21 18:17:45 +0000360 tok->kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100361 lex->emit_dent += 1;
362
363 } else if (lex->emit_dent > 0) {
Damiend99b0522013-12-21 18:17:45 +0000364 tok->kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100365 lex->emit_dent -= 1;
366
Damien91d387d2013-10-09 15:09:52 +0100367 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damiend99b0522013-12-21 18:17:45 +0000368 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100369
370 uint num_spaces = lex->column - 1;
371 lex->emit_dent = 0;
372 if (num_spaces == indent_top(lex)) {
373 } else if (num_spaces > indent_top(lex)) {
374 indent_push(lex, num_spaces);
375 lex->emit_dent += 1;
376 } else {
377 while (num_spaces < indent_top(lex)) {
378 indent_pop(lex);
379 lex->emit_dent -= 1;
380 }
381 if (num_spaces != indent_top(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000382 tok->kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100383 }
384 }
385
386 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100387 if (indent_top(lex) > 0) {
Damiend99b0522013-12-21 18:17:45 +0000388 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100389 lex->emit_dent = 0;
390 while (indent_top(lex) > 0) {
391 indent_pop(lex);
392 lex->emit_dent -= 1;
393 }
394 } else {
Damiend99b0522013-12-21 18:17:45 +0000395 tok->kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100396 }
397
398 } else if (is_char_or(lex, '\'', '\"')
399 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
400 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
401 // a string or bytes literal
402
403 // parse type codes
404 bool is_raw = false;
405 bool is_bytes = false;
406 if (is_char(lex, 'u')) {
407 next_char(lex);
408 } else if (is_char(lex, 'b')) {
409 is_bytes = true;
410 next_char(lex);
411 if (is_char(lex, 'r')) {
412 is_raw = true;
413 next_char(lex);
414 }
415 } else if (is_char(lex, 'r')) {
416 is_raw = true;
417 next_char(lex);
418 if (is_char(lex, 'b')) {
419 is_bytes = true;
420 next_char(lex);
421 }
422 }
423
424 // set token kind
425 if (is_bytes) {
Damiend99b0522013-12-21 18:17:45 +0000426 tok->kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100427 } else {
Damiend99b0522013-12-21 18:17:45 +0000428 tok->kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100429 }
430
431 // get first quoting character
432 char quote_char = '\'';
433 if (is_char(lex, '\"')) {
434 quote_char = '\"';
435 }
436 next_char(lex);
437
438 // work out if it's a single or triple quoted literal
439 int num_quotes;
440 if (is_char_and(lex, quote_char, quote_char)) {
441 // triple quotes
442 next_char(lex);
443 next_char(lex);
444 num_quotes = 3;
445 } else {
446 // single quotes
447 num_quotes = 1;
448 }
449
Damien429d7192013-10-04 19:53:11 +0100450 // parse the literal
Damien429d7192013-10-04 19:53:11 +0100451 int n_closing = 0;
452 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
453 if (is_char(lex, quote_char)) {
454 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100455 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100456 } else {
457 n_closing = 0;
458 if (!is_raw && is_char(lex, '\\')) {
459 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100460 unichar c = CUR_CHAR(lex);
461 switch (c) {
Damiend99b0522013-12-21 18:17:45 +0000462 case MP_LEXER_CHAR_EOF: break; // TODO a proper error message?
463 case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damiena5185f42013-10-20 14:41:27 +0100464 case '\\': break;
465 case '\'': break;
466 case '"': break;
467 case 'a': c = 0x07; break;
468 case 'b': c = 0x08; break;
469 case 't': c = 0x09; break;
470 case 'n': c = 0x0a; break;
471 case 'v': c = 0x0b; break;
472 case 'f': c = 0x0c; break;
473 case 'r': c = 0x0d; break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200474 case 'x':
475 {
Damien Georgef64086f2014-01-22 23:18:50 +0000476 uint num = 0;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200477 if (!get_hex(lex, 2, &num)) {
478 // TODO error message
479 assert(0);
480 }
481 c = num;
482 break;
483 }
484 case 'N': break; // TODO \N{name} only in strings
485 case 'u': break; // TODO \uxxxx only in strings
486 case 'U': break; // TODO \Uxxxxxxxx only in strings
487 default:
488 if (c >= '0' && c <= '7') {
489 // Octal sequence, 1-3 chars
490 int digits = 3;
491 int num = c - '0';
492 while (is_following_odigit(lex) && --digits != 0) {
493 next_char(lex);
494 num = num * 8 + (CUR_CHAR(lex) - '0');
495 }
496 c = num;
497 } else {
Damien Georgeb829b5c2014-01-25 13:51:19 +0000498 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
499 vstr_add_char(&lex->vstr, '\\');
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200500 }
501 break;
Damiena5185f42013-10-20 14:41:27 +0100502 }
Damiend99b0522013-12-21 18:17:45 +0000503 if (c != MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100504 vstr_add_char(&lex->vstr, c);
505 }
506 } else {
507 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100508 }
509 }
510 next_char(lex);
511 }
512
513 // check we got the required end quotes
514 if (n_closing < num_quotes) {
Damiend99b0522013-12-21 18:17:45 +0000515 tok->kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100516 }
517
Damiena5185f42013-10-20 14:41:27 +0100518 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000519 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100520
521 } else if (is_head_of_identifier(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000522 tok->kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100523
Damiena5185f42013-10-20 14:41:27 +0100524 // get first char
525 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100526 next_char(lex);
527
Damiena5185f42013-10-20 14:41:27 +0100528 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100529 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100530 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100531 next_char(lex);
532 }
533
534 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damiend99b0522013-12-21 18:17:45 +0000535 tok->kind = MP_TOKEN_NUMBER;
Damien429d7192013-10-04 19:53:11 +0100536
Damiena5185f42013-10-20 14:41:27 +0100537 // get first char
538 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100539 next_char(lex);
540
Damiena5185f42013-10-20 14:41:27 +0100541 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100542 while (!is_end(lex)) {
543 if (is_char_or(lex, 'e', 'E')) {
Damiena5185f42013-10-20 14:41:27 +0100544 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100545 next_char(lex);
546 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100547 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100548 next_char(lex);
549 }
550 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
Damiena5185f42013-10-20 14:41:27 +0100551 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100552 next_char(lex);
553 } else {
554 break;
555 }
556 }
557
558 } else {
559 // search for encoded delimiter or operator
560
561 const char *t = tok_enc;
562 uint tok_enc_index = 0;
563 for (; *t != 0 && !is_char(lex, *t); t += 1) {
564 if (*t == 'e' || *t == 'c') {
565 t += 1;
566 } else if (*t == 'E') {
567 tok_enc_index -= 1;
568 t += 1;
569 }
570 tok_enc_index += 1;
571 }
572
573 next_char(lex);
574
575 if (*t == 0) {
576 // didn't match any delimiter or operator characters
Damiend99b0522013-12-21 18:17:45 +0000577 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100578
579 } else {
580 // matched a delimiter or operator character
581
582 // get the maximum characters for a valid token
583 t += 1;
584 uint t_index = tok_enc_index;
585 for (;;) {
586 for (; *t == 'e'; t += 1) {
587 t += 1;
588 t_index += 1;
589 if (is_char(lex, *t)) {
590 next_char(lex);
591 tok_enc_index = t_index;
592 break;
593 }
594 }
595
596 if (*t == 'E') {
597 t += 1;
598 if (is_char(lex, *t)) {
599 next_char(lex);
600 tok_enc_index = t_index;
601 } else {
Damiend99b0522013-12-21 18:17:45 +0000602 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100603 }
604 break;
605 }
606
607 if (*t == 'c') {
608 t += 1;
609 t_index += 1;
610 if (is_char(lex, *t)) {
611 next_char(lex);
612 tok_enc_index = t_index;
613 t += 1;
614 } else {
615 break;
616 }
617 } else {
618 break;
619 }
620 }
621
622 // set token kind
623 tok->kind = tok_enc_kind[tok_enc_index];
624
625 // compute bracket level for implicit line joining
Damiend99b0522013-12-21 18:17:45 +0000626 if (tok->kind == MP_TOKEN_DEL_PAREN_OPEN || tok->kind == MP_TOKEN_DEL_BRACKET_OPEN || tok->kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100627 lex->nested_bracket_level += 1;
Damiend99b0522013-12-21 18:17:45 +0000628 } else if (tok->kind == MP_TOKEN_DEL_PAREN_CLOSE || tok->kind == MP_TOKEN_DEL_BRACKET_CLOSE || tok->kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100629 lex->nested_bracket_level -= 1;
630 }
631 }
632 }
633
Damiena5185f42013-10-20 14:41:27 +0100634 // point token text to vstr buffer
635 tok->str = vstr_str(&lex->vstr);
636 tok->len = vstr_len(&lex->vstr);
Damien429d7192013-10-04 19:53:11 +0100637
Damiena5185f42013-10-20 14:41:27 +0100638 // check for keywords
Damiend99b0522013-12-21 18:17:45 +0000639 if (tok->kind == MP_TOKEN_NAME) {
Damien429d7192013-10-04 19:53:11 +0100640 for (int i = 0; tok_kw[i] != NULL; i++) {
Damiena5185f42013-10-20 14:41:27 +0100641 if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
Damiend99b0522013-12-21 18:17:45 +0000642 tok->kind = MP_TOKEN_KW_FALSE + i;
Damien429d7192013-10-04 19:53:11 +0100643 break;
644 }
645 }
646 }
647}
648
Damien Georgeb829b5c2014-01-25 13:51:19 +0000649mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) {
Damiend99b0522013-12-21 18:17:45 +0000650 mp_lexer_t *lex = m_new(mp_lexer_t, 1);
Damien429d7192013-10-04 19:53:11 +0100651
Damien Georgeb829b5c2014-01-25 13:51:19 +0000652 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100653 lex->stream_data = stream_data;
654 lex->stream_next_char = stream_next_char;
Damienfa2162b2013-10-20 17:42:00 +0100655 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100656 lex->line = 1;
657 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100658 lex->emit_dent = 0;
659 lex->nested_bracket_level = 0;
660 lex->alloc_indent_level = 16;
661 lex->num_indent_level = 1;
662 lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
663 lex->indent_level[0] = 0;
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200664 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100665
666 // preload characters
Damiena5185f42013-10-20 14:41:27 +0100667 lex->chr0 = stream_next_char(stream_data);
668 lex->chr1 = stream_next_char(stream_data);
669 lex->chr2 = stream_next_char(stream_data);
670
671 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damiend99b0522013-12-21 18:17:45 +0000672 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100673 lex->chr0 = '\n';
Damiend99b0522013-12-21 18:17:45 +0000674 } else if (lex->chr1 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100675 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100676 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100677 }
Damiend99b0522013-12-21 18:17:45 +0000678 } else if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100679 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100680 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100681 }
Damien429d7192013-10-04 19:53:11 +0100682 }
683
Damiena5185f42013-10-20 14:41:27 +0100684 // preload first token
Damiend99b0522013-12-21 18:17:45 +0000685 mp_lexer_next_token_into(lex, &lex->tok_cur, true);
Damien429d7192013-10-04 19:53:11 +0100686
687 return lex;
688}
689
Damiend99b0522013-12-21 18:17:45 +0000690void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100691 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100692 if (lex->stream_close) {
693 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100694 }
Damienbb5316b2013-10-22 21:12:29 +0100695 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200696 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000697 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100698 }
Damien429d7192013-10-04 19:53:11 +0100699}
700
Damien George08335002014-01-18 23:24:36 +0000701qstr mp_lexer_source_name(mp_lexer_t *lex) {
702 return lex->source_name;
703}
704
Damiend99b0522013-12-21 18:17:45 +0000705void mp_lexer_to_next(mp_lexer_t *lex) {
706 mp_lexer_next_token_into(lex, &lex->tok_cur, false);
Damien429d7192013-10-04 19:53:11 +0100707}
708
Damiend99b0522013-12-21 18:17:45 +0000709const mp_token_t *mp_lexer_cur(const mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100710 return &lex->tok_cur;
711}
712
Damiend99b0522013-12-21 18:17:45 +0000713bool mp_lexer_is_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
Damien429d7192013-10-04 19:53:11 +0100714 return lex->tok_cur.kind == kind;
715}