blob: f71e355476982f88418b994d3b53e90683bdb398 [file] [log] [blame]
Damien429d7192013-10-04 19:53:11 +01001/* lexer.c -- simple tokeniser for Python implementation
2 */
3
4#include <stdint.h>
5#include <stdio.h>
Damien George08335002014-01-18 23:24:36 +00006#include <string.h>
Damien429d7192013-10-04 19:53:11 +01007#include <assert.h>
8
9#include "misc.h"
Damien George55baff42014-01-21 21:40:13 +000010#include "mpconfig.h"
11#include "qstr.h"
Damien429d7192013-10-04 19:53:11 +010012#include "lexer.h"
13
14#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010015
Damien92c06562013-10-22 22:32:27 +010016// TODO seems that CPython allows NULL byte in the input stream
17// don't know if that's intentional or not, but we don't allow it
18
Damiend99b0522013-12-21 18:17:45 +000019struct _mp_lexer_t {
Damien George08335002014-01-18 23:24:36 +000020 qstr source_name; // name of source
Damiena5185f42013-10-20 14:41:27 +010021 void *stream_data; // data for stream
Damiend99b0522013-12-21 18:17:45 +000022 mp_lexer_stream_next_char_t stream_next_char; // stream callback to get next char
23 mp_lexer_stream_close_t stream_close; // stream callback to free
Damien429d7192013-10-04 19:53:11 +010024
Damiena5185f42013-10-20 14:41:27 +010025 unichar chr0, chr1, chr2; // current cached characters from source
Damien429d7192013-10-04 19:53:11 +010026
27 uint line; // source line
28 uint column; // source column
29
Damiena5185f42013-10-20 14:41:27 +010030 int emit_dent; // non-zero when there are INDENT/DEDENT tokens to emit
31 int nested_bracket_level; // >0 when there are nested brackets over multiple lines
Damien429d7192013-10-04 19:53:11 +010032
33 uint alloc_indent_level;
34 uint num_indent_level;
35 uint16_t *indent_level;
36
Damiena5185f42013-10-20 14:41:27 +010037 vstr_t vstr;
Damiend99b0522013-12-21 18:17:45 +000038 mp_token_t tok_cur;
Damien429d7192013-10-04 19:53:11 +010039};
40
Damien George9528cd62014-01-15 21:23:31 +000041// TODO replace with a call to a standard function
Damiena5185f42013-10-20 14:41:27 +010042bool str_strn_equal(const char *str, const char *strn, int len) {
Damien429d7192013-10-04 19:53:11 +010043 uint i = 0;
Damien429d7192013-10-04 19:53:11 +010044
Damiena5185f42013-10-20 14:41:27 +010045 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010046 ++i;
Damien429d7192013-10-04 19:53:11 +010047 ++str;
Damiena5185f42013-10-20 14:41:27 +010048 ++strn;
Damien429d7192013-10-04 19:53:11 +010049 }
50
Damiena5185f42013-10-20 14:41:27 +010051 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010052}
53
Damiend99b0522013-12-21 18:17:45 +000054void mp_token_show(const mp_token_t *tok) {
Damien George08335002014-01-18 23:24:36 +000055 printf("(%d:%d) kind:%d str:%p len:%d", tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
Damien429d7192013-10-04 19:53:11 +010056 if (tok->str != NULL && tok->len > 0) {
57 const char *i = tok->str;
58 const char *j = i + tok->len;
59 printf(" ");
60 while (i < j) {
Damien George8cc96a32013-12-30 18:23:50 +000061 unichar c = utf8_get_char(i);
62 i = utf8_next_char(i);
63 if (unichar_isprint(c)) {
Damien429d7192013-10-04 19:53:11 +010064 printf("%c", c);
65 } else {
66 printf("?");
67 }
68 }
69 }
70 printf("\n");
71}
72
Damiena5185f42013-10-20 14:41:27 +010073#define CUR_CHAR(lex) ((lex)->chr0)
74
Damiend99b0522013-12-21 18:17:45 +000075static bool is_end(mp_lexer_t *lex) {
76 return lex->chr0 == MP_LEXER_CHAR_EOF;
Damien429d7192013-10-04 19:53:11 +010077}
78
Damiend99b0522013-12-21 18:17:45 +000079static bool is_physical_newline(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +010080 return lex->chr0 == '\n' || lex->chr0 == '\r';
81}
82
Damiend99b0522013-12-21 18:17:45 +000083static bool is_char(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010084 return lex->chr0 == c;
85}
86
Damiend99b0522013-12-21 18:17:45 +000087static bool is_char_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010088 return lex->chr0 == c1 || lex->chr0 == c2;
89}
90
Damiend99b0522013-12-21 18:17:45 +000091static bool is_char_or3(mp_lexer_t *lex, char c1, char c2, char c3) {
Damien429d7192013-10-04 19:53:11 +010092 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
93}
94
95/*
Damiend99b0522013-12-21 18:17:45 +000096static bool is_char_following(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010097 return lex->chr1 == c;
98}
99*/
100
Damiend99b0522013-12-21 18:17:45 +0000101static bool is_char_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100102 return lex->chr1 == c1 || lex->chr1 == c2;
103}
104
Damiend99b0522013-12-21 18:17:45 +0000105static bool is_char_following_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100106 return lex->chr2 == c1 || lex->chr2 == c2;
107}
108
Damiend99b0522013-12-21 18:17:45 +0000109static bool is_char_and(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100110 return lex->chr0 == c1 && lex->chr1 == c2;
111}
112
Damiend99b0522013-12-21 18:17:45 +0000113static bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000114 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100115}
116
Damiend99b0522013-12-21 18:17:45 +0000117static bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000118 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100119}
120
Damiend99b0522013-12-21 18:17:45 +0000121static bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000122 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100123}
124
Damiend99b0522013-12-21 18:17:45 +0000125static bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000126 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100127}
128
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200129static bool is_following_odigit(mp_lexer_t *lex) {
130 return lex->chr1 >= '0' && lex->chr1 <= '7';
131}
132
Damien429d7192013-10-04 19:53:11 +0100133// TODO UNICODE include unicode characters in definition of identifiers
Damiend99b0522013-12-21 18:17:45 +0000134static bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100135 return is_letter(lex) || lex->chr0 == '_';
136}
137
138// TODO UNICODE include unicode characters in definition of identifiers
Damiend99b0522013-12-21 18:17:45 +0000139static bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100140 return is_head_of_identifier(lex) || is_digit(lex);
141}
142
Damiend99b0522013-12-21 18:17:45 +0000143static void next_char(mp_lexer_t *lex) {
144 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100145 return;
146 }
147
148 int advance = 1;
149
150 if (lex->chr0 == '\n') {
151 // LF is a new line
152 ++lex->line;
153 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100154 } else if (lex->chr0 == '\r') {
155 // CR is a new line
156 ++lex->line;
157 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100158 if (lex->chr1 == '\n') {
159 // CR LF is a single new line
160 advance = 2;
161 }
162 } else if (lex->chr0 == '\t') {
163 // a tab
164 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
165 } else {
166 // a character worth one column
167 ++lex->column;
168 }
169
170 for (; advance > 0; advance--) {
171 lex->chr0 = lex->chr1;
172 lex->chr1 = lex->chr2;
Damiena5185f42013-10-20 14:41:27 +0100173 lex->chr2 = lex->stream_next_char(lex->stream_data);
Damiend99b0522013-12-21 18:17:45 +0000174 if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100175 // EOF
Damiend99b0522013-12-21 18:17:45 +0000176 if (lex->chr1 != MP_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien429d7192013-10-04 19:53:11 +0100177 lex->chr2 = '\n'; // insert newline at end of file
Damien429d7192013-10-04 19:53:11 +0100178 }
179 }
180 }
181}
182
Damiend99b0522013-12-21 18:17:45 +0000183void indent_push(mp_lexer_t *lex, uint indent) {
Damien429d7192013-10-04 19:53:11 +0100184 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien732407f2013-12-29 19:33:23 +0000185 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level * 2);
Damien429d7192013-10-04 19:53:11 +0100186 lex->alloc_indent_level *= 2;
Damien429d7192013-10-04 19:53:11 +0100187 }
188 lex->indent_level[lex->num_indent_level++] = indent;
189}
190
Damiend99b0522013-12-21 18:17:45 +0000191uint indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100192 return lex->indent_level[lex->num_indent_level - 1];
193}
194
Damiend99b0522013-12-21 18:17:45 +0000195void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100196 lex->num_indent_level -= 1;
197}
198
199// some tricky operator encoding:
200// <op> = begin with <op>, if this opchar matches then begin here
201// e<op> = end with <op>, if this opchar matches then end
202// E<op> = mandatory end with <op>, this opchar must match, then end
203// c<op> = continue with <op>, if this opchar matches then continue matching
204// this means if the start of two ops are the same then they are equal til the last char
205
206static const char *tok_enc =
207 "()[]{},:;@~" // singles
208 "<e=c<e=" // < <= << <<=
209 ">e=c>e=" // > >= >> >>=
210 "*e=c*e=" // * *= ** **=
211 "+e=" // + +=
212 "-e=e>" // - -= ->
213 "&e=" // & &=
214 "|e=" // | |=
215 "/e=c/e=" // / /= // //=
216 "%e=" // % %=
217 "^e=" // ^ ^=
218 "=e=" // = ==
219 "!E=" // !=
220 ".c.E."; // . ...
221
222// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
223static const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000224 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
225 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
226 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
227 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100228
Damiend99b0522013-12-21 18:17:45 +0000229 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
230 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
231 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
232 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
233 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
234 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
235 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
236 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
237 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
238 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
239 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
240 MP_TOKEN_OP_NOT_EQUAL,
Damien Georgee9906ac2014-01-04 18:44:46 +0000241 MP_TOKEN_DEL_PERIOD, MP_TOKEN_ELLIPSIS,
Damien429d7192013-10-04 19:53:11 +0100242};
243
244// must have the same order as enum in lexer.h
245static const char *tok_kw[] = {
246 "False",
247 "None",
248 "True",
249 "and",
250 "as",
251 "assert",
252 "break",
253 "class",
254 "continue",
255 "def",
256 "del",
257 "elif",
258 "else",
259 "except",
260 "finally",
261 "for",
262 "from",
263 "global",
264 "if",
265 "import",
266 "in",
267 "is",
268 "lambda",
269 "nonlocal",
270 "not",
271 "or",
272 "pass",
273 "raise",
274 "return",
275 "try",
276 "while",
277 "with",
278 "yield",
279 NULL,
280};
281
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200282static int hex_digit(unichar c) {
283 // c is assumed to be hex digit
284 int n = c - '0';
285 if (n > 9) {
286 n &= ~('a' - 'A');
287 n -= ('A' - ('9' + 1));
288 }
289 return n;
290}
291
292// This is called with CUR_CHAR() before first hex digit, and should return with
293// it pointing to last hex digit
294static bool get_hex(mp_lexer_t *lex, int num_digits, uint *result) {
295 uint num = 0;
296 while (num_digits-- != 0) {
297 next_char(lex);
298 unichar c = CUR_CHAR(lex);
299 if (!unichar_isxdigit(c)) {
300 return false;
301 }
302 num = (num << 4) + hex_digit(c);
303 }
304 *result = num;
305 return true;
306}
307
Damiend99b0522013-12-21 18:17:45 +0000308static void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) {
Damiena5185f42013-10-20 14:41:27 +0100309 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100310 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100311 while (!is_end(lex)) {
312 if (is_physical_newline(lex)) {
313 had_physical_newline = true;
314 next_char(lex);
315 } else if (is_whitespace(lex)) {
316 next_char(lex);
317 } else if (is_char(lex, '#')) {
318 next_char(lex);
319 while (!is_end(lex) && !is_physical_newline(lex)) {
320 next_char(lex);
321 }
322 // had_physical_newline will be set on next loop
323 } else if (is_char(lex, '\\')) {
324 // backslash (outside string literals) must appear just before a physical newline
325 next_char(lex);
326 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000327 // SyntaxError: unexpected character after line continuation character
Damien George69a818d2014-01-12 13:55:24 +0000328 tok->src_line = lex->line;
329 tok->src_column = lex->column;
330 tok->kind = MP_TOKEN_BAD_LINE_CONTINUATION;
331 vstr_reset(&lex->vstr);
332 tok->str = vstr_str(&lex->vstr);
333 tok->len = 0;
334 return;
Damien429d7192013-10-04 19:53:11 +0100335 } else {
336 next_char(lex);
337 }
338 } else {
339 break;
340 }
341 }
342
Damiena5185f42013-10-20 14:41:27 +0100343 // set token source information
Damien429d7192013-10-04 19:53:11 +0100344 tok->src_line = lex->line;
345 tok->src_column = lex->column;
Damien429d7192013-10-04 19:53:11 +0100346
Damiena5185f42013-10-20 14:41:27 +0100347 // start new token text
348 vstr_reset(&lex->vstr);
349
350 if (first_token && lex->line == 1 && lex->column != 1) {
351 // check that the first token is in the first column
352 // if first token is not on first line, we get a physical newline and
353 // this check is done as part of normal indent/dedent checking below
354 // (done to get equivalence with CPython)
Damiend99b0522013-12-21 18:17:45 +0000355 tok->kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100356
357 } else if (lex->emit_dent < 0) {
Damiend99b0522013-12-21 18:17:45 +0000358 tok->kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100359 lex->emit_dent += 1;
360
361 } else if (lex->emit_dent > 0) {
Damiend99b0522013-12-21 18:17:45 +0000362 tok->kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100363 lex->emit_dent -= 1;
364
Damien91d387d2013-10-09 15:09:52 +0100365 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damiend99b0522013-12-21 18:17:45 +0000366 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100367
368 uint num_spaces = lex->column - 1;
369 lex->emit_dent = 0;
370 if (num_spaces == indent_top(lex)) {
371 } else if (num_spaces > indent_top(lex)) {
372 indent_push(lex, num_spaces);
373 lex->emit_dent += 1;
374 } else {
375 while (num_spaces < indent_top(lex)) {
376 indent_pop(lex);
377 lex->emit_dent -= 1;
378 }
379 if (num_spaces != indent_top(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000380 tok->kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100381 }
382 }
383
384 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100385 if (indent_top(lex) > 0) {
Damiend99b0522013-12-21 18:17:45 +0000386 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100387 lex->emit_dent = 0;
388 while (indent_top(lex) > 0) {
389 indent_pop(lex);
390 lex->emit_dent -= 1;
391 }
392 } else {
Damiend99b0522013-12-21 18:17:45 +0000393 tok->kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100394 }
395
396 } else if (is_char_or(lex, '\'', '\"')
397 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
398 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
399 // a string or bytes literal
400
401 // parse type codes
402 bool is_raw = false;
403 bool is_bytes = false;
404 if (is_char(lex, 'u')) {
405 next_char(lex);
406 } else if (is_char(lex, 'b')) {
407 is_bytes = true;
408 next_char(lex);
409 if (is_char(lex, 'r')) {
410 is_raw = true;
411 next_char(lex);
412 }
413 } else if (is_char(lex, 'r')) {
414 is_raw = true;
415 next_char(lex);
416 if (is_char(lex, 'b')) {
417 is_bytes = true;
418 next_char(lex);
419 }
420 }
421
422 // set token kind
423 if (is_bytes) {
Damiend99b0522013-12-21 18:17:45 +0000424 tok->kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100425 } else {
Damiend99b0522013-12-21 18:17:45 +0000426 tok->kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100427 }
428
429 // get first quoting character
430 char quote_char = '\'';
431 if (is_char(lex, '\"')) {
432 quote_char = '\"';
433 }
434 next_char(lex);
435
436 // work out if it's a single or triple quoted literal
437 int num_quotes;
438 if (is_char_and(lex, quote_char, quote_char)) {
439 // triple quotes
440 next_char(lex);
441 next_char(lex);
442 num_quotes = 3;
443 } else {
444 // single quotes
445 num_quotes = 1;
446 }
447
Damien429d7192013-10-04 19:53:11 +0100448 // parse the literal
Damien429d7192013-10-04 19:53:11 +0100449 int n_closing = 0;
450 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
451 if (is_char(lex, quote_char)) {
452 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100453 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100454 } else {
455 n_closing = 0;
456 if (!is_raw && is_char(lex, '\\')) {
457 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100458 unichar c = CUR_CHAR(lex);
459 switch (c) {
Damiend99b0522013-12-21 18:17:45 +0000460 case MP_LEXER_CHAR_EOF: break; // TODO a proper error message?
461 case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damiena5185f42013-10-20 14:41:27 +0100462 case '\\': break;
463 case '\'': break;
464 case '"': break;
465 case 'a': c = 0x07; break;
466 case 'b': c = 0x08; break;
467 case 't': c = 0x09; break;
468 case 'n': c = 0x0a; break;
469 case 'v': c = 0x0b; break;
470 case 'f': c = 0x0c; break;
471 case 'r': c = 0x0d; break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200472 case 'x':
473 {
Damien Georgef64086f2014-01-22 23:18:50 +0000474 uint num = 0;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200475 if (!get_hex(lex, 2, &num)) {
476 // TODO error message
477 assert(0);
478 }
479 c = num;
480 break;
481 }
482 case 'N': break; // TODO \N{name} only in strings
483 case 'u': break; // TODO \uxxxx only in strings
484 case 'U': break; // TODO \Uxxxxxxxx only in strings
485 default:
486 if (c >= '0' && c <= '7') {
487 // Octal sequence, 1-3 chars
488 int digits = 3;
489 int num = c - '0';
490 while (is_following_odigit(lex) && --digits != 0) {
491 next_char(lex);
492 num = num * 8 + (CUR_CHAR(lex) - '0');
493 }
494 c = num;
495 } else {
Damien Georgeb829b5c2014-01-25 13:51:19 +0000496 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
497 vstr_add_char(&lex->vstr, '\\');
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200498 }
499 break;
Damiena5185f42013-10-20 14:41:27 +0100500 }
Damiend99b0522013-12-21 18:17:45 +0000501 if (c != MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100502 vstr_add_char(&lex->vstr, c);
503 }
504 } else {
505 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100506 }
507 }
508 next_char(lex);
509 }
510
511 // check we got the required end quotes
512 if (n_closing < num_quotes) {
Damiend99b0522013-12-21 18:17:45 +0000513 tok->kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100514 }
515
Damiena5185f42013-10-20 14:41:27 +0100516 // cut off the end quotes from the token text
517 vstr_cut_tail(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100518
519 } else if (is_head_of_identifier(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000520 tok->kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100521
Damiena5185f42013-10-20 14:41:27 +0100522 // get first char
523 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100524 next_char(lex);
525
Damiena5185f42013-10-20 14:41:27 +0100526 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100527 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100528 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100529 next_char(lex);
530 }
531
532 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damiend99b0522013-12-21 18:17:45 +0000533 tok->kind = MP_TOKEN_NUMBER;
Damien429d7192013-10-04 19:53:11 +0100534
Damiena5185f42013-10-20 14:41:27 +0100535 // get first char
536 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100537 next_char(lex);
538
Damiena5185f42013-10-20 14:41:27 +0100539 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100540 while (!is_end(lex)) {
541 if (is_char_or(lex, 'e', 'E')) {
Damiena5185f42013-10-20 14:41:27 +0100542 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100543 next_char(lex);
544 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100545 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100546 next_char(lex);
547 }
548 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
Damiena5185f42013-10-20 14:41:27 +0100549 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100550 next_char(lex);
551 } else {
552 break;
553 }
554 }
555
556 } else {
557 // search for encoded delimiter or operator
558
559 const char *t = tok_enc;
560 uint tok_enc_index = 0;
561 for (; *t != 0 && !is_char(lex, *t); t += 1) {
562 if (*t == 'e' || *t == 'c') {
563 t += 1;
564 } else if (*t == 'E') {
565 tok_enc_index -= 1;
566 t += 1;
567 }
568 tok_enc_index += 1;
569 }
570
571 next_char(lex);
572
573 if (*t == 0) {
574 // didn't match any delimiter or operator characters
Damiend99b0522013-12-21 18:17:45 +0000575 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100576
577 } else {
578 // matched a delimiter or operator character
579
580 // get the maximum characters for a valid token
581 t += 1;
582 uint t_index = tok_enc_index;
583 for (;;) {
584 for (; *t == 'e'; t += 1) {
585 t += 1;
586 t_index += 1;
587 if (is_char(lex, *t)) {
588 next_char(lex);
589 tok_enc_index = t_index;
590 break;
591 }
592 }
593
594 if (*t == 'E') {
595 t += 1;
596 if (is_char(lex, *t)) {
597 next_char(lex);
598 tok_enc_index = t_index;
599 } else {
Damiend99b0522013-12-21 18:17:45 +0000600 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100601 }
602 break;
603 }
604
605 if (*t == 'c') {
606 t += 1;
607 t_index += 1;
608 if (is_char(lex, *t)) {
609 next_char(lex);
610 tok_enc_index = t_index;
611 t += 1;
612 } else {
613 break;
614 }
615 } else {
616 break;
617 }
618 }
619
620 // set token kind
621 tok->kind = tok_enc_kind[tok_enc_index];
622
623 // compute bracket level for implicit line joining
Damiend99b0522013-12-21 18:17:45 +0000624 if (tok->kind == MP_TOKEN_DEL_PAREN_OPEN || tok->kind == MP_TOKEN_DEL_BRACKET_OPEN || tok->kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100625 lex->nested_bracket_level += 1;
Damiend99b0522013-12-21 18:17:45 +0000626 } else if (tok->kind == MP_TOKEN_DEL_PAREN_CLOSE || tok->kind == MP_TOKEN_DEL_BRACKET_CLOSE || tok->kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100627 lex->nested_bracket_level -= 1;
628 }
629 }
630 }
631
Damiena5185f42013-10-20 14:41:27 +0100632 // point token text to vstr buffer
633 tok->str = vstr_str(&lex->vstr);
634 tok->len = vstr_len(&lex->vstr);
Damien429d7192013-10-04 19:53:11 +0100635
Damiena5185f42013-10-20 14:41:27 +0100636 // check for keywords
Damiend99b0522013-12-21 18:17:45 +0000637 if (tok->kind == MP_TOKEN_NAME) {
Damien429d7192013-10-04 19:53:11 +0100638 for (int i = 0; tok_kw[i] != NULL; i++) {
Damiena5185f42013-10-20 14:41:27 +0100639 if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
Damiend99b0522013-12-21 18:17:45 +0000640 tok->kind = MP_TOKEN_KW_FALSE + i;
Damien429d7192013-10-04 19:53:11 +0100641 break;
642 }
643 }
644 }
645}
646
Damien Georgeb829b5c2014-01-25 13:51:19 +0000647mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) {
Damiend99b0522013-12-21 18:17:45 +0000648 mp_lexer_t *lex = m_new(mp_lexer_t, 1);
Damien429d7192013-10-04 19:53:11 +0100649
Damien Georgeb829b5c2014-01-25 13:51:19 +0000650 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100651 lex->stream_data = stream_data;
652 lex->stream_next_char = stream_next_char;
Damienfa2162b2013-10-20 17:42:00 +0100653 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100654 lex->line = 1;
655 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100656 lex->emit_dent = 0;
657 lex->nested_bracket_level = 0;
658 lex->alloc_indent_level = 16;
659 lex->num_indent_level = 1;
660 lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
661 lex->indent_level[0] = 0;
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200662 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100663
664 // preload characters
Damiena5185f42013-10-20 14:41:27 +0100665 lex->chr0 = stream_next_char(stream_data);
666 lex->chr1 = stream_next_char(stream_data);
667 lex->chr2 = stream_next_char(stream_data);
668
669 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damiend99b0522013-12-21 18:17:45 +0000670 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100671 lex->chr0 = '\n';
Damiend99b0522013-12-21 18:17:45 +0000672 } else if (lex->chr1 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100673 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100674 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100675 }
Damiend99b0522013-12-21 18:17:45 +0000676 } else if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100677 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100678 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100679 }
Damien429d7192013-10-04 19:53:11 +0100680 }
681
Damiena5185f42013-10-20 14:41:27 +0100682 // preload first token
Damiend99b0522013-12-21 18:17:45 +0000683 mp_lexer_next_token_into(lex, &lex->tok_cur, true);
Damien429d7192013-10-04 19:53:11 +0100684
685 return lex;
686}
687
Damiend99b0522013-12-21 18:17:45 +0000688void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100689 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100690 if (lex->stream_close) {
691 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100692 }
Damienbb5316b2013-10-22 21:12:29 +0100693 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200694 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000695 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100696 }
Damien429d7192013-10-04 19:53:11 +0100697}
698
Damien George08335002014-01-18 23:24:36 +0000699qstr mp_lexer_source_name(mp_lexer_t *lex) {
700 return lex->source_name;
701}
702
Damiend99b0522013-12-21 18:17:45 +0000703void mp_lexer_to_next(mp_lexer_t *lex) {
704 mp_lexer_next_token_into(lex, &lex->tok_cur, false);
Damien429d7192013-10-04 19:53:11 +0100705}
706
Damiend99b0522013-12-21 18:17:45 +0000707const mp_token_t *mp_lexer_cur(const mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100708 return &lex->tok_cur;
709}
710
Damiend99b0522013-12-21 18:17:45 +0000711bool mp_lexer_is_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
Damien429d7192013-10-04 19:53:11 +0100712 return lex->tok_cur.kind == kind;
713}
714
715/*
Damiend99b0522013-12-21 18:17:45 +0000716bool mp_lexer_is_str(mp_lexer_t *lex, const char *str) {
717 return mp_token_is_str(&lex->tok_cur, str);
Damien429d7192013-10-04 19:53:11 +0100718}
719
Damiend99b0522013-12-21 18:17:45 +0000720bool mp_lexer_opt_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
721 if (mp_lexer_is_kind(lex, kind)) {
722 mp_lexer_to_next(lex);
Damien429d7192013-10-04 19:53:11 +0100723 return true;
724 }
725 return false;
726}
727
Damiend99b0522013-12-21 18:17:45 +0000728bool mp_lexer_opt_str(mp_lexer_t *lex, const char *str) {
729 if (mp_lexer_is_str(lex, str)) {
730 mp_lexer_to_next(lex);
Damien429d7192013-10-04 19:53:11 +0100731 return true;
732 }
733 return false;
734}
735*/
736
Damien George9528cd62014-01-15 21:23:31 +0000737bool mp_lexer_show_error_pythonic_prefix(mp_lexer_t *lex) {
Damien George08335002014-01-18 23:24:36 +0000738 printf(" File \"%s\", line %d column %d\n", qstr_str(lex->source_name), lex->tok_cur.src_line, lex->tok_cur.src_column);
Damien George9528cd62014-01-15 21:23:31 +0000739 return false;
Damien429d7192013-10-04 19:53:11 +0100740}
Damien91d387d2013-10-09 15:09:52 +0100741
Damiend99b0522013-12-21 18:17:45 +0000742bool mp_lexer_show_error_pythonic(mp_lexer_t *lex, const char *msg) {
Damien George08335002014-01-18 23:24:36 +0000743 printf(" File \"%s\", line %d column %d\n%s\n", qstr_str(lex->source_name), lex->tok_cur.src_line, lex->tok_cur.src_column, msg);
Damien91d387d2013-10-09 15:09:52 +0100744 return false;
745}