blob: 3487e69e8a0b4b30282f155cdf5cfa498ca11fb9 [file] [log] [blame]
Damien429d7192013-10-04 19:53:11 +01001/* lexer.c -- simple tokeniser for Python implementation
2 */
3
xbeefe34222014-03-16 00:14:26 -07004#include <stdbool.h>
Damien429d7192013-10-04 19:53:11 +01005#include <stdint.h>
6#include <stdio.h>
7#include <assert.h>
8
9#include "misc.h"
Damien George55baff42014-01-21 21:40:13 +000010#include "mpconfig.h"
11#include "qstr.h"
Damien429d7192013-10-04 19:53:11 +010012#include "lexer.h"
13
14#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010015
Damien92c06562013-10-22 22:32:27 +010016// TODO seems that CPython allows NULL byte in the input stream
17// don't know if that's intentional or not, but we don't allow it
18
Damiend99b0522013-12-21 18:17:45 +000019struct _mp_lexer_t {
Damien George08335002014-01-18 23:24:36 +000020 qstr source_name; // name of source
Damiena5185f42013-10-20 14:41:27 +010021 void *stream_data; // data for stream
Damiend99b0522013-12-21 18:17:45 +000022 mp_lexer_stream_next_char_t stream_next_char; // stream callback to get next char
23 mp_lexer_stream_close_t stream_close; // stream callback to free
Damien429d7192013-10-04 19:53:11 +010024
Damiena5185f42013-10-20 14:41:27 +010025 unichar chr0, chr1, chr2; // current cached characters from source
Damien429d7192013-10-04 19:53:11 +010026
27 uint line; // source line
28 uint column; // source column
29
Damiena5185f42013-10-20 14:41:27 +010030 int emit_dent; // non-zero when there are INDENT/DEDENT tokens to emit
31 int nested_bracket_level; // >0 when there are nested brackets over multiple lines
Damien429d7192013-10-04 19:53:11 +010032
33 uint alloc_indent_level;
34 uint num_indent_level;
35 uint16_t *indent_level;
36
Damiena5185f42013-10-20 14:41:27 +010037 vstr_t vstr;
Damiend99b0522013-12-21 18:17:45 +000038 mp_token_t tok_cur;
Damien429d7192013-10-04 19:53:11 +010039};
40
Damien George9528cd62014-01-15 21:23:31 +000041// TODO replace with a call to a standard function
Damiena5185f42013-10-20 14:41:27 +010042bool str_strn_equal(const char *str, const char *strn, int len) {
Damien429d7192013-10-04 19:53:11 +010043 uint i = 0;
Damien429d7192013-10-04 19:53:11 +010044
Damiena5185f42013-10-20 14:41:27 +010045 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010046 ++i;
Damien429d7192013-10-04 19:53:11 +010047 ++str;
Damiena5185f42013-10-20 14:41:27 +010048 ++strn;
Damien429d7192013-10-04 19:53:11 +010049 }
50
Damiena5185f42013-10-20 14:41:27 +010051 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010052}
53
Damien Georgec5966122014-02-15 16:10:44 +000054#ifdef MICROPY_DEBUG_PRINTERS
Damiend99b0522013-12-21 18:17:45 +000055void mp_token_show(const mp_token_t *tok) {
Damien George08335002014-01-18 23:24:36 +000056 printf("(%d:%d) kind:%d str:%p len:%d", tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
Damien429d7192013-10-04 19:53:11 +010057 if (tok->str != NULL && tok->len > 0) {
58 const char *i = tok->str;
59 const char *j = i + tok->len;
60 printf(" ");
61 while (i < j) {
Damien George8cc96a32013-12-30 18:23:50 +000062 unichar c = utf8_get_char(i);
63 i = utf8_next_char(i);
64 if (unichar_isprint(c)) {
Damien429d7192013-10-04 19:53:11 +010065 printf("%c", c);
66 } else {
67 printf("?");
68 }
69 }
70 }
71 printf("\n");
72}
Damien Georgec5966122014-02-15 16:10:44 +000073#endif
Damien429d7192013-10-04 19:53:11 +010074
Damiena5185f42013-10-20 14:41:27 +010075#define CUR_CHAR(lex) ((lex)->chr0)
76
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020077STATIC bool is_end(mp_lexer_t *lex) {
Damiend99b0522013-12-21 18:17:45 +000078 return lex->chr0 == MP_LEXER_CHAR_EOF;
Damien429d7192013-10-04 19:53:11 +010079}
80
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020081STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +010082 return lex->chr0 == '\n' || lex->chr0 == '\r';
83}
84
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020085STATIC bool is_char(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010086 return lex->chr0 == c;
87}
88
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020089STATIC bool is_char_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010090 return lex->chr0 == c1 || lex->chr0 == c2;
91}
92
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020093STATIC bool is_char_or3(mp_lexer_t *lex, char c1, char c2, char c3) {
Damien429d7192013-10-04 19:53:11 +010094 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
95}
96
97/*
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020098STATIC bool is_char_following(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010099 return lex->chr1 == c;
100}
101*/
102
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200103STATIC bool is_char_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100104 return lex->chr1 == c1 || lex->chr1 == c2;
105}
106
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200107STATIC bool is_char_following_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100108 return lex->chr2 == c1 || lex->chr2 == c2;
109}
110
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200111STATIC bool is_char_and(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100112 return lex->chr0 == c1 && lex->chr1 == c2;
113}
114
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200115STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000116 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100117}
118
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200119STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000120 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100121}
122
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200123STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000124 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100125}
126
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200127STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000128 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100129}
130
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200131STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200132 return lex->chr1 >= '0' && lex->chr1 <= '7';
133}
134
Damien429d7192013-10-04 19:53:11 +0100135// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200136STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100137 return is_letter(lex) || lex->chr0 == '_';
138}
139
140// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200141STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100142 return is_head_of_identifier(lex) || is_digit(lex);
143}
144
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200145STATIC void next_char(mp_lexer_t *lex) {
Damiend99b0522013-12-21 18:17:45 +0000146 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100147 return;
148 }
149
150 int advance = 1;
151
152 if (lex->chr0 == '\n') {
153 // LF is a new line
154 ++lex->line;
155 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100156 } else if (lex->chr0 == '\r') {
157 // CR is a new line
158 ++lex->line;
159 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100160 if (lex->chr1 == '\n') {
161 // CR LF is a single new line
162 advance = 2;
163 }
164 } else if (lex->chr0 == '\t') {
165 // a tab
166 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
167 } else {
168 // a character worth one column
169 ++lex->column;
170 }
171
172 for (; advance > 0; advance--) {
173 lex->chr0 = lex->chr1;
174 lex->chr1 = lex->chr2;
Damiena5185f42013-10-20 14:41:27 +0100175 lex->chr2 = lex->stream_next_char(lex->stream_data);
Damiend99b0522013-12-21 18:17:45 +0000176 if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100177 // EOF
Damiend99b0522013-12-21 18:17:45 +0000178 if (lex->chr1 != MP_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien429d7192013-10-04 19:53:11 +0100179 lex->chr2 = '\n'; // insert newline at end of file
Damien429d7192013-10-04 19:53:11 +0100180 }
181 }
182 }
183}
184
Damiend99b0522013-12-21 18:17:45 +0000185void indent_push(mp_lexer_t *lex, uint indent) {
Damien429d7192013-10-04 19:53:11 +0100186 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien732407f2013-12-29 19:33:23 +0000187 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level * 2);
Damien429d7192013-10-04 19:53:11 +0100188 lex->alloc_indent_level *= 2;
Damien429d7192013-10-04 19:53:11 +0100189 }
190 lex->indent_level[lex->num_indent_level++] = indent;
191}
192
Damiend99b0522013-12-21 18:17:45 +0000193uint indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100194 return lex->indent_level[lex->num_indent_level - 1];
195}
196
Damiend99b0522013-12-21 18:17:45 +0000197void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100198 lex->num_indent_level -= 1;
199}
200
201// some tricky operator encoding:
202// <op> = begin with <op>, if this opchar matches then begin here
203// e<op> = end with <op>, if this opchar matches then end
204// E<op> = mandatory end with <op>, this opchar must match, then end
205// c<op> = continue with <op>, if this opchar matches then continue matching
206// this means if the start of two ops are the same then they are equal til the last char
207
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200208STATIC const char *tok_enc =
Damien429d7192013-10-04 19:53:11 +0100209 "()[]{},:;@~" // singles
210 "<e=c<e=" // < <= << <<=
211 ">e=c>e=" // > >= >> >>=
212 "*e=c*e=" // * *= ** **=
213 "+e=" // + +=
214 "-e=e>" // - -= ->
215 "&e=" // & &=
216 "|e=" // | |=
217 "/e=c/e=" // / /= // //=
218 "%e=" // % %=
219 "^e=" // ^ ^=
220 "=e=" // = ==
221 "!E=" // !=
222 ".c.E."; // . ...
223
224// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200225STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000226 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
227 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
228 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
229 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100230
Damiend99b0522013-12-21 18:17:45 +0000231 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
232 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
233 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
234 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
235 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
236 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
237 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
238 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
239 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
240 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
241 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
242 MP_TOKEN_OP_NOT_EQUAL,
Damien Georgee9906ac2014-01-04 18:44:46 +0000243 MP_TOKEN_DEL_PERIOD, MP_TOKEN_ELLIPSIS,
Damien429d7192013-10-04 19:53:11 +0100244};
245
246// must have the same order as enum in lexer.h
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200247STATIC const char *tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100248 "False",
249 "None",
250 "True",
251 "and",
252 "as",
253 "assert",
254 "break",
255 "class",
256 "continue",
257 "def",
258 "del",
259 "elif",
260 "else",
261 "except",
262 "finally",
263 "for",
264 "from",
265 "global",
266 "if",
267 "import",
268 "in",
269 "is",
270 "lambda",
271 "nonlocal",
272 "not",
273 "or",
274 "pass",
275 "raise",
276 "return",
277 "try",
278 "while",
279 "with",
280 "yield",
281 NULL,
282};
283
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200284STATIC int hex_digit(unichar c) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200285 // c is assumed to be hex digit
286 int n = c - '0';
287 if (n > 9) {
288 n &= ~('a' - 'A');
289 n -= ('A' - ('9' + 1));
290 }
291 return n;
292}
293
294// This is called with CUR_CHAR() before first hex digit, and should return with
295// it pointing to last hex digit
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200296STATIC bool get_hex(mp_lexer_t *lex, int num_digits, uint *result) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200297 uint num = 0;
298 while (num_digits-- != 0) {
299 next_char(lex);
300 unichar c = CUR_CHAR(lex);
301 if (!unichar_isxdigit(c)) {
302 return false;
303 }
304 num = (num << 4) + hex_digit(c);
305 }
306 *result = num;
307 return true;
308}
309
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200310STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) {
Damiena5185f42013-10-20 14:41:27 +0100311 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100312 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100313 while (!is_end(lex)) {
314 if (is_physical_newline(lex)) {
315 had_physical_newline = true;
316 next_char(lex);
317 } else if (is_whitespace(lex)) {
318 next_char(lex);
319 } else if (is_char(lex, '#')) {
320 next_char(lex);
321 while (!is_end(lex) && !is_physical_newline(lex)) {
322 next_char(lex);
323 }
324 // had_physical_newline will be set on next loop
325 } else if (is_char(lex, '\\')) {
326 // backslash (outside string literals) must appear just before a physical newline
327 next_char(lex);
328 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000329 // SyntaxError: unexpected character after line continuation character
Damien George69a818d2014-01-12 13:55:24 +0000330 tok->src_line = lex->line;
331 tok->src_column = lex->column;
332 tok->kind = MP_TOKEN_BAD_LINE_CONTINUATION;
333 vstr_reset(&lex->vstr);
334 tok->str = vstr_str(&lex->vstr);
335 tok->len = 0;
336 return;
Damien429d7192013-10-04 19:53:11 +0100337 } else {
338 next_char(lex);
339 }
340 } else {
341 break;
342 }
343 }
344
Damiena5185f42013-10-20 14:41:27 +0100345 // set token source information
Damien429d7192013-10-04 19:53:11 +0100346 tok->src_line = lex->line;
347 tok->src_column = lex->column;
Damien429d7192013-10-04 19:53:11 +0100348
Damiena5185f42013-10-20 14:41:27 +0100349 // start new token text
350 vstr_reset(&lex->vstr);
351
352 if (first_token && lex->line == 1 && lex->column != 1) {
353 // check that the first token is in the first column
354 // if first token is not on first line, we get a physical newline and
355 // this check is done as part of normal indent/dedent checking below
356 // (done to get equivalence with CPython)
Damiend99b0522013-12-21 18:17:45 +0000357 tok->kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100358
359 } else if (lex->emit_dent < 0) {
Damiend99b0522013-12-21 18:17:45 +0000360 tok->kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100361 lex->emit_dent += 1;
362
363 } else if (lex->emit_dent > 0) {
Damiend99b0522013-12-21 18:17:45 +0000364 tok->kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100365 lex->emit_dent -= 1;
366
Damien91d387d2013-10-09 15:09:52 +0100367 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damiend99b0522013-12-21 18:17:45 +0000368 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100369
370 uint num_spaces = lex->column - 1;
371 lex->emit_dent = 0;
372 if (num_spaces == indent_top(lex)) {
373 } else if (num_spaces > indent_top(lex)) {
374 indent_push(lex, num_spaces);
375 lex->emit_dent += 1;
376 } else {
377 while (num_spaces < indent_top(lex)) {
378 indent_pop(lex);
379 lex->emit_dent -= 1;
380 }
381 if (num_spaces != indent_top(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000382 tok->kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100383 }
384 }
385
386 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100387 if (indent_top(lex) > 0) {
Damiend99b0522013-12-21 18:17:45 +0000388 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100389 lex->emit_dent = 0;
390 while (indent_top(lex) > 0) {
391 indent_pop(lex);
392 lex->emit_dent -= 1;
393 }
394 } else {
Damiend99b0522013-12-21 18:17:45 +0000395 tok->kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100396 }
397
398 } else if (is_char_or(lex, '\'', '\"')
399 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
400 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
401 // a string or bytes literal
402
403 // parse type codes
404 bool is_raw = false;
405 bool is_bytes = false;
406 if (is_char(lex, 'u')) {
407 next_char(lex);
408 } else if (is_char(lex, 'b')) {
409 is_bytes = true;
410 next_char(lex);
411 if (is_char(lex, 'r')) {
412 is_raw = true;
413 next_char(lex);
414 }
415 } else if (is_char(lex, 'r')) {
416 is_raw = true;
417 next_char(lex);
418 if (is_char(lex, 'b')) {
419 is_bytes = true;
420 next_char(lex);
421 }
422 }
423
424 // set token kind
425 if (is_bytes) {
Damiend99b0522013-12-21 18:17:45 +0000426 tok->kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100427 } else {
Damiend99b0522013-12-21 18:17:45 +0000428 tok->kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100429 }
430
431 // get first quoting character
432 char quote_char = '\'';
433 if (is_char(lex, '\"')) {
434 quote_char = '\"';
435 }
436 next_char(lex);
437
438 // work out if it's a single or triple quoted literal
439 int num_quotes;
440 if (is_char_and(lex, quote_char, quote_char)) {
441 // triple quotes
442 next_char(lex);
443 next_char(lex);
444 num_quotes = 3;
445 } else {
446 // single quotes
447 num_quotes = 1;
448 }
449
Damien429d7192013-10-04 19:53:11 +0100450 // parse the literal
Damien429d7192013-10-04 19:53:11 +0100451 int n_closing = 0;
452 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
453 if (is_char(lex, quote_char)) {
454 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100455 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100456 } else {
457 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100458 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100459 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100460 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100461 if (is_raw) {
462 // raw strings allow escaping of quotes, but the backslash is also emitted
463 vstr_add_char(&lex->vstr, '\\');
464 } else {
465 switch (c) {
466 case MP_LEXER_CHAR_EOF: break; // TODO a proper error message?
467 case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
468 case '\\': break;
469 case '\'': break;
470 case '"': break;
471 case 'a': c = 0x07; break;
472 case 'b': c = 0x08; break;
473 case 't': c = 0x09; break;
474 case 'n': c = 0x0a; break;
475 case 'v': c = 0x0b; break;
476 case 'f': c = 0x0c; break;
477 case 'r': c = 0x0d; break;
478 case 'x':
479 {
480 uint num = 0;
481 if (!get_hex(lex, 2, &num)) {
482 // TODO error message
483 assert(0);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200484 }
485 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100486 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200487 }
Damien Georgea91f4142014-04-10 11:30:55 +0100488 case 'N': break; // TODO \N{name} only in strings
489 case 'u': break; // TODO \uxxxx only in strings
490 case 'U': break; // TODO \Uxxxxxxxx only in strings
491 default:
492 if (c >= '0' && c <= '7') {
493 // Octal sequence, 1-3 chars
494 int digits = 3;
495 int num = c - '0';
496 while (is_following_odigit(lex) && --digits != 0) {
497 next_char(lex);
498 num = num * 8 + (CUR_CHAR(lex) - '0');
499 }
500 c = num;
501 } else {
502 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
503 vstr_add_char(&lex->vstr, '\\');
504 }
505 break;
506 }
Damiena5185f42013-10-20 14:41:27 +0100507 }
Damiend99b0522013-12-21 18:17:45 +0000508 if (c != MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100509 vstr_add_char(&lex->vstr, c);
510 }
511 } else {
512 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100513 }
514 }
515 next_char(lex);
516 }
517
518 // check we got the required end quotes
519 if (n_closing < num_quotes) {
Damiend99b0522013-12-21 18:17:45 +0000520 tok->kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100521 }
522
Damiena5185f42013-10-20 14:41:27 +0100523 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000524 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100525
526 } else if (is_head_of_identifier(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000527 tok->kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100528
Damiena5185f42013-10-20 14:41:27 +0100529 // get first char
530 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100531 next_char(lex);
532
Damiena5185f42013-10-20 14:41:27 +0100533 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100534 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100535 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100536 next_char(lex);
537 }
538
539 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damiend99b0522013-12-21 18:17:45 +0000540 tok->kind = MP_TOKEN_NUMBER;
Damien429d7192013-10-04 19:53:11 +0100541
Damiena5185f42013-10-20 14:41:27 +0100542 // get first char
543 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100544 next_char(lex);
545
Damiena5185f42013-10-20 14:41:27 +0100546 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100547 while (!is_end(lex)) {
548 if (is_char_or(lex, 'e', 'E')) {
Damiena5185f42013-10-20 14:41:27 +0100549 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100550 next_char(lex);
551 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100552 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100553 next_char(lex);
554 }
555 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
Damiena5185f42013-10-20 14:41:27 +0100556 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100557 next_char(lex);
558 } else {
559 break;
560 }
561 }
562
563 } else {
564 // search for encoded delimiter or operator
565
566 const char *t = tok_enc;
567 uint tok_enc_index = 0;
568 for (; *t != 0 && !is_char(lex, *t); t += 1) {
569 if (*t == 'e' || *t == 'c') {
570 t += 1;
571 } else if (*t == 'E') {
572 tok_enc_index -= 1;
573 t += 1;
574 }
575 tok_enc_index += 1;
576 }
577
578 next_char(lex);
579
580 if (*t == 0) {
581 // didn't match any delimiter or operator characters
Damiend99b0522013-12-21 18:17:45 +0000582 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100583
584 } else {
585 // matched a delimiter or operator character
586
587 // get the maximum characters for a valid token
588 t += 1;
589 uint t_index = tok_enc_index;
590 for (;;) {
591 for (; *t == 'e'; t += 1) {
592 t += 1;
593 t_index += 1;
594 if (is_char(lex, *t)) {
595 next_char(lex);
596 tok_enc_index = t_index;
597 break;
598 }
599 }
600
601 if (*t == 'E') {
602 t += 1;
603 if (is_char(lex, *t)) {
604 next_char(lex);
605 tok_enc_index = t_index;
606 } else {
Damiend99b0522013-12-21 18:17:45 +0000607 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100608 }
609 break;
610 }
611
612 if (*t == 'c') {
613 t += 1;
614 t_index += 1;
615 if (is_char(lex, *t)) {
616 next_char(lex);
617 tok_enc_index = t_index;
618 t += 1;
619 } else {
620 break;
621 }
622 } else {
623 break;
624 }
625 }
626
627 // set token kind
628 tok->kind = tok_enc_kind[tok_enc_index];
629
630 // compute bracket level for implicit line joining
Damiend99b0522013-12-21 18:17:45 +0000631 if (tok->kind == MP_TOKEN_DEL_PAREN_OPEN || tok->kind == MP_TOKEN_DEL_BRACKET_OPEN || tok->kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100632 lex->nested_bracket_level += 1;
Damiend99b0522013-12-21 18:17:45 +0000633 } else if (tok->kind == MP_TOKEN_DEL_PAREN_CLOSE || tok->kind == MP_TOKEN_DEL_BRACKET_CLOSE || tok->kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100634 lex->nested_bracket_level -= 1;
635 }
636 }
637 }
638
Damiena5185f42013-10-20 14:41:27 +0100639 // point token text to vstr buffer
640 tok->str = vstr_str(&lex->vstr);
641 tok->len = vstr_len(&lex->vstr);
Damien429d7192013-10-04 19:53:11 +0100642
Damiena5185f42013-10-20 14:41:27 +0100643 // check for keywords
Damiend99b0522013-12-21 18:17:45 +0000644 if (tok->kind == MP_TOKEN_NAME) {
Damien429d7192013-10-04 19:53:11 +0100645 for (int i = 0; tok_kw[i] != NULL; i++) {
Damiena5185f42013-10-20 14:41:27 +0100646 if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
Damiend99b0522013-12-21 18:17:45 +0000647 tok->kind = MP_TOKEN_KW_FALSE + i;
Damien429d7192013-10-04 19:53:11 +0100648 break;
649 }
650 }
651 }
652}
653
Damien Georgeb829b5c2014-01-25 13:51:19 +0000654mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) {
Damiend99b0522013-12-21 18:17:45 +0000655 mp_lexer_t *lex = m_new(mp_lexer_t, 1);
Damien429d7192013-10-04 19:53:11 +0100656
Damien Georgeb829b5c2014-01-25 13:51:19 +0000657 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100658 lex->stream_data = stream_data;
659 lex->stream_next_char = stream_next_char;
Damienfa2162b2013-10-20 17:42:00 +0100660 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100661 lex->line = 1;
662 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100663 lex->emit_dent = 0;
664 lex->nested_bracket_level = 0;
665 lex->alloc_indent_level = 16;
666 lex->num_indent_level = 1;
667 lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
668 lex->indent_level[0] = 0;
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200669 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100670
671 // preload characters
Damiena5185f42013-10-20 14:41:27 +0100672 lex->chr0 = stream_next_char(stream_data);
673 lex->chr1 = stream_next_char(stream_data);
674 lex->chr2 = stream_next_char(stream_data);
675
676 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damiend99b0522013-12-21 18:17:45 +0000677 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100678 lex->chr0 = '\n';
Damiend99b0522013-12-21 18:17:45 +0000679 } else if (lex->chr1 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100680 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100681 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100682 }
Damiend99b0522013-12-21 18:17:45 +0000683 } else if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100684 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100685 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100686 }
Damien429d7192013-10-04 19:53:11 +0100687 }
688
Damiena5185f42013-10-20 14:41:27 +0100689 // preload first token
Damiend99b0522013-12-21 18:17:45 +0000690 mp_lexer_next_token_into(lex, &lex->tok_cur, true);
Damien429d7192013-10-04 19:53:11 +0100691
692 return lex;
693}
694
Damiend99b0522013-12-21 18:17:45 +0000695void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100696 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100697 if (lex->stream_close) {
698 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100699 }
Damienbb5316b2013-10-22 21:12:29 +0100700 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200701 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000702 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100703 }
Damien429d7192013-10-04 19:53:11 +0100704}
705
Damien George08335002014-01-18 23:24:36 +0000706qstr mp_lexer_source_name(mp_lexer_t *lex) {
707 return lex->source_name;
708}
709
Damiend99b0522013-12-21 18:17:45 +0000710void mp_lexer_to_next(mp_lexer_t *lex) {
711 mp_lexer_next_token_into(lex, &lex->tok_cur, false);
Damien429d7192013-10-04 19:53:11 +0100712}
713
Damiend99b0522013-12-21 18:17:45 +0000714const mp_token_t *mp_lexer_cur(const mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100715 return &lex->tok_cur;
716}
717
Damiend99b0522013-12-21 18:17:45 +0000718bool mp_lexer_is_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
Damien429d7192013-10-04 19:53:11 +0100719 return lex->tok_cur.kind == kind;
720}