blob: 03605373d08840048fb40300b05201902e707b2a [file] [log] [blame]
Damien429d7192013-10-04 19:53:11 +01001/* lexer.c -- simple tokeniser for Python implementation
2 */
3
xbeefe34222014-03-16 00:14:26 -07004#include <stdbool.h>
Damien429d7192013-10-04 19:53:11 +01005#include <stdint.h>
6#include <stdio.h>
7#include <assert.h>
8
9#include "misc.h"
Damien George55baff42014-01-21 21:40:13 +000010#include "mpconfig.h"
11#include "qstr.h"
Damien429d7192013-10-04 19:53:11 +010012#include "lexer.h"
13
14#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010015
Damien92c06562013-10-22 22:32:27 +010016// TODO seems that CPython allows NULL byte in the input stream
17// don't know if that's intentional or not, but we don't allow it
18
Damiend99b0522013-12-21 18:17:45 +000019struct _mp_lexer_t {
Damien George08335002014-01-18 23:24:36 +000020 qstr source_name; // name of source
Damiena5185f42013-10-20 14:41:27 +010021 void *stream_data; // data for stream
Damiend99b0522013-12-21 18:17:45 +000022 mp_lexer_stream_next_char_t stream_next_char; // stream callback to get next char
23 mp_lexer_stream_close_t stream_close; // stream callback to free
Damien429d7192013-10-04 19:53:11 +010024
Damiena5185f42013-10-20 14:41:27 +010025 unichar chr0, chr1, chr2; // current cached characters from source
Damien429d7192013-10-04 19:53:11 +010026
27 uint line; // source line
28 uint column; // source column
29
Damiena5185f42013-10-20 14:41:27 +010030 int emit_dent; // non-zero when there are INDENT/DEDENT tokens to emit
31 int nested_bracket_level; // >0 when there are nested brackets over multiple lines
Damien429d7192013-10-04 19:53:11 +010032
33 uint alloc_indent_level;
34 uint num_indent_level;
35 uint16_t *indent_level;
36
Damiena5185f42013-10-20 14:41:27 +010037 vstr_t vstr;
Damiend99b0522013-12-21 18:17:45 +000038 mp_token_t tok_cur;
Damien429d7192013-10-04 19:53:11 +010039};
40
Damien George9528cd62014-01-15 21:23:31 +000041// TODO replace with a call to a standard function
Damiena5185f42013-10-20 14:41:27 +010042bool str_strn_equal(const char *str, const char *strn, int len) {
Damien429d7192013-10-04 19:53:11 +010043 uint i = 0;
Damien429d7192013-10-04 19:53:11 +010044
Damiena5185f42013-10-20 14:41:27 +010045 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010046 ++i;
Damien429d7192013-10-04 19:53:11 +010047 ++str;
Damiena5185f42013-10-20 14:41:27 +010048 ++strn;
Damien429d7192013-10-04 19:53:11 +010049 }
50
Damiena5185f42013-10-20 14:41:27 +010051 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010052}
53
Damien Georgec5966122014-02-15 16:10:44 +000054#ifdef MICROPY_DEBUG_PRINTERS
Damiend99b0522013-12-21 18:17:45 +000055void mp_token_show(const mp_token_t *tok) {
Damien George08335002014-01-18 23:24:36 +000056 printf("(%d:%d) kind:%d str:%p len:%d", tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
Damien429d7192013-10-04 19:53:11 +010057 if (tok->str != NULL && tok->len > 0) {
58 const char *i = tok->str;
59 const char *j = i + tok->len;
60 printf(" ");
61 while (i < j) {
Damien George8cc96a32013-12-30 18:23:50 +000062 unichar c = utf8_get_char(i);
63 i = utf8_next_char(i);
64 if (unichar_isprint(c)) {
Damien429d7192013-10-04 19:53:11 +010065 printf("%c", c);
66 } else {
67 printf("?");
68 }
69 }
70 }
71 printf("\n");
72}
Damien Georgec5966122014-02-15 16:10:44 +000073#endif
Damien429d7192013-10-04 19:53:11 +010074
Damiena5185f42013-10-20 14:41:27 +010075#define CUR_CHAR(lex) ((lex)->chr0)
76
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020077STATIC bool is_end(mp_lexer_t *lex) {
Damiend99b0522013-12-21 18:17:45 +000078 return lex->chr0 == MP_LEXER_CHAR_EOF;
Damien429d7192013-10-04 19:53:11 +010079}
80
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020081STATIC bool is_physical_newline(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +010082 return lex->chr0 == '\n' || lex->chr0 == '\r';
83}
84
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020085STATIC bool is_char(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010086 return lex->chr0 == c;
87}
88
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020089STATIC bool is_char_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010090 return lex->chr0 == c1 || lex->chr0 == c2;
91}
92
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020093STATIC bool is_char_or3(mp_lexer_t *lex, char c1, char c2, char c3) {
Damien429d7192013-10-04 19:53:11 +010094 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
95}
96
97/*
Paul Sokolovsky520e2f52014-02-12 18:31:30 +020098STATIC bool is_char_following(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010099 return lex->chr1 == c;
100}
101*/
102
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200103STATIC bool is_char_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100104 return lex->chr1 == c1 || lex->chr1 == c2;
105}
106
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200107STATIC bool is_char_following_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100108 return lex->chr2 == c1 || lex->chr2 == c2;
109}
110
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200111STATIC bool is_char_and(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100112 return lex->chr0 == c1 && lex->chr1 == c2;
113}
114
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200115STATIC bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000116 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100117}
118
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200119STATIC bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000120 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100121}
122
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200123STATIC bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000124 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100125}
126
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200127STATIC bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000128 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100129}
130
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200131STATIC bool is_following_odigit(mp_lexer_t *lex) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200132 return lex->chr1 >= '0' && lex->chr1 <= '7';
133}
134
Damien429d7192013-10-04 19:53:11 +0100135// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200136STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100137 return is_letter(lex) || lex->chr0 == '_';
138}
139
140// TODO UNICODE include unicode characters in definition of identifiers
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200141STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100142 return is_head_of_identifier(lex) || is_digit(lex);
143}
144
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200145STATIC void next_char(mp_lexer_t *lex) {
Damiend99b0522013-12-21 18:17:45 +0000146 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100147 return;
148 }
149
150 int advance = 1;
151
152 if (lex->chr0 == '\n') {
153 // LF is a new line
154 ++lex->line;
155 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100156 } else if (lex->chr0 == '\r') {
157 // CR is a new line
158 ++lex->line;
159 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100160 if (lex->chr1 == '\n') {
161 // CR LF is a single new line
162 advance = 2;
163 }
164 } else if (lex->chr0 == '\t') {
165 // a tab
166 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
167 } else {
168 // a character worth one column
169 ++lex->column;
170 }
171
172 for (; advance > 0; advance--) {
173 lex->chr0 = lex->chr1;
174 lex->chr1 = lex->chr2;
Damiena5185f42013-10-20 14:41:27 +0100175 lex->chr2 = lex->stream_next_char(lex->stream_data);
Damiend99b0522013-12-21 18:17:45 +0000176 if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100177 // EOF
Damiend99b0522013-12-21 18:17:45 +0000178 if (lex->chr1 != MP_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien429d7192013-10-04 19:53:11 +0100179 lex->chr2 = '\n'; // insert newline at end of file
Damien429d7192013-10-04 19:53:11 +0100180 }
181 }
182 }
183}
184
Damiend99b0522013-12-21 18:17:45 +0000185void indent_push(mp_lexer_t *lex, uint indent) {
Damien429d7192013-10-04 19:53:11 +0100186 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien732407f2013-12-29 19:33:23 +0000187 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level * 2);
Damien429d7192013-10-04 19:53:11 +0100188 lex->alloc_indent_level *= 2;
Damien429d7192013-10-04 19:53:11 +0100189 }
190 lex->indent_level[lex->num_indent_level++] = indent;
191}
192
Damiend99b0522013-12-21 18:17:45 +0000193uint indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100194 return lex->indent_level[lex->num_indent_level - 1];
195}
196
Damiend99b0522013-12-21 18:17:45 +0000197void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100198 lex->num_indent_level -= 1;
199}
200
201// some tricky operator encoding:
202// <op> = begin with <op>, if this opchar matches then begin here
203// e<op> = end with <op>, if this opchar matches then end
204// E<op> = mandatory end with <op>, this opchar must match, then end
205// c<op> = continue with <op>, if this opchar matches then continue matching
206// this means if the start of two ops are the same then they are equal til the last char
207
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200208STATIC const char *tok_enc =
Damien429d7192013-10-04 19:53:11 +0100209 "()[]{},:;@~" // singles
210 "<e=c<e=" // < <= << <<=
211 ">e=c>e=" // > >= >> >>=
212 "*e=c*e=" // * *= ** **=
213 "+e=" // + +=
214 "-e=e>" // - -= ->
215 "&e=" // & &=
216 "|e=" // | |=
217 "/e=c/e=" // / /= // //=
218 "%e=" // % %=
219 "^e=" // ^ ^=
220 "=e=" // = ==
Damien George2e9eb2d2014-04-10 12:19:33 +0100221 "!E="; // !=
Damien429d7192013-10-04 19:53:11 +0100222
223// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200224STATIC const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000225 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
226 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
227 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
228 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100229
Damiend99b0522013-12-21 18:17:45 +0000230 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
231 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
232 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
233 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
234 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
235 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
236 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
237 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
238 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
239 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
240 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
241 MP_TOKEN_OP_NOT_EQUAL,
Damien429d7192013-10-04 19:53:11 +0100242};
243
244// must have the same order as enum in lexer.h
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200245STATIC const char *tok_kw[] = {
Damien429d7192013-10-04 19:53:11 +0100246 "False",
247 "None",
248 "True",
249 "and",
250 "as",
251 "assert",
252 "break",
253 "class",
254 "continue",
255 "def",
256 "del",
257 "elif",
258 "else",
259 "except",
260 "finally",
261 "for",
262 "from",
263 "global",
264 "if",
265 "import",
266 "in",
267 "is",
268 "lambda",
269 "nonlocal",
270 "not",
271 "or",
272 "pass",
273 "raise",
274 "return",
275 "try",
276 "while",
277 "with",
278 "yield",
279 NULL,
280};
281
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200282STATIC int hex_digit(unichar c) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200283 // c is assumed to be hex digit
284 int n = c - '0';
285 if (n > 9) {
286 n &= ~('a' - 'A');
287 n -= ('A' - ('9' + 1));
288 }
289 return n;
290}
291
292// This is called with CUR_CHAR() before first hex digit, and should return with
293// it pointing to last hex digit
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200294STATIC bool get_hex(mp_lexer_t *lex, int num_digits, uint *result) {
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200295 uint num = 0;
296 while (num_digits-- != 0) {
297 next_char(lex);
298 unichar c = CUR_CHAR(lex);
299 if (!unichar_isxdigit(c)) {
300 return false;
301 }
302 num = (num << 4) + hex_digit(c);
303 }
304 *result = num;
305 return true;
306}
307
Paul Sokolovsky520e2f52014-02-12 18:31:30 +0200308STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) {
Damiena5185f42013-10-20 14:41:27 +0100309 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100310 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100311 while (!is_end(lex)) {
312 if (is_physical_newline(lex)) {
313 had_physical_newline = true;
314 next_char(lex);
315 } else if (is_whitespace(lex)) {
316 next_char(lex);
317 } else if (is_char(lex, '#')) {
318 next_char(lex);
319 while (!is_end(lex) && !is_physical_newline(lex)) {
320 next_char(lex);
321 }
322 // had_physical_newline will be set on next loop
323 } else if (is_char(lex, '\\')) {
324 // backslash (outside string literals) must appear just before a physical newline
325 next_char(lex);
326 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000327 // SyntaxError: unexpected character after line continuation character
Damien George69a818d2014-01-12 13:55:24 +0000328 tok->src_line = lex->line;
329 tok->src_column = lex->column;
330 tok->kind = MP_TOKEN_BAD_LINE_CONTINUATION;
331 vstr_reset(&lex->vstr);
332 tok->str = vstr_str(&lex->vstr);
333 tok->len = 0;
334 return;
Damien429d7192013-10-04 19:53:11 +0100335 } else {
336 next_char(lex);
337 }
338 } else {
339 break;
340 }
341 }
342
Damiena5185f42013-10-20 14:41:27 +0100343 // set token source information
Damien429d7192013-10-04 19:53:11 +0100344 tok->src_line = lex->line;
345 tok->src_column = lex->column;
Damien429d7192013-10-04 19:53:11 +0100346
Damiena5185f42013-10-20 14:41:27 +0100347 // start new token text
348 vstr_reset(&lex->vstr);
349
350 if (first_token && lex->line == 1 && lex->column != 1) {
351 // check that the first token is in the first column
352 // if first token is not on first line, we get a physical newline and
353 // this check is done as part of normal indent/dedent checking below
354 // (done to get equivalence with CPython)
Damiend99b0522013-12-21 18:17:45 +0000355 tok->kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100356
357 } else if (lex->emit_dent < 0) {
Damiend99b0522013-12-21 18:17:45 +0000358 tok->kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100359 lex->emit_dent += 1;
360
361 } else if (lex->emit_dent > 0) {
Damiend99b0522013-12-21 18:17:45 +0000362 tok->kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100363 lex->emit_dent -= 1;
364
Damien91d387d2013-10-09 15:09:52 +0100365 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damiend99b0522013-12-21 18:17:45 +0000366 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100367
368 uint num_spaces = lex->column - 1;
369 lex->emit_dent = 0;
370 if (num_spaces == indent_top(lex)) {
371 } else if (num_spaces > indent_top(lex)) {
372 indent_push(lex, num_spaces);
373 lex->emit_dent += 1;
374 } else {
375 while (num_spaces < indent_top(lex)) {
376 indent_pop(lex);
377 lex->emit_dent -= 1;
378 }
379 if (num_spaces != indent_top(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000380 tok->kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100381 }
382 }
383
384 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100385 if (indent_top(lex) > 0) {
Damiend99b0522013-12-21 18:17:45 +0000386 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100387 lex->emit_dent = 0;
388 while (indent_top(lex) > 0) {
389 indent_pop(lex);
390 lex->emit_dent -= 1;
391 }
392 } else {
Damiend99b0522013-12-21 18:17:45 +0000393 tok->kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100394 }
395
396 } else if (is_char_or(lex, '\'', '\"')
397 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
398 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
399 // a string or bytes literal
400
401 // parse type codes
402 bool is_raw = false;
403 bool is_bytes = false;
404 if (is_char(lex, 'u')) {
405 next_char(lex);
406 } else if (is_char(lex, 'b')) {
407 is_bytes = true;
408 next_char(lex);
409 if (is_char(lex, 'r')) {
410 is_raw = true;
411 next_char(lex);
412 }
413 } else if (is_char(lex, 'r')) {
414 is_raw = true;
415 next_char(lex);
416 if (is_char(lex, 'b')) {
417 is_bytes = true;
418 next_char(lex);
419 }
420 }
421
422 // set token kind
423 if (is_bytes) {
Damiend99b0522013-12-21 18:17:45 +0000424 tok->kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100425 } else {
Damiend99b0522013-12-21 18:17:45 +0000426 tok->kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100427 }
428
429 // get first quoting character
430 char quote_char = '\'';
431 if (is_char(lex, '\"')) {
432 quote_char = '\"';
433 }
434 next_char(lex);
435
436 // work out if it's a single or triple quoted literal
437 int num_quotes;
438 if (is_char_and(lex, quote_char, quote_char)) {
439 // triple quotes
440 next_char(lex);
441 next_char(lex);
442 num_quotes = 3;
443 } else {
444 // single quotes
445 num_quotes = 1;
446 }
447
Damien429d7192013-10-04 19:53:11 +0100448 // parse the literal
Damien429d7192013-10-04 19:53:11 +0100449 int n_closing = 0;
450 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
451 if (is_char(lex, quote_char)) {
452 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100453 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100454 } else {
455 n_closing = 0;
Damien Georgea91f4142014-04-10 11:30:55 +0100456 if (is_char(lex, '\\')) {
Damien429d7192013-10-04 19:53:11 +0100457 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100458 unichar c = CUR_CHAR(lex);
Damien Georgea91f4142014-04-10 11:30:55 +0100459 if (is_raw) {
460 // raw strings allow escaping of quotes, but the backslash is also emitted
461 vstr_add_char(&lex->vstr, '\\');
462 } else {
463 switch (c) {
464 case MP_LEXER_CHAR_EOF: break; // TODO a proper error message?
465 case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
466 case '\\': break;
467 case '\'': break;
468 case '"': break;
469 case 'a': c = 0x07; break;
470 case 'b': c = 0x08; break;
471 case 't': c = 0x09; break;
472 case 'n': c = 0x0a; break;
473 case 'v': c = 0x0b; break;
474 case 'f': c = 0x0c; break;
475 case 'r': c = 0x0d; break;
476 case 'x':
477 {
478 uint num = 0;
479 if (!get_hex(lex, 2, &num)) {
480 // TODO error message
481 assert(0);
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200482 }
483 c = num;
Damien Georgea91f4142014-04-10 11:30:55 +0100484 break;
Paul Sokolovsky0b7184d2014-01-22 22:40:02 +0200485 }
Damien Georgea91f4142014-04-10 11:30:55 +0100486 case 'N': break; // TODO \N{name} only in strings
487 case 'u': break; // TODO \uxxxx only in strings
488 case 'U': break; // TODO \Uxxxxxxxx only in strings
489 default:
490 if (c >= '0' && c <= '7') {
491 // Octal sequence, 1-3 chars
492 int digits = 3;
493 int num = c - '0';
494 while (is_following_odigit(lex) && --digits != 0) {
495 next_char(lex);
496 num = num * 8 + (CUR_CHAR(lex) - '0');
497 }
498 c = num;
499 } else {
500 // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
501 vstr_add_char(&lex->vstr, '\\');
502 }
503 break;
504 }
Damiena5185f42013-10-20 14:41:27 +0100505 }
Damiend99b0522013-12-21 18:17:45 +0000506 if (c != MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100507 vstr_add_char(&lex->vstr, c);
508 }
509 } else {
510 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100511 }
512 }
513 next_char(lex);
514 }
515
516 // check we got the required end quotes
517 if (n_closing < num_quotes) {
Damiend99b0522013-12-21 18:17:45 +0000518 tok->kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100519 }
520
Damiena5185f42013-10-20 14:41:27 +0100521 // cut off the end quotes from the token text
Damien George280e7202014-03-15 14:33:09 +0000522 vstr_cut_tail_bytes(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100523
524 } else if (is_head_of_identifier(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000525 tok->kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100526
Damiena5185f42013-10-20 14:41:27 +0100527 // get first char
528 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100529 next_char(lex);
530
Damiena5185f42013-10-20 14:41:27 +0100531 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100532 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100533 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100534 next_char(lex);
535 }
536
537 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damiend99b0522013-12-21 18:17:45 +0000538 tok->kind = MP_TOKEN_NUMBER;
Damien429d7192013-10-04 19:53:11 +0100539
Damiena5185f42013-10-20 14:41:27 +0100540 // get first char
541 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100542 next_char(lex);
543
Damiena5185f42013-10-20 14:41:27 +0100544 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100545 while (!is_end(lex)) {
546 if (is_char_or(lex, 'e', 'E')) {
Damiena5185f42013-10-20 14:41:27 +0100547 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100548 next_char(lex);
549 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100550 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100551 next_char(lex);
552 }
553 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
Damiena5185f42013-10-20 14:41:27 +0100554 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100555 next_char(lex);
556 } else {
557 break;
558 }
559 }
560
Damien George2e9eb2d2014-04-10 12:19:33 +0100561 } else if (is_char(lex, '.')) {
562 // special handling for . and ... operators, because .. is not a valid operator
563
564 // get first char
565 vstr_add_char(&lex->vstr, '.');
566 next_char(lex);
567
568 if (is_char_and(lex, '.', '.')) {
569 vstr_add_char(&lex->vstr, '.');
570 vstr_add_char(&lex->vstr, '.');
571 next_char(lex);
572 next_char(lex);
573 tok->kind = MP_TOKEN_ELLIPSIS;
574 } else {
575 tok->kind = MP_TOKEN_DEL_PERIOD;
576 }
577
Damien429d7192013-10-04 19:53:11 +0100578 } else {
579 // search for encoded delimiter or operator
580
581 const char *t = tok_enc;
582 uint tok_enc_index = 0;
583 for (; *t != 0 && !is_char(lex, *t); t += 1) {
584 if (*t == 'e' || *t == 'c') {
585 t += 1;
586 } else if (*t == 'E') {
587 tok_enc_index -= 1;
588 t += 1;
589 }
590 tok_enc_index += 1;
591 }
592
593 next_char(lex);
594
595 if (*t == 0) {
596 // didn't match any delimiter or operator characters
Damiend99b0522013-12-21 18:17:45 +0000597 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100598
599 } else {
600 // matched a delimiter or operator character
601
602 // get the maximum characters for a valid token
603 t += 1;
604 uint t_index = tok_enc_index;
605 for (;;) {
606 for (; *t == 'e'; t += 1) {
607 t += 1;
608 t_index += 1;
609 if (is_char(lex, *t)) {
610 next_char(lex);
611 tok_enc_index = t_index;
612 break;
613 }
614 }
615
616 if (*t == 'E') {
617 t += 1;
618 if (is_char(lex, *t)) {
619 next_char(lex);
620 tok_enc_index = t_index;
621 } else {
Damiend99b0522013-12-21 18:17:45 +0000622 tok->kind = MP_TOKEN_INVALID;
Damien George2e9eb2d2014-04-10 12:19:33 +0100623 goto tok_enc_no_match;
Damien429d7192013-10-04 19:53:11 +0100624 }
625 break;
626 }
627
628 if (*t == 'c') {
629 t += 1;
630 t_index += 1;
631 if (is_char(lex, *t)) {
632 next_char(lex);
633 tok_enc_index = t_index;
634 t += 1;
635 } else {
636 break;
637 }
638 } else {
639 break;
640 }
641 }
642
643 // set token kind
644 tok->kind = tok_enc_kind[tok_enc_index];
645
Damien George2e9eb2d2014-04-10 12:19:33 +0100646 tok_enc_no_match:
647
Damien429d7192013-10-04 19:53:11 +0100648 // compute bracket level for implicit line joining
Damiend99b0522013-12-21 18:17:45 +0000649 if (tok->kind == MP_TOKEN_DEL_PAREN_OPEN || tok->kind == MP_TOKEN_DEL_BRACKET_OPEN || tok->kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100650 lex->nested_bracket_level += 1;
Damiend99b0522013-12-21 18:17:45 +0000651 } else if (tok->kind == MP_TOKEN_DEL_PAREN_CLOSE || tok->kind == MP_TOKEN_DEL_BRACKET_CLOSE || tok->kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100652 lex->nested_bracket_level -= 1;
653 }
654 }
655 }
656
Damiena5185f42013-10-20 14:41:27 +0100657 // point token text to vstr buffer
658 tok->str = vstr_str(&lex->vstr);
659 tok->len = vstr_len(&lex->vstr);
Damien429d7192013-10-04 19:53:11 +0100660
Damiena5185f42013-10-20 14:41:27 +0100661 // check for keywords
Damiend99b0522013-12-21 18:17:45 +0000662 if (tok->kind == MP_TOKEN_NAME) {
Damien429d7192013-10-04 19:53:11 +0100663 for (int i = 0; tok_kw[i] != NULL; i++) {
Damiena5185f42013-10-20 14:41:27 +0100664 if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
Damiend99b0522013-12-21 18:17:45 +0000665 tok->kind = MP_TOKEN_KW_FALSE + i;
Damien429d7192013-10-04 19:53:11 +0100666 break;
667 }
668 }
669 }
670}
671
Damien Georgeb829b5c2014-01-25 13:51:19 +0000672mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) {
Damiend99b0522013-12-21 18:17:45 +0000673 mp_lexer_t *lex = m_new(mp_lexer_t, 1);
Damien429d7192013-10-04 19:53:11 +0100674
Damien Georgeb829b5c2014-01-25 13:51:19 +0000675 lex->source_name = src_name;
Damiena5185f42013-10-20 14:41:27 +0100676 lex->stream_data = stream_data;
677 lex->stream_next_char = stream_next_char;
Damienfa2162b2013-10-20 17:42:00 +0100678 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100679 lex->line = 1;
680 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100681 lex->emit_dent = 0;
682 lex->nested_bracket_level = 0;
683 lex->alloc_indent_level = 16;
684 lex->num_indent_level = 1;
685 lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
686 lex->indent_level[0] = 0;
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200687 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100688
689 // preload characters
Damiena5185f42013-10-20 14:41:27 +0100690 lex->chr0 = stream_next_char(stream_data);
691 lex->chr1 = stream_next_char(stream_data);
692 lex->chr2 = stream_next_char(stream_data);
693
694 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damiend99b0522013-12-21 18:17:45 +0000695 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100696 lex->chr0 = '\n';
Damiend99b0522013-12-21 18:17:45 +0000697 } else if (lex->chr1 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100698 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100699 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100700 }
Damiend99b0522013-12-21 18:17:45 +0000701 } else if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100702 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100703 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100704 }
Damien429d7192013-10-04 19:53:11 +0100705 }
706
Damiena5185f42013-10-20 14:41:27 +0100707 // preload first token
Damiend99b0522013-12-21 18:17:45 +0000708 mp_lexer_next_token_into(lex, &lex->tok_cur, true);
Damien429d7192013-10-04 19:53:11 +0100709
710 return lex;
711}
712
Damiend99b0522013-12-21 18:17:45 +0000713void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100714 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100715 if (lex->stream_close) {
716 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100717 }
Damienbb5316b2013-10-22 21:12:29 +0100718 vstr_clear(&lex->vstr);
Paul Sokolovsky624ed5d2014-01-23 22:25:57 +0200719 m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);
Damien732407f2013-12-29 19:33:23 +0000720 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100721 }
Damien429d7192013-10-04 19:53:11 +0100722}
723
Damien George08335002014-01-18 23:24:36 +0000724qstr mp_lexer_source_name(mp_lexer_t *lex) {
725 return lex->source_name;
726}
727
Damiend99b0522013-12-21 18:17:45 +0000728void mp_lexer_to_next(mp_lexer_t *lex) {
729 mp_lexer_next_token_into(lex, &lex->tok_cur, false);
Damien429d7192013-10-04 19:53:11 +0100730}
731
Damiend99b0522013-12-21 18:17:45 +0000732const mp_token_t *mp_lexer_cur(const mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100733 return &lex->tok_cur;
734}
735
Damiend99b0522013-12-21 18:17:45 +0000736bool mp_lexer_is_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
Damien429d7192013-10-04 19:53:11 +0100737 return lex->tok_cur.kind == kind;
738}