blob: daaeebf5118ec11499c606e4759465611992b836 [file] [log] [blame]
Damien429d7192013-10-04 19:53:11 +01001/* lexer.c -- simple tokeniser for Python implementation
2 */
3
4#include <stdint.h>
5#include <stdio.h>
Damien George08335002014-01-18 23:24:36 +00006#include <string.h>
Damien429d7192013-10-04 19:53:11 +01007#include <assert.h>
8
9#include "misc.h"
Damien George55baff42014-01-21 21:40:13 +000010#include "mpconfig.h"
11#include "qstr.h"
Damien429d7192013-10-04 19:53:11 +010012#include "lexer.h"
13
14#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010015
Damien92c06562013-10-22 22:32:27 +010016// TODO seems that CPython allows NULL byte in the input stream
17// don't know if that's intentional or not, but we don't allow it
18
Damiend99b0522013-12-21 18:17:45 +000019struct _mp_lexer_t {
Damien George08335002014-01-18 23:24:36 +000020 qstr source_name; // name of source
Damiena5185f42013-10-20 14:41:27 +010021 void *stream_data; // data for stream
Damiend99b0522013-12-21 18:17:45 +000022 mp_lexer_stream_next_char_t stream_next_char; // stream callback to get next char
23 mp_lexer_stream_close_t stream_close; // stream callback to free
Damien429d7192013-10-04 19:53:11 +010024
Damiena5185f42013-10-20 14:41:27 +010025 unichar chr0, chr1, chr2; // current cached characters from source
Damien429d7192013-10-04 19:53:11 +010026
27 uint line; // source line
28 uint column; // source column
29
Damiena5185f42013-10-20 14:41:27 +010030 int emit_dent; // non-zero when there are INDENT/DEDENT tokens to emit
31 int nested_bracket_level; // >0 when there are nested brackets over multiple lines
Damien429d7192013-10-04 19:53:11 +010032
33 uint alloc_indent_level;
34 uint num_indent_level;
35 uint16_t *indent_level;
36
Damiena5185f42013-10-20 14:41:27 +010037 vstr_t vstr;
Damiend99b0522013-12-21 18:17:45 +000038 mp_token_t tok_cur;
Damien429d7192013-10-04 19:53:11 +010039};
40
Damien George9528cd62014-01-15 21:23:31 +000041// TODO replace with a call to a standard function
Damiena5185f42013-10-20 14:41:27 +010042bool str_strn_equal(const char *str, const char *strn, int len) {
Damien429d7192013-10-04 19:53:11 +010043 uint i = 0;
Damien429d7192013-10-04 19:53:11 +010044
Damiena5185f42013-10-20 14:41:27 +010045 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010046 ++i;
Damien429d7192013-10-04 19:53:11 +010047 ++str;
Damiena5185f42013-10-20 14:41:27 +010048 ++strn;
Damien429d7192013-10-04 19:53:11 +010049 }
50
Damiena5185f42013-10-20 14:41:27 +010051 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010052}
53
Damiend99b0522013-12-21 18:17:45 +000054void mp_token_show(const mp_token_t *tok) {
Damien George08335002014-01-18 23:24:36 +000055 printf("(%d:%d) kind:%d str:%p len:%d", tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
Damien429d7192013-10-04 19:53:11 +010056 if (tok->str != NULL && tok->len > 0) {
57 const char *i = tok->str;
58 const char *j = i + tok->len;
59 printf(" ");
60 while (i < j) {
Damien George8cc96a32013-12-30 18:23:50 +000061 unichar c = utf8_get_char(i);
62 i = utf8_next_char(i);
63 if (unichar_isprint(c)) {
Damien429d7192013-10-04 19:53:11 +010064 printf("%c", c);
65 } else {
66 printf("?");
67 }
68 }
69 }
70 printf("\n");
71}
72
Damiena5185f42013-10-20 14:41:27 +010073#define CUR_CHAR(lex) ((lex)->chr0)
74
Damiend99b0522013-12-21 18:17:45 +000075static bool is_end(mp_lexer_t *lex) {
76 return lex->chr0 == MP_LEXER_CHAR_EOF;
Damien429d7192013-10-04 19:53:11 +010077}
78
Damiend99b0522013-12-21 18:17:45 +000079static bool is_physical_newline(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +010080 return lex->chr0 == '\n' || lex->chr0 == '\r';
81}
82
Damiend99b0522013-12-21 18:17:45 +000083static bool is_char(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010084 return lex->chr0 == c;
85}
86
Damiend99b0522013-12-21 18:17:45 +000087static bool is_char_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010088 return lex->chr0 == c1 || lex->chr0 == c2;
89}
90
Damiend99b0522013-12-21 18:17:45 +000091static bool is_char_or3(mp_lexer_t *lex, char c1, char c2, char c3) {
Damien429d7192013-10-04 19:53:11 +010092 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
93}
94
95/*
Damiend99b0522013-12-21 18:17:45 +000096static bool is_char_following(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010097 return lex->chr1 == c;
98}
99*/
100
Damiend99b0522013-12-21 18:17:45 +0000101static bool is_char_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100102 return lex->chr1 == c1 || lex->chr1 == c2;
103}
104
Damiend99b0522013-12-21 18:17:45 +0000105static bool is_char_following_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100106 return lex->chr2 == c1 || lex->chr2 == c2;
107}
108
Damiend99b0522013-12-21 18:17:45 +0000109static bool is_char_and(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100110 return lex->chr0 == c1 && lex->chr1 == c2;
111}
112
Damiend99b0522013-12-21 18:17:45 +0000113static bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000114 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100115}
116
Damiend99b0522013-12-21 18:17:45 +0000117static bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000118 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100119}
120
Damiend99b0522013-12-21 18:17:45 +0000121static bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000122 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100123}
124
Damiend99b0522013-12-21 18:17:45 +0000125static bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000126 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100127}
128
129// TODO UNICODE include unicode characters in definition of identifiers
Damiend99b0522013-12-21 18:17:45 +0000130static bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100131 return is_letter(lex) || lex->chr0 == '_';
132}
133
134// TODO UNICODE include unicode characters in definition of identifiers
Damiend99b0522013-12-21 18:17:45 +0000135static bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100136 return is_head_of_identifier(lex) || is_digit(lex);
137}
138
Damiend99b0522013-12-21 18:17:45 +0000139static void next_char(mp_lexer_t *lex) {
140 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100141 return;
142 }
143
144 int advance = 1;
145
146 if (lex->chr0 == '\n') {
147 // LF is a new line
148 ++lex->line;
149 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100150 } else if (lex->chr0 == '\r') {
151 // CR is a new line
152 ++lex->line;
153 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100154 if (lex->chr1 == '\n') {
155 // CR LF is a single new line
156 advance = 2;
157 }
158 } else if (lex->chr0 == '\t') {
159 // a tab
160 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
161 } else {
162 // a character worth one column
163 ++lex->column;
164 }
165
166 for (; advance > 0; advance--) {
167 lex->chr0 = lex->chr1;
168 lex->chr1 = lex->chr2;
Damiena5185f42013-10-20 14:41:27 +0100169 lex->chr2 = lex->stream_next_char(lex->stream_data);
Damiend99b0522013-12-21 18:17:45 +0000170 if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100171 // EOF
Damiend99b0522013-12-21 18:17:45 +0000172 if (lex->chr1 != MP_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien429d7192013-10-04 19:53:11 +0100173 lex->chr2 = '\n'; // insert newline at end of file
Damien429d7192013-10-04 19:53:11 +0100174 }
175 }
176 }
177}
178
Damiend99b0522013-12-21 18:17:45 +0000179void indent_push(mp_lexer_t *lex, uint indent) {
Damien429d7192013-10-04 19:53:11 +0100180 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien732407f2013-12-29 19:33:23 +0000181 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level * 2);
Damien429d7192013-10-04 19:53:11 +0100182 lex->alloc_indent_level *= 2;
Damien429d7192013-10-04 19:53:11 +0100183 }
184 lex->indent_level[lex->num_indent_level++] = indent;
185}
186
Damiend99b0522013-12-21 18:17:45 +0000187uint indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100188 return lex->indent_level[lex->num_indent_level - 1];
189}
190
Damiend99b0522013-12-21 18:17:45 +0000191void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100192 lex->num_indent_level -= 1;
193}
194
195// some tricky operator encoding:
196// <op> = begin with <op>, if this opchar matches then begin here
197// e<op> = end with <op>, if this opchar matches then end
198// E<op> = mandatory end with <op>, this opchar must match, then end
199// c<op> = continue with <op>, if this opchar matches then continue matching
200// this means if the start of two ops are the same then they are equal til the last char
201
202static const char *tok_enc =
203 "()[]{},:;@~" // singles
204 "<e=c<e=" // < <= << <<=
205 ">e=c>e=" // > >= >> >>=
206 "*e=c*e=" // * *= ** **=
207 "+e=" // + +=
208 "-e=e>" // - -= ->
209 "&e=" // & &=
210 "|e=" // | |=
211 "/e=c/e=" // / /= // //=
212 "%e=" // % %=
213 "^e=" // ^ ^=
214 "=e=" // = ==
215 "!E=" // !=
216 ".c.E."; // . ...
217
218// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
219static const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000220 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
221 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
222 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
223 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100224
Damiend99b0522013-12-21 18:17:45 +0000225 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
226 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
227 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
228 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
229 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
230 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
231 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
232 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
233 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
234 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
235 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
236 MP_TOKEN_OP_NOT_EQUAL,
Damien Georgee9906ac2014-01-04 18:44:46 +0000237 MP_TOKEN_DEL_PERIOD, MP_TOKEN_ELLIPSIS,
Damien429d7192013-10-04 19:53:11 +0100238};
239
240// must have the same order as enum in lexer.h
241static const char *tok_kw[] = {
242 "False",
243 "None",
244 "True",
245 "and",
246 "as",
247 "assert",
248 "break",
249 "class",
250 "continue",
251 "def",
252 "del",
253 "elif",
254 "else",
255 "except",
256 "finally",
257 "for",
258 "from",
259 "global",
260 "if",
261 "import",
262 "in",
263 "is",
264 "lambda",
265 "nonlocal",
266 "not",
267 "or",
268 "pass",
269 "raise",
270 "return",
271 "try",
272 "while",
273 "with",
274 "yield",
275 NULL,
276};
277
Damiend99b0522013-12-21 18:17:45 +0000278static void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) {
Damiena5185f42013-10-20 14:41:27 +0100279 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100280 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100281 while (!is_end(lex)) {
282 if (is_physical_newline(lex)) {
283 had_physical_newline = true;
284 next_char(lex);
285 } else if (is_whitespace(lex)) {
286 next_char(lex);
287 } else if (is_char(lex, '#')) {
288 next_char(lex);
289 while (!is_end(lex) && !is_physical_newline(lex)) {
290 next_char(lex);
291 }
292 // had_physical_newline will be set on next loop
293 } else if (is_char(lex, '\\')) {
294 // backslash (outside string literals) must appear just before a physical newline
295 next_char(lex);
296 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000297 // SyntaxError: unexpected character after line continuation character
Damien George69a818d2014-01-12 13:55:24 +0000298 tok->src_line = lex->line;
299 tok->src_column = lex->column;
300 tok->kind = MP_TOKEN_BAD_LINE_CONTINUATION;
301 vstr_reset(&lex->vstr);
302 tok->str = vstr_str(&lex->vstr);
303 tok->len = 0;
304 return;
Damien429d7192013-10-04 19:53:11 +0100305 } else {
306 next_char(lex);
307 }
308 } else {
309 break;
310 }
311 }
312
Damiena5185f42013-10-20 14:41:27 +0100313 // set token source information
Damien429d7192013-10-04 19:53:11 +0100314 tok->src_line = lex->line;
315 tok->src_column = lex->column;
Damien429d7192013-10-04 19:53:11 +0100316
Damiena5185f42013-10-20 14:41:27 +0100317 // start new token text
318 vstr_reset(&lex->vstr);
319
320 if (first_token && lex->line == 1 && lex->column != 1) {
321 // check that the first token is in the first column
322 // if first token is not on first line, we get a physical newline and
323 // this check is done as part of normal indent/dedent checking below
324 // (done to get equivalence with CPython)
Damiend99b0522013-12-21 18:17:45 +0000325 tok->kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100326
327 } else if (lex->emit_dent < 0) {
Damiend99b0522013-12-21 18:17:45 +0000328 tok->kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100329 lex->emit_dent += 1;
330
331 } else if (lex->emit_dent > 0) {
Damiend99b0522013-12-21 18:17:45 +0000332 tok->kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100333 lex->emit_dent -= 1;
334
Damien91d387d2013-10-09 15:09:52 +0100335 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damiend99b0522013-12-21 18:17:45 +0000336 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100337
338 uint num_spaces = lex->column - 1;
339 lex->emit_dent = 0;
340 if (num_spaces == indent_top(lex)) {
341 } else if (num_spaces > indent_top(lex)) {
342 indent_push(lex, num_spaces);
343 lex->emit_dent += 1;
344 } else {
345 while (num_spaces < indent_top(lex)) {
346 indent_pop(lex);
347 lex->emit_dent -= 1;
348 }
349 if (num_spaces != indent_top(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000350 tok->kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100351 }
352 }
353
354 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100355 if (indent_top(lex) > 0) {
Damiend99b0522013-12-21 18:17:45 +0000356 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100357 lex->emit_dent = 0;
358 while (indent_top(lex) > 0) {
359 indent_pop(lex);
360 lex->emit_dent -= 1;
361 }
362 } else {
Damiend99b0522013-12-21 18:17:45 +0000363 tok->kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100364 }
365
366 } else if (is_char_or(lex, '\'', '\"')
367 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
368 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
369 // a string or bytes literal
370
371 // parse type codes
372 bool is_raw = false;
373 bool is_bytes = false;
374 if (is_char(lex, 'u')) {
375 next_char(lex);
376 } else if (is_char(lex, 'b')) {
377 is_bytes = true;
378 next_char(lex);
379 if (is_char(lex, 'r')) {
380 is_raw = true;
381 next_char(lex);
382 }
383 } else if (is_char(lex, 'r')) {
384 is_raw = true;
385 next_char(lex);
386 if (is_char(lex, 'b')) {
387 is_bytes = true;
388 next_char(lex);
389 }
390 }
391
392 // set token kind
393 if (is_bytes) {
Damiend99b0522013-12-21 18:17:45 +0000394 tok->kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100395 } else {
Damiend99b0522013-12-21 18:17:45 +0000396 tok->kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100397 }
398
399 // get first quoting character
400 char quote_char = '\'';
401 if (is_char(lex, '\"')) {
402 quote_char = '\"';
403 }
404 next_char(lex);
405
406 // work out if it's a single or triple quoted literal
407 int num_quotes;
408 if (is_char_and(lex, quote_char, quote_char)) {
409 // triple quotes
410 next_char(lex);
411 next_char(lex);
412 num_quotes = 3;
413 } else {
414 // single quotes
415 num_quotes = 1;
416 }
417
Damien429d7192013-10-04 19:53:11 +0100418 // parse the literal
Damien429d7192013-10-04 19:53:11 +0100419 int n_closing = 0;
420 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
421 if (is_char(lex, quote_char)) {
422 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100423 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100424 } else {
425 n_closing = 0;
426 if (!is_raw && is_char(lex, '\\')) {
427 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100428 unichar c = CUR_CHAR(lex);
429 switch (c) {
Damiend99b0522013-12-21 18:17:45 +0000430 case MP_LEXER_CHAR_EOF: break; // TODO a proper error message?
431 case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damiena5185f42013-10-20 14:41:27 +0100432 case '\\': break;
433 case '\'': break;
434 case '"': break;
435 case 'a': c = 0x07; break;
436 case 'b': c = 0x08; break;
437 case 't': c = 0x09; break;
438 case 'n': c = 0x0a; break;
439 case 'v': c = 0x0b; break;
440 case 'f': c = 0x0c; break;
441 case 'r': c = 0x0d; break;
442 // TODO \ooo octal
443 case 'x': // TODO \xhh
444 case 'N': // TODO \N{name} only in strings
445 case 'u': // TODO \uxxxx only in strings
446 case 'U': // TODO \Uxxxxxxxx only in strings
447 default: break; // TODO error message
448 }
Damiend99b0522013-12-21 18:17:45 +0000449 if (c != MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100450 vstr_add_char(&lex->vstr, c);
451 }
452 } else {
453 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100454 }
455 }
456 next_char(lex);
457 }
458
459 // check we got the required end quotes
460 if (n_closing < num_quotes) {
Damiend99b0522013-12-21 18:17:45 +0000461 tok->kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100462 }
463
Damiena5185f42013-10-20 14:41:27 +0100464 // cut off the end quotes from the token text
465 vstr_cut_tail(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100466
467 } else if (is_head_of_identifier(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000468 tok->kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100469
Damiena5185f42013-10-20 14:41:27 +0100470 // get first char
471 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100472 next_char(lex);
473
Damiena5185f42013-10-20 14:41:27 +0100474 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100475 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100476 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100477 next_char(lex);
478 }
479
480 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damiend99b0522013-12-21 18:17:45 +0000481 tok->kind = MP_TOKEN_NUMBER;
Damien429d7192013-10-04 19:53:11 +0100482
Damiena5185f42013-10-20 14:41:27 +0100483 // get first char
484 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100485 next_char(lex);
486
Damiena5185f42013-10-20 14:41:27 +0100487 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100488 while (!is_end(lex)) {
489 if (is_char_or(lex, 'e', 'E')) {
Damiena5185f42013-10-20 14:41:27 +0100490 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100491 next_char(lex);
492 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100493 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100494 next_char(lex);
495 }
496 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
Damiena5185f42013-10-20 14:41:27 +0100497 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100498 next_char(lex);
499 } else {
500 break;
501 }
502 }
503
504 } else {
505 // search for encoded delimiter or operator
506
507 const char *t = tok_enc;
508 uint tok_enc_index = 0;
509 for (; *t != 0 && !is_char(lex, *t); t += 1) {
510 if (*t == 'e' || *t == 'c') {
511 t += 1;
512 } else if (*t == 'E') {
513 tok_enc_index -= 1;
514 t += 1;
515 }
516 tok_enc_index += 1;
517 }
518
519 next_char(lex);
520
521 if (*t == 0) {
522 // didn't match any delimiter or operator characters
Damiend99b0522013-12-21 18:17:45 +0000523 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100524
525 } else {
526 // matched a delimiter or operator character
527
528 // get the maximum characters for a valid token
529 t += 1;
530 uint t_index = tok_enc_index;
531 for (;;) {
532 for (; *t == 'e'; t += 1) {
533 t += 1;
534 t_index += 1;
535 if (is_char(lex, *t)) {
536 next_char(lex);
537 tok_enc_index = t_index;
538 break;
539 }
540 }
541
542 if (*t == 'E') {
543 t += 1;
544 if (is_char(lex, *t)) {
545 next_char(lex);
546 tok_enc_index = t_index;
547 } else {
Damiend99b0522013-12-21 18:17:45 +0000548 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100549 }
550 break;
551 }
552
553 if (*t == 'c') {
554 t += 1;
555 t_index += 1;
556 if (is_char(lex, *t)) {
557 next_char(lex);
558 tok_enc_index = t_index;
559 t += 1;
560 } else {
561 break;
562 }
563 } else {
564 break;
565 }
566 }
567
568 // set token kind
569 tok->kind = tok_enc_kind[tok_enc_index];
570
571 // compute bracket level for implicit line joining
Damiend99b0522013-12-21 18:17:45 +0000572 if (tok->kind == MP_TOKEN_DEL_PAREN_OPEN || tok->kind == MP_TOKEN_DEL_BRACKET_OPEN || tok->kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100573 lex->nested_bracket_level += 1;
Damiend99b0522013-12-21 18:17:45 +0000574 } else if (tok->kind == MP_TOKEN_DEL_PAREN_CLOSE || tok->kind == MP_TOKEN_DEL_BRACKET_CLOSE || tok->kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100575 lex->nested_bracket_level -= 1;
576 }
577 }
578 }
579
Damiena5185f42013-10-20 14:41:27 +0100580 // point token text to vstr buffer
581 tok->str = vstr_str(&lex->vstr);
582 tok->len = vstr_len(&lex->vstr);
Damien429d7192013-10-04 19:53:11 +0100583
Damiena5185f42013-10-20 14:41:27 +0100584 // check for keywords
Damiend99b0522013-12-21 18:17:45 +0000585 if (tok->kind == MP_TOKEN_NAME) {
Damien429d7192013-10-04 19:53:11 +0100586 for (int i = 0; tok_kw[i] != NULL; i++) {
Damiena5185f42013-10-20 14:41:27 +0100587 if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
Damiend99b0522013-12-21 18:17:45 +0000588 tok->kind = MP_TOKEN_KW_FALSE + i;
Damien429d7192013-10-04 19:53:11 +0100589 break;
590 }
591 }
592 }
593}
594
Damiend99b0522013-12-21 18:17:45 +0000595mp_lexer_t *mp_lexer_new(const char *src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) {
596 mp_lexer_t *lex = m_new(mp_lexer_t, 1);
Damien429d7192013-10-04 19:53:11 +0100597
Damien George55baff42014-01-21 21:40:13 +0000598 lex->source_name = qstr_from_str(src_name);
Damiena5185f42013-10-20 14:41:27 +0100599 lex->stream_data = stream_data;
600 lex->stream_next_char = stream_next_char;
Damienfa2162b2013-10-20 17:42:00 +0100601 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100602 lex->line = 1;
603 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100604 lex->emit_dent = 0;
605 lex->nested_bracket_level = 0;
606 lex->alloc_indent_level = 16;
607 lex->num_indent_level = 1;
608 lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
609 lex->indent_level[0] = 0;
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200610 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100611
612 // preload characters
Damiena5185f42013-10-20 14:41:27 +0100613 lex->chr0 = stream_next_char(stream_data);
614 lex->chr1 = stream_next_char(stream_data);
615 lex->chr2 = stream_next_char(stream_data);
616
617 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damiend99b0522013-12-21 18:17:45 +0000618 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100619 lex->chr0 = '\n';
Damiend99b0522013-12-21 18:17:45 +0000620 } else if (lex->chr1 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100621 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100622 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100623 }
Damiend99b0522013-12-21 18:17:45 +0000624 } else if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100625 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100626 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100627 }
Damien429d7192013-10-04 19:53:11 +0100628 }
629
Damiena5185f42013-10-20 14:41:27 +0100630 // preload first token
Damiend99b0522013-12-21 18:17:45 +0000631 mp_lexer_next_token_into(lex, &lex->tok_cur, true);
Damien429d7192013-10-04 19:53:11 +0100632
633 return lex;
634}
635
Damiend99b0522013-12-21 18:17:45 +0000636void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100637 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100638 if (lex->stream_close) {
639 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100640 }
Damienbb5316b2013-10-22 21:12:29 +0100641 vstr_clear(&lex->vstr);
Damien732407f2013-12-29 19:33:23 +0000642 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100643 }
Damien429d7192013-10-04 19:53:11 +0100644}
645
Damien George08335002014-01-18 23:24:36 +0000646qstr mp_lexer_source_name(mp_lexer_t *lex) {
647 return lex->source_name;
648}
649
Damiend99b0522013-12-21 18:17:45 +0000650void mp_lexer_to_next(mp_lexer_t *lex) {
651 mp_lexer_next_token_into(lex, &lex->tok_cur, false);
Damien429d7192013-10-04 19:53:11 +0100652}
653
Damiend99b0522013-12-21 18:17:45 +0000654const mp_token_t *mp_lexer_cur(const mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100655 return &lex->tok_cur;
656}
657
Damiend99b0522013-12-21 18:17:45 +0000658bool mp_lexer_is_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
Damien429d7192013-10-04 19:53:11 +0100659 return lex->tok_cur.kind == kind;
660}
661
662/*
Damiend99b0522013-12-21 18:17:45 +0000663bool mp_lexer_is_str(mp_lexer_t *lex, const char *str) {
664 return mp_token_is_str(&lex->tok_cur, str);
Damien429d7192013-10-04 19:53:11 +0100665}
666
Damiend99b0522013-12-21 18:17:45 +0000667bool mp_lexer_opt_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
668 if (mp_lexer_is_kind(lex, kind)) {
669 mp_lexer_to_next(lex);
Damien429d7192013-10-04 19:53:11 +0100670 return true;
671 }
672 return false;
673}
674
Damiend99b0522013-12-21 18:17:45 +0000675bool mp_lexer_opt_str(mp_lexer_t *lex, const char *str) {
676 if (mp_lexer_is_str(lex, str)) {
677 mp_lexer_to_next(lex);
Damien429d7192013-10-04 19:53:11 +0100678 return true;
679 }
680 return false;
681}
682*/
683
Damien George9528cd62014-01-15 21:23:31 +0000684bool mp_lexer_show_error_pythonic_prefix(mp_lexer_t *lex) {
Damien George08335002014-01-18 23:24:36 +0000685 printf(" File \"%s\", line %d column %d\n", qstr_str(lex->source_name), lex->tok_cur.src_line, lex->tok_cur.src_column);
Damien George9528cd62014-01-15 21:23:31 +0000686 return false;
Damien429d7192013-10-04 19:53:11 +0100687}
Damien91d387d2013-10-09 15:09:52 +0100688
Damiend99b0522013-12-21 18:17:45 +0000689bool mp_lexer_show_error_pythonic(mp_lexer_t *lex, const char *msg) {
Damien George08335002014-01-18 23:24:36 +0000690 printf(" File \"%s\", line %d column %d\n%s\n", qstr_str(lex->source_name), lex->tok_cur.src_line, lex->tok_cur.src_column, msg);
Damien91d387d2013-10-09 15:09:52 +0100691 return false;
692}