blob: 6e43c7469a1014c893215b2248b0040427c5d090 [file] [log] [blame]
Damien429d7192013-10-04 19:53:11 +01001/* lexer.c -- simple tokeniser for Python implementation
2 */
3
4#include <stdint.h>
5#include <stdio.h>
6#include <assert.h>
7
8#include "misc.h"
9#include "lexer.h"
10
11#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010012
Damien92c06562013-10-22 22:32:27 +010013// TODO seems that CPython allows NULL byte in the input stream
14// don't know if that's intentional or not, but we don't allow it
15
Damiend99b0522013-12-21 18:17:45 +000016struct _mp_lexer_t {
Damiena5185f42013-10-20 14:41:27 +010017 const char *name; // name of source
18 void *stream_data; // data for stream
Damiend99b0522013-12-21 18:17:45 +000019 mp_lexer_stream_next_char_t stream_next_char; // stream callback to get next char
20 mp_lexer_stream_close_t stream_close; // stream callback to free
Damien429d7192013-10-04 19:53:11 +010021
Damiena5185f42013-10-20 14:41:27 +010022 unichar chr0, chr1, chr2; // current cached characters from source
Damien429d7192013-10-04 19:53:11 +010023
24 uint line; // source line
25 uint column; // source column
26
Damiena5185f42013-10-20 14:41:27 +010027 int emit_dent; // non-zero when there are INDENT/DEDENT tokens to emit
28 int nested_bracket_level; // >0 when there are nested brackets over multiple lines
Damien429d7192013-10-04 19:53:11 +010029
30 uint alloc_indent_level;
31 uint num_indent_level;
32 uint16_t *indent_level;
33
Damiena5185f42013-10-20 14:41:27 +010034 vstr_t vstr;
Damiend99b0522013-12-21 18:17:45 +000035 mp_token_t tok_cur;
Damien429d7192013-10-04 19:53:11 +010036};
37
Damiena5185f42013-10-20 14:41:27 +010038bool str_strn_equal(const char *str, const char *strn, int len) {
Damien429d7192013-10-04 19:53:11 +010039 uint i = 0;
Damien429d7192013-10-04 19:53:11 +010040
Damiena5185f42013-10-20 14:41:27 +010041 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010042 ++i;
Damien429d7192013-10-04 19:53:11 +010043 ++str;
Damiena5185f42013-10-20 14:41:27 +010044 ++strn;
Damien429d7192013-10-04 19:53:11 +010045 }
46
Damiena5185f42013-10-20 14:41:27 +010047 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010048}
49
Damiend99b0522013-12-21 18:17:45 +000050void mp_token_show(const mp_token_t *tok) {
Damiena5185f42013-10-20 14:41:27 +010051 printf("(%s:%d:%d) kind:%d str:%p len:%d", tok->src_name, tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
Damien429d7192013-10-04 19:53:11 +010052 if (tok->str != NULL && tok->len > 0) {
53 const char *i = tok->str;
54 const char *j = i + tok->len;
55 printf(" ");
56 while (i < j) {
57 unichar c = g_utf8_get_char(i);
58 i = g_utf8_next_char(i);
59 if (g_unichar_isprint(c)) {
60 printf("%c", c);
61 } else {
62 printf("?");
63 }
64 }
65 }
66 printf("\n");
67}
68
Damiend99b0522013-12-21 18:17:45 +000069void mp_token_show_error_prefix(const mp_token_t *tok) {
Damien429d7192013-10-04 19:53:11 +010070 printf("(%s:%d:%d) ", tok->src_name, tok->src_line, tok->src_column);
71}
72
Damiend99b0522013-12-21 18:17:45 +000073bool mp_token_show_error(const mp_token_t *tok, const char *msg) {
Damien429d7192013-10-04 19:53:11 +010074 printf("(%s:%d:%d) %s\n", tok->src_name, tok->src_line, tok->src_column, msg);
75 return false;
76}
77
Damiena5185f42013-10-20 14:41:27 +010078#define CUR_CHAR(lex) ((lex)->chr0)
79
Damiend99b0522013-12-21 18:17:45 +000080static bool is_end(mp_lexer_t *lex) {
81 return lex->chr0 == MP_LEXER_CHAR_EOF;
Damien429d7192013-10-04 19:53:11 +010082}
83
Damiend99b0522013-12-21 18:17:45 +000084static bool is_physical_newline(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +010085 return lex->chr0 == '\n' || lex->chr0 == '\r';
86}
87
Damiend99b0522013-12-21 18:17:45 +000088static bool is_char(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010089 return lex->chr0 == c;
90}
91
Damiend99b0522013-12-21 18:17:45 +000092static bool is_char_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010093 return lex->chr0 == c1 || lex->chr0 == c2;
94}
95
Damiend99b0522013-12-21 18:17:45 +000096static bool is_char_or3(mp_lexer_t *lex, char c1, char c2, char c3) {
Damien429d7192013-10-04 19:53:11 +010097 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
98}
99
100/*
Damiend99b0522013-12-21 18:17:45 +0000101static bool is_char_following(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +0100102 return lex->chr1 == c;
103}
104*/
105
Damiend99b0522013-12-21 18:17:45 +0000106static bool is_char_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100107 return lex->chr1 == c1 || lex->chr1 == c2;
108}
109
Damiend99b0522013-12-21 18:17:45 +0000110static bool is_char_following_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100111 return lex->chr2 == c1 || lex->chr2 == c2;
112}
113
Damiend99b0522013-12-21 18:17:45 +0000114static bool is_char_and(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100115 return lex->chr0 == c1 && lex->chr1 == c2;
116}
117
Damiend99b0522013-12-21 18:17:45 +0000118static bool is_whitespace(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100119 return g_unichar_isspace(lex->chr0);
120}
121
Damiend99b0522013-12-21 18:17:45 +0000122static bool is_letter(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100123 return g_unichar_isalpha(lex->chr0);
124}
125
Damiend99b0522013-12-21 18:17:45 +0000126static bool is_digit(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100127 return g_unichar_isdigit(lex->chr0);
128}
129
Damiend99b0522013-12-21 18:17:45 +0000130static bool is_following_digit(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100131 return g_unichar_isdigit(lex->chr1);
132}
133
134// TODO UNICODE include unicode characters in definition of identifiers
Damiend99b0522013-12-21 18:17:45 +0000135static bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100136 return is_letter(lex) || lex->chr0 == '_';
137}
138
139// TODO UNICODE include unicode characters in definition of identifiers
Damiend99b0522013-12-21 18:17:45 +0000140static bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100141 return is_head_of_identifier(lex) || is_digit(lex);
142}
143
Damiend99b0522013-12-21 18:17:45 +0000144static void next_char(mp_lexer_t *lex) {
145 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100146 return;
147 }
148
149 int advance = 1;
150
151 if (lex->chr0 == '\n') {
152 // LF is a new line
153 ++lex->line;
154 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100155 } else if (lex->chr0 == '\r') {
156 // CR is a new line
157 ++lex->line;
158 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100159 if (lex->chr1 == '\n') {
160 // CR LF is a single new line
161 advance = 2;
162 }
163 } else if (lex->chr0 == '\t') {
164 // a tab
165 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
166 } else {
167 // a character worth one column
168 ++lex->column;
169 }
170
171 for (; advance > 0; advance--) {
172 lex->chr0 = lex->chr1;
173 lex->chr1 = lex->chr2;
Damiena5185f42013-10-20 14:41:27 +0100174 lex->chr2 = lex->stream_next_char(lex->stream_data);
Damiend99b0522013-12-21 18:17:45 +0000175 if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100176 // EOF
Damiend99b0522013-12-21 18:17:45 +0000177 if (lex->chr1 != MP_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien429d7192013-10-04 19:53:11 +0100178 lex->chr2 = '\n'; // insert newline at end of file
Damien429d7192013-10-04 19:53:11 +0100179 }
180 }
181 }
182}
183
Damiend99b0522013-12-21 18:17:45 +0000184void indent_push(mp_lexer_t *lex, uint indent) {
Damien429d7192013-10-04 19:53:11 +0100185 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien732407f2013-12-29 19:33:23 +0000186 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level * 2);
Damien429d7192013-10-04 19:53:11 +0100187 lex->alloc_indent_level *= 2;
Damien429d7192013-10-04 19:53:11 +0100188 }
189 lex->indent_level[lex->num_indent_level++] = indent;
190}
191
Damiend99b0522013-12-21 18:17:45 +0000192uint indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100193 return lex->indent_level[lex->num_indent_level - 1];
194}
195
Damiend99b0522013-12-21 18:17:45 +0000196void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100197 lex->num_indent_level -= 1;
198}
199
200// some tricky operator encoding:
201// <op> = begin with <op>, if this opchar matches then begin here
202// e<op> = end with <op>, if this opchar matches then end
203// E<op> = mandatory end with <op>, this opchar must match, then end
204// c<op> = continue with <op>, if this opchar matches then continue matching
205// this means if the start of two ops are the same then they are equal til the last char
206
207static const char *tok_enc =
208 "()[]{},:;@~" // singles
209 "<e=c<e=" // < <= << <<=
210 ">e=c>e=" // > >= >> >>=
211 "*e=c*e=" // * *= ** **=
212 "+e=" // + +=
213 "-e=e>" // - -= ->
214 "&e=" // & &=
215 "|e=" // | |=
216 "/e=c/e=" // / /= // //=
217 "%e=" // % %=
218 "^e=" // ^ ^=
219 "=e=" // = ==
220 "!E=" // !=
221 ".c.E."; // . ...
222
223// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
224static const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000225 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
226 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
227 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
228 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100229
Damiend99b0522013-12-21 18:17:45 +0000230 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
231 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
232 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
233 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
234 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
235 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
236 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
237 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
238 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
239 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
240 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
241 MP_TOKEN_OP_NOT_EQUAL,
242 MP_TOKEN_DEL_PERIOD, MP_TOKEN_ELLIPSES,
Damien429d7192013-10-04 19:53:11 +0100243};
244
245// must have the same order as enum in lexer.h
246static const char *tok_kw[] = {
247 "False",
248 "None",
249 "True",
250 "and",
251 "as",
252 "assert",
253 "break",
254 "class",
255 "continue",
256 "def",
257 "del",
258 "elif",
259 "else",
260 "except",
261 "finally",
262 "for",
263 "from",
264 "global",
265 "if",
266 "import",
267 "in",
268 "is",
269 "lambda",
270 "nonlocal",
271 "not",
272 "or",
273 "pass",
274 "raise",
275 "return",
276 "try",
277 "while",
278 "with",
279 "yield",
280 NULL,
281};
282
Damiend99b0522013-12-21 18:17:45 +0000283static void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) {
Damiena5185f42013-10-20 14:41:27 +0100284 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100285 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100286 while (!is_end(lex)) {
287 if (is_physical_newline(lex)) {
288 had_physical_newline = true;
289 next_char(lex);
290 } else if (is_whitespace(lex)) {
291 next_char(lex);
292 } else if (is_char(lex, '#')) {
293 next_char(lex);
294 while (!is_end(lex) && !is_physical_newline(lex)) {
295 next_char(lex);
296 }
297 // had_physical_newline will be set on next loop
298 } else if (is_char(lex, '\\')) {
299 // backslash (outside string literals) must appear just before a physical newline
300 next_char(lex);
301 if (!is_physical_newline(lex)) {
302 // TODO SyntaxError
303 assert(0);
304 } else {
305 next_char(lex);
306 }
307 } else {
308 break;
309 }
310 }
311
Damiena5185f42013-10-20 14:41:27 +0100312 // set token source information
Damien429d7192013-10-04 19:53:11 +0100313 tok->src_name = lex->name;
314 tok->src_line = lex->line;
315 tok->src_column = lex->column;
Damien429d7192013-10-04 19:53:11 +0100316
Damiena5185f42013-10-20 14:41:27 +0100317 // start new token text
318 vstr_reset(&lex->vstr);
319
320 if (first_token && lex->line == 1 && lex->column != 1) {
321 // check that the first token is in the first column
322 // if first token is not on first line, we get a physical newline and
323 // this check is done as part of normal indent/dedent checking below
324 // (done to get equivalence with CPython)
Damiend99b0522013-12-21 18:17:45 +0000325 tok->kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100326
327 } else if (lex->emit_dent < 0) {
Damiend99b0522013-12-21 18:17:45 +0000328 tok->kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100329 lex->emit_dent += 1;
330
331 } else if (lex->emit_dent > 0) {
Damiend99b0522013-12-21 18:17:45 +0000332 tok->kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100333 lex->emit_dent -= 1;
334
Damien91d387d2013-10-09 15:09:52 +0100335 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damiend99b0522013-12-21 18:17:45 +0000336 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100337
338 uint num_spaces = lex->column - 1;
339 lex->emit_dent = 0;
340 if (num_spaces == indent_top(lex)) {
341 } else if (num_spaces > indent_top(lex)) {
342 indent_push(lex, num_spaces);
343 lex->emit_dent += 1;
344 } else {
345 while (num_spaces < indent_top(lex)) {
346 indent_pop(lex);
347 lex->emit_dent -= 1;
348 }
349 if (num_spaces != indent_top(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000350 tok->kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100351 }
352 }
353
354 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100355 if (indent_top(lex) > 0) {
Damiend99b0522013-12-21 18:17:45 +0000356 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100357 lex->emit_dent = 0;
358 while (indent_top(lex) > 0) {
359 indent_pop(lex);
360 lex->emit_dent -= 1;
361 }
362 } else {
Damiend99b0522013-12-21 18:17:45 +0000363 tok->kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100364 }
365
366 } else if (is_char_or(lex, '\'', '\"')
367 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
368 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
369 // a string or bytes literal
370
371 // parse type codes
372 bool is_raw = false;
373 bool is_bytes = false;
374 if (is_char(lex, 'u')) {
375 next_char(lex);
376 } else if (is_char(lex, 'b')) {
377 is_bytes = true;
378 next_char(lex);
379 if (is_char(lex, 'r')) {
380 is_raw = true;
381 next_char(lex);
382 }
383 } else if (is_char(lex, 'r')) {
384 is_raw = true;
385 next_char(lex);
386 if (is_char(lex, 'b')) {
387 is_bytes = true;
388 next_char(lex);
389 }
390 }
391
392 // set token kind
393 if (is_bytes) {
Damiend99b0522013-12-21 18:17:45 +0000394 tok->kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100395 } else {
Damiend99b0522013-12-21 18:17:45 +0000396 tok->kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100397 }
398
399 // get first quoting character
400 char quote_char = '\'';
401 if (is_char(lex, '\"')) {
402 quote_char = '\"';
403 }
404 next_char(lex);
405
406 // work out if it's a single or triple quoted literal
407 int num_quotes;
408 if (is_char_and(lex, quote_char, quote_char)) {
409 // triple quotes
410 next_char(lex);
411 next_char(lex);
412 num_quotes = 3;
413 } else {
414 // single quotes
415 num_quotes = 1;
416 }
417
Damien429d7192013-10-04 19:53:11 +0100418 // parse the literal
Damien429d7192013-10-04 19:53:11 +0100419 int n_closing = 0;
420 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
421 if (is_char(lex, quote_char)) {
422 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100423 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100424 } else {
425 n_closing = 0;
426 if (!is_raw && is_char(lex, '\\')) {
427 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100428 unichar c = CUR_CHAR(lex);
429 switch (c) {
Damiend99b0522013-12-21 18:17:45 +0000430 case MP_LEXER_CHAR_EOF: break; // TODO a proper error message?
431 case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damiena5185f42013-10-20 14:41:27 +0100432 case '\\': break;
433 case '\'': break;
434 case '"': break;
435 case 'a': c = 0x07; break;
436 case 'b': c = 0x08; break;
437 case 't': c = 0x09; break;
438 case 'n': c = 0x0a; break;
439 case 'v': c = 0x0b; break;
440 case 'f': c = 0x0c; break;
441 case 'r': c = 0x0d; break;
442 // TODO \ooo octal
443 case 'x': // TODO \xhh
444 case 'N': // TODO \N{name} only in strings
445 case 'u': // TODO \uxxxx only in strings
446 case 'U': // TODO \Uxxxxxxxx only in strings
447 default: break; // TODO error message
448 }
Damiend99b0522013-12-21 18:17:45 +0000449 if (c != MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100450 vstr_add_char(&lex->vstr, c);
451 }
452 } else {
453 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100454 }
455 }
456 next_char(lex);
457 }
458
459 // check we got the required end quotes
460 if (n_closing < num_quotes) {
Damiend99b0522013-12-21 18:17:45 +0000461 tok->kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100462 }
463
Damiena5185f42013-10-20 14:41:27 +0100464 // cut off the end quotes from the token text
465 vstr_cut_tail(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100466
467 } else if (is_head_of_identifier(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000468 tok->kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100469
Damiena5185f42013-10-20 14:41:27 +0100470 // get first char
471 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100472 next_char(lex);
473
Damiena5185f42013-10-20 14:41:27 +0100474 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100475 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100476 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100477 next_char(lex);
478 }
479
480 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damiend99b0522013-12-21 18:17:45 +0000481 tok->kind = MP_TOKEN_NUMBER;
Damien429d7192013-10-04 19:53:11 +0100482
Damiena5185f42013-10-20 14:41:27 +0100483 // get first char
484 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100485 next_char(lex);
486
Damiena5185f42013-10-20 14:41:27 +0100487 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100488 while (!is_end(lex)) {
489 if (is_char_or(lex, 'e', 'E')) {
Damiena5185f42013-10-20 14:41:27 +0100490 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100491 next_char(lex);
492 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100493 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100494 next_char(lex);
495 }
496 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
Damiena5185f42013-10-20 14:41:27 +0100497 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100498 next_char(lex);
499 } else {
500 break;
501 }
502 }
503
504 } else {
505 // search for encoded delimiter or operator
506
507 const char *t = tok_enc;
508 uint tok_enc_index = 0;
509 for (; *t != 0 && !is_char(lex, *t); t += 1) {
510 if (*t == 'e' || *t == 'c') {
511 t += 1;
512 } else if (*t == 'E') {
513 tok_enc_index -= 1;
514 t += 1;
515 }
516 tok_enc_index += 1;
517 }
518
519 next_char(lex);
520
521 if (*t == 0) {
522 // didn't match any delimiter or operator characters
Damiend99b0522013-12-21 18:17:45 +0000523 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100524
525 } else {
526 // matched a delimiter or operator character
527
528 // get the maximum characters for a valid token
529 t += 1;
530 uint t_index = tok_enc_index;
531 for (;;) {
532 for (; *t == 'e'; t += 1) {
533 t += 1;
534 t_index += 1;
535 if (is_char(lex, *t)) {
536 next_char(lex);
537 tok_enc_index = t_index;
538 break;
539 }
540 }
541
542 if (*t == 'E') {
543 t += 1;
544 if (is_char(lex, *t)) {
545 next_char(lex);
546 tok_enc_index = t_index;
547 } else {
Damiend99b0522013-12-21 18:17:45 +0000548 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100549 }
550 break;
551 }
552
553 if (*t == 'c') {
554 t += 1;
555 t_index += 1;
556 if (is_char(lex, *t)) {
557 next_char(lex);
558 tok_enc_index = t_index;
559 t += 1;
560 } else {
561 break;
562 }
563 } else {
564 break;
565 }
566 }
567
568 // set token kind
569 tok->kind = tok_enc_kind[tok_enc_index];
570
571 // compute bracket level for implicit line joining
Damiend99b0522013-12-21 18:17:45 +0000572 if (tok->kind == MP_TOKEN_DEL_PAREN_OPEN || tok->kind == MP_TOKEN_DEL_BRACKET_OPEN || tok->kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100573 lex->nested_bracket_level += 1;
Damiend99b0522013-12-21 18:17:45 +0000574 } else if (tok->kind == MP_TOKEN_DEL_PAREN_CLOSE || tok->kind == MP_TOKEN_DEL_BRACKET_CLOSE || tok->kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100575 lex->nested_bracket_level -= 1;
576 }
577 }
578 }
579
Damiena5185f42013-10-20 14:41:27 +0100580 // point token text to vstr buffer
581 tok->str = vstr_str(&lex->vstr);
582 tok->len = vstr_len(&lex->vstr);
Damien429d7192013-10-04 19:53:11 +0100583
Damiena5185f42013-10-20 14:41:27 +0100584 // check for keywords
Damiend99b0522013-12-21 18:17:45 +0000585 if (tok->kind == MP_TOKEN_NAME) {
Damien429d7192013-10-04 19:53:11 +0100586 for (int i = 0; tok_kw[i] != NULL; i++) {
Damiena5185f42013-10-20 14:41:27 +0100587 if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
Damiend99b0522013-12-21 18:17:45 +0000588 tok->kind = MP_TOKEN_KW_FALSE + i;
Damien429d7192013-10-04 19:53:11 +0100589 break;
590 }
591 }
592 }
593}
594
Damiend99b0522013-12-21 18:17:45 +0000595mp_lexer_t *mp_lexer_new(const char *src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) {
596 mp_lexer_t *lex = m_new(mp_lexer_t, 1);
Damien429d7192013-10-04 19:53:11 +0100597
Damiena5185f42013-10-20 14:41:27 +0100598 lex->name = src_name; // TODO do we need to strdup this?
599 lex->stream_data = stream_data;
600 lex->stream_next_char = stream_next_char;
Damienfa2162b2013-10-20 17:42:00 +0100601 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100602 lex->line = 1;
603 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100604 lex->emit_dent = 0;
605 lex->nested_bracket_level = 0;
606 lex->alloc_indent_level = 16;
607 lex->num_indent_level = 1;
608 lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
609 lex->indent_level[0] = 0;
Damiena5185f42013-10-20 14:41:27 +0100610 vstr_init(&lex->vstr);
Damien429d7192013-10-04 19:53:11 +0100611
612 // preload characters
Damiena5185f42013-10-20 14:41:27 +0100613 lex->chr0 = stream_next_char(stream_data);
614 lex->chr1 = stream_next_char(stream_data);
615 lex->chr2 = stream_next_char(stream_data);
616
617 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damiend99b0522013-12-21 18:17:45 +0000618 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100619 lex->chr0 = '\n';
Damiend99b0522013-12-21 18:17:45 +0000620 } else if (lex->chr1 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100621 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100622 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100623 }
Damiend99b0522013-12-21 18:17:45 +0000624 } else if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100625 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100626 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100627 }
Damien429d7192013-10-04 19:53:11 +0100628 }
629
Damiena5185f42013-10-20 14:41:27 +0100630 // preload first token
Damiend99b0522013-12-21 18:17:45 +0000631 mp_lexer_next_token_into(lex, &lex->tok_cur, true);
Damien429d7192013-10-04 19:53:11 +0100632
633 return lex;
634}
635
Damiend99b0522013-12-21 18:17:45 +0000636void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100637 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100638 if (lex->stream_close) {
639 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100640 }
Damienbb5316b2013-10-22 21:12:29 +0100641 vstr_clear(&lex->vstr);
Damien732407f2013-12-29 19:33:23 +0000642 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100643 }
Damien429d7192013-10-04 19:53:11 +0100644}
645
Damiend99b0522013-12-21 18:17:45 +0000646void mp_lexer_to_next(mp_lexer_t *lex) {
647 mp_lexer_next_token_into(lex, &lex->tok_cur, false);
Damien429d7192013-10-04 19:53:11 +0100648}
649
Damiend99b0522013-12-21 18:17:45 +0000650const mp_token_t *mp_lexer_cur(const mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100651 return &lex->tok_cur;
652}
653
Damiend99b0522013-12-21 18:17:45 +0000654bool mp_lexer_is_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
Damien429d7192013-10-04 19:53:11 +0100655 return lex->tok_cur.kind == kind;
656}
657
658/*
Damiend99b0522013-12-21 18:17:45 +0000659bool mp_lexer_is_str(mp_lexer_t *lex, const char *str) {
660 return mp_token_is_str(&lex->tok_cur, str);
Damien429d7192013-10-04 19:53:11 +0100661}
662
Damiend99b0522013-12-21 18:17:45 +0000663bool mp_lexer_opt_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
664 if (mp_lexer_is_kind(lex, kind)) {
665 mp_lexer_to_next(lex);
Damien429d7192013-10-04 19:53:11 +0100666 return true;
667 }
668 return false;
669}
670
Damiend99b0522013-12-21 18:17:45 +0000671bool mp_lexer_opt_str(mp_lexer_t *lex, const char *str) {
672 if (mp_lexer_is_str(lex, str)) {
673 mp_lexer_to_next(lex);
Damien429d7192013-10-04 19:53:11 +0100674 return true;
675 }
676 return false;
677}
678*/
679
Damiend99b0522013-12-21 18:17:45 +0000680bool mp_lexer_show_error(mp_lexer_t *lex, const char *msg) {
681 return mp_token_show_error(&lex->tok_cur, msg);
Damien429d7192013-10-04 19:53:11 +0100682}
Damien91d387d2013-10-09 15:09:52 +0100683
Damiend99b0522013-12-21 18:17:45 +0000684bool mp_lexer_show_error_pythonic(mp_lexer_t *lex, const char *msg) {
Damien91d387d2013-10-09 15:09:52 +0100685 printf(" File \"%s\", line %d column %d\n%s\n", lex->tok_cur.src_name, lex->tok_cur.src_line, lex->tok_cur.src_column, msg);
686 return false;
687}