blob: da8967b163ef726d3592c3df014a5378135ef45d [file] [log] [blame]
Damien429d7192013-10-04 19:53:11 +01001/* lexer.c -- simple tokeniser for Python implementation
2 */
3
4#include <stdint.h>
5#include <stdio.h>
6#include <assert.h>
7
8#include "misc.h"
9#include "lexer.h"
10
11#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010012
Damien92c06562013-10-22 22:32:27 +010013// TODO seems that CPython allows NULL byte in the input stream
14// don't know if that's intentional or not, but we don't allow it
15
Damiend99b0522013-12-21 18:17:45 +000016struct _mp_lexer_t {
Damiena5185f42013-10-20 14:41:27 +010017 const char *name; // name of source
18 void *stream_data; // data for stream
Damiend99b0522013-12-21 18:17:45 +000019 mp_lexer_stream_next_char_t stream_next_char; // stream callback to get next char
20 mp_lexer_stream_close_t stream_close; // stream callback to free
Damien429d7192013-10-04 19:53:11 +010021
Damiena5185f42013-10-20 14:41:27 +010022 unichar chr0, chr1, chr2; // current cached characters from source
Damien429d7192013-10-04 19:53:11 +010023
24 uint line; // source line
25 uint column; // source column
26
Damiena5185f42013-10-20 14:41:27 +010027 int emit_dent; // non-zero when there are INDENT/DEDENT tokens to emit
28 int nested_bracket_level; // >0 when there are nested brackets over multiple lines
Damien429d7192013-10-04 19:53:11 +010029
30 uint alloc_indent_level;
31 uint num_indent_level;
32 uint16_t *indent_level;
33
Damiena5185f42013-10-20 14:41:27 +010034 vstr_t vstr;
Damiend99b0522013-12-21 18:17:45 +000035 mp_token_t tok_cur;
Damien429d7192013-10-04 19:53:11 +010036};
37
Damiena5185f42013-10-20 14:41:27 +010038bool str_strn_equal(const char *str, const char *strn, int len) {
Damien429d7192013-10-04 19:53:11 +010039 uint i = 0;
Damien429d7192013-10-04 19:53:11 +010040
Damiena5185f42013-10-20 14:41:27 +010041 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010042 ++i;
Damien429d7192013-10-04 19:53:11 +010043 ++str;
Damiena5185f42013-10-20 14:41:27 +010044 ++strn;
Damien429d7192013-10-04 19:53:11 +010045 }
46
Damiena5185f42013-10-20 14:41:27 +010047 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010048}
49
Damiend99b0522013-12-21 18:17:45 +000050void mp_token_show(const mp_token_t *tok) {
Damiena5185f42013-10-20 14:41:27 +010051 printf("(%s:%d:%d) kind:%d str:%p len:%d", tok->src_name, tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
Damien429d7192013-10-04 19:53:11 +010052 if (tok->str != NULL && tok->len > 0) {
53 const char *i = tok->str;
54 const char *j = i + tok->len;
55 printf(" ");
56 while (i < j) {
Damien George8cc96a32013-12-30 18:23:50 +000057 unichar c = utf8_get_char(i);
58 i = utf8_next_char(i);
59 if (unichar_isprint(c)) {
Damien429d7192013-10-04 19:53:11 +010060 printf("%c", c);
61 } else {
62 printf("?");
63 }
64 }
65 }
66 printf("\n");
67}
68
Damiend99b0522013-12-21 18:17:45 +000069void mp_token_show_error_prefix(const mp_token_t *tok) {
Damien429d7192013-10-04 19:53:11 +010070 printf("(%s:%d:%d) ", tok->src_name, tok->src_line, tok->src_column);
71}
72
Damiend99b0522013-12-21 18:17:45 +000073bool mp_token_show_error(const mp_token_t *tok, const char *msg) {
Damien429d7192013-10-04 19:53:11 +010074 printf("(%s:%d:%d) %s\n", tok->src_name, tok->src_line, tok->src_column, msg);
75 return false;
76}
77
Damiena5185f42013-10-20 14:41:27 +010078#define CUR_CHAR(lex) ((lex)->chr0)
79
Damiend99b0522013-12-21 18:17:45 +000080static bool is_end(mp_lexer_t *lex) {
81 return lex->chr0 == MP_LEXER_CHAR_EOF;
Damien429d7192013-10-04 19:53:11 +010082}
83
Damiend99b0522013-12-21 18:17:45 +000084static bool is_physical_newline(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +010085 return lex->chr0 == '\n' || lex->chr0 == '\r';
86}
87
Damiend99b0522013-12-21 18:17:45 +000088static bool is_char(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010089 return lex->chr0 == c;
90}
91
Damiend99b0522013-12-21 18:17:45 +000092static bool is_char_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010093 return lex->chr0 == c1 || lex->chr0 == c2;
94}
95
Damiend99b0522013-12-21 18:17:45 +000096static bool is_char_or3(mp_lexer_t *lex, char c1, char c2, char c3) {
Damien429d7192013-10-04 19:53:11 +010097 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
98}
99
100/*
Damiend99b0522013-12-21 18:17:45 +0000101static bool is_char_following(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +0100102 return lex->chr1 == c;
103}
104*/
105
Damiend99b0522013-12-21 18:17:45 +0000106static bool is_char_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100107 return lex->chr1 == c1 || lex->chr1 == c2;
108}
109
Damiend99b0522013-12-21 18:17:45 +0000110static bool is_char_following_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100111 return lex->chr2 == c1 || lex->chr2 == c2;
112}
113
Damiend99b0522013-12-21 18:17:45 +0000114static bool is_char_and(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100115 return lex->chr0 == c1 && lex->chr1 == c2;
116}
117
Damiend99b0522013-12-21 18:17:45 +0000118static bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000119 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100120}
121
Damiend99b0522013-12-21 18:17:45 +0000122static bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000123 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100124}
125
Damiend99b0522013-12-21 18:17:45 +0000126static bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000127 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100128}
129
Damiend99b0522013-12-21 18:17:45 +0000130static bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000131 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100132}
133
134// TODO UNICODE include unicode characters in definition of identifiers
Damiend99b0522013-12-21 18:17:45 +0000135static bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100136 return is_letter(lex) || lex->chr0 == '_';
137}
138
139// TODO UNICODE include unicode characters in definition of identifiers
Damiend99b0522013-12-21 18:17:45 +0000140static bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100141 return is_head_of_identifier(lex) || is_digit(lex);
142}
143
Damiend99b0522013-12-21 18:17:45 +0000144static void next_char(mp_lexer_t *lex) {
145 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100146 return;
147 }
148
149 int advance = 1;
150
151 if (lex->chr0 == '\n') {
152 // LF is a new line
153 ++lex->line;
154 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100155 } else if (lex->chr0 == '\r') {
156 // CR is a new line
157 ++lex->line;
158 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100159 if (lex->chr1 == '\n') {
160 // CR LF is a single new line
161 advance = 2;
162 }
163 } else if (lex->chr0 == '\t') {
164 // a tab
165 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
166 } else {
167 // a character worth one column
168 ++lex->column;
169 }
170
171 for (; advance > 0; advance--) {
172 lex->chr0 = lex->chr1;
173 lex->chr1 = lex->chr2;
Damiena5185f42013-10-20 14:41:27 +0100174 lex->chr2 = lex->stream_next_char(lex->stream_data);
Damiend99b0522013-12-21 18:17:45 +0000175 if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100176 // EOF
Damiend99b0522013-12-21 18:17:45 +0000177 if (lex->chr1 != MP_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien429d7192013-10-04 19:53:11 +0100178 lex->chr2 = '\n'; // insert newline at end of file
Damien429d7192013-10-04 19:53:11 +0100179 }
180 }
181 }
182}
183
Damiend99b0522013-12-21 18:17:45 +0000184void indent_push(mp_lexer_t *lex, uint indent) {
Damien429d7192013-10-04 19:53:11 +0100185 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien732407f2013-12-29 19:33:23 +0000186 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level * 2);
Damien429d7192013-10-04 19:53:11 +0100187 lex->alloc_indent_level *= 2;
Damien429d7192013-10-04 19:53:11 +0100188 }
189 lex->indent_level[lex->num_indent_level++] = indent;
190}
191
Damiend99b0522013-12-21 18:17:45 +0000192uint indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100193 return lex->indent_level[lex->num_indent_level - 1];
194}
195
Damiend99b0522013-12-21 18:17:45 +0000196void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100197 lex->num_indent_level -= 1;
198}
199
200// some tricky operator encoding:
201// <op> = begin with <op>, if this opchar matches then begin here
202// e<op> = end with <op>, if this opchar matches then end
203// E<op> = mandatory end with <op>, this opchar must match, then end
204// c<op> = continue with <op>, if this opchar matches then continue matching
205// this means if the start of two ops are the same then they are equal til the last char
206
207static const char *tok_enc =
208 "()[]{},:;@~" // singles
209 "<e=c<e=" // < <= << <<=
210 ">e=c>e=" // > >= >> >>=
211 "*e=c*e=" // * *= ** **=
212 "+e=" // + +=
213 "-e=e>" // - -= ->
214 "&e=" // & &=
215 "|e=" // | |=
216 "/e=c/e=" // / /= // //=
217 "%e=" // % %=
218 "^e=" // ^ ^=
219 "=e=" // = ==
220 "!E=" // !=
221 ".c.E."; // . ...
222
223// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
224static const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000225 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
226 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
227 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
228 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100229
Damiend99b0522013-12-21 18:17:45 +0000230 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
231 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
232 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
233 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
234 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
235 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
236 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
237 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
238 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
239 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
240 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
241 MP_TOKEN_OP_NOT_EQUAL,
Damien Georgee9906ac2014-01-04 18:44:46 +0000242 MP_TOKEN_DEL_PERIOD, MP_TOKEN_ELLIPSIS,
Damien429d7192013-10-04 19:53:11 +0100243};
244
245// must have the same order as enum in lexer.h
246static const char *tok_kw[] = {
247 "False",
248 "None",
249 "True",
250 "and",
251 "as",
252 "assert",
253 "break",
254 "class",
255 "continue",
256 "def",
257 "del",
258 "elif",
259 "else",
260 "except",
261 "finally",
262 "for",
263 "from",
264 "global",
265 "if",
266 "import",
267 "in",
268 "is",
269 "lambda",
270 "nonlocal",
271 "not",
272 "or",
273 "pass",
274 "raise",
275 "return",
276 "try",
277 "while",
278 "with",
279 "yield",
280 NULL,
281};
282
Damiend99b0522013-12-21 18:17:45 +0000283static void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) {
Damiena5185f42013-10-20 14:41:27 +0100284 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100285 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100286 while (!is_end(lex)) {
287 if (is_physical_newline(lex)) {
288 had_physical_newline = true;
289 next_char(lex);
290 } else if (is_whitespace(lex)) {
291 next_char(lex);
292 } else if (is_char(lex, '#')) {
293 next_char(lex);
294 while (!is_end(lex) && !is_physical_newline(lex)) {
295 next_char(lex);
296 }
297 // had_physical_newline will be set on next loop
298 } else if (is_char(lex, '\\')) {
299 // backslash (outside string literals) must appear just before a physical newline
300 next_char(lex);
301 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000302 // SyntaxError: unexpected character after line continuation character
303 tok->src_name = lex->name;
304 tok->src_line = lex->line;
305 tok->src_column = lex->column;
306 tok->kind = MP_TOKEN_BAD_LINE_CONTINUATION;
307 vstr_reset(&lex->vstr);
308 tok->str = vstr_str(&lex->vstr);
309 tok->len = 0;
310 return;
Damien429d7192013-10-04 19:53:11 +0100311 } else {
312 next_char(lex);
313 }
314 } else {
315 break;
316 }
317 }
318
Damiena5185f42013-10-20 14:41:27 +0100319 // set token source information
Damien429d7192013-10-04 19:53:11 +0100320 tok->src_name = lex->name;
321 tok->src_line = lex->line;
322 tok->src_column = lex->column;
Damien429d7192013-10-04 19:53:11 +0100323
Damiena5185f42013-10-20 14:41:27 +0100324 // start new token text
325 vstr_reset(&lex->vstr);
326
327 if (first_token && lex->line == 1 && lex->column != 1) {
328 // check that the first token is in the first column
329 // if first token is not on first line, we get a physical newline and
330 // this check is done as part of normal indent/dedent checking below
331 // (done to get equivalence with CPython)
Damiend99b0522013-12-21 18:17:45 +0000332 tok->kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100333
334 } else if (lex->emit_dent < 0) {
Damiend99b0522013-12-21 18:17:45 +0000335 tok->kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100336 lex->emit_dent += 1;
337
338 } else if (lex->emit_dent > 0) {
Damiend99b0522013-12-21 18:17:45 +0000339 tok->kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100340 lex->emit_dent -= 1;
341
Damien91d387d2013-10-09 15:09:52 +0100342 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damiend99b0522013-12-21 18:17:45 +0000343 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100344
345 uint num_spaces = lex->column - 1;
346 lex->emit_dent = 0;
347 if (num_spaces == indent_top(lex)) {
348 } else if (num_spaces > indent_top(lex)) {
349 indent_push(lex, num_spaces);
350 lex->emit_dent += 1;
351 } else {
352 while (num_spaces < indent_top(lex)) {
353 indent_pop(lex);
354 lex->emit_dent -= 1;
355 }
356 if (num_spaces != indent_top(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000357 tok->kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100358 }
359 }
360
361 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100362 if (indent_top(lex) > 0) {
Damiend99b0522013-12-21 18:17:45 +0000363 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100364 lex->emit_dent = 0;
365 while (indent_top(lex) > 0) {
366 indent_pop(lex);
367 lex->emit_dent -= 1;
368 }
369 } else {
Damiend99b0522013-12-21 18:17:45 +0000370 tok->kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100371 }
372
373 } else if (is_char_or(lex, '\'', '\"')
374 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
375 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
376 // a string or bytes literal
377
378 // parse type codes
379 bool is_raw = false;
380 bool is_bytes = false;
381 if (is_char(lex, 'u')) {
382 next_char(lex);
383 } else if (is_char(lex, 'b')) {
384 is_bytes = true;
385 next_char(lex);
386 if (is_char(lex, 'r')) {
387 is_raw = true;
388 next_char(lex);
389 }
390 } else if (is_char(lex, 'r')) {
391 is_raw = true;
392 next_char(lex);
393 if (is_char(lex, 'b')) {
394 is_bytes = true;
395 next_char(lex);
396 }
397 }
398
399 // set token kind
400 if (is_bytes) {
Damiend99b0522013-12-21 18:17:45 +0000401 tok->kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100402 } else {
Damiend99b0522013-12-21 18:17:45 +0000403 tok->kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100404 }
405
406 // get first quoting character
407 char quote_char = '\'';
408 if (is_char(lex, '\"')) {
409 quote_char = '\"';
410 }
411 next_char(lex);
412
413 // work out if it's a single or triple quoted literal
414 int num_quotes;
415 if (is_char_and(lex, quote_char, quote_char)) {
416 // triple quotes
417 next_char(lex);
418 next_char(lex);
419 num_quotes = 3;
420 } else {
421 // single quotes
422 num_quotes = 1;
423 }
424
Damien429d7192013-10-04 19:53:11 +0100425 // parse the literal
Damien429d7192013-10-04 19:53:11 +0100426 int n_closing = 0;
427 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
428 if (is_char(lex, quote_char)) {
429 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100430 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100431 } else {
432 n_closing = 0;
433 if (!is_raw && is_char(lex, '\\')) {
434 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100435 unichar c = CUR_CHAR(lex);
436 switch (c) {
Damiend99b0522013-12-21 18:17:45 +0000437 case MP_LEXER_CHAR_EOF: break; // TODO a proper error message?
438 case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damiena5185f42013-10-20 14:41:27 +0100439 case '\\': break;
440 case '\'': break;
441 case '"': break;
442 case 'a': c = 0x07; break;
443 case 'b': c = 0x08; break;
444 case 't': c = 0x09; break;
445 case 'n': c = 0x0a; break;
446 case 'v': c = 0x0b; break;
447 case 'f': c = 0x0c; break;
448 case 'r': c = 0x0d; break;
449 // TODO \ooo octal
450 case 'x': // TODO \xhh
451 case 'N': // TODO \N{name} only in strings
452 case 'u': // TODO \uxxxx only in strings
453 case 'U': // TODO \Uxxxxxxxx only in strings
454 default: break; // TODO error message
455 }
Damiend99b0522013-12-21 18:17:45 +0000456 if (c != MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100457 vstr_add_char(&lex->vstr, c);
458 }
459 } else {
460 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100461 }
462 }
463 next_char(lex);
464 }
465
466 // check we got the required end quotes
467 if (n_closing < num_quotes) {
Damiend99b0522013-12-21 18:17:45 +0000468 tok->kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100469 }
470
Damiena5185f42013-10-20 14:41:27 +0100471 // cut off the end quotes from the token text
472 vstr_cut_tail(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100473
474 } else if (is_head_of_identifier(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000475 tok->kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100476
Damiena5185f42013-10-20 14:41:27 +0100477 // get first char
478 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100479 next_char(lex);
480
Damiena5185f42013-10-20 14:41:27 +0100481 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100482 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100483 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100484 next_char(lex);
485 }
486
487 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damiend99b0522013-12-21 18:17:45 +0000488 tok->kind = MP_TOKEN_NUMBER;
Damien429d7192013-10-04 19:53:11 +0100489
Damiena5185f42013-10-20 14:41:27 +0100490 // get first char
491 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100492 next_char(lex);
493
Damiena5185f42013-10-20 14:41:27 +0100494 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100495 while (!is_end(lex)) {
496 if (is_char_or(lex, 'e', 'E')) {
Damiena5185f42013-10-20 14:41:27 +0100497 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100498 next_char(lex);
499 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100500 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100501 next_char(lex);
502 }
503 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
Damiena5185f42013-10-20 14:41:27 +0100504 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100505 next_char(lex);
506 } else {
507 break;
508 }
509 }
510
511 } else {
512 // search for encoded delimiter or operator
513
514 const char *t = tok_enc;
515 uint tok_enc_index = 0;
516 for (; *t != 0 && !is_char(lex, *t); t += 1) {
517 if (*t == 'e' || *t == 'c') {
518 t += 1;
519 } else if (*t == 'E') {
520 tok_enc_index -= 1;
521 t += 1;
522 }
523 tok_enc_index += 1;
524 }
525
526 next_char(lex);
527
528 if (*t == 0) {
529 // didn't match any delimiter or operator characters
Damiend99b0522013-12-21 18:17:45 +0000530 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100531
532 } else {
533 // matched a delimiter or operator character
534
535 // get the maximum characters for a valid token
536 t += 1;
537 uint t_index = tok_enc_index;
538 for (;;) {
539 for (; *t == 'e'; t += 1) {
540 t += 1;
541 t_index += 1;
542 if (is_char(lex, *t)) {
543 next_char(lex);
544 tok_enc_index = t_index;
545 break;
546 }
547 }
548
549 if (*t == 'E') {
550 t += 1;
551 if (is_char(lex, *t)) {
552 next_char(lex);
553 tok_enc_index = t_index;
554 } else {
Damiend99b0522013-12-21 18:17:45 +0000555 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100556 }
557 break;
558 }
559
560 if (*t == 'c') {
561 t += 1;
562 t_index += 1;
563 if (is_char(lex, *t)) {
564 next_char(lex);
565 tok_enc_index = t_index;
566 t += 1;
567 } else {
568 break;
569 }
570 } else {
571 break;
572 }
573 }
574
575 // set token kind
576 tok->kind = tok_enc_kind[tok_enc_index];
577
578 // compute bracket level for implicit line joining
Damiend99b0522013-12-21 18:17:45 +0000579 if (tok->kind == MP_TOKEN_DEL_PAREN_OPEN || tok->kind == MP_TOKEN_DEL_BRACKET_OPEN || tok->kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100580 lex->nested_bracket_level += 1;
Damiend99b0522013-12-21 18:17:45 +0000581 } else if (tok->kind == MP_TOKEN_DEL_PAREN_CLOSE || tok->kind == MP_TOKEN_DEL_BRACKET_CLOSE || tok->kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100582 lex->nested_bracket_level -= 1;
583 }
584 }
585 }
586
Damiena5185f42013-10-20 14:41:27 +0100587 // point token text to vstr buffer
588 tok->str = vstr_str(&lex->vstr);
589 tok->len = vstr_len(&lex->vstr);
Damien429d7192013-10-04 19:53:11 +0100590
Damiena5185f42013-10-20 14:41:27 +0100591 // check for keywords
Damiend99b0522013-12-21 18:17:45 +0000592 if (tok->kind == MP_TOKEN_NAME) {
Damien429d7192013-10-04 19:53:11 +0100593 for (int i = 0; tok_kw[i] != NULL; i++) {
Damiena5185f42013-10-20 14:41:27 +0100594 if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
Damiend99b0522013-12-21 18:17:45 +0000595 tok->kind = MP_TOKEN_KW_FALSE + i;
Damien429d7192013-10-04 19:53:11 +0100596 break;
597 }
598 }
599 }
600}
601
Damiend99b0522013-12-21 18:17:45 +0000602mp_lexer_t *mp_lexer_new(const char *src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) {
603 mp_lexer_t *lex = m_new(mp_lexer_t, 1);
Damien429d7192013-10-04 19:53:11 +0100604
Damiena5185f42013-10-20 14:41:27 +0100605 lex->name = src_name; // TODO do we need to strdup this?
606 lex->stream_data = stream_data;
607 lex->stream_next_char = stream_next_char;
Damienfa2162b2013-10-20 17:42:00 +0100608 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100609 lex->line = 1;
610 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100611 lex->emit_dent = 0;
612 lex->nested_bracket_level = 0;
613 lex->alloc_indent_level = 16;
614 lex->num_indent_level = 1;
615 lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
616 lex->indent_level[0] = 0;
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200617 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100618
619 // preload characters
Damiena5185f42013-10-20 14:41:27 +0100620 lex->chr0 = stream_next_char(stream_data);
621 lex->chr1 = stream_next_char(stream_data);
622 lex->chr2 = stream_next_char(stream_data);
623
624 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damiend99b0522013-12-21 18:17:45 +0000625 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100626 lex->chr0 = '\n';
Damiend99b0522013-12-21 18:17:45 +0000627 } else if (lex->chr1 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100628 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100629 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100630 }
Damiend99b0522013-12-21 18:17:45 +0000631 } else if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100632 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100633 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100634 }
Damien429d7192013-10-04 19:53:11 +0100635 }
636
Damiena5185f42013-10-20 14:41:27 +0100637 // preload first token
Damiend99b0522013-12-21 18:17:45 +0000638 mp_lexer_next_token_into(lex, &lex->tok_cur, true);
Damien429d7192013-10-04 19:53:11 +0100639
640 return lex;
641}
642
Damiend99b0522013-12-21 18:17:45 +0000643void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100644 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100645 if (lex->stream_close) {
646 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100647 }
Damienbb5316b2013-10-22 21:12:29 +0100648 vstr_clear(&lex->vstr);
Damien732407f2013-12-29 19:33:23 +0000649 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100650 }
Damien429d7192013-10-04 19:53:11 +0100651}
652
Damiend99b0522013-12-21 18:17:45 +0000653void mp_lexer_to_next(mp_lexer_t *lex) {
654 mp_lexer_next_token_into(lex, &lex->tok_cur, false);
Damien429d7192013-10-04 19:53:11 +0100655}
656
Damiend99b0522013-12-21 18:17:45 +0000657const mp_token_t *mp_lexer_cur(const mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100658 return &lex->tok_cur;
659}
660
Damiend99b0522013-12-21 18:17:45 +0000661bool mp_lexer_is_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
Damien429d7192013-10-04 19:53:11 +0100662 return lex->tok_cur.kind == kind;
663}
664
665/*
Damiend99b0522013-12-21 18:17:45 +0000666bool mp_lexer_is_str(mp_lexer_t *lex, const char *str) {
667 return mp_token_is_str(&lex->tok_cur, str);
Damien429d7192013-10-04 19:53:11 +0100668}
669
Damiend99b0522013-12-21 18:17:45 +0000670bool mp_lexer_opt_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
671 if (mp_lexer_is_kind(lex, kind)) {
672 mp_lexer_to_next(lex);
Damien429d7192013-10-04 19:53:11 +0100673 return true;
674 }
675 return false;
676}
677
Damiend99b0522013-12-21 18:17:45 +0000678bool mp_lexer_opt_str(mp_lexer_t *lex, const char *str) {
679 if (mp_lexer_is_str(lex, str)) {
680 mp_lexer_to_next(lex);
Damien429d7192013-10-04 19:53:11 +0100681 return true;
682 }
683 return false;
684}
685*/
686
Damiend99b0522013-12-21 18:17:45 +0000687bool mp_lexer_show_error(mp_lexer_t *lex, const char *msg) {
688 return mp_token_show_error(&lex->tok_cur, msg);
Damien429d7192013-10-04 19:53:11 +0100689}
Damien91d387d2013-10-09 15:09:52 +0100690
Damiend99b0522013-12-21 18:17:45 +0000691bool mp_lexer_show_error_pythonic(mp_lexer_t *lex, const char *msg) {
Damien91d387d2013-10-09 15:09:52 +0100692 printf(" File \"%s\", line %d column %d\n%s\n", lex->tok_cur.src_name, lex->tok_cur.src_line, lex->tok_cur.src_column, msg);
693 return false;
694}