blob: e8c6bc3082e3695560689e2aa87ca2c90be34e8b [file] [log] [blame]
Damien429d7192013-10-04 19:53:11 +01001/* lexer.c -- simple tokeniser for Python implementation
2 */
3
4#include <stdint.h>
5#include <stdio.h>
Damien George08335002014-01-18 23:24:36 +00006#include <string.h>
Damien429d7192013-10-04 19:53:11 +01007#include <assert.h>
8
9#include "misc.h"
10#include "lexer.h"
11
12#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010013
Damien92c06562013-10-22 22:32:27 +010014// TODO seems that CPython allows NULL byte in the input stream
15// don't know if that's intentional or not, but we don't allow it
16
Damiend99b0522013-12-21 18:17:45 +000017struct _mp_lexer_t {
Damien George08335002014-01-18 23:24:36 +000018 qstr source_name; // name of source
Damiena5185f42013-10-20 14:41:27 +010019 void *stream_data; // data for stream
Damiend99b0522013-12-21 18:17:45 +000020 mp_lexer_stream_next_char_t stream_next_char; // stream callback to get next char
21 mp_lexer_stream_close_t stream_close; // stream callback to free
Damien429d7192013-10-04 19:53:11 +010022
Damiena5185f42013-10-20 14:41:27 +010023 unichar chr0, chr1, chr2; // current cached characters from source
Damien429d7192013-10-04 19:53:11 +010024
25 uint line; // source line
26 uint column; // source column
27
Damiena5185f42013-10-20 14:41:27 +010028 int emit_dent; // non-zero when there are INDENT/DEDENT tokens to emit
29 int nested_bracket_level; // >0 when there are nested brackets over multiple lines
Damien429d7192013-10-04 19:53:11 +010030
31 uint alloc_indent_level;
32 uint num_indent_level;
33 uint16_t *indent_level;
34
Damiena5185f42013-10-20 14:41:27 +010035 vstr_t vstr;
Damiend99b0522013-12-21 18:17:45 +000036 mp_token_t tok_cur;
Damien429d7192013-10-04 19:53:11 +010037};
38
Damien George9528cd62014-01-15 21:23:31 +000039// TODO replace with a call to a standard function
Damiena5185f42013-10-20 14:41:27 +010040bool str_strn_equal(const char *str, const char *strn, int len) {
Damien429d7192013-10-04 19:53:11 +010041 uint i = 0;
Damien429d7192013-10-04 19:53:11 +010042
Damiena5185f42013-10-20 14:41:27 +010043 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010044 ++i;
Damien429d7192013-10-04 19:53:11 +010045 ++str;
Damiena5185f42013-10-20 14:41:27 +010046 ++strn;
Damien429d7192013-10-04 19:53:11 +010047 }
48
Damiena5185f42013-10-20 14:41:27 +010049 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010050}
51
Damiend99b0522013-12-21 18:17:45 +000052void mp_token_show(const mp_token_t *tok) {
Damien George08335002014-01-18 23:24:36 +000053 printf("(%d:%d) kind:%d str:%p len:%d", tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
Damien429d7192013-10-04 19:53:11 +010054 if (tok->str != NULL && tok->len > 0) {
55 const char *i = tok->str;
56 const char *j = i + tok->len;
57 printf(" ");
58 while (i < j) {
Damien George8cc96a32013-12-30 18:23:50 +000059 unichar c = utf8_get_char(i);
60 i = utf8_next_char(i);
61 if (unichar_isprint(c)) {
Damien429d7192013-10-04 19:53:11 +010062 printf("%c", c);
63 } else {
64 printf("?");
65 }
66 }
67 }
68 printf("\n");
69}
70
Damiena5185f42013-10-20 14:41:27 +010071#define CUR_CHAR(lex) ((lex)->chr0)
72
Damiend99b0522013-12-21 18:17:45 +000073static bool is_end(mp_lexer_t *lex) {
74 return lex->chr0 == MP_LEXER_CHAR_EOF;
Damien429d7192013-10-04 19:53:11 +010075}
76
Damiend99b0522013-12-21 18:17:45 +000077static bool is_physical_newline(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +010078 return lex->chr0 == '\n' || lex->chr0 == '\r';
79}
80
Damiend99b0522013-12-21 18:17:45 +000081static bool is_char(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010082 return lex->chr0 == c;
83}
84
Damiend99b0522013-12-21 18:17:45 +000085static bool is_char_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010086 return lex->chr0 == c1 || lex->chr0 == c2;
87}
88
Damiend99b0522013-12-21 18:17:45 +000089static bool is_char_or3(mp_lexer_t *lex, char c1, char c2, char c3) {
Damien429d7192013-10-04 19:53:11 +010090 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
91}
92
93/*
Damiend99b0522013-12-21 18:17:45 +000094static bool is_char_following(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010095 return lex->chr1 == c;
96}
97*/
98
Damiend99b0522013-12-21 18:17:45 +000099static bool is_char_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100100 return lex->chr1 == c1 || lex->chr1 == c2;
101}
102
Damiend99b0522013-12-21 18:17:45 +0000103static bool is_char_following_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100104 return lex->chr2 == c1 || lex->chr2 == c2;
105}
106
Damiend99b0522013-12-21 18:17:45 +0000107static bool is_char_and(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100108 return lex->chr0 == c1 && lex->chr1 == c2;
109}
110
Damiend99b0522013-12-21 18:17:45 +0000111static bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000112 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100113}
114
Damiend99b0522013-12-21 18:17:45 +0000115static bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000116 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100117}
118
Damiend99b0522013-12-21 18:17:45 +0000119static bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000120 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100121}
122
Damiend99b0522013-12-21 18:17:45 +0000123static bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000124 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100125}
126
127// TODO UNICODE include unicode characters in definition of identifiers
Damiend99b0522013-12-21 18:17:45 +0000128static bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100129 return is_letter(lex) || lex->chr0 == '_';
130}
131
132// TODO UNICODE include unicode characters in definition of identifiers
Damiend99b0522013-12-21 18:17:45 +0000133static bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100134 return is_head_of_identifier(lex) || is_digit(lex);
135}
136
Damiend99b0522013-12-21 18:17:45 +0000137static void next_char(mp_lexer_t *lex) {
138 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100139 return;
140 }
141
142 int advance = 1;
143
144 if (lex->chr0 == '\n') {
145 // LF is a new line
146 ++lex->line;
147 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100148 } else if (lex->chr0 == '\r') {
149 // CR is a new line
150 ++lex->line;
151 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100152 if (lex->chr1 == '\n') {
153 // CR LF is a single new line
154 advance = 2;
155 }
156 } else if (lex->chr0 == '\t') {
157 // a tab
158 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
159 } else {
160 // a character worth one column
161 ++lex->column;
162 }
163
164 for (; advance > 0; advance--) {
165 lex->chr0 = lex->chr1;
166 lex->chr1 = lex->chr2;
Damiena5185f42013-10-20 14:41:27 +0100167 lex->chr2 = lex->stream_next_char(lex->stream_data);
Damiend99b0522013-12-21 18:17:45 +0000168 if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100169 // EOF
Damiend99b0522013-12-21 18:17:45 +0000170 if (lex->chr1 != MP_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien429d7192013-10-04 19:53:11 +0100171 lex->chr2 = '\n'; // insert newline at end of file
Damien429d7192013-10-04 19:53:11 +0100172 }
173 }
174 }
175}
176
Damiend99b0522013-12-21 18:17:45 +0000177void indent_push(mp_lexer_t *lex, uint indent) {
Damien429d7192013-10-04 19:53:11 +0100178 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien732407f2013-12-29 19:33:23 +0000179 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level * 2);
Damien429d7192013-10-04 19:53:11 +0100180 lex->alloc_indent_level *= 2;
Damien429d7192013-10-04 19:53:11 +0100181 }
182 lex->indent_level[lex->num_indent_level++] = indent;
183}
184
Damiend99b0522013-12-21 18:17:45 +0000185uint indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100186 return lex->indent_level[lex->num_indent_level - 1];
187}
188
Damiend99b0522013-12-21 18:17:45 +0000189void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100190 lex->num_indent_level -= 1;
191}
192
193// some tricky operator encoding:
194// <op> = begin with <op>, if this opchar matches then begin here
195// e<op> = end with <op>, if this opchar matches then end
196// E<op> = mandatory end with <op>, this opchar must match, then end
197// c<op> = continue with <op>, if this opchar matches then continue matching
198// this means if the start of two ops are the same then they are equal til the last char
199
200static const char *tok_enc =
201 "()[]{},:;@~" // singles
202 "<e=c<e=" // < <= << <<=
203 ">e=c>e=" // > >= >> >>=
204 "*e=c*e=" // * *= ** **=
205 "+e=" // + +=
206 "-e=e>" // - -= ->
207 "&e=" // & &=
208 "|e=" // | |=
209 "/e=c/e=" // / /= // //=
210 "%e=" // % %=
211 "^e=" // ^ ^=
212 "=e=" // = ==
213 "!E=" // !=
214 ".c.E."; // . ...
215
216// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
217static const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000218 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
219 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
220 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
221 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100222
Damiend99b0522013-12-21 18:17:45 +0000223 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
224 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
225 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
226 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
227 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
228 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
229 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
230 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
231 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
232 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
233 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
234 MP_TOKEN_OP_NOT_EQUAL,
Damien Georgee9906ac2014-01-04 18:44:46 +0000235 MP_TOKEN_DEL_PERIOD, MP_TOKEN_ELLIPSIS,
Damien429d7192013-10-04 19:53:11 +0100236};
237
238// must have the same order as enum in lexer.h
239static const char *tok_kw[] = {
240 "False",
241 "None",
242 "True",
243 "and",
244 "as",
245 "assert",
246 "break",
247 "class",
248 "continue",
249 "def",
250 "del",
251 "elif",
252 "else",
253 "except",
254 "finally",
255 "for",
256 "from",
257 "global",
258 "if",
259 "import",
260 "in",
261 "is",
262 "lambda",
263 "nonlocal",
264 "not",
265 "or",
266 "pass",
267 "raise",
268 "return",
269 "try",
270 "while",
271 "with",
272 "yield",
273 NULL,
274};
275
Damiend99b0522013-12-21 18:17:45 +0000276static void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) {
Damiena5185f42013-10-20 14:41:27 +0100277 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100278 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100279 while (!is_end(lex)) {
280 if (is_physical_newline(lex)) {
281 had_physical_newline = true;
282 next_char(lex);
283 } else if (is_whitespace(lex)) {
284 next_char(lex);
285 } else if (is_char(lex, '#')) {
286 next_char(lex);
287 while (!is_end(lex) && !is_physical_newline(lex)) {
288 next_char(lex);
289 }
290 // had_physical_newline will be set on next loop
291 } else if (is_char(lex, '\\')) {
292 // backslash (outside string literals) must appear just before a physical newline
293 next_char(lex);
294 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000295 // SyntaxError: unexpected character after line continuation character
Damien George69a818d2014-01-12 13:55:24 +0000296 tok->src_line = lex->line;
297 tok->src_column = lex->column;
298 tok->kind = MP_TOKEN_BAD_LINE_CONTINUATION;
299 vstr_reset(&lex->vstr);
300 tok->str = vstr_str(&lex->vstr);
301 tok->len = 0;
302 return;
Damien429d7192013-10-04 19:53:11 +0100303 } else {
304 next_char(lex);
305 }
306 } else {
307 break;
308 }
309 }
310
Damiena5185f42013-10-20 14:41:27 +0100311 // set token source information
Damien429d7192013-10-04 19:53:11 +0100312 tok->src_line = lex->line;
313 tok->src_column = lex->column;
Damien429d7192013-10-04 19:53:11 +0100314
Damiena5185f42013-10-20 14:41:27 +0100315 // start new token text
316 vstr_reset(&lex->vstr);
317
318 if (first_token && lex->line == 1 && lex->column != 1) {
319 // check that the first token is in the first column
320 // if first token is not on first line, we get a physical newline and
321 // this check is done as part of normal indent/dedent checking below
322 // (done to get equivalence with CPython)
Damiend99b0522013-12-21 18:17:45 +0000323 tok->kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100324
325 } else if (lex->emit_dent < 0) {
Damiend99b0522013-12-21 18:17:45 +0000326 tok->kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100327 lex->emit_dent += 1;
328
329 } else if (lex->emit_dent > 0) {
Damiend99b0522013-12-21 18:17:45 +0000330 tok->kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100331 lex->emit_dent -= 1;
332
Damien91d387d2013-10-09 15:09:52 +0100333 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damiend99b0522013-12-21 18:17:45 +0000334 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100335
336 uint num_spaces = lex->column - 1;
337 lex->emit_dent = 0;
338 if (num_spaces == indent_top(lex)) {
339 } else if (num_spaces > indent_top(lex)) {
340 indent_push(lex, num_spaces);
341 lex->emit_dent += 1;
342 } else {
343 while (num_spaces < indent_top(lex)) {
344 indent_pop(lex);
345 lex->emit_dent -= 1;
346 }
347 if (num_spaces != indent_top(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000348 tok->kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100349 }
350 }
351
352 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100353 if (indent_top(lex) > 0) {
Damiend99b0522013-12-21 18:17:45 +0000354 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100355 lex->emit_dent = 0;
356 while (indent_top(lex) > 0) {
357 indent_pop(lex);
358 lex->emit_dent -= 1;
359 }
360 } else {
Damiend99b0522013-12-21 18:17:45 +0000361 tok->kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100362 }
363
364 } else if (is_char_or(lex, '\'', '\"')
365 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
366 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
367 // a string or bytes literal
368
369 // parse type codes
370 bool is_raw = false;
371 bool is_bytes = false;
372 if (is_char(lex, 'u')) {
373 next_char(lex);
374 } else if (is_char(lex, 'b')) {
375 is_bytes = true;
376 next_char(lex);
377 if (is_char(lex, 'r')) {
378 is_raw = true;
379 next_char(lex);
380 }
381 } else if (is_char(lex, 'r')) {
382 is_raw = true;
383 next_char(lex);
384 if (is_char(lex, 'b')) {
385 is_bytes = true;
386 next_char(lex);
387 }
388 }
389
390 // set token kind
391 if (is_bytes) {
Damiend99b0522013-12-21 18:17:45 +0000392 tok->kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100393 } else {
Damiend99b0522013-12-21 18:17:45 +0000394 tok->kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100395 }
396
397 // get first quoting character
398 char quote_char = '\'';
399 if (is_char(lex, '\"')) {
400 quote_char = '\"';
401 }
402 next_char(lex);
403
404 // work out if it's a single or triple quoted literal
405 int num_quotes;
406 if (is_char_and(lex, quote_char, quote_char)) {
407 // triple quotes
408 next_char(lex);
409 next_char(lex);
410 num_quotes = 3;
411 } else {
412 // single quotes
413 num_quotes = 1;
414 }
415
Damien429d7192013-10-04 19:53:11 +0100416 // parse the literal
Damien429d7192013-10-04 19:53:11 +0100417 int n_closing = 0;
418 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
419 if (is_char(lex, quote_char)) {
420 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100421 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100422 } else {
423 n_closing = 0;
424 if (!is_raw && is_char(lex, '\\')) {
425 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100426 unichar c = CUR_CHAR(lex);
427 switch (c) {
Damiend99b0522013-12-21 18:17:45 +0000428 case MP_LEXER_CHAR_EOF: break; // TODO a proper error message?
429 case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damiena5185f42013-10-20 14:41:27 +0100430 case '\\': break;
431 case '\'': break;
432 case '"': break;
433 case 'a': c = 0x07; break;
434 case 'b': c = 0x08; break;
435 case 't': c = 0x09; break;
436 case 'n': c = 0x0a; break;
437 case 'v': c = 0x0b; break;
438 case 'f': c = 0x0c; break;
439 case 'r': c = 0x0d; break;
440 // TODO \ooo octal
441 case 'x': // TODO \xhh
442 case 'N': // TODO \N{name} only in strings
443 case 'u': // TODO \uxxxx only in strings
444 case 'U': // TODO \Uxxxxxxxx only in strings
445 default: break; // TODO error message
446 }
Damiend99b0522013-12-21 18:17:45 +0000447 if (c != MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100448 vstr_add_char(&lex->vstr, c);
449 }
450 } else {
451 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100452 }
453 }
454 next_char(lex);
455 }
456
457 // check we got the required end quotes
458 if (n_closing < num_quotes) {
Damiend99b0522013-12-21 18:17:45 +0000459 tok->kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100460 }
461
Damiena5185f42013-10-20 14:41:27 +0100462 // cut off the end quotes from the token text
463 vstr_cut_tail(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100464
465 } else if (is_head_of_identifier(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000466 tok->kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100467
Damiena5185f42013-10-20 14:41:27 +0100468 // get first char
469 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100470 next_char(lex);
471
Damiena5185f42013-10-20 14:41:27 +0100472 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100473 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100474 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100475 next_char(lex);
476 }
477
478 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damiend99b0522013-12-21 18:17:45 +0000479 tok->kind = MP_TOKEN_NUMBER;
Damien429d7192013-10-04 19:53:11 +0100480
Damiena5185f42013-10-20 14:41:27 +0100481 // get first char
482 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100483 next_char(lex);
484
Damiena5185f42013-10-20 14:41:27 +0100485 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100486 while (!is_end(lex)) {
487 if (is_char_or(lex, 'e', 'E')) {
Damiena5185f42013-10-20 14:41:27 +0100488 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100489 next_char(lex);
490 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100491 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100492 next_char(lex);
493 }
494 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
Damiena5185f42013-10-20 14:41:27 +0100495 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100496 next_char(lex);
497 } else {
498 break;
499 }
500 }
501
502 } else {
503 // search for encoded delimiter or operator
504
505 const char *t = tok_enc;
506 uint tok_enc_index = 0;
507 for (; *t != 0 && !is_char(lex, *t); t += 1) {
508 if (*t == 'e' || *t == 'c') {
509 t += 1;
510 } else if (*t == 'E') {
511 tok_enc_index -= 1;
512 t += 1;
513 }
514 tok_enc_index += 1;
515 }
516
517 next_char(lex);
518
519 if (*t == 0) {
520 // didn't match any delimiter or operator characters
Damiend99b0522013-12-21 18:17:45 +0000521 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100522
523 } else {
524 // matched a delimiter or operator character
525
526 // get the maximum characters for a valid token
527 t += 1;
528 uint t_index = tok_enc_index;
529 for (;;) {
530 for (; *t == 'e'; t += 1) {
531 t += 1;
532 t_index += 1;
533 if (is_char(lex, *t)) {
534 next_char(lex);
535 tok_enc_index = t_index;
536 break;
537 }
538 }
539
540 if (*t == 'E') {
541 t += 1;
542 if (is_char(lex, *t)) {
543 next_char(lex);
544 tok_enc_index = t_index;
545 } else {
Damiend99b0522013-12-21 18:17:45 +0000546 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100547 }
548 break;
549 }
550
551 if (*t == 'c') {
552 t += 1;
553 t_index += 1;
554 if (is_char(lex, *t)) {
555 next_char(lex);
556 tok_enc_index = t_index;
557 t += 1;
558 } else {
559 break;
560 }
561 } else {
562 break;
563 }
564 }
565
566 // set token kind
567 tok->kind = tok_enc_kind[tok_enc_index];
568
569 // compute bracket level for implicit line joining
Damiend99b0522013-12-21 18:17:45 +0000570 if (tok->kind == MP_TOKEN_DEL_PAREN_OPEN || tok->kind == MP_TOKEN_DEL_BRACKET_OPEN || tok->kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100571 lex->nested_bracket_level += 1;
Damiend99b0522013-12-21 18:17:45 +0000572 } else if (tok->kind == MP_TOKEN_DEL_PAREN_CLOSE || tok->kind == MP_TOKEN_DEL_BRACKET_CLOSE || tok->kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100573 lex->nested_bracket_level -= 1;
574 }
575 }
576 }
577
Damiena5185f42013-10-20 14:41:27 +0100578 // point token text to vstr buffer
579 tok->str = vstr_str(&lex->vstr);
580 tok->len = vstr_len(&lex->vstr);
Damien429d7192013-10-04 19:53:11 +0100581
Damiena5185f42013-10-20 14:41:27 +0100582 // check for keywords
Damiend99b0522013-12-21 18:17:45 +0000583 if (tok->kind == MP_TOKEN_NAME) {
Damien429d7192013-10-04 19:53:11 +0100584 for (int i = 0; tok_kw[i] != NULL; i++) {
Damiena5185f42013-10-20 14:41:27 +0100585 if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
Damiend99b0522013-12-21 18:17:45 +0000586 tok->kind = MP_TOKEN_KW_FALSE + i;
Damien429d7192013-10-04 19:53:11 +0100587 break;
588 }
589 }
590 }
591}
592
Damiend99b0522013-12-21 18:17:45 +0000593mp_lexer_t *mp_lexer_new(const char *src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) {
594 mp_lexer_t *lex = m_new(mp_lexer_t, 1);
Damien429d7192013-10-04 19:53:11 +0100595
Damien George08335002014-01-18 23:24:36 +0000596 lex->source_name = qstr_from_strn_copy(src_name, strlen(src_name));
Damiena5185f42013-10-20 14:41:27 +0100597 lex->stream_data = stream_data;
598 lex->stream_next_char = stream_next_char;
Damienfa2162b2013-10-20 17:42:00 +0100599 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100600 lex->line = 1;
601 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100602 lex->emit_dent = 0;
603 lex->nested_bracket_level = 0;
604 lex->alloc_indent_level = 16;
605 lex->num_indent_level = 1;
606 lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
607 lex->indent_level[0] = 0;
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200608 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100609
610 // preload characters
Damiena5185f42013-10-20 14:41:27 +0100611 lex->chr0 = stream_next_char(stream_data);
612 lex->chr1 = stream_next_char(stream_data);
613 lex->chr2 = stream_next_char(stream_data);
614
615 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damiend99b0522013-12-21 18:17:45 +0000616 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100617 lex->chr0 = '\n';
Damiend99b0522013-12-21 18:17:45 +0000618 } else if (lex->chr1 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100619 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100620 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100621 }
Damiend99b0522013-12-21 18:17:45 +0000622 } else if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100623 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100624 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100625 }
Damien429d7192013-10-04 19:53:11 +0100626 }
627
Damiena5185f42013-10-20 14:41:27 +0100628 // preload first token
Damiend99b0522013-12-21 18:17:45 +0000629 mp_lexer_next_token_into(lex, &lex->tok_cur, true);
Damien429d7192013-10-04 19:53:11 +0100630
631 return lex;
632}
633
Damiend99b0522013-12-21 18:17:45 +0000634void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100635 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100636 if (lex->stream_close) {
637 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100638 }
Damienbb5316b2013-10-22 21:12:29 +0100639 vstr_clear(&lex->vstr);
Damien732407f2013-12-29 19:33:23 +0000640 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100641 }
Damien429d7192013-10-04 19:53:11 +0100642}
643
Damien George08335002014-01-18 23:24:36 +0000644qstr mp_lexer_source_name(mp_lexer_t *lex) {
645 return lex->source_name;
646}
647
Damiend99b0522013-12-21 18:17:45 +0000648void mp_lexer_to_next(mp_lexer_t *lex) {
649 mp_lexer_next_token_into(lex, &lex->tok_cur, false);
Damien429d7192013-10-04 19:53:11 +0100650}
651
Damiend99b0522013-12-21 18:17:45 +0000652const mp_token_t *mp_lexer_cur(const mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100653 return &lex->tok_cur;
654}
655
Damiend99b0522013-12-21 18:17:45 +0000656bool mp_lexer_is_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
Damien429d7192013-10-04 19:53:11 +0100657 return lex->tok_cur.kind == kind;
658}
659
660/*
Damiend99b0522013-12-21 18:17:45 +0000661bool mp_lexer_is_str(mp_lexer_t *lex, const char *str) {
662 return mp_token_is_str(&lex->tok_cur, str);
Damien429d7192013-10-04 19:53:11 +0100663}
664
Damiend99b0522013-12-21 18:17:45 +0000665bool mp_lexer_opt_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
666 if (mp_lexer_is_kind(lex, kind)) {
667 mp_lexer_to_next(lex);
Damien429d7192013-10-04 19:53:11 +0100668 return true;
669 }
670 return false;
671}
672
Damiend99b0522013-12-21 18:17:45 +0000673bool mp_lexer_opt_str(mp_lexer_t *lex, const char *str) {
674 if (mp_lexer_is_str(lex, str)) {
675 mp_lexer_to_next(lex);
Damien429d7192013-10-04 19:53:11 +0100676 return true;
677 }
678 return false;
679}
680*/
681
Damien George9528cd62014-01-15 21:23:31 +0000682bool mp_lexer_show_error_pythonic_prefix(mp_lexer_t *lex) {
Damien George08335002014-01-18 23:24:36 +0000683 printf(" File \"%s\", line %d column %d\n", qstr_str(lex->source_name), lex->tok_cur.src_line, lex->tok_cur.src_column);
Damien George9528cd62014-01-15 21:23:31 +0000684 return false;
Damien429d7192013-10-04 19:53:11 +0100685}
Damien91d387d2013-10-09 15:09:52 +0100686
Damiend99b0522013-12-21 18:17:45 +0000687bool mp_lexer_show_error_pythonic(mp_lexer_t *lex, const char *msg) {
Damien George08335002014-01-18 23:24:36 +0000688 printf(" File \"%s\", line %d column %d\n%s\n", qstr_str(lex->source_name), lex->tok_cur.src_line, lex->tok_cur.src_column, msg);
Damien91d387d2013-10-09 15:09:52 +0100689 return false;
690}