blob: 6feb231e0cd1f79562b552a317d00842001d2d92 [file] [log] [blame]
Damien429d7192013-10-04 19:53:11 +01001/* lexer.c -- simple tokeniser for Python implementation
2 */
3
4#include <stdint.h>
5#include <stdio.h>
6#include <assert.h>
7
8#include "misc.h"
9#include "lexer.h"
10
11#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010012
Damien92c06562013-10-22 22:32:27 +010013// TODO seems that CPython allows NULL byte in the input stream
14// don't know if that's intentional or not, but we don't allow it
15
Damiend99b0522013-12-21 18:17:45 +000016struct _mp_lexer_t {
Damiena5185f42013-10-20 14:41:27 +010017 const char *name; // name of source
18 void *stream_data; // data for stream
Damiend99b0522013-12-21 18:17:45 +000019 mp_lexer_stream_next_char_t stream_next_char; // stream callback to get next char
20 mp_lexer_stream_close_t stream_close; // stream callback to free
Damien429d7192013-10-04 19:53:11 +010021
Damiena5185f42013-10-20 14:41:27 +010022 unichar chr0, chr1, chr2; // current cached characters from source
Damien429d7192013-10-04 19:53:11 +010023
24 uint line; // source line
25 uint column; // source column
26
Damiena5185f42013-10-20 14:41:27 +010027 int emit_dent; // non-zero when there are INDENT/DEDENT tokens to emit
28 int nested_bracket_level; // >0 when there are nested brackets over multiple lines
Damien429d7192013-10-04 19:53:11 +010029
30 uint alloc_indent_level;
31 uint num_indent_level;
32 uint16_t *indent_level;
33
Damiena5185f42013-10-20 14:41:27 +010034 vstr_t vstr;
Damiend99b0522013-12-21 18:17:45 +000035 mp_token_t tok_cur;
Damien429d7192013-10-04 19:53:11 +010036};
37
Damien George9528cd62014-01-15 21:23:31 +000038// TODO replace with a call to a standard function
Damiena5185f42013-10-20 14:41:27 +010039bool str_strn_equal(const char *str, const char *strn, int len) {
Damien429d7192013-10-04 19:53:11 +010040 uint i = 0;
Damien429d7192013-10-04 19:53:11 +010041
Damiena5185f42013-10-20 14:41:27 +010042 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010043 ++i;
Damien429d7192013-10-04 19:53:11 +010044 ++str;
Damiena5185f42013-10-20 14:41:27 +010045 ++strn;
Damien429d7192013-10-04 19:53:11 +010046 }
47
Damiena5185f42013-10-20 14:41:27 +010048 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010049}
50
Damiend99b0522013-12-21 18:17:45 +000051void mp_token_show(const mp_token_t *tok) {
Damiena5185f42013-10-20 14:41:27 +010052 printf("(%s:%d:%d) kind:%d str:%p len:%d", tok->src_name, tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
Damien429d7192013-10-04 19:53:11 +010053 if (tok->str != NULL && tok->len > 0) {
54 const char *i = tok->str;
55 const char *j = i + tok->len;
56 printf(" ");
57 while (i < j) {
Damien George8cc96a32013-12-30 18:23:50 +000058 unichar c = utf8_get_char(i);
59 i = utf8_next_char(i);
60 if (unichar_isprint(c)) {
Damien429d7192013-10-04 19:53:11 +010061 printf("%c", c);
62 } else {
63 printf("?");
64 }
65 }
66 }
67 printf("\n");
68}
69
Damiena5185f42013-10-20 14:41:27 +010070#define CUR_CHAR(lex) ((lex)->chr0)
71
Damiend99b0522013-12-21 18:17:45 +000072static bool is_end(mp_lexer_t *lex) {
73 return lex->chr0 == MP_LEXER_CHAR_EOF;
Damien429d7192013-10-04 19:53:11 +010074}
75
Damiend99b0522013-12-21 18:17:45 +000076static bool is_physical_newline(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +010077 return lex->chr0 == '\n' || lex->chr0 == '\r';
78}
79
Damiend99b0522013-12-21 18:17:45 +000080static bool is_char(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010081 return lex->chr0 == c;
82}
83
Damiend99b0522013-12-21 18:17:45 +000084static bool is_char_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010085 return lex->chr0 == c1 || lex->chr0 == c2;
86}
87
Damiend99b0522013-12-21 18:17:45 +000088static bool is_char_or3(mp_lexer_t *lex, char c1, char c2, char c3) {
Damien429d7192013-10-04 19:53:11 +010089 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
90}
91
92/*
Damiend99b0522013-12-21 18:17:45 +000093static bool is_char_following(mp_lexer_t *lex, char c) {
Damien429d7192013-10-04 19:53:11 +010094 return lex->chr1 == c;
95}
96*/
97
Damiend99b0522013-12-21 18:17:45 +000098static bool is_char_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +010099 return lex->chr1 == c1 || lex->chr1 == c2;
100}
101
Damiend99b0522013-12-21 18:17:45 +0000102static bool is_char_following_following_or(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100103 return lex->chr2 == c1 || lex->chr2 == c2;
104}
105
Damiend99b0522013-12-21 18:17:45 +0000106static bool is_char_and(mp_lexer_t *lex, char c1, char c2) {
Damien429d7192013-10-04 19:53:11 +0100107 return lex->chr0 == c1 && lex->chr1 == c2;
108}
109
Damiend99b0522013-12-21 18:17:45 +0000110static bool is_whitespace(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000111 return unichar_isspace(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100112}
113
Damiend99b0522013-12-21 18:17:45 +0000114static bool is_letter(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000115 return unichar_isalpha(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100116}
117
Damiend99b0522013-12-21 18:17:45 +0000118static bool is_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000119 return unichar_isdigit(lex->chr0);
Damien429d7192013-10-04 19:53:11 +0100120}
121
Damiend99b0522013-12-21 18:17:45 +0000122static bool is_following_digit(mp_lexer_t *lex) {
Damien George8cc96a32013-12-30 18:23:50 +0000123 return unichar_isdigit(lex->chr1);
Damien429d7192013-10-04 19:53:11 +0100124}
125
126// TODO UNICODE include unicode characters in definition of identifiers
Damiend99b0522013-12-21 18:17:45 +0000127static bool is_head_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100128 return is_letter(lex) || lex->chr0 == '_';
129}
130
131// TODO UNICODE include unicode characters in definition of identifiers
Damiend99b0522013-12-21 18:17:45 +0000132static bool is_tail_of_identifier(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100133 return is_head_of_identifier(lex) || is_digit(lex);
134}
135
Damiend99b0522013-12-21 18:17:45 +0000136static void next_char(mp_lexer_t *lex) {
137 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100138 return;
139 }
140
141 int advance = 1;
142
143 if (lex->chr0 == '\n') {
144 // LF is a new line
145 ++lex->line;
146 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100147 } else if (lex->chr0 == '\r') {
148 // CR is a new line
149 ++lex->line;
150 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100151 if (lex->chr1 == '\n') {
152 // CR LF is a single new line
153 advance = 2;
154 }
155 } else if (lex->chr0 == '\t') {
156 // a tab
157 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
158 } else {
159 // a character worth one column
160 ++lex->column;
161 }
162
163 for (; advance > 0; advance--) {
164 lex->chr0 = lex->chr1;
165 lex->chr1 = lex->chr2;
Damiena5185f42013-10-20 14:41:27 +0100166 lex->chr2 = lex->stream_next_char(lex->stream_data);
Damiend99b0522013-12-21 18:17:45 +0000167 if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100168 // EOF
Damiend99b0522013-12-21 18:17:45 +0000169 if (lex->chr1 != MP_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien429d7192013-10-04 19:53:11 +0100170 lex->chr2 = '\n'; // insert newline at end of file
Damien429d7192013-10-04 19:53:11 +0100171 }
172 }
173 }
174}
175
Damiend99b0522013-12-21 18:17:45 +0000176void indent_push(mp_lexer_t *lex, uint indent) {
Damien429d7192013-10-04 19:53:11 +0100177 if (lex->num_indent_level >= lex->alloc_indent_level) {
Damien732407f2013-12-29 19:33:23 +0000178 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level * 2);
Damien429d7192013-10-04 19:53:11 +0100179 lex->alloc_indent_level *= 2;
Damien429d7192013-10-04 19:53:11 +0100180 }
181 lex->indent_level[lex->num_indent_level++] = indent;
182}
183
Damiend99b0522013-12-21 18:17:45 +0000184uint indent_top(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100185 return lex->indent_level[lex->num_indent_level - 1];
186}
187
Damiend99b0522013-12-21 18:17:45 +0000188void indent_pop(mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100189 lex->num_indent_level -= 1;
190}
191
192// some tricky operator encoding:
193// <op> = begin with <op>, if this opchar matches then begin here
194// e<op> = end with <op>, if this opchar matches then end
195// E<op> = mandatory end with <op>, this opchar must match, then end
196// c<op> = continue with <op>, if this opchar matches then continue matching
197// this means if the start of two ops are the same then they are equal til the last char
198
199static const char *tok_enc =
200 "()[]{},:;@~" // singles
201 "<e=c<e=" // < <= << <<=
202 ">e=c>e=" // > >= >> >>=
203 "*e=c*e=" // * *= ** **=
204 "+e=" // + +=
205 "-e=e>" // - -= ->
206 "&e=" // & &=
207 "|e=" // | |=
208 "/e=c/e=" // / /= // //=
209 "%e=" // % %=
210 "^e=" // ^ ^=
211 "=e=" // = ==
212 "!E=" // !=
213 ".c.E."; // . ...
214
215// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
216static const uint8_t tok_enc_kind[] = {
Damiend99b0522013-12-21 18:17:45 +0000217 MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,
218 MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,
219 MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,
220 MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,
Damien429d7192013-10-04 19:53:11 +0100221
Damiend99b0522013-12-21 18:17:45 +0000222 MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,
223 MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,
224 MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,
225 MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,
226 MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,
227 MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,
228 MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,
229 MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,
230 MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
231 MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
232 MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
233 MP_TOKEN_OP_NOT_EQUAL,
Damien Georgee9906ac2014-01-04 18:44:46 +0000234 MP_TOKEN_DEL_PERIOD, MP_TOKEN_ELLIPSIS,
Damien429d7192013-10-04 19:53:11 +0100235};
236
237// must have the same order as enum in lexer.h
238static const char *tok_kw[] = {
239 "False",
240 "None",
241 "True",
242 "and",
243 "as",
244 "assert",
245 "break",
246 "class",
247 "continue",
248 "def",
249 "del",
250 "elif",
251 "else",
252 "except",
253 "finally",
254 "for",
255 "from",
256 "global",
257 "if",
258 "import",
259 "in",
260 "is",
261 "lambda",
262 "nonlocal",
263 "not",
264 "or",
265 "pass",
266 "raise",
267 "return",
268 "try",
269 "while",
270 "with",
271 "yield",
272 NULL,
273};
274
Damiend99b0522013-12-21 18:17:45 +0000275static void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) {
Damiena5185f42013-10-20 14:41:27 +0100276 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100277 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100278 while (!is_end(lex)) {
279 if (is_physical_newline(lex)) {
280 had_physical_newline = true;
281 next_char(lex);
282 } else if (is_whitespace(lex)) {
283 next_char(lex);
284 } else if (is_char(lex, '#')) {
285 next_char(lex);
286 while (!is_end(lex) && !is_physical_newline(lex)) {
287 next_char(lex);
288 }
289 // had_physical_newline will be set on next loop
290 } else if (is_char(lex, '\\')) {
291 // backslash (outside string literals) must appear just before a physical newline
292 next_char(lex);
293 if (!is_physical_newline(lex)) {
Damien George69a818d2014-01-12 13:55:24 +0000294 // SyntaxError: unexpected character after line continuation character
295 tok->src_name = lex->name;
296 tok->src_line = lex->line;
297 tok->src_column = lex->column;
298 tok->kind = MP_TOKEN_BAD_LINE_CONTINUATION;
299 vstr_reset(&lex->vstr);
300 tok->str = vstr_str(&lex->vstr);
301 tok->len = 0;
302 return;
Damien429d7192013-10-04 19:53:11 +0100303 } else {
304 next_char(lex);
305 }
306 } else {
307 break;
308 }
309 }
310
Damiena5185f42013-10-20 14:41:27 +0100311 // set token source information
Damien429d7192013-10-04 19:53:11 +0100312 tok->src_name = lex->name;
313 tok->src_line = lex->line;
314 tok->src_column = lex->column;
Damien429d7192013-10-04 19:53:11 +0100315
Damiena5185f42013-10-20 14:41:27 +0100316 // start new token text
317 vstr_reset(&lex->vstr);
318
319 if (first_token && lex->line == 1 && lex->column != 1) {
320 // check that the first token is in the first column
321 // if first token is not on first line, we get a physical newline and
322 // this check is done as part of normal indent/dedent checking below
323 // (done to get equivalence with CPython)
Damiend99b0522013-12-21 18:17:45 +0000324 tok->kind = MP_TOKEN_INDENT;
Damiena5185f42013-10-20 14:41:27 +0100325
326 } else if (lex->emit_dent < 0) {
Damiend99b0522013-12-21 18:17:45 +0000327 tok->kind = MP_TOKEN_DEDENT;
Damien429d7192013-10-04 19:53:11 +0100328 lex->emit_dent += 1;
329
330 } else if (lex->emit_dent > 0) {
Damiend99b0522013-12-21 18:17:45 +0000331 tok->kind = MP_TOKEN_INDENT;
Damien429d7192013-10-04 19:53:11 +0100332 lex->emit_dent -= 1;
333
Damien91d387d2013-10-09 15:09:52 +0100334 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damiend99b0522013-12-21 18:17:45 +0000335 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100336
337 uint num_spaces = lex->column - 1;
338 lex->emit_dent = 0;
339 if (num_spaces == indent_top(lex)) {
340 } else if (num_spaces > indent_top(lex)) {
341 indent_push(lex, num_spaces);
342 lex->emit_dent += 1;
343 } else {
344 while (num_spaces < indent_top(lex)) {
345 indent_pop(lex);
346 lex->emit_dent -= 1;
347 }
348 if (num_spaces != indent_top(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000349 tok->kind = MP_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100350 }
351 }
352
353 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100354 if (indent_top(lex) > 0) {
Damiend99b0522013-12-21 18:17:45 +0000355 tok->kind = MP_TOKEN_NEWLINE;
Damien429d7192013-10-04 19:53:11 +0100356 lex->emit_dent = 0;
357 while (indent_top(lex) > 0) {
358 indent_pop(lex);
359 lex->emit_dent -= 1;
360 }
361 } else {
Damiend99b0522013-12-21 18:17:45 +0000362 tok->kind = MP_TOKEN_END;
Damien429d7192013-10-04 19:53:11 +0100363 }
364
365 } else if (is_char_or(lex, '\'', '\"')
366 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
367 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
368 // a string or bytes literal
369
370 // parse type codes
371 bool is_raw = false;
372 bool is_bytes = false;
373 if (is_char(lex, 'u')) {
374 next_char(lex);
375 } else if (is_char(lex, 'b')) {
376 is_bytes = true;
377 next_char(lex);
378 if (is_char(lex, 'r')) {
379 is_raw = true;
380 next_char(lex);
381 }
382 } else if (is_char(lex, 'r')) {
383 is_raw = true;
384 next_char(lex);
385 if (is_char(lex, 'b')) {
386 is_bytes = true;
387 next_char(lex);
388 }
389 }
390
391 // set token kind
392 if (is_bytes) {
Damiend99b0522013-12-21 18:17:45 +0000393 tok->kind = MP_TOKEN_BYTES;
Damien429d7192013-10-04 19:53:11 +0100394 } else {
Damiend99b0522013-12-21 18:17:45 +0000395 tok->kind = MP_TOKEN_STRING;
Damien429d7192013-10-04 19:53:11 +0100396 }
397
398 // get first quoting character
399 char quote_char = '\'';
400 if (is_char(lex, '\"')) {
401 quote_char = '\"';
402 }
403 next_char(lex);
404
405 // work out if it's a single or triple quoted literal
406 int num_quotes;
407 if (is_char_and(lex, quote_char, quote_char)) {
408 // triple quotes
409 next_char(lex);
410 next_char(lex);
411 num_quotes = 3;
412 } else {
413 // single quotes
414 num_quotes = 1;
415 }
416
Damien429d7192013-10-04 19:53:11 +0100417 // parse the literal
Damien429d7192013-10-04 19:53:11 +0100418 int n_closing = 0;
419 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
420 if (is_char(lex, quote_char)) {
421 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100422 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100423 } else {
424 n_closing = 0;
425 if (!is_raw && is_char(lex, '\\')) {
426 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100427 unichar c = CUR_CHAR(lex);
428 switch (c) {
Damiend99b0522013-12-21 18:17:45 +0000429 case MP_LEXER_CHAR_EOF: break; // TODO a proper error message?
430 case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
Damiena5185f42013-10-20 14:41:27 +0100431 case '\\': break;
432 case '\'': break;
433 case '"': break;
434 case 'a': c = 0x07; break;
435 case 'b': c = 0x08; break;
436 case 't': c = 0x09; break;
437 case 'n': c = 0x0a; break;
438 case 'v': c = 0x0b; break;
439 case 'f': c = 0x0c; break;
440 case 'r': c = 0x0d; break;
441 // TODO \ooo octal
442 case 'x': // TODO \xhh
443 case 'N': // TODO \N{name} only in strings
444 case 'u': // TODO \uxxxx only in strings
445 case 'U': // TODO \Uxxxxxxxx only in strings
446 default: break; // TODO error message
447 }
Damiend99b0522013-12-21 18:17:45 +0000448 if (c != MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100449 vstr_add_char(&lex->vstr, c);
450 }
451 } else {
452 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100453 }
454 }
455 next_char(lex);
456 }
457
458 // check we got the required end quotes
459 if (n_closing < num_quotes) {
Damiend99b0522013-12-21 18:17:45 +0000460 tok->kind = MP_TOKEN_LONELY_STRING_OPEN;
Damien429d7192013-10-04 19:53:11 +0100461 }
462
Damiena5185f42013-10-20 14:41:27 +0100463 // cut off the end quotes from the token text
464 vstr_cut_tail(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100465
466 } else if (is_head_of_identifier(lex)) {
Damiend99b0522013-12-21 18:17:45 +0000467 tok->kind = MP_TOKEN_NAME;
Damien429d7192013-10-04 19:53:11 +0100468
Damiena5185f42013-10-20 14:41:27 +0100469 // get first char
470 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100471 next_char(lex);
472
Damiena5185f42013-10-20 14:41:27 +0100473 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100474 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100475 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100476 next_char(lex);
477 }
478
479 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
Damiend99b0522013-12-21 18:17:45 +0000480 tok->kind = MP_TOKEN_NUMBER;
Damien429d7192013-10-04 19:53:11 +0100481
Damiena5185f42013-10-20 14:41:27 +0100482 // get first char
483 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100484 next_char(lex);
485
Damiena5185f42013-10-20 14:41:27 +0100486 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100487 while (!is_end(lex)) {
488 if (is_char_or(lex, 'e', 'E')) {
Damiena5185f42013-10-20 14:41:27 +0100489 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100490 next_char(lex);
491 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100492 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100493 next_char(lex);
494 }
495 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
Damiena5185f42013-10-20 14:41:27 +0100496 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100497 next_char(lex);
498 } else {
499 break;
500 }
501 }
502
503 } else {
504 // search for encoded delimiter or operator
505
506 const char *t = tok_enc;
507 uint tok_enc_index = 0;
508 for (; *t != 0 && !is_char(lex, *t); t += 1) {
509 if (*t == 'e' || *t == 'c') {
510 t += 1;
511 } else if (*t == 'E') {
512 tok_enc_index -= 1;
513 t += 1;
514 }
515 tok_enc_index += 1;
516 }
517
518 next_char(lex);
519
520 if (*t == 0) {
521 // didn't match any delimiter or operator characters
Damiend99b0522013-12-21 18:17:45 +0000522 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100523
524 } else {
525 // matched a delimiter or operator character
526
527 // get the maximum characters for a valid token
528 t += 1;
529 uint t_index = tok_enc_index;
530 for (;;) {
531 for (; *t == 'e'; t += 1) {
532 t += 1;
533 t_index += 1;
534 if (is_char(lex, *t)) {
535 next_char(lex);
536 tok_enc_index = t_index;
537 break;
538 }
539 }
540
541 if (*t == 'E') {
542 t += 1;
543 if (is_char(lex, *t)) {
544 next_char(lex);
545 tok_enc_index = t_index;
546 } else {
Damiend99b0522013-12-21 18:17:45 +0000547 tok->kind = MP_TOKEN_INVALID;
Damien429d7192013-10-04 19:53:11 +0100548 }
549 break;
550 }
551
552 if (*t == 'c') {
553 t += 1;
554 t_index += 1;
555 if (is_char(lex, *t)) {
556 next_char(lex);
557 tok_enc_index = t_index;
558 t += 1;
559 } else {
560 break;
561 }
562 } else {
563 break;
564 }
565 }
566
567 // set token kind
568 tok->kind = tok_enc_kind[tok_enc_index];
569
570 // compute bracket level for implicit line joining
Damiend99b0522013-12-21 18:17:45 +0000571 if (tok->kind == MP_TOKEN_DEL_PAREN_OPEN || tok->kind == MP_TOKEN_DEL_BRACKET_OPEN || tok->kind == MP_TOKEN_DEL_BRACE_OPEN) {
Damien429d7192013-10-04 19:53:11 +0100572 lex->nested_bracket_level += 1;
Damiend99b0522013-12-21 18:17:45 +0000573 } else if (tok->kind == MP_TOKEN_DEL_PAREN_CLOSE || tok->kind == MP_TOKEN_DEL_BRACKET_CLOSE || tok->kind == MP_TOKEN_DEL_BRACE_CLOSE) {
Damien429d7192013-10-04 19:53:11 +0100574 lex->nested_bracket_level -= 1;
575 }
576 }
577 }
578
Damiena5185f42013-10-20 14:41:27 +0100579 // point token text to vstr buffer
580 tok->str = vstr_str(&lex->vstr);
581 tok->len = vstr_len(&lex->vstr);
Damien429d7192013-10-04 19:53:11 +0100582
Damiena5185f42013-10-20 14:41:27 +0100583 // check for keywords
Damiend99b0522013-12-21 18:17:45 +0000584 if (tok->kind == MP_TOKEN_NAME) {
Damien429d7192013-10-04 19:53:11 +0100585 for (int i = 0; tok_kw[i] != NULL; i++) {
Damiena5185f42013-10-20 14:41:27 +0100586 if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
Damiend99b0522013-12-21 18:17:45 +0000587 tok->kind = MP_TOKEN_KW_FALSE + i;
Damien429d7192013-10-04 19:53:11 +0100588 break;
589 }
590 }
591 }
592}
593
Damiend99b0522013-12-21 18:17:45 +0000594mp_lexer_t *mp_lexer_new(const char *src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) {
595 mp_lexer_t *lex = m_new(mp_lexer_t, 1);
Damien429d7192013-10-04 19:53:11 +0100596
Damiena5185f42013-10-20 14:41:27 +0100597 lex->name = src_name; // TODO do we need to strdup this?
598 lex->stream_data = stream_data;
599 lex->stream_next_char = stream_next_char;
Damienfa2162b2013-10-20 17:42:00 +0100600 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100601 lex->line = 1;
602 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100603 lex->emit_dent = 0;
604 lex->nested_bracket_level = 0;
605 lex->alloc_indent_level = 16;
606 lex->num_indent_level = 1;
607 lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
608 lex->indent_level[0] = 0;
Paul Sokolovsky5d2499c2014-01-13 23:15:23 +0200609 vstr_init(&lex->vstr, 32);
Damien429d7192013-10-04 19:53:11 +0100610
611 // preload characters
Damiena5185f42013-10-20 14:41:27 +0100612 lex->chr0 = stream_next_char(stream_data);
613 lex->chr1 = stream_next_char(stream_data);
614 lex->chr2 = stream_next_char(stream_data);
615
616 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
Damiend99b0522013-12-21 18:17:45 +0000617 if (lex->chr0 == MP_LEXER_CHAR_EOF) {
Damiena5185f42013-10-20 14:41:27 +0100618 lex->chr0 = '\n';
Damiend99b0522013-12-21 18:17:45 +0000619 } else if (lex->chr1 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100620 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100621 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100622 }
Damiend99b0522013-12-21 18:17:45 +0000623 } else if (lex->chr2 == MP_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100624 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100625 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100626 }
Damien429d7192013-10-04 19:53:11 +0100627 }
628
Damiena5185f42013-10-20 14:41:27 +0100629 // preload first token
Damiend99b0522013-12-21 18:17:45 +0000630 mp_lexer_next_token_into(lex, &lex->tok_cur, true);
Damien429d7192013-10-04 19:53:11 +0100631
632 return lex;
633}
634
Damiend99b0522013-12-21 18:17:45 +0000635void mp_lexer_free(mp_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100636 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100637 if (lex->stream_close) {
638 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100639 }
Damienbb5316b2013-10-22 21:12:29 +0100640 vstr_clear(&lex->vstr);
Damien732407f2013-12-29 19:33:23 +0000641 m_del_obj(mp_lexer_t, lex);
Damien429d7192013-10-04 19:53:11 +0100642 }
Damien429d7192013-10-04 19:53:11 +0100643}
644
Damiend99b0522013-12-21 18:17:45 +0000645void mp_lexer_to_next(mp_lexer_t *lex) {
646 mp_lexer_next_token_into(lex, &lex->tok_cur, false);
Damien429d7192013-10-04 19:53:11 +0100647}
648
Damiend99b0522013-12-21 18:17:45 +0000649const mp_token_t *mp_lexer_cur(const mp_lexer_t *lex) {
Damien429d7192013-10-04 19:53:11 +0100650 return &lex->tok_cur;
651}
652
Damiend99b0522013-12-21 18:17:45 +0000653bool mp_lexer_is_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
Damien429d7192013-10-04 19:53:11 +0100654 return lex->tok_cur.kind == kind;
655}
656
657/*
Damiend99b0522013-12-21 18:17:45 +0000658bool mp_lexer_is_str(mp_lexer_t *lex, const char *str) {
659 return mp_token_is_str(&lex->tok_cur, str);
Damien429d7192013-10-04 19:53:11 +0100660}
661
Damiend99b0522013-12-21 18:17:45 +0000662bool mp_lexer_opt_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
663 if (mp_lexer_is_kind(lex, kind)) {
664 mp_lexer_to_next(lex);
Damien429d7192013-10-04 19:53:11 +0100665 return true;
666 }
667 return false;
668}
669
Damiend99b0522013-12-21 18:17:45 +0000670bool mp_lexer_opt_str(mp_lexer_t *lex, const char *str) {
671 if (mp_lexer_is_str(lex, str)) {
672 mp_lexer_to_next(lex);
Damien429d7192013-10-04 19:53:11 +0100673 return true;
674 }
675 return false;
676}
677*/
678
Damien George9528cd62014-01-15 21:23:31 +0000679bool mp_lexer_show_error_pythonic_prefix(mp_lexer_t *lex) {
680 printf(" File \"%s\", line %d column %d\n", lex->tok_cur.src_name, lex->tok_cur.src_line, lex->tok_cur.src_column);
681 return false;
Damien429d7192013-10-04 19:53:11 +0100682}
Damien91d387d2013-10-09 15:09:52 +0100683
Damiend99b0522013-12-21 18:17:45 +0000684bool mp_lexer_show_error_pythonic(mp_lexer_t *lex, const char *msg) {
Damien91d387d2013-10-09 15:09:52 +0100685 printf(" File \"%s\", line %d column %d\n%s\n", lex->tok_cur.src_name, lex->tok_cur.src_line, lex->tok_cur.src_column, msg);
686 return false;
687}