blob: 7167b93276f1139cb86358ba14ff9aec164f6487 [file] [log] [blame]
Damien429d7192013-10-04 19:53:11 +01001/* lexer.c -- simple tokeniser for Python implementation
2 */
3
4#include <stdint.h>
5#include <stdio.h>
6#include <assert.h>
7
8#include "misc.h"
9#include "lexer.h"
10
11#define TAB_SIZE (8)
12#define CHR_EOF (-1)
13
14struct _py_lexer_t {
15 const char *name; // (file) name of source
16 bool free; // free source when done with it
17
18 const char *src_beg; // beginning of source
19 const char *src_cur; // current location in source; points to chr0
20 const char *src_end; // end (exclusive) of source
21 unichar chr0, chr1, chr2; // current characters from source
22
23 uint line; // source line
24 uint column; // source column
25
26 uint cont_line; // continued line
27
28 int emit_dent;
29 int nested_bracket_level;
30
31 uint alloc_indent_level;
32 uint num_indent_level;
33 uint16_t *indent_level;
34
35 py_token_t tok_cur;
36 py_token_t tok_next;
37};
38
39static bool py_token_is_str(const py_token_t *tok, const char *str) {
40 uint i = 0;
41 const char *tstr = tok->str;
42
43 while (i < tok->len && *tstr == *str) {
44 ++i;
45 ++tstr;
46 ++str;
47 }
48
49 return i == tok->len && *str == 0;
50}
51
52void py_token_show(const py_token_t *tok) {
53 printf("(%s:%d:%d) kind:%d cont_line:%d str:%p len:%d", tok->src_name, tok->src_line, tok->src_column, tok->kind, tok->cont_line, tok->str, tok->len);
54 if (tok->str != NULL && tok->len > 0) {
55 const char *i = tok->str;
56 const char *j = i + tok->len;
57 printf(" ");
58 while (i < j) {
59 unichar c = g_utf8_get_char(i);
60 i = g_utf8_next_char(i);
61 if (g_unichar_isprint(c)) {
62 printf("%c", c);
63 } else {
64 printf("?");
65 }
66 }
67 }
68 printf("\n");
69}
70
71void py_token_show_error_prefix(const py_token_t *tok) {
72 printf("(%s:%d:%d) ", tok->src_name, tok->src_line, tok->src_column);
73}
74
75bool py_token_show_error(const py_token_t *tok, const char *msg) {
76 printf("(%s:%d:%d) %s\n", tok->src_name, tok->src_line, tok->src_column, msg);
77 return false;
78}
79
80static bool is_end(py_lexer_t *lex) {
81 return lex->chr0 == CHR_EOF;
82}
83
84static bool is_physical_newline(py_lexer_t *lex) {
85 return lex->chr0 == '\n' || lex->chr0 == '\r';
86}
87
88static bool is_char(py_lexer_t *lex, char c) {
89 return lex->chr0 == c;
90}
91
92static bool is_char_or(py_lexer_t *lex, char c1, char c2) {
93 return lex->chr0 == c1 || lex->chr0 == c2;
94}
95
96static bool is_char_or3(py_lexer_t *lex, char c1, char c2, char c3) {
97 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
98}
99
100/*
101static bool is_char_following(py_lexer_t *lex, char c) {
102 return lex->chr1 == c;
103}
104*/
105
106static bool is_char_following_or(py_lexer_t *lex, char c1, char c2) {
107 return lex->chr1 == c1 || lex->chr1 == c2;
108}
109
110static bool is_char_following_following_or(py_lexer_t *lex, char c1, char c2) {
111 return lex->chr2 == c1 || lex->chr2 == c2;
112}
113
114static bool is_char_and(py_lexer_t *lex, char c1, char c2) {
115 return lex->chr0 == c1 && lex->chr1 == c2;
116}
117
118static bool is_whitespace(py_lexer_t *lex) {
119 return g_unichar_isspace(lex->chr0);
120}
121
122static bool is_letter(py_lexer_t *lex) {
123 return g_unichar_isalpha(lex->chr0);
124}
125
126static bool is_digit(py_lexer_t *lex) {
127 return g_unichar_isdigit(lex->chr0);
128}
129
130static bool is_following_digit(py_lexer_t *lex) {
131 return g_unichar_isdigit(lex->chr1);
132}
133
134// TODO UNICODE include unicode characters in definition of identifiers
135static bool is_head_of_identifier(py_lexer_t *lex) {
136 return is_letter(lex) || lex->chr0 == '_';
137}
138
139// TODO UNICODE include unicode characters in definition of identifiers
140static bool is_tail_of_identifier(py_lexer_t *lex) {
141 return is_head_of_identifier(lex) || is_digit(lex);
142}
143
144static void next_char(py_lexer_t *lex) {
145 if (lex->chr0 == CHR_EOF) {
146 return;
147 }
148
149 int advance = 1;
150
151 if (lex->chr0 == '\n') {
152 // LF is a new line
153 ++lex->line;
154 lex->column = 1;
155 lex->cont_line = lex->line;
156 } else if (lex->chr0 == '\r') {
157 // CR is a new line
158 ++lex->line;
159 lex->column = 1;
160 lex->cont_line = lex->line;
161 if (lex->chr1 == '\n') {
162 // CR LF is a single new line
163 advance = 2;
164 }
165 } else if (lex->chr0 == '\t') {
166 // a tab
167 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
168 } else {
169 // a character worth one column
170 ++lex->column;
171 }
172
173 for (; advance > 0; advance--) {
174 lex->chr0 = lex->chr1;
175 lex->chr1 = lex->chr2;
176 lex->src_cur++;
177 if (lex->src_cur + 2 < lex->src_end) {
178 lex->chr2 = lex->src_cur[2];
179 } else {
180 // EOF
Damien9f770c62013-10-18 19:54:31 +0100181 if (lex->chr1 != CHR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien429d7192013-10-04 19:53:11 +0100182 lex->chr2 = '\n'; // insert newline at end of file
183 } else {
184 lex->chr2 = CHR_EOF;
185 }
186 }
187 }
188}
189
190void indent_push(py_lexer_t *lex, uint indent) {
191 if (lex->num_indent_level >= lex->alloc_indent_level) {
192 lex->alloc_indent_level *= 2;
193 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level);
194 }
195 lex->indent_level[lex->num_indent_level++] = indent;
196}
197
198uint indent_top(py_lexer_t *lex) {
199 return lex->indent_level[lex->num_indent_level - 1];
200}
201
202void indent_pop(py_lexer_t *lex) {
203 lex->num_indent_level -= 1;
204}
205
206// some tricky operator encoding:
207// <op> = begin with <op>, if this opchar matches then begin here
208// e<op> = end with <op>, if this opchar matches then end
209// E<op> = mandatory end with <op>, this opchar must match, then end
210// c<op> = continue with <op>, if this opchar matches then continue matching
211// this means if the start of two ops are the same then they are equal til the last char
212
213static const char *tok_enc =
214 "()[]{},:;@~" // singles
215 "<e=c<e=" // < <= << <<=
216 ">e=c>e=" // > >= >> >>=
217 "*e=c*e=" // * *= ** **=
218 "+e=" // + +=
219 "-e=e>" // - -= ->
220 "&e=" // & &=
221 "|e=" // | |=
222 "/e=c/e=" // / /= // //=
223 "%e=" // % %=
224 "^e=" // ^ ^=
225 "=e=" // = ==
226 "!E=" // !=
227 ".c.E."; // . ...
228
229// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
230static const uint8_t tok_enc_kind[] = {
231 PY_TOKEN_DEL_PAREN_OPEN, PY_TOKEN_DEL_PAREN_CLOSE,
232 PY_TOKEN_DEL_BRACKET_OPEN, PY_TOKEN_DEL_BRACKET_CLOSE,
233 PY_TOKEN_DEL_BRACE_OPEN, PY_TOKEN_DEL_BRACE_CLOSE,
234 PY_TOKEN_DEL_COMMA, PY_TOKEN_DEL_COLON, PY_TOKEN_DEL_SEMICOLON, PY_TOKEN_DEL_AT, PY_TOKEN_OP_TILDE,
235
236 PY_TOKEN_OP_LESS, PY_TOKEN_OP_LESS_EQUAL, PY_TOKEN_OP_DBL_LESS, PY_TOKEN_DEL_DBL_LESS_EQUAL,
237 PY_TOKEN_OP_MORE, PY_TOKEN_OP_MORE_EQUAL, PY_TOKEN_OP_DBL_MORE, PY_TOKEN_DEL_DBL_MORE_EQUAL,
238 PY_TOKEN_OP_STAR, PY_TOKEN_DEL_STAR_EQUAL, PY_TOKEN_OP_DBL_STAR, PY_TOKEN_DEL_DBL_STAR_EQUAL,
239 PY_TOKEN_OP_PLUS, PY_TOKEN_DEL_PLUS_EQUAL,
240 PY_TOKEN_OP_MINUS, PY_TOKEN_DEL_MINUS_EQUAL, PY_TOKEN_DEL_MINUS_MORE,
241 PY_TOKEN_OP_AMPERSAND, PY_TOKEN_DEL_AMPERSAND_EQUAL,
242 PY_TOKEN_OP_PIPE, PY_TOKEN_DEL_PIPE_EQUAL,
243 PY_TOKEN_OP_SLASH, PY_TOKEN_DEL_SLASH_EQUAL, PY_TOKEN_OP_DBL_SLASH, PY_TOKEN_DEL_DBL_SLASH_EQUAL,
244 PY_TOKEN_OP_PERCENT, PY_TOKEN_DEL_PERCENT_EQUAL,
245 PY_TOKEN_OP_CARET, PY_TOKEN_DEL_CARET_EQUAL,
246 PY_TOKEN_DEL_EQUAL, PY_TOKEN_OP_DBL_EQUAL,
247 PY_TOKEN_OP_NOT_EQUAL,
248 PY_TOKEN_DEL_PERIOD, PY_TOKEN_ELLIPSES,
249};
250
251// must have the same order as enum in lexer.h
252static const char *tok_kw[] = {
253 "False",
254 "None",
255 "True",
256 "and",
257 "as",
258 "assert",
259 "break",
260 "class",
261 "continue",
262 "def",
263 "del",
264 "elif",
265 "else",
266 "except",
267 "finally",
268 "for",
269 "from",
270 "global",
271 "if",
272 "import",
273 "in",
274 "is",
275 "lambda",
276 "nonlocal",
277 "not",
278 "or",
279 "pass",
280 "raise",
281 "return",
282 "try",
283 "while",
284 "with",
285 "yield",
286 NULL,
287};
288
289static void py_lexer_next_token_into(py_lexer_t *lex, py_token_t *tok) {
290 bool had_physical_newline = false;
291
292 while (!is_end(lex)) {
293 if (is_physical_newline(lex)) {
294 had_physical_newline = true;
295 next_char(lex);
296 } else if (is_whitespace(lex)) {
297 next_char(lex);
298 } else if (is_char(lex, '#')) {
299 next_char(lex);
300 while (!is_end(lex) && !is_physical_newline(lex)) {
301 next_char(lex);
302 }
303 // had_physical_newline will be set on next loop
304 } else if (is_char(lex, '\\')) {
305 // backslash (outside string literals) must appear just before a physical newline
306 next_char(lex);
307 if (!is_physical_newline(lex)) {
308 // TODO SyntaxError
309 assert(0);
310 } else {
311 next_char(lex);
312 }
313 } else {
314 break;
315 }
316 }
317
318 tok->src_name = lex->name;
319 tok->src_line = lex->line;
320 tok->src_column = lex->column;
321 tok->kind = PY_TOKEN_INVALID;
322 tok->cont_line = lex->cont_line;
323 tok->str = lex->src_cur;
324 tok->len = 0;
325
326 if (lex->emit_dent < 0) {
327 tok->kind = PY_TOKEN_DEDENT;
328 lex->emit_dent += 1;
329
330 } else if (lex->emit_dent > 0) {
331 tok->kind = PY_TOKEN_INDENT;
332 lex->emit_dent -= 1;
333
Damien91d387d2013-10-09 15:09:52 +0100334 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damien429d7192013-10-04 19:53:11 +0100335 tok->kind = PY_TOKEN_NEWLINE;
336
337 uint num_spaces = lex->column - 1;
338 lex->emit_dent = 0;
339 if (num_spaces == indent_top(lex)) {
340 } else if (num_spaces > indent_top(lex)) {
341 indent_push(lex, num_spaces);
342 lex->emit_dent += 1;
343 } else {
344 while (num_spaces < indent_top(lex)) {
345 indent_pop(lex);
346 lex->emit_dent -= 1;
347 }
348 if (num_spaces != indent_top(lex)) {
Damien91d387d2013-10-09 15:09:52 +0100349 tok->kind = PY_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100350 }
351 }
352
353 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100354 if (indent_top(lex) > 0) {
355 tok->kind = PY_TOKEN_NEWLINE;
356 lex->emit_dent = 0;
357 while (indent_top(lex) > 0) {
358 indent_pop(lex);
359 lex->emit_dent -= 1;
360 }
361 } else {
362 tok->kind = PY_TOKEN_END;
363 }
364
365 } else if (is_char_or(lex, '\'', '\"')
366 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
367 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
368 // a string or bytes literal
369
370 // parse type codes
371 bool is_raw = false;
372 bool is_bytes = false;
373 if (is_char(lex, 'u')) {
374 next_char(lex);
375 } else if (is_char(lex, 'b')) {
376 is_bytes = true;
377 next_char(lex);
378 if (is_char(lex, 'r')) {
379 is_raw = true;
380 next_char(lex);
381 }
382 } else if (is_char(lex, 'r')) {
383 is_raw = true;
384 next_char(lex);
385 if (is_char(lex, 'b')) {
386 is_bytes = true;
387 next_char(lex);
388 }
389 }
390
391 // set token kind
392 if (is_bytes) {
393 tok->kind = PY_TOKEN_BYTES;
394 } else {
395 tok->kind = PY_TOKEN_STRING;
396 }
397
398 // get first quoting character
399 char quote_char = '\'';
400 if (is_char(lex, '\"')) {
401 quote_char = '\"';
402 }
403 next_char(lex);
404
405 // work out if it's a single or triple quoted literal
406 int num_quotes;
407 if (is_char_and(lex, quote_char, quote_char)) {
408 // triple quotes
409 next_char(lex);
410 next_char(lex);
411 num_quotes = 3;
412 } else {
413 // single quotes
414 num_quotes = 1;
415 }
416
417 // set start of token
418 tok->str = lex->src_cur;
419
420 // parse the literal
421 // TODO proper escaping
422 int n_closing = 0;
423 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
424 if (is_char(lex, quote_char)) {
425 n_closing += 1;
426 } else {
427 n_closing = 0;
428 if (!is_raw && is_char(lex, '\\')) {
429 next_char(lex);
430 }
431 }
432 next_char(lex);
433 }
434
435 // check we got the required end quotes
436 if (n_closing < num_quotes) {
437 tok->kind = PY_TOKEN_LONELY_STRING_OPEN;
438 }
439
440 // set token string (byte) length
441 tok->len = lex->src_cur - tok->str - n_closing;
442
443 // we set the length, return now so it's not set incorrectly below
444 return;
445
446 } else if (is_head_of_identifier(lex)) {
447 tok->kind = PY_TOKEN_NAME;
448
449 next_char(lex);
450
451 while (!is_end(lex) && is_tail_of_identifier(lex)) {
452 next_char(lex);
453 }
454
455 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
456 tok->kind = PY_TOKEN_NUMBER;
457
458 next_char(lex);
459
460 while (!is_end(lex)) {
461 if (is_char_or(lex, 'e', 'E')) {
462 next_char(lex);
463 if (is_char(lex, '+') || is_char(lex, '-')) {
464 next_char(lex);
465 }
466 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
467 next_char(lex);
468 } else {
469 break;
470 }
471 }
472
473 } else {
474 // search for encoded delimiter or operator
475
476 const char *t = tok_enc;
477 uint tok_enc_index = 0;
478 for (; *t != 0 && !is_char(lex, *t); t += 1) {
479 if (*t == 'e' || *t == 'c') {
480 t += 1;
481 } else if (*t == 'E') {
482 tok_enc_index -= 1;
483 t += 1;
484 }
485 tok_enc_index += 1;
486 }
487
488 next_char(lex);
489
490 if (*t == 0) {
491 // didn't match any delimiter or operator characters
492 tok->kind = PY_TOKEN_INVALID;
493
494 } else {
495 // matched a delimiter or operator character
496
497 // get the maximum characters for a valid token
498 t += 1;
499 uint t_index = tok_enc_index;
500 for (;;) {
501 for (; *t == 'e'; t += 1) {
502 t += 1;
503 t_index += 1;
504 if (is_char(lex, *t)) {
505 next_char(lex);
506 tok_enc_index = t_index;
507 break;
508 }
509 }
510
511 if (*t == 'E') {
512 t += 1;
513 if (is_char(lex, *t)) {
514 next_char(lex);
515 tok_enc_index = t_index;
516 } else {
517 tok->kind = PY_TOKEN_INVALID;
518 }
519 break;
520 }
521
522 if (*t == 'c') {
523 t += 1;
524 t_index += 1;
525 if (is_char(lex, *t)) {
526 next_char(lex);
527 tok_enc_index = t_index;
528 t += 1;
529 } else {
530 break;
531 }
532 } else {
533 break;
534 }
535 }
536
537 // set token kind
538 tok->kind = tok_enc_kind[tok_enc_index];
539
540 // compute bracket level for implicit line joining
541 if (tok->kind == PY_TOKEN_DEL_PAREN_OPEN || tok->kind == PY_TOKEN_DEL_BRACKET_OPEN || tok->kind == PY_TOKEN_DEL_BRACE_OPEN) {
542 lex->nested_bracket_level += 1;
543 } else if (tok->kind == PY_TOKEN_DEL_PAREN_CLOSE || tok->kind == PY_TOKEN_DEL_BRACKET_CLOSE || tok->kind == PY_TOKEN_DEL_BRACE_CLOSE) {
544 lex->nested_bracket_level -= 1;
545 }
546 }
547 }
548
549 // set token string (byte) length
550 tok->len = lex->src_cur - tok->str;
551
552 // check for keywords (must be done after setting token string length)
553 if (tok->kind == PY_TOKEN_NAME) {
554 for (int i = 0; tok_kw[i] != NULL; i++) {
555 if (py_token_is_str(tok, tok_kw[i])) {
556 tok->kind = PY_TOKEN_KW_FALSE + i;
557 break;
558 }
559 }
560 }
561}
562
563py_lexer_t *py_lexer_from_str_len(const char *src_name, const char *str, uint len, bool free_str) {
564 py_lexer_t *lex;
565
566 lex = m_new(py_lexer_t, 1);
567
568 //lex->name = g_strdup(src_name); // TODO
569 lex->name = src_name;
570 lex->free = free_str;
571 lex->src_beg = str;
572 lex->src_cur = str;
573 lex->src_end = str + len;
574 lex->line = 1;
575 lex->column = 1;
576 lex->cont_line = lex->line;
577 lex->emit_dent = 0;
578 lex->nested_bracket_level = 0;
579 lex->alloc_indent_level = 16;
580 lex->num_indent_level = 1;
581 lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
582 lex->indent_level[0] = 0;
583
584 // preload characters
585 // TODO unicode
586 if (len == 0) {
587 lex->chr0 = '\n'; // insert newline at end of file
588 lex->chr1 = CHR_EOF;
589 lex->chr2 = CHR_EOF;
590 } else if (len == 1) {
591 lex->chr0 = str[0];
592 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
593 lex->chr1 = '\n'; // insert newline at end of file
594 } else {
595 lex->chr1 = CHR_EOF;
596 }
597 lex->chr2 = CHR_EOF;
598 } else if (len == 2) {
599 lex->chr0 = str[0];
600 lex->chr1 = str[1];
601 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
602 lex->chr2 = '\n'; // insert newline at end of file
603 } else {
604 lex->chr2 = CHR_EOF;
605 }
606 } else {
607 lex->chr0 = str[0];
608 lex->chr1 = str[1];
609 lex->chr2 = str[2];
610 }
611
612 py_lexer_next_token_into(lex, &lex->tok_cur);
Damien91d387d2013-10-09 15:09:52 +0100613
614 // check that the first token is in the first column
615 // (done to get equivalence with CPython)
616 if (lex->tok_cur.src_line == 1 && lex->tok_cur.src_column != 1) {
617 lex->tok_next = lex->tok_cur;
618 lex->tok_cur.kind = PY_TOKEN_INDENT;
619 } else {
620 py_lexer_next_token_into(lex, &lex->tok_next);
621 }
Damien429d7192013-10-04 19:53:11 +0100622
623 return lex;
624}
625
626void py_lexer_free(py_lexer_t *lex) {
627 if (lex == NULL) {
628 return;
629 }
630 //m_free(lex->name);
631 if (lex->free) {
632 m_free((char*)lex->src_beg);
633 }
634 m_free(lex);
635}
636
637void py_lexer_to_next(py_lexer_t *lex) {
638 lex->tok_cur = lex->tok_next;
639 py_lexer_next_token_into(lex, &lex->tok_next);
640}
641
642const py_token_t *py_lexer_cur(const py_lexer_t *lex) {
643 return &lex->tok_cur;
644}
645
646bool py_lexer_is_kind(py_lexer_t *lex, py_token_kind_t kind) {
647 return lex->tok_cur.kind == kind;
648}
649
650/*
651bool py_lexer_is_str(py_lexer_t *lex, const char *str) {
652 return py_token_is_str(&lex->tok_cur, str);
653}
654
655bool py_lexer_is_next_kind(py_lexer_t *lex, py_token_kind_t kind) {
656 return lex->tok_next.kind == kind;
657}
658
659bool py_lexer_is_next_str(py_lexer_t *lex, const char *str) {
660 return py_token_is_str(&lex->tok_next, str);
661}
662
663bool py_lexer_opt_kind(py_lexer_t *lex, py_token_kind_t kind) {
664 if (py_lexer_is_kind(lex, kind)) {
665 py_lexer_to_next(lex);
666 return true;
667 }
668 return false;
669}
670
671bool py_lexer_opt_str(py_lexer_t *lex, const char *str) {
672 if (py_lexer_is_str(lex, str)) {
673 py_lexer_to_next(lex);
674 return true;
675 }
676 return false;
677}
678*/
679
680bool py_lexer_show_error(py_lexer_t *lex, const char *msg) {
681 return py_token_show_error(&lex->tok_cur, msg);
682}
Damien91d387d2013-10-09 15:09:52 +0100683
684bool py_lexer_show_error_pythonic(py_lexer_t *lex, const char *msg) {
685 printf(" File \"%s\", line %d column %d\n%s\n", lex->tok_cur.src_name, lex->tok_cur.src_line, lex->tok_cur.src_column, msg);
686 return false;
687}