blob: 9c2195ef5bee911adad041312136dc21ec601ad4 [file] [log] [blame]
Damien429d7192013-10-04 19:53:11 +01001/* lexer.c -- simple tokeniser for Python implementation
2 */
3
4#include <stdint.h>
5#include <stdio.h>
6#include <assert.h>
7
8#include "misc.h"
9#include "lexer.h"
10
11#define TAB_SIZE (8)
12#define CHR_EOF (-1)
13
14struct _py_lexer_t {
15 const char *name; // (file) name of source
16 bool free; // free source when done with it
17
18 const char *src_beg; // beginning of source
19 const char *src_cur; // current location in source; points to chr0
20 const char *src_end; // end (exclusive) of source
21 unichar chr0, chr1, chr2; // current characters from source
22
23 uint line; // source line
24 uint column; // source column
25
26 uint cont_line; // continued line
27
28 int emit_dent;
29 int nested_bracket_level;
30
31 uint alloc_indent_level;
32 uint num_indent_level;
33 uint16_t *indent_level;
34
35 py_token_t tok_cur;
36 py_token_t tok_next;
37};
38
39static bool py_token_is_str(const py_token_t *tok, const char *str) {
40 uint i = 0;
41 const char *tstr = tok->str;
42
43 while (i < tok->len && *tstr == *str) {
44 ++i;
45 ++tstr;
46 ++str;
47 }
48
49 return i == tok->len && *str == 0;
50}
51
52void py_token_show(const py_token_t *tok) {
53 printf("(%s:%d:%d) kind:%d cont_line:%d str:%p len:%d", tok->src_name, tok->src_line, tok->src_column, tok->kind, tok->cont_line, tok->str, tok->len);
54 if (tok->str != NULL && tok->len > 0) {
55 const char *i = tok->str;
56 const char *j = i + tok->len;
57 printf(" ");
58 while (i < j) {
59 unichar c = g_utf8_get_char(i);
60 i = g_utf8_next_char(i);
61 if (g_unichar_isprint(c)) {
62 printf("%c", c);
63 } else {
64 printf("?");
65 }
66 }
67 }
68 printf("\n");
69}
70
71void py_token_show_error_prefix(const py_token_t *tok) {
72 printf("(%s:%d:%d) ", tok->src_name, tok->src_line, tok->src_column);
73}
74
75bool py_token_show_error(const py_token_t *tok, const char *msg) {
76 printf("(%s:%d:%d) %s\n", tok->src_name, tok->src_line, tok->src_column, msg);
77 return false;
78}
79
80static bool is_end(py_lexer_t *lex) {
81 return lex->chr0 == CHR_EOF;
82}
83
84static bool is_physical_newline(py_lexer_t *lex) {
85 return lex->chr0 == '\n' || lex->chr0 == '\r';
86}
87
88static bool is_char(py_lexer_t *lex, char c) {
89 return lex->chr0 == c;
90}
91
92static bool is_char_or(py_lexer_t *lex, char c1, char c2) {
93 return lex->chr0 == c1 || lex->chr0 == c2;
94}
95
96static bool is_char_or3(py_lexer_t *lex, char c1, char c2, char c3) {
97 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
98}
99
100/*
101static bool is_char_following(py_lexer_t *lex, char c) {
102 return lex->chr1 == c;
103}
104*/
105
106static bool is_char_following_or(py_lexer_t *lex, char c1, char c2) {
107 return lex->chr1 == c1 || lex->chr1 == c2;
108}
109
110static bool is_char_following_following_or(py_lexer_t *lex, char c1, char c2) {
111 return lex->chr2 == c1 || lex->chr2 == c2;
112}
113
114static bool is_char_and(py_lexer_t *lex, char c1, char c2) {
115 return lex->chr0 == c1 && lex->chr1 == c2;
116}
117
118static bool is_whitespace(py_lexer_t *lex) {
119 return g_unichar_isspace(lex->chr0);
120}
121
122static bool is_letter(py_lexer_t *lex) {
123 return g_unichar_isalpha(lex->chr0);
124}
125
126static bool is_digit(py_lexer_t *lex) {
127 return g_unichar_isdigit(lex->chr0);
128}
129
130static bool is_following_digit(py_lexer_t *lex) {
131 return g_unichar_isdigit(lex->chr1);
132}
133
134// TODO UNICODE include unicode characters in definition of identifiers
135static bool is_head_of_identifier(py_lexer_t *lex) {
136 return is_letter(lex) || lex->chr0 == '_';
137}
138
139// TODO UNICODE include unicode characters in definition of identifiers
140static bool is_tail_of_identifier(py_lexer_t *lex) {
141 return is_head_of_identifier(lex) || is_digit(lex);
142}
143
144static void next_char(py_lexer_t *lex) {
145 if (lex->chr0 == CHR_EOF) {
146 return;
147 }
148
149 int advance = 1;
150
151 if (lex->chr0 == '\n') {
152 // LF is a new line
153 ++lex->line;
154 lex->column = 1;
155 lex->cont_line = lex->line;
156 } else if (lex->chr0 == '\r') {
157 // CR is a new line
158 ++lex->line;
159 lex->column = 1;
160 lex->cont_line = lex->line;
161 if (lex->chr1 == '\n') {
162 // CR LF is a single new line
163 advance = 2;
164 }
165 } else if (lex->chr0 == '\t') {
166 // a tab
167 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
168 } else {
169 // a character worth one column
170 ++lex->column;
171 }
172
173 for (; advance > 0; advance--) {
174 lex->chr0 = lex->chr1;
175 lex->chr1 = lex->chr2;
176 lex->src_cur++;
177 if (lex->src_cur + 2 < lex->src_end) {
178 lex->chr2 = lex->src_cur[2];
179 } else {
180 // EOF
181 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
182 lex->chr2 = '\n'; // insert newline at end of file
183 } else {
184 lex->chr2 = CHR_EOF;
185 }
186 }
187 }
188}
189
190void indent_push(py_lexer_t *lex, uint indent) {
191 if (lex->num_indent_level >= lex->alloc_indent_level) {
192 lex->alloc_indent_level *= 2;
193 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level);
194 }
195 lex->indent_level[lex->num_indent_level++] = indent;
196}
197
198uint indent_top(py_lexer_t *lex) {
199 return lex->indent_level[lex->num_indent_level - 1];
200}
201
202void indent_pop(py_lexer_t *lex) {
203 lex->num_indent_level -= 1;
204}
205
206// some tricky operator encoding:
207// <op> = begin with <op>, if this opchar matches then begin here
208// e<op> = end with <op>, if this opchar matches then end
209// E<op> = mandatory end with <op>, this opchar must match, then end
210// c<op> = continue with <op>, if this opchar matches then continue matching
211// this means if the start of two ops are the same then they are equal til the last char
212
213static const char *tok_enc =
214 "()[]{},:;@~" // singles
215 "<e=c<e=" // < <= << <<=
216 ">e=c>e=" // > >= >> >>=
217 "*e=c*e=" // * *= ** **=
218 "+e=" // + +=
219 "-e=e>" // - -= ->
220 "&e=" // & &=
221 "|e=" // | |=
222 "/e=c/e=" // / /= // //=
223 "%e=" // % %=
224 "^e=" // ^ ^=
225 "=e=" // = ==
226 "!E=" // !=
227 ".c.E."; // . ...
228
229// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
230static const uint8_t tok_enc_kind[] = {
231 PY_TOKEN_DEL_PAREN_OPEN, PY_TOKEN_DEL_PAREN_CLOSE,
232 PY_TOKEN_DEL_BRACKET_OPEN, PY_TOKEN_DEL_BRACKET_CLOSE,
233 PY_TOKEN_DEL_BRACE_OPEN, PY_TOKEN_DEL_BRACE_CLOSE,
234 PY_TOKEN_DEL_COMMA, PY_TOKEN_DEL_COLON, PY_TOKEN_DEL_SEMICOLON, PY_TOKEN_DEL_AT, PY_TOKEN_OP_TILDE,
235
236 PY_TOKEN_OP_LESS, PY_TOKEN_OP_LESS_EQUAL, PY_TOKEN_OP_DBL_LESS, PY_TOKEN_DEL_DBL_LESS_EQUAL,
237 PY_TOKEN_OP_MORE, PY_TOKEN_OP_MORE_EQUAL, PY_TOKEN_OP_DBL_MORE, PY_TOKEN_DEL_DBL_MORE_EQUAL,
238 PY_TOKEN_OP_STAR, PY_TOKEN_DEL_STAR_EQUAL, PY_TOKEN_OP_DBL_STAR, PY_TOKEN_DEL_DBL_STAR_EQUAL,
239 PY_TOKEN_OP_PLUS, PY_TOKEN_DEL_PLUS_EQUAL,
240 PY_TOKEN_OP_MINUS, PY_TOKEN_DEL_MINUS_EQUAL, PY_TOKEN_DEL_MINUS_MORE,
241 PY_TOKEN_OP_AMPERSAND, PY_TOKEN_DEL_AMPERSAND_EQUAL,
242 PY_TOKEN_OP_PIPE, PY_TOKEN_DEL_PIPE_EQUAL,
243 PY_TOKEN_OP_SLASH, PY_TOKEN_DEL_SLASH_EQUAL, PY_TOKEN_OP_DBL_SLASH, PY_TOKEN_DEL_DBL_SLASH_EQUAL,
244 PY_TOKEN_OP_PERCENT, PY_TOKEN_DEL_PERCENT_EQUAL,
245 PY_TOKEN_OP_CARET, PY_TOKEN_DEL_CARET_EQUAL,
246 PY_TOKEN_DEL_EQUAL, PY_TOKEN_OP_DBL_EQUAL,
247 PY_TOKEN_OP_NOT_EQUAL,
248 PY_TOKEN_DEL_PERIOD, PY_TOKEN_ELLIPSES,
249};
250
251// must have the same order as enum in lexer.h
252static const char *tok_kw[] = {
253 "False",
254 "None",
255 "True",
256 "and",
257 "as",
258 "assert",
259 "break",
260 "class",
261 "continue",
262 "def",
263 "del",
264 "elif",
265 "else",
266 "except",
267 "finally",
268 "for",
269 "from",
270 "global",
271 "if",
272 "import",
273 "in",
274 "is",
275 "lambda",
276 "nonlocal",
277 "not",
278 "or",
279 "pass",
280 "raise",
281 "return",
282 "try",
283 "while",
284 "with",
285 "yield",
286 NULL,
287};
288
289static void py_lexer_next_token_into(py_lexer_t *lex, py_token_t *tok) {
290 bool had_physical_newline = false;
291
292 while (!is_end(lex)) {
293 if (is_physical_newline(lex)) {
294 had_physical_newline = true;
295 next_char(lex);
296 } else if (is_whitespace(lex)) {
297 next_char(lex);
298 } else if (is_char(lex, '#')) {
299 next_char(lex);
300 while (!is_end(lex) && !is_physical_newline(lex)) {
301 next_char(lex);
302 }
303 // had_physical_newline will be set on next loop
304 } else if (is_char(lex, '\\')) {
305 // backslash (outside string literals) must appear just before a physical newline
306 next_char(lex);
307 if (!is_physical_newline(lex)) {
308 // TODO SyntaxError
309 assert(0);
310 } else {
311 next_char(lex);
312 }
313 } else {
314 break;
315 }
316 }
317
318 tok->src_name = lex->name;
319 tok->src_line = lex->line;
320 tok->src_column = lex->column;
321 tok->kind = PY_TOKEN_INVALID;
322 tok->cont_line = lex->cont_line;
323 tok->str = lex->src_cur;
324 tok->len = 0;
325
326 if (lex->emit_dent < 0) {
327 tok->kind = PY_TOKEN_DEDENT;
328 lex->emit_dent += 1;
329
330 } else if (lex->emit_dent > 0) {
331 tok->kind = PY_TOKEN_INDENT;
332 lex->emit_dent -= 1;
333
334 } else if (had_physical_newline && lex->nested_bracket_level == 0
335 && tok != &lex->tok_cur // so that we don't emit a newline if file starts with a comment
336 ) {
337 tok->kind = PY_TOKEN_NEWLINE;
338
339 uint num_spaces = lex->column - 1;
340 lex->emit_dent = 0;
341 if (num_spaces == indent_top(lex)) {
342 } else if (num_spaces > indent_top(lex)) {
343 indent_push(lex, num_spaces);
344 lex->emit_dent += 1;
345 } else {
346 while (num_spaces < indent_top(lex)) {
347 indent_pop(lex);
348 lex->emit_dent -= 1;
349 }
350 if (num_spaces != indent_top(lex)) {
351 //SyntaxError
352 }
353 }
354
355 } else if (is_end(lex)) {
356 // TODO emit a newline if file does not end in one
357 if (indent_top(lex) > 0) {
358 tok->kind = PY_TOKEN_NEWLINE;
359 lex->emit_dent = 0;
360 while (indent_top(lex) > 0) {
361 indent_pop(lex);
362 lex->emit_dent -= 1;
363 }
364 } else {
365 tok->kind = PY_TOKEN_END;
366 }
367
368 } else if (is_char_or(lex, '\'', '\"')
369 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
370 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
371 // a string or bytes literal
372
373 // parse type codes
374 bool is_raw = false;
375 bool is_bytes = false;
376 if (is_char(lex, 'u')) {
377 next_char(lex);
378 } else if (is_char(lex, 'b')) {
379 is_bytes = true;
380 next_char(lex);
381 if (is_char(lex, 'r')) {
382 is_raw = true;
383 next_char(lex);
384 }
385 } else if (is_char(lex, 'r')) {
386 is_raw = true;
387 next_char(lex);
388 if (is_char(lex, 'b')) {
389 is_bytes = true;
390 next_char(lex);
391 }
392 }
393
394 // set token kind
395 if (is_bytes) {
396 tok->kind = PY_TOKEN_BYTES;
397 } else {
398 tok->kind = PY_TOKEN_STRING;
399 }
400
401 // get first quoting character
402 char quote_char = '\'';
403 if (is_char(lex, '\"')) {
404 quote_char = '\"';
405 }
406 next_char(lex);
407
408 // work out if it's a single or triple quoted literal
409 int num_quotes;
410 if (is_char_and(lex, quote_char, quote_char)) {
411 // triple quotes
412 next_char(lex);
413 next_char(lex);
414 num_quotes = 3;
415 } else {
416 // single quotes
417 num_quotes = 1;
418 }
419
420 // set start of token
421 tok->str = lex->src_cur;
422
423 // parse the literal
424 // TODO proper escaping
425 int n_closing = 0;
426 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
427 if (is_char(lex, quote_char)) {
428 n_closing += 1;
429 } else {
430 n_closing = 0;
431 if (!is_raw && is_char(lex, '\\')) {
432 next_char(lex);
433 }
434 }
435 next_char(lex);
436 }
437
438 // check we got the required end quotes
439 if (n_closing < num_quotes) {
440 tok->kind = PY_TOKEN_LONELY_STRING_OPEN;
441 }
442
443 // set token string (byte) length
444 tok->len = lex->src_cur - tok->str - n_closing;
445
446 // we set the length, return now so it's not set incorrectly below
447 return;
448
449 } else if (is_head_of_identifier(lex)) {
450 tok->kind = PY_TOKEN_NAME;
451
452 next_char(lex);
453
454 while (!is_end(lex) && is_tail_of_identifier(lex)) {
455 next_char(lex);
456 }
457
458 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
459 tok->kind = PY_TOKEN_NUMBER;
460
461 next_char(lex);
462
463 while (!is_end(lex)) {
464 if (is_char_or(lex, 'e', 'E')) {
465 next_char(lex);
466 if (is_char(lex, '+') || is_char(lex, '-')) {
467 next_char(lex);
468 }
469 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
470 next_char(lex);
471 } else {
472 break;
473 }
474 }
475
476 } else {
477 // search for encoded delimiter or operator
478
479 const char *t = tok_enc;
480 uint tok_enc_index = 0;
481 for (; *t != 0 && !is_char(lex, *t); t += 1) {
482 if (*t == 'e' || *t == 'c') {
483 t += 1;
484 } else if (*t == 'E') {
485 tok_enc_index -= 1;
486 t += 1;
487 }
488 tok_enc_index += 1;
489 }
490
491 next_char(lex);
492
493 if (*t == 0) {
494 // didn't match any delimiter or operator characters
495 tok->kind = PY_TOKEN_INVALID;
496
497 } else {
498 // matched a delimiter or operator character
499
500 // get the maximum characters for a valid token
501 t += 1;
502 uint t_index = tok_enc_index;
503 for (;;) {
504 for (; *t == 'e'; t += 1) {
505 t += 1;
506 t_index += 1;
507 if (is_char(lex, *t)) {
508 next_char(lex);
509 tok_enc_index = t_index;
510 break;
511 }
512 }
513
514 if (*t == 'E') {
515 t += 1;
516 if (is_char(lex, *t)) {
517 next_char(lex);
518 tok_enc_index = t_index;
519 } else {
520 tok->kind = PY_TOKEN_INVALID;
521 }
522 break;
523 }
524
525 if (*t == 'c') {
526 t += 1;
527 t_index += 1;
528 if (is_char(lex, *t)) {
529 next_char(lex);
530 tok_enc_index = t_index;
531 t += 1;
532 } else {
533 break;
534 }
535 } else {
536 break;
537 }
538 }
539
540 // set token kind
541 tok->kind = tok_enc_kind[tok_enc_index];
542
543 // compute bracket level for implicit line joining
544 if (tok->kind == PY_TOKEN_DEL_PAREN_OPEN || tok->kind == PY_TOKEN_DEL_BRACKET_OPEN || tok->kind == PY_TOKEN_DEL_BRACE_OPEN) {
545 lex->nested_bracket_level += 1;
546 } else if (tok->kind == PY_TOKEN_DEL_PAREN_CLOSE || tok->kind == PY_TOKEN_DEL_BRACKET_CLOSE || tok->kind == PY_TOKEN_DEL_BRACE_CLOSE) {
547 lex->nested_bracket_level -= 1;
548 }
549 }
550 }
551
552 // set token string (byte) length
553 tok->len = lex->src_cur - tok->str;
554
555 // check for keywords (must be done after setting token string length)
556 if (tok->kind == PY_TOKEN_NAME) {
557 for (int i = 0; tok_kw[i] != NULL; i++) {
558 if (py_token_is_str(tok, tok_kw[i])) {
559 tok->kind = PY_TOKEN_KW_FALSE + i;
560 break;
561 }
562 }
563 }
564}
565
566py_lexer_t *py_lexer_from_str_len(const char *src_name, const char *str, uint len, bool free_str) {
567 py_lexer_t *lex;
568
569 lex = m_new(py_lexer_t, 1);
570
571 //lex->name = g_strdup(src_name); // TODO
572 lex->name = src_name;
573 lex->free = free_str;
574 lex->src_beg = str;
575 lex->src_cur = str;
576 lex->src_end = str + len;
577 lex->line = 1;
578 lex->column = 1;
579 lex->cont_line = lex->line;
580 lex->emit_dent = 0;
581 lex->nested_bracket_level = 0;
582 lex->alloc_indent_level = 16;
583 lex->num_indent_level = 1;
584 lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
585 lex->indent_level[0] = 0;
586
587 // preload characters
588 // TODO unicode
589 if (len == 0) {
590 lex->chr0 = '\n'; // insert newline at end of file
591 lex->chr1 = CHR_EOF;
592 lex->chr2 = CHR_EOF;
593 } else if (len == 1) {
594 lex->chr0 = str[0];
595 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
596 lex->chr1 = '\n'; // insert newline at end of file
597 } else {
598 lex->chr1 = CHR_EOF;
599 }
600 lex->chr2 = CHR_EOF;
601 } else if (len == 2) {
602 lex->chr0 = str[0];
603 lex->chr1 = str[1];
604 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
605 lex->chr2 = '\n'; // insert newline at end of file
606 } else {
607 lex->chr2 = CHR_EOF;
608 }
609 } else {
610 lex->chr0 = str[0];
611 lex->chr1 = str[1];
612 lex->chr2 = str[2];
613 }
614
615 py_lexer_next_token_into(lex, &lex->tok_cur);
616 py_lexer_next_token_into(lex, &lex->tok_next);
617
618 return lex;
619}
620
621void py_lexer_free(py_lexer_t *lex) {
622 if (lex == NULL) {
623 return;
624 }
625 //m_free(lex->name);
626 if (lex->free) {
627 m_free((char*)lex->src_beg);
628 }
629 m_free(lex);
630}
631
632void py_lexer_to_next(py_lexer_t *lex) {
633 lex->tok_cur = lex->tok_next;
634 py_lexer_next_token_into(lex, &lex->tok_next);
635}
636
637const py_token_t *py_lexer_cur(const py_lexer_t *lex) {
638 return &lex->tok_cur;
639}
640
641bool py_lexer_is_kind(py_lexer_t *lex, py_token_kind_t kind) {
642 return lex->tok_cur.kind == kind;
643}
644
645/*
646bool py_lexer_is_str(py_lexer_t *lex, const char *str) {
647 return py_token_is_str(&lex->tok_cur, str);
648}
649
650bool py_lexer_is_next_kind(py_lexer_t *lex, py_token_kind_t kind) {
651 return lex->tok_next.kind == kind;
652}
653
654bool py_lexer_is_next_str(py_lexer_t *lex, const char *str) {
655 return py_token_is_str(&lex->tok_next, str);
656}
657
658bool py_lexer_opt_kind(py_lexer_t *lex, py_token_kind_t kind) {
659 if (py_lexer_is_kind(lex, kind)) {
660 py_lexer_to_next(lex);
661 return true;
662 }
663 return false;
664}
665
666bool py_lexer_opt_str(py_lexer_t *lex, const char *str) {
667 if (py_lexer_is_str(lex, str)) {
668 py_lexer_to_next(lex);
669 return true;
670 }
671 return false;
672}
673*/
674
675bool py_lexer_show_error(py_lexer_t *lex, const char *msg) {
676 return py_token_show_error(&lex->tok_cur, msg);
677}