blob: 9ab0641967003b4ba3b66fb51b2e7be5eda07472 [file] [log] [blame]
Damien429d7192013-10-04 19:53:11 +01001/* lexer.c -- simple tokeniser for Python implementation
2 */
3
4#include <stdint.h>
5#include <stdio.h>
6#include <assert.h>
7
8#include "misc.h"
9#include "lexer.h"
10
11#define TAB_SIZE (8)
Damien429d7192013-10-04 19:53:11 +010012
13struct _py_lexer_t {
Damiena5185f42013-10-20 14:41:27 +010014 const char *name; // name of source
15 void *stream_data; // data for stream
16 py_lexer_stream_next_char_t stream_next_char; // stream callback to get next char
Damienfa2162b2013-10-20 17:42:00 +010017 py_lexer_stream_close_t stream_close; // stream callback to free
Damien429d7192013-10-04 19:53:11 +010018
Damiena5185f42013-10-20 14:41:27 +010019 unichar chr0, chr1, chr2; // current cached characters from source
Damien429d7192013-10-04 19:53:11 +010020
21 uint line; // source line
22 uint column; // source column
23
Damiena5185f42013-10-20 14:41:27 +010024 int emit_dent; // non-zero when there are INDENT/DEDENT tokens to emit
25 int nested_bracket_level; // >0 when there are nested brackets over multiple lines
Damien429d7192013-10-04 19:53:11 +010026
27 uint alloc_indent_level;
28 uint num_indent_level;
29 uint16_t *indent_level;
30
Damiena5185f42013-10-20 14:41:27 +010031 vstr_t vstr;
Damien429d7192013-10-04 19:53:11 +010032 py_token_t tok_cur;
Damien429d7192013-10-04 19:53:11 +010033};
34
Damiena5185f42013-10-20 14:41:27 +010035bool str_strn_equal(const char *str, const char *strn, int len) {
Damien429d7192013-10-04 19:53:11 +010036 uint i = 0;
Damien429d7192013-10-04 19:53:11 +010037
Damiena5185f42013-10-20 14:41:27 +010038 while (i < len && *str == *strn) {
Damien429d7192013-10-04 19:53:11 +010039 ++i;
Damien429d7192013-10-04 19:53:11 +010040 ++str;
Damiena5185f42013-10-20 14:41:27 +010041 ++strn;
Damien429d7192013-10-04 19:53:11 +010042 }
43
Damiena5185f42013-10-20 14:41:27 +010044 return i == len && *str == 0;
Damien429d7192013-10-04 19:53:11 +010045}
46
47void py_token_show(const py_token_t *tok) {
Damiena5185f42013-10-20 14:41:27 +010048 printf("(%s:%d:%d) kind:%d str:%p len:%d", tok->src_name, tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
Damien429d7192013-10-04 19:53:11 +010049 if (tok->str != NULL && tok->len > 0) {
50 const char *i = tok->str;
51 const char *j = i + tok->len;
52 printf(" ");
53 while (i < j) {
54 unichar c = g_utf8_get_char(i);
55 i = g_utf8_next_char(i);
56 if (g_unichar_isprint(c)) {
57 printf("%c", c);
58 } else {
59 printf("?");
60 }
61 }
62 }
63 printf("\n");
64}
65
66void py_token_show_error_prefix(const py_token_t *tok) {
67 printf("(%s:%d:%d) ", tok->src_name, tok->src_line, tok->src_column);
68}
69
70bool py_token_show_error(const py_token_t *tok, const char *msg) {
71 printf("(%s:%d:%d) %s\n", tok->src_name, tok->src_line, tok->src_column, msg);
72 return false;
73}
74
Damiena5185f42013-10-20 14:41:27 +010075#define CUR_CHAR(lex) ((lex)->chr0)
76
Damien429d7192013-10-04 19:53:11 +010077static bool is_end(py_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +010078 return lex->chr0 == PY_LEXER_CHAR_EOF;
Damien429d7192013-10-04 19:53:11 +010079}
80
81static bool is_physical_newline(py_lexer_t *lex) {
82 return lex->chr0 == '\n' || lex->chr0 == '\r';
83}
84
85static bool is_char(py_lexer_t *lex, char c) {
86 return lex->chr0 == c;
87}
88
89static bool is_char_or(py_lexer_t *lex, char c1, char c2) {
90 return lex->chr0 == c1 || lex->chr0 == c2;
91}
92
93static bool is_char_or3(py_lexer_t *lex, char c1, char c2, char c3) {
94 return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
95}
96
97/*
98static bool is_char_following(py_lexer_t *lex, char c) {
99 return lex->chr1 == c;
100}
101*/
102
103static bool is_char_following_or(py_lexer_t *lex, char c1, char c2) {
104 return lex->chr1 == c1 || lex->chr1 == c2;
105}
106
107static bool is_char_following_following_or(py_lexer_t *lex, char c1, char c2) {
108 return lex->chr2 == c1 || lex->chr2 == c2;
109}
110
111static bool is_char_and(py_lexer_t *lex, char c1, char c2) {
112 return lex->chr0 == c1 && lex->chr1 == c2;
113}
114
115static bool is_whitespace(py_lexer_t *lex) {
116 return g_unichar_isspace(lex->chr0);
117}
118
119static bool is_letter(py_lexer_t *lex) {
120 return g_unichar_isalpha(lex->chr0);
121}
122
123static bool is_digit(py_lexer_t *lex) {
124 return g_unichar_isdigit(lex->chr0);
125}
126
127static bool is_following_digit(py_lexer_t *lex) {
128 return g_unichar_isdigit(lex->chr1);
129}
130
131// TODO UNICODE include unicode characters in definition of identifiers
132static bool is_head_of_identifier(py_lexer_t *lex) {
133 return is_letter(lex) || lex->chr0 == '_';
134}
135
136// TODO UNICODE include unicode characters in definition of identifiers
137static bool is_tail_of_identifier(py_lexer_t *lex) {
138 return is_head_of_identifier(lex) || is_digit(lex);
139}
140
141static void next_char(py_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100142 if (lex->chr0 == PY_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100143 return;
144 }
145
146 int advance = 1;
147
148 if (lex->chr0 == '\n') {
149 // LF is a new line
150 ++lex->line;
151 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100152 } else if (lex->chr0 == '\r') {
153 // CR is a new line
154 ++lex->line;
155 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100156 if (lex->chr1 == '\n') {
157 // CR LF is a single new line
158 advance = 2;
159 }
160 } else if (lex->chr0 == '\t') {
161 // a tab
162 lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
163 } else {
164 // a character worth one column
165 ++lex->column;
166 }
167
168 for (; advance > 0; advance--) {
169 lex->chr0 = lex->chr1;
170 lex->chr1 = lex->chr2;
Damiena5185f42013-10-20 14:41:27 +0100171 lex->chr2 = lex->stream_next_char(lex->stream_data);
172 if (lex->chr2 == PY_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100173 // EOF
Damiena5185f42013-10-20 14:41:27 +0100174 if (lex->chr1 != PY_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien429d7192013-10-04 19:53:11 +0100175 lex->chr2 = '\n'; // insert newline at end of file
Damien429d7192013-10-04 19:53:11 +0100176 }
177 }
178 }
179}
180
181void indent_push(py_lexer_t *lex, uint indent) {
182 if (lex->num_indent_level >= lex->alloc_indent_level) {
183 lex->alloc_indent_level *= 2;
184 lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level);
185 }
186 lex->indent_level[lex->num_indent_level++] = indent;
187}
188
189uint indent_top(py_lexer_t *lex) {
190 return lex->indent_level[lex->num_indent_level - 1];
191}
192
193void indent_pop(py_lexer_t *lex) {
194 lex->num_indent_level -= 1;
195}
196
197// some tricky operator encoding:
198// <op> = begin with <op>, if this opchar matches then begin here
199// e<op> = end with <op>, if this opchar matches then end
200// E<op> = mandatory end with <op>, this opchar must match, then end
201// c<op> = continue with <op>, if this opchar matches then continue matching
202// this means if the start of two ops are the same then they are equal til the last char
203
204static const char *tok_enc =
205 "()[]{},:;@~" // singles
206 "<e=c<e=" // < <= << <<=
207 ">e=c>e=" // > >= >> >>=
208 "*e=c*e=" // * *= ** **=
209 "+e=" // + +=
210 "-e=e>" // - -= ->
211 "&e=" // & &=
212 "|e=" // | |=
213 "/e=c/e=" // / /= // //=
214 "%e=" // % %=
215 "^e=" // ^ ^=
216 "=e=" // = ==
217 "!E=" // !=
218 ".c.E."; // . ...
219
220// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
221static const uint8_t tok_enc_kind[] = {
222 PY_TOKEN_DEL_PAREN_OPEN, PY_TOKEN_DEL_PAREN_CLOSE,
223 PY_TOKEN_DEL_BRACKET_OPEN, PY_TOKEN_DEL_BRACKET_CLOSE,
224 PY_TOKEN_DEL_BRACE_OPEN, PY_TOKEN_DEL_BRACE_CLOSE,
225 PY_TOKEN_DEL_COMMA, PY_TOKEN_DEL_COLON, PY_TOKEN_DEL_SEMICOLON, PY_TOKEN_DEL_AT, PY_TOKEN_OP_TILDE,
226
227 PY_TOKEN_OP_LESS, PY_TOKEN_OP_LESS_EQUAL, PY_TOKEN_OP_DBL_LESS, PY_TOKEN_DEL_DBL_LESS_EQUAL,
228 PY_TOKEN_OP_MORE, PY_TOKEN_OP_MORE_EQUAL, PY_TOKEN_OP_DBL_MORE, PY_TOKEN_DEL_DBL_MORE_EQUAL,
229 PY_TOKEN_OP_STAR, PY_TOKEN_DEL_STAR_EQUAL, PY_TOKEN_OP_DBL_STAR, PY_TOKEN_DEL_DBL_STAR_EQUAL,
230 PY_TOKEN_OP_PLUS, PY_TOKEN_DEL_PLUS_EQUAL,
231 PY_TOKEN_OP_MINUS, PY_TOKEN_DEL_MINUS_EQUAL, PY_TOKEN_DEL_MINUS_MORE,
232 PY_TOKEN_OP_AMPERSAND, PY_TOKEN_DEL_AMPERSAND_EQUAL,
233 PY_TOKEN_OP_PIPE, PY_TOKEN_DEL_PIPE_EQUAL,
234 PY_TOKEN_OP_SLASH, PY_TOKEN_DEL_SLASH_EQUAL, PY_TOKEN_OP_DBL_SLASH, PY_TOKEN_DEL_DBL_SLASH_EQUAL,
235 PY_TOKEN_OP_PERCENT, PY_TOKEN_DEL_PERCENT_EQUAL,
236 PY_TOKEN_OP_CARET, PY_TOKEN_DEL_CARET_EQUAL,
237 PY_TOKEN_DEL_EQUAL, PY_TOKEN_OP_DBL_EQUAL,
238 PY_TOKEN_OP_NOT_EQUAL,
239 PY_TOKEN_DEL_PERIOD, PY_TOKEN_ELLIPSES,
240};
241
242// must have the same order as enum in lexer.h
243static const char *tok_kw[] = {
244 "False",
245 "None",
246 "True",
247 "and",
248 "as",
249 "assert",
250 "break",
251 "class",
252 "continue",
253 "def",
254 "del",
255 "elif",
256 "else",
257 "except",
258 "finally",
259 "for",
260 "from",
261 "global",
262 "if",
263 "import",
264 "in",
265 "is",
266 "lambda",
267 "nonlocal",
268 "not",
269 "or",
270 "pass",
271 "raise",
272 "return",
273 "try",
274 "while",
275 "with",
276 "yield",
277 NULL,
278};
279
Damiena5185f42013-10-20 14:41:27 +0100280static void py_lexer_next_token_into(py_lexer_t *lex, py_token_t *tok, bool first_token) {
281 // skip white space and comments
Damien429d7192013-10-04 19:53:11 +0100282 bool had_physical_newline = false;
Damien429d7192013-10-04 19:53:11 +0100283 while (!is_end(lex)) {
284 if (is_physical_newline(lex)) {
285 had_physical_newline = true;
286 next_char(lex);
287 } else if (is_whitespace(lex)) {
288 next_char(lex);
289 } else if (is_char(lex, '#')) {
290 next_char(lex);
291 while (!is_end(lex) && !is_physical_newline(lex)) {
292 next_char(lex);
293 }
294 // had_physical_newline will be set on next loop
295 } else if (is_char(lex, '\\')) {
296 // backslash (outside string literals) must appear just before a physical newline
297 next_char(lex);
298 if (!is_physical_newline(lex)) {
299 // TODO SyntaxError
300 assert(0);
301 } else {
302 next_char(lex);
303 }
304 } else {
305 break;
306 }
307 }
308
Damiena5185f42013-10-20 14:41:27 +0100309 // set token source information
Damien429d7192013-10-04 19:53:11 +0100310 tok->src_name = lex->name;
311 tok->src_line = lex->line;
312 tok->src_column = lex->column;
Damien429d7192013-10-04 19:53:11 +0100313
Damiena5185f42013-10-20 14:41:27 +0100314 // start new token text
315 vstr_reset(&lex->vstr);
316
317 if (first_token && lex->line == 1 && lex->column != 1) {
318 // check that the first token is in the first column
319 // if first token is not on first line, we get a physical newline and
320 // this check is done as part of normal indent/dedent checking below
321 // (done to get equivalence with CPython)
322 tok->kind = PY_TOKEN_INDENT;
323
324 } else if (lex->emit_dent < 0) {
Damien429d7192013-10-04 19:53:11 +0100325 tok->kind = PY_TOKEN_DEDENT;
326 lex->emit_dent += 1;
327
328 } else if (lex->emit_dent > 0) {
329 tok->kind = PY_TOKEN_INDENT;
330 lex->emit_dent -= 1;
331
Damien91d387d2013-10-09 15:09:52 +0100332 } else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damien429d7192013-10-04 19:53:11 +0100333 tok->kind = PY_TOKEN_NEWLINE;
334
335 uint num_spaces = lex->column - 1;
336 lex->emit_dent = 0;
337 if (num_spaces == indent_top(lex)) {
338 } else if (num_spaces > indent_top(lex)) {
339 indent_push(lex, num_spaces);
340 lex->emit_dent += 1;
341 } else {
342 while (num_spaces < indent_top(lex)) {
343 indent_pop(lex);
344 lex->emit_dent -= 1;
345 }
346 if (num_spaces != indent_top(lex)) {
Damien91d387d2013-10-09 15:09:52 +0100347 tok->kind = PY_TOKEN_DEDENT_MISMATCH;
Damien429d7192013-10-04 19:53:11 +0100348 }
349 }
350
351 } else if (is_end(lex)) {
Damien429d7192013-10-04 19:53:11 +0100352 if (indent_top(lex) > 0) {
353 tok->kind = PY_TOKEN_NEWLINE;
354 lex->emit_dent = 0;
355 while (indent_top(lex) > 0) {
356 indent_pop(lex);
357 lex->emit_dent -= 1;
358 }
359 } else {
360 tok->kind = PY_TOKEN_END;
361 }
362
363 } else if (is_char_or(lex, '\'', '\"')
364 || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
365 || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
366 // a string or bytes literal
367
368 // parse type codes
369 bool is_raw = false;
370 bool is_bytes = false;
371 if (is_char(lex, 'u')) {
372 next_char(lex);
373 } else if (is_char(lex, 'b')) {
374 is_bytes = true;
375 next_char(lex);
376 if (is_char(lex, 'r')) {
377 is_raw = true;
378 next_char(lex);
379 }
380 } else if (is_char(lex, 'r')) {
381 is_raw = true;
382 next_char(lex);
383 if (is_char(lex, 'b')) {
384 is_bytes = true;
385 next_char(lex);
386 }
387 }
388
389 // set token kind
390 if (is_bytes) {
391 tok->kind = PY_TOKEN_BYTES;
392 } else {
393 tok->kind = PY_TOKEN_STRING;
394 }
395
396 // get first quoting character
397 char quote_char = '\'';
398 if (is_char(lex, '\"')) {
399 quote_char = '\"';
400 }
401 next_char(lex);
402
403 // work out if it's a single or triple quoted literal
404 int num_quotes;
405 if (is_char_and(lex, quote_char, quote_char)) {
406 // triple quotes
407 next_char(lex);
408 next_char(lex);
409 num_quotes = 3;
410 } else {
411 // single quotes
412 num_quotes = 1;
413 }
414
Damien429d7192013-10-04 19:53:11 +0100415 // parse the literal
Damien429d7192013-10-04 19:53:11 +0100416 int n_closing = 0;
417 while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
418 if (is_char(lex, quote_char)) {
419 n_closing += 1;
Damiena5185f42013-10-20 14:41:27 +0100420 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100421 } else {
422 n_closing = 0;
423 if (!is_raw && is_char(lex, '\\')) {
424 next_char(lex);
Damiena5185f42013-10-20 14:41:27 +0100425 unichar c = CUR_CHAR(lex);
426 switch (c) {
427 case PY_LEXER_CHAR_EOF: break; // TODO a proper error message?
428 case '\n': c = PY_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
429 case '\\': break;
430 case '\'': break;
431 case '"': break;
432 case 'a': c = 0x07; break;
433 case 'b': c = 0x08; break;
434 case 't': c = 0x09; break;
435 case 'n': c = 0x0a; break;
436 case 'v': c = 0x0b; break;
437 case 'f': c = 0x0c; break;
438 case 'r': c = 0x0d; break;
439 // TODO \ooo octal
440 case 'x': // TODO \xhh
441 case 'N': // TODO \N{name} only in strings
442 case 'u': // TODO \uxxxx only in strings
443 case 'U': // TODO \Uxxxxxxxx only in strings
444 default: break; // TODO error message
445 }
446 if (c != PY_LEXER_CHAR_EOF) {
447 vstr_add_char(&lex->vstr, c);
448 }
449 } else {
450 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100451 }
452 }
453 next_char(lex);
454 }
455
456 // check we got the required end quotes
457 if (n_closing < num_quotes) {
458 tok->kind = PY_TOKEN_LONELY_STRING_OPEN;
459 }
460
Damiena5185f42013-10-20 14:41:27 +0100461 // cut off the end quotes from the token text
462 vstr_cut_tail(&lex->vstr, n_closing);
Damien429d7192013-10-04 19:53:11 +0100463
464 } else if (is_head_of_identifier(lex)) {
465 tok->kind = PY_TOKEN_NAME;
466
Damiena5185f42013-10-20 14:41:27 +0100467 // get first char
468 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100469 next_char(lex);
470
Damiena5185f42013-10-20 14:41:27 +0100471 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100472 while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damiena5185f42013-10-20 14:41:27 +0100473 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100474 next_char(lex);
475 }
476
477 } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
478 tok->kind = PY_TOKEN_NUMBER;
479
Damiena5185f42013-10-20 14:41:27 +0100480 // get first char
481 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100482 next_char(lex);
483
Damiena5185f42013-10-20 14:41:27 +0100484 // get tail chars
Damien429d7192013-10-04 19:53:11 +0100485 while (!is_end(lex)) {
486 if (is_char_or(lex, 'e', 'E')) {
Damiena5185f42013-10-20 14:41:27 +0100487 vstr_add_char(&lex->vstr, 'e');
Damien429d7192013-10-04 19:53:11 +0100488 next_char(lex);
489 if (is_char(lex, '+') || is_char(lex, '-')) {
Damiena5185f42013-10-20 14:41:27 +0100490 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100491 next_char(lex);
492 }
493 } else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
Damiena5185f42013-10-20 14:41:27 +0100494 vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien429d7192013-10-04 19:53:11 +0100495 next_char(lex);
496 } else {
497 break;
498 }
499 }
500
501 } else {
502 // search for encoded delimiter or operator
503
504 const char *t = tok_enc;
505 uint tok_enc_index = 0;
506 for (; *t != 0 && !is_char(lex, *t); t += 1) {
507 if (*t == 'e' || *t == 'c') {
508 t += 1;
509 } else if (*t == 'E') {
510 tok_enc_index -= 1;
511 t += 1;
512 }
513 tok_enc_index += 1;
514 }
515
516 next_char(lex);
517
518 if (*t == 0) {
519 // didn't match any delimiter or operator characters
520 tok->kind = PY_TOKEN_INVALID;
521
522 } else {
523 // matched a delimiter or operator character
524
525 // get the maximum characters for a valid token
526 t += 1;
527 uint t_index = tok_enc_index;
528 for (;;) {
529 for (; *t == 'e'; t += 1) {
530 t += 1;
531 t_index += 1;
532 if (is_char(lex, *t)) {
533 next_char(lex);
534 tok_enc_index = t_index;
535 break;
536 }
537 }
538
539 if (*t == 'E') {
540 t += 1;
541 if (is_char(lex, *t)) {
542 next_char(lex);
543 tok_enc_index = t_index;
544 } else {
545 tok->kind = PY_TOKEN_INVALID;
546 }
547 break;
548 }
549
550 if (*t == 'c') {
551 t += 1;
552 t_index += 1;
553 if (is_char(lex, *t)) {
554 next_char(lex);
555 tok_enc_index = t_index;
556 t += 1;
557 } else {
558 break;
559 }
560 } else {
561 break;
562 }
563 }
564
565 // set token kind
566 tok->kind = tok_enc_kind[tok_enc_index];
567
568 // compute bracket level for implicit line joining
569 if (tok->kind == PY_TOKEN_DEL_PAREN_OPEN || tok->kind == PY_TOKEN_DEL_BRACKET_OPEN || tok->kind == PY_TOKEN_DEL_BRACE_OPEN) {
570 lex->nested_bracket_level += 1;
571 } else if (tok->kind == PY_TOKEN_DEL_PAREN_CLOSE || tok->kind == PY_TOKEN_DEL_BRACKET_CLOSE || tok->kind == PY_TOKEN_DEL_BRACE_CLOSE) {
572 lex->nested_bracket_level -= 1;
573 }
574 }
575 }
576
Damiena5185f42013-10-20 14:41:27 +0100577 // point token text to vstr buffer
578 tok->str = vstr_str(&lex->vstr);
579 tok->len = vstr_len(&lex->vstr);
Damien429d7192013-10-04 19:53:11 +0100580
Damiena5185f42013-10-20 14:41:27 +0100581 // check for keywords
Damien429d7192013-10-04 19:53:11 +0100582 if (tok->kind == PY_TOKEN_NAME) {
583 for (int i = 0; tok_kw[i] != NULL; i++) {
Damiena5185f42013-10-20 14:41:27 +0100584 if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
Damien429d7192013-10-04 19:53:11 +0100585 tok->kind = PY_TOKEN_KW_FALSE + i;
586 break;
587 }
588 }
589 }
590}
591
Damienfa2162b2013-10-20 17:42:00 +0100592py_lexer_t *py_lexer_new(const char *src_name, void *stream_data, py_lexer_stream_next_char_t stream_next_char, py_lexer_stream_close_t stream_close) {
Damiena5185f42013-10-20 14:41:27 +0100593 py_lexer_t *lex = m_new(py_lexer_t, 1);
Damien429d7192013-10-04 19:53:11 +0100594
Damiena5185f42013-10-20 14:41:27 +0100595 lex->name = src_name; // TODO do we need to strdup this?
596 lex->stream_data = stream_data;
597 lex->stream_next_char = stream_next_char;
Damienfa2162b2013-10-20 17:42:00 +0100598 lex->stream_close = stream_close;
Damien429d7192013-10-04 19:53:11 +0100599 lex->line = 1;
600 lex->column = 1;
Damien429d7192013-10-04 19:53:11 +0100601 lex->emit_dent = 0;
602 lex->nested_bracket_level = 0;
603 lex->alloc_indent_level = 16;
604 lex->num_indent_level = 1;
605 lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
606 lex->indent_level[0] = 0;
Damiena5185f42013-10-20 14:41:27 +0100607 vstr_init(&lex->vstr);
Damien429d7192013-10-04 19:53:11 +0100608
609 // preload characters
Damiena5185f42013-10-20 14:41:27 +0100610 lex->chr0 = stream_next_char(stream_data);
611 lex->chr1 = stream_next_char(stream_data);
612 lex->chr2 = stream_next_char(stream_data);
613
614 // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
615 if (lex->chr0 == PY_LEXER_CHAR_EOF) {
616 lex->chr0 = '\n';
617 } else if (lex->chr1 == PY_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100618 if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100619 lex->chr1 = '\n';
Damien429d7192013-10-04 19:53:11 +0100620 }
Damiena5185f42013-10-20 14:41:27 +0100621 } else if (lex->chr2 == PY_LEXER_CHAR_EOF) {
Damien429d7192013-10-04 19:53:11 +0100622 if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damiena5185f42013-10-20 14:41:27 +0100623 lex->chr2 = '\n';
Damien429d7192013-10-04 19:53:11 +0100624 }
Damien429d7192013-10-04 19:53:11 +0100625 }
626
Damiena5185f42013-10-20 14:41:27 +0100627 // preload first token
628 py_lexer_next_token_into(lex, &lex->tok_cur, true);
Damien429d7192013-10-04 19:53:11 +0100629
630 return lex;
631}
632
633void py_lexer_free(py_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100634 if (lex) {
Damienfa2162b2013-10-20 17:42:00 +0100635 if (lex->stream_close) {
636 lex->stream_close(lex->stream_data);
Damiena5185f42013-10-20 14:41:27 +0100637 }
638 m_free(lex);
Damien429d7192013-10-04 19:53:11 +0100639 }
Damien429d7192013-10-04 19:53:11 +0100640}
641
642void py_lexer_to_next(py_lexer_t *lex) {
Damiena5185f42013-10-20 14:41:27 +0100643 py_lexer_next_token_into(lex, &lex->tok_cur, false);
Damien429d7192013-10-04 19:53:11 +0100644}
645
646const py_token_t *py_lexer_cur(const py_lexer_t *lex) {
647 return &lex->tok_cur;
648}
649
650bool py_lexer_is_kind(py_lexer_t *lex, py_token_kind_t kind) {
651 return lex->tok_cur.kind == kind;
652}
653
654/*
655bool py_lexer_is_str(py_lexer_t *lex, const char *str) {
656 return py_token_is_str(&lex->tok_cur, str);
657}
658
Damien429d7192013-10-04 19:53:11 +0100659bool py_lexer_opt_kind(py_lexer_t *lex, py_token_kind_t kind) {
660 if (py_lexer_is_kind(lex, kind)) {
661 py_lexer_to_next(lex);
662 return true;
663 }
664 return false;
665}
666
667bool py_lexer_opt_str(py_lexer_t *lex, const char *str) {
668 if (py_lexer_is_str(lex, str)) {
669 py_lexer_to_next(lex);
670 return true;
671 }
672 return false;
673}
674*/
675
676bool py_lexer_show_error(py_lexer_t *lex, const char *msg) {
677 return py_token_show_error(&lex->tok_cur, msg);
678}
Damien91d387d2013-10-09 15:09:52 +0100679
680bool py_lexer_show_error_pythonic(py_lexer_t *lex, const char *msg) {
681 printf(" File \"%s\", line %d column %d\n%s\n", lex->tok_cur.src_name, lex->tok_cur.src_line, lex->tok_cur.src_column, msg);
682 return false;
683}