Blame - py/lexer.c - lite/micropython

blob: 9ab0641967003b4ba3b66fb51b2e7be5eda07472 [file] [log] [blame]

Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	1	/* lexer.c -- simple tokeniser for Python implementation
				2	*/
				3
				4	#include <stdint.h>
				5	#include <stdio.h>
				6	#include <assert.h>
				7
				8	#include "misc.h"
				9	#include "lexer.h"
				10
				11	#define TAB_SIZE (8)
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	12
				13	struct _py_lexer_t {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	14	const char *name; // name of source
				15	void *stream_data; // data for stream
				16	py_lexer_stream_next_char_t stream_next_char; // stream callback to get next char
Damien	fa2162b	2013-10-20 17:42:00 +0100	[diff] [blame^]	17	py_lexer_stream_close_t stream_close; // stream callback to free
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	18
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	19	unichar chr0, chr1, chr2; // current cached characters from source
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	20
				21	uint line; // source line
				22	uint column; // source column
				23
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	24	int emit_dent; // non-zero when there are INDENT/DEDENT tokens to emit
				25	int nested_bracket_level; // >0 when there are nested brackets over multiple lines
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	26
				27	uint alloc_indent_level;
				28	uint num_indent_level;
				29	uint16_t *indent_level;
				30
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	31	vstr_t vstr;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	32	py_token_t tok_cur;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	33	};
				34
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	35	bool str_strn_equal(const char str, const char strn, int len) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	36	uint i = 0;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	37
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	38	while (i < len && str == strn) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	39	++i;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	40	++str;
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	41	++strn;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	42	}
				43
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	44	return i == len && *str == 0;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	45	}
				46
				47	void py_token_show(const py_token_t *tok) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	48	printf("(%s:%d:%d) kind:%d str:%p len:%d", tok->src_name, tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	49	if (tok->str != NULL && tok->len > 0) {
				50	const char *i = tok->str;
				51	const char *j = i + tok->len;
				52	printf(" ");
				53	while (i < j) {
				54	unichar c = g_utf8_get_char(i);
				55	i = g_utf8_next_char(i);
				56	if (g_unichar_isprint(c)) {
				57	printf("%c", c);
				58	} else {
				59	printf("?");
				60	}
				61	}
				62	}
				63	printf("\n");
				64	}
				65
				66	void py_token_show_error_prefix(const py_token_t *tok) {
				67	printf("(%s:%d:%d) ", tok->src_name, tok->src_line, tok->src_column);
				68	}
				69
				70	bool py_token_show_error(const py_token_t tok, const char msg) {
				71	printf("(%s:%d:%d) %s\n", tok->src_name, tok->src_line, tok->src_column, msg);
				72	return false;
				73	}
				74
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	75	#define CUR_CHAR(lex) ((lex)->chr0)
				76
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	77	static bool is_end(py_lexer_t *lex) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	78	return lex->chr0 == PY_LEXER_CHAR_EOF;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	79	}
				80
				81	static bool is_physical_newline(py_lexer_t *lex) {
				82	return lex->chr0 == '\n' \|\| lex->chr0 == '\r';
				83	}
				84
				85	static bool is_char(py_lexer_t *lex, char c) {
				86	return lex->chr0 == c;
				87	}
				88
				89	static bool is_char_or(py_lexer_t *lex, char c1, char c2) {
				90	return lex->chr0 == c1 \|\| lex->chr0 == c2;
				91	}
				92
				93	static bool is_char_or3(py_lexer_t *lex, char c1, char c2, char c3) {
				94	return lex->chr0 == c1 \|\| lex->chr0 == c2 \|\| lex->chr0 == c3;
				95	}
				96
				97	/*
				98	static bool is_char_following(py_lexer_t *lex, char c) {
				99	return lex->chr1 == c;
				100	}
				101	*/
				102
				103	static bool is_char_following_or(py_lexer_t *lex, char c1, char c2) {
				104	return lex->chr1 == c1 \|\| lex->chr1 == c2;
				105	}
				106
				107	static bool is_char_following_following_or(py_lexer_t *lex, char c1, char c2) {
				108	return lex->chr2 == c1 \|\| lex->chr2 == c2;
				109	}
				110
				111	static bool is_char_and(py_lexer_t *lex, char c1, char c2) {
				112	return lex->chr0 == c1 && lex->chr1 == c2;
				113	}
				114
				115	static bool is_whitespace(py_lexer_t *lex) {
				116	return g_unichar_isspace(lex->chr0);
				117	}
				118
				119	static bool is_letter(py_lexer_t *lex) {
				120	return g_unichar_isalpha(lex->chr0);
				121	}
				122
				123	static bool is_digit(py_lexer_t *lex) {
				124	return g_unichar_isdigit(lex->chr0);
				125	}
				126
				127	static bool is_following_digit(py_lexer_t *lex) {
				128	return g_unichar_isdigit(lex->chr1);
				129	}
				130
				131	// TODO UNICODE include unicode characters in definition of identifiers
				132	static bool is_head_of_identifier(py_lexer_t *lex) {
				133	return is_letter(lex) \|\| lex->chr0 == '_';
				134	}
				135
				136	// TODO UNICODE include unicode characters in definition of identifiers
				137	static bool is_tail_of_identifier(py_lexer_t *lex) {
				138	return is_head_of_identifier(lex) \|\| is_digit(lex);
				139	}
				140
				141	static void next_char(py_lexer_t *lex) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	142	if (lex->chr0 == PY_LEXER_CHAR_EOF) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	143	return;
				144	}
				145
				146	int advance = 1;
				147
				148	if (lex->chr0 == '\n') {
				149	// LF is a new line
				150	++lex->line;
				151	lex->column = 1;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	152	} else if (lex->chr0 == '\r') {
				153	// CR is a new line
				154	++lex->line;
				155	lex->column = 1;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	156	if (lex->chr1 == '\n') {
				157	// CR LF is a single new line
				158	advance = 2;
				159	}
				160	} else if (lex->chr0 == '\t') {
				161	// a tab
				162	lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
				163	} else {
				164	// a character worth one column
				165	++lex->column;
				166	}
				167
				168	for (; advance > 0; advance--) {
				169	lex->chr0 = lex->chr1;
				170	lex->chr1 = lex->chr2;
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	171	lex->chr2 = lex->stream_next_char(lex->stream_data);
				172	if (lex->chr2 == PY_LEXER_CHAR_EOF) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	173	// EOF
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	174	if (lex->chr1 != PY_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	175	lex->chr2 = '\n'; // insert newline at end of file
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	176	}
				177	}
				178	}
				179	}
				180
				181	void indent_push(py_lexer_t *lex, uint indent) {
				182	if (lex->num_indent_level >= lex->alloc_indent_level) {
				183	lex->alloc_indent_level *= 2;
				184	lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level);
				185	}
				186	lex->indent_level[lex->num_indent_level++] = indent;
				187	}
				188
				189	uint indent_top(py_lexer_t *lex) {
				190	return lex->indent_level[lex->num_indent_level - 1];
				191	}
				192
				193	void indent_pop(py_lexer_t *lex) {
				194	lex->num_indent_level -= 1;
				195	}
				196
				197	// some tricky operator encoding:
				198	// <op> = begin with <op>, if this opchar matches then begin here
				199	// e<op> = end with <op>, if this opchar matches then end
				200	// E<op> = mandatory end with <op>, this opchar must match, then end
				201	// c<op> = continue with <op>, if this opchar matches then continue matching
				202	// this means if the start of two ops are the same then they are equal til the last char
				203
				204	static const char *tok_enc =
				205	"()[]{},:;@~" // singles
				206	"<e=c<e=" // < <= << <<=
				207	">e=c>e=" // > >= >> >>=
				208	"e=ce=" // * = * **=
				209	"+e=" // + +=
				210	"-e=e>" // - -= ->
				211	"&e=" // & &=
				212	"\|e=" // \| \|=
				213	"/e=c/e=" // / /= // //=
				214	"%e=" // % %=
				215	"^e=" // ^ ^=
				216	"=e=" // = ==
				217	"!E=" // !=
				218	".c.E."; // . ...
				219
				220	// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
				221	static const uint8_t tok_enc_kind[] = {
				222	PY_TOKEN_DEL_PAREN_OPEN, PY_TOKEN_DEL_PAREN_CLOSE,
				223	PY_TOKEN_DEL_BRACKET_OPEN, PY_TOKEN_DEL_BRACKET_CLOSE,
				224	PY_TOKEN_DEL_BRACE_OPEN, PY_TOKEN_DEL_BRACE_CLOSE,
				225	PY_TOKEN_DEL_COMMA, PY_TOKEN_DEL_COLON, PY_TOKEN_DEL_SEMICOLON, PY_TOKEN_DEL_AT, PY_TOKEN_OP_TILDE,
				226
				227	PY_TOKEN_OP_LESS, PY_TOKEN_OP_LESS_EQUAL, PY_TOKEN_OP_DBL_LESS, PY_TOKEN_DEL_DBL_LESS_EQUAL,
				228	PY_TOKEN_OP_MORE, PY_TOKEN_OP_MORE_EQUAL, PY_TOKEN_OP_DBL_MORE, PY_TOKEN_DEL_DBL_MORE_EQUAL,
				229	PY_TOKEN_OP_STAR, PY_TOKEN_DEL_STAR_EQUAL, PY_TOKEN_OP_DBL_STAR, PY_TOKEN_DEL_DBL_STAR_EQUAL,
				230	PY_TOKEN_OP_PLUS, PY_TOKEN_DEL_PLUS_EQUAL,
				231	PY_TOKEN_OP_MINUS, PY_TOKEN_DEL_MINUS_EQUAL, PY_TOKEN_DEL_MINUS_MORE,
				232	PY_TOKEN_OP_AMPERSAND, PY_TOKEN_DEL_AMPERSAND_EQUAL,
				233	PY_TOKEN_OP_PIPE, PY_TOKEN_DEL_PIPE_EQUAL,
				234	PY_TOKEN_OP_SLASH, PY_TOKEN_DEL_SLASH_EQUAL, PY_TOKEN_OP_DBL_SLASH, PY_TOKEN_DEL_DBL_SLASH_EQUAL,
				235	PY_TOKEN_OP_PERCENT, PY_TOKEN_DEL_PERCENT_EQUAL,
				236	PY_TOKEN_OP_CARET, PY_TOKEN_DEL_CARET_EQUAL,
				237	PY_TOKEN_DEL_EQUAL, PY_TOKEN_OP_DBL_EQUAL,
				238	PY_TOKEN_OP_NOT_EQUAL,
				239	PY_TOKEN_DEL_PERIOD, PY_TOKEN_ELLIPSES,
				240	};
				241
				242	// must have the same order as enum in lexer.h
				243	static const char *tok_kw[] = {
				244	"False",
				245	"None",
				246	"True",
				247	"and",
				248	"as",
				249	"assert",
				250	"break",
				251	"class",
				252	"continue",
				253	"def",
				254	"del",
				255	"elif",
				256	"else",
				257	"except",
				258	"finally",
				259	"for",
				260	"from",
				261	"global",
				262	"if",
				263	"import",
				264	"in",
				265	"is",
				266	"lambda",
				267	"nonlocal",
				268	"not",
				269	"or",
				270	"pass",
				271	"raise",
				272	"return",
				273	"try",
				274	"while",
				275	"with",
				276	"yield",
				277	NULL,
				278	};
				279
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	280	static void py_lexer_next_token_into(py_lexer_t lex, py_token_t tok, bool first_token) {
				281	// skip white space and comments
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	282	bool had_physical_newline = false;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	283	while (!is_end(lex)) {
				284	if (is_physical_newline(lex)) {
				285	had_physical_newline = true;
				286	next_char(lex);
				287	} else if (is_whitespace(lex)) {
				288	next_char(lex);
				289	} else if (is_char(lex, '#')) {
				290	next_char(lex);
				291	while (!is_end(lex) && !is_physical_newline(lex)) {
				292	next_char(lex);
				293	}
				294	// had_physical_newline will be set on next loop
				295	} else if (is_char(lex, '\\')) {
				296	// backslash (outside string literals) must appear just before a physical newline
				297	next_char(lex);
				298	if (!is_physical_newline(lex)) {
				299	// TODO SyntaxError
				300	assert(0);
				301	} else {
				302	next_char(lex);
				303	}
				304	} else {
				305	break;
				306	}
				307	}
				308
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	309	// set token source information
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	310	tok->src_name = lex->name;
				311	tok->src_line = lex->line;
				312	tok->src_column = lex->column;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	313
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	314	// start new token text
				315	vstr_reset(&lex->vstr);
				316
				317	if (first_token && lex->line == 1 && lex->column != 1) {
				318	// check that the first token is in the first column
				319	// if first token is not on first line, we get a physical newline and
				320	// this check is done as part of normal indent/dedent checking below
				321	// (done to get equivalence with CPython)
				322	tok->kind = PY_TOKEN_INDENT;
				323
				324	} else if (lex->emit_dent < 0) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	325	tok->kind = PY_TOKEN_DEDENT;
				326	lex->emit_dent += 1;
				327
				328	} else if (lex->emit_dent > 0) {
				329	tok->kind = PY_TOKEN_INDENT;
				330	lex->emit_dent -= 1;
				331
Damien	91d387d	2013-10-09 15:09:52 +0100	[diff] [blame]	332	} else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	333	tok->kind = PY_TOKEN_NEWLINE;
				334
				335	uint num_spaces = lex->column - 1;
				336	lex->emit_dent = 0;
				337	if (num_spaces == indent_top(lex)) {
				338	} else if (num_spaces > indent_top(lex)) {
				339	indent_push(lex, num_spaces);
				340	lex->emit_dent += 1;
				341	} else {
				342	while (num_spaces < indent_top(lex)) {
				343	indent_pop(lex);
				344	lex->emit_dent -= 1;
				345	}
				346	if (num_spaces != indent_top(lex)) {
Damien	91d387d	2013-10-09 15:09:52 +0100	[diff] [blame]	347	tok->kind = PY_TOKEN_DEDENT_MISMATCH;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	348	}
				349	}
				350
				351	} else if (is_end(lex)) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	352	if (indent_top(lex) > 0) {
				353	tok->kind = PY_TOKEN_NEWLINE;
				354	lex->emit_dent = 0;
				355	while (indent_top(lex) > 0) {
				356	indent_pop(lex);
				357	lex->emit_dent -= 1;
				358	}
				359	} else {
				360	tok->kind = PY_TOKEN_END;
				361	}
				362
				363	} else if (is_char_or(lex, '\'', '\"')
				364	\|\| (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
				365	\|\| ((is_char_and(lex, 'r', 'b') \|\| is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
				366	// a string or bytes literal
				367
				368	// parse type codes
				369	bool is_raw = false;
				370	bool is_bytes = false;
				371	if (is_char(lex, 'u')) {
				372	next_char(lex);
				373	} else if (is_char(lex, 'b')) {
				374	is_bytes = true;
				375	next_char(lex);
				376	if (is_char(lex, 'r')) {
				377	is_raw = true;
				378	next_char(lex);
				379	}
				380	} else if (is_char(lex, 'r')) {
				381	is_raw = true;
				382	next_char(lex);
				383	if (is_char(lex, 'b')) {
				384	is_bytes = true;
				385	next_char(lex);
				386	}
				387	}
				388
				389	// set token kind
				390	if (is_bytes) {
				391	tok->kind = PY_TOKEN_BYTES;
				392	} else {
				393	tok->kind = PY_TOKEN_STRING;
				394	}
				395
				396	// get first quoting character
				397	char quote_char = '\'';
				398	if (is_char(lex, '\"')) {
				399	quote_char = '\"';
				400	}
				401	next_char(lex);
				402
				403	// work out if it's a single or triple quoted literal
				404	int num_quotes;
				405	if (is_char_and(lex, quote_char, quote_char)) {
				406	// triple quotes
				407	next_char(lex);
				408	next_char(lex);
				409	num_quotes = 3;
				410	} else {
				411	// single quotes
				412	num_quotes = 1;
				413	}
				414
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	415	// parse the literal
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	416	int n_closing = 0;
				417	while (!is_end(lex) && (num_quotes > 1 \|\| !is_char(lex, '\n')) && n_closing < num_quotes) {
				418	if (is_char(lex, quote_char)) {
				419	n_closing += 1;
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	420	vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	421	} else {
				422	n_closing = 0;
				423	if (!is_raw && is_char(lex, '\\')) {
				424	next_char(lex);
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	425	unichar c = CUR_CHAR(lex);
				426	switch (c) {
				427	case PY_LEXER_CHAR_EOF: break; // TODO a proper error message?
				428	case '\n': c = PY_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
				429	case '\\': break;
				430	case '\'': break;
				431	case '"': break;
				432	case 'a': c = 0x07; break;
				433	case 'b': c = 0x08; break;
				434	case 't': c = 0x09; break;
				435	case 'n': c = 0x0a; break;
				436	case 'v': c = 0x0b; break;
				437	case 'f': c = 0x0c; break;
				438	case 'r': c = 0x0d; break;
				439	// TODO \ooo octal
				440	case 'x': // TODO \xhh
				441	case 'N': // TODO \N{name} only in strings
				442	case 'u': // TODO \uxxxx only in strings
				443	case 'U': // TODO \Uxxxxxxxx only in strings
				444	default: break; // TODO error message
				445	}
				446	if (c != PY_LEXER_CHAR_EOF) {
				447	vstr_add_char(&lex->vstr, c);
				448	}
				449	} else {
				450	vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	451	}
				452	}
				453	next_char(lex);
				454	}
				455
				456	// check we got the required end quotes
				457	if (n_closing < num_quotes) {
				458	tok->kind = PY_TOKEN_LONELY_STRING_OPEN;
				459	}
				460
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	461	// cut off the end quotes from the token text
				462	vstr_cut_tail(&lex->vstr, n_closing);
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	463
				464	} else if (is_head_of_identifier(lex)) {
				465	tok->kind = PY_TOKEN_NAME;
				466
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	467	// get first char
				468	vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	469	next_char(lex);
				470
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	471	// get tail chars
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	472	while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	473	vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	474	next_char(lex);
				475	}
				476
				477	} else if (is_digit(lex) \|\| (is_char(lex, '.') && is_following_digit(lex))) {
				478	tok->kind = PY_TOKEN_NUMBER;
				479
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	480	// get first char
				481	vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	482	next_char(lex);
				483
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	484	// get tail chars
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	485	while (!is_end(lex)) {
				486	if (is_char_or(lex, 'e', 'E')) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	487	vstr_add_char(&lex->vstr, 'e');
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	488	next_char(lex);
				489	if (is_char(lex, '+') \|\| is_char(lex, '-')) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	490	vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	491	next_char(lex);
				492	}
				493	} else if (is_letter(lex) \|\| is_digit(lex) \|\| is_char_or(lex, '_', '.')) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	494	vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	495	next_char(lex);
				496	} else {
				497	break;
				498	}
				499	}
				500
				501	} else {
				502	// search for encoded delimiter or operator
				503
				504	const char *t = tok_enc;
				505	uint tok_enc_index = 0;
				506	for (; t != 0 && !is_char(lex, t); t += 1) {
				507	if (t == 'e' \|\| t == 'c') {
				508	t += 1;
				509	} else if (*t == 'E') {
				510	tok_enc_index -= 1;
				511	t += 1;
				512	}
				513	tok_enc_index += 1;
				514	}
				515
				516	next_char(lex);
				517
				518	if (*t == 0) {
				519	// didn't match any delimiter or operator characters
				520	tok->kind = PY_TOKEN_INVALID;
				521
				522	} else {
				523	// matched a delimiter or operator character
				524
				525	// get the maximum characters for a valid token
				526	t += 1;
				527	uint t_index = tok_enc_index;
				528	for (;;) {
				529	for (; *t == 'e'; t += 1) {
				530	t += 1;
				531	t_index += 1;
				532	if (is_char(lex, *t)) {
				533	next_char(lex);
				534	tok_enc_index = t_index;
				535	break;
				536	}
				537	}
				538
				539	if (*t == 'E') {
				540	t += 1;
				541	if (is_char(lex, *t)) {
				542	next_char(lex);
				543	tok_enc_index = t_index;
				544	} else {
				545	tok->kind = PY_TOKEN_INVALID;
				546	}
				547	break;
				548	}
				549
				550	if (*t == 'c') {
				551	t += 1;
				552	t_index += 1;
				553	if (is_char(lex, *t)) {
				554	next_char(lex);
				555	tok_enc_index = t_index;
				556	t += 1;
				557	} else {
				558	break;
				559	}
				560	} else {
				561	break;
				562	}
				563	}
				564
				565	// set token kind
				566	tok->kind = tok_enc_kind[tok_enc_index];
				567
				568	// compute bracket level for implicit line joining
				569	if (tok->kind == PY_TOKEN_DEL_PAREN_OPEN \|\| tok->kind == PY_TOKEN_DEL_BRACKET_OPEN \|\| tok->kind == PY_TOKEN_DEL_BRACE_OPEN) {
				570	lex->nested_bracket_level += 1;
				571	} else if (tok->kind == PY_TOKEN_DEL_PAREN_CLOSE \|\| tok->kind == PY_TOKEN_DEL_BRACKET_CLOSE \|\| tok->kind == PY_TOKEN_DEL_BRACE_CLOSE) {
				572	lex->nested_bracket_level -= 1;
				573	}
				574	}
				575	}
				576
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	577	// point token text to vstr buffer
				578	tok->str = vstr_str(&lex->vstr);
				579	tok->len = vstr_len(&lex->vstr);
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	580
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	581	// check for keywords
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	582	if (tok->kind == PY_TOKEN_NAME) {
				583	for (int i = 0; tok_kw[i] != NULL; i++) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	584	if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	585	tok->kind = PY_TOKEN_KW_FALSE + i;
				586	break;
				587	}
				588	}
				589	}
				590	}
				591
Damien	fa2162b	2013-10-20 17:42:00 +0100	[diff] [blame^]	592	py_lexer_t py_lexer_new(const char src_name, void *stream_data, py_lexer_stream_next_char_t stream_next_char, py_lexer_stream_close_t stream_close) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	593	py_lexer_t *lex = m_new(py_lexer_t, 1);
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	594
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	595	lex->name = src_name; // TODO do we need to strdup this?
				596	lex->stream_data = stream_data;
				597	lex->stream_next_char = stream_next_char;
Damien	fa2162b	2013-10-20 17:42:00 +0100	[diff] [blame^]	598	lex->stream_close = stream_close;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	599	lex->line = 1;
				600	lex->column = 1;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	601	lex->emit_dent = 0;
				602	lex->nested_bracket_level = 0;
				603	lex->alloc_indent_level = 16;
				604	lex->num_indent_level = 1;
				605	lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
				606	lex->indent_level[0] = 0;
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	607	vstr_init(&lex->vstr);
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	608
				609	// preload characters
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	610	lex->chr0 = stream_next_char(stream_data);
				611	lex->chr1 = stream_next_char(stream_data);
				612	lex->chr2 = stream_next_char(stream_data);
				613
				614	// if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
				615	if (lex->chr0 == PY_LEXER_CHAR_EOF) {
				616	lex->chr0 = '\n';
				617	} else if (lex->chr1 == PY_LEXER_CHAR_EOF) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	618	if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	619	lex->chr1 = '\n';
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	620	}
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	621	} else if (lex->chr2 == PY_LEXER_CHAR_EOF) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	622	if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	623	lex->chr2 = '\n';
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	624	}
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	625	}
				626
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	627	// preload first token
				628	py_lexer_next_token_into(lex, &lex->tok_cur, true);
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	629
				630	return lex;
				631	}
				632
				633	void py_lexer_free(py_lexer_t *lex) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	634	if (lex) {
Damien	fa2162b	2013-10-20 17:42:00 +0100	[diff] [blame^]	635	if (lex->stream_close) {
				636	lex->stream_close(lex->stream_data);
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	637	}
				638	m_free(lex);
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	639	}
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	640	}
				641
				642	void py_lexer_to_next(py_lexer_t *lex) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	643	py_lexer_next_token_into(lex, &lex->tok_cur, false);
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	644	}
				645
				646	const py_token_t py_lexer_cur(const py_lexer_t lex) {
				647	return &lex->tok_cur;
				648	}
				649
				650	bool py_lexer_is_kind(py_lexer_t *lex, py_token_kind_t kind) {
				651	return lex->tok_cur.kind == kind;
				652	}
				653
				654	/*
				655	bool py_lexer_is_str(py_lexer_t lex, const char str) {
				656	return py_token_is_str(&lex->tok_cur, str);
				657	}
				658
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	659	bool py_lexer_opt_kind(py_lexer_t *lex, py_token_kind_t kind) {
				660	if (py_lexer_is_kind(lex, kind)) {
				661	py_lexer_to_next(lex);
				662	return true;
				663	}
				664	return false;
				665	}
				666
				667	bool py_lexer_opt_str(py_lexer_t lex, const char str) {
				668	if (py_lexer_is_str(lex, str)) {
				669	py_lexer_to_next(lex);
				670	return true;
				671	}
				672	return false;
				673	}
				674	*/
				675
				676	bool py_lexer_show_error(py_lexer_t lex, const char msg) {
				677	return py_token_show_error(&lex->tok_cur, msg);
				678	}
Damien	91d387d	2013-10-09 15:09:52 +0100	[diff] [blame]	679
				680	bool py_lexer_show_error_pythonic(py_lexer_t lex, const char msg) {
				681	printf(" File \"%s\", line %d column %d\n%s\n", lex->tok_cur.src_name, lex->tok_cur.src_line, lex->tok_cur.src_column, msg);
				682	return false;
				683	}