Blame - py/lexer.c - lite/micropython

blob: cd2e05ece01d4432e51bda6e264be4596185b664 [file] [log] [blame]

Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	1	/* lexer.c -- simple tokeniser for Python implementation
				2	*/
				3
				4	#include <stdint.h>
				5	#include <stdio.h>
				6	#include <assert.h>
				7
				8	#include "misc.h"
				9	#include "lexer.h"
				10
				11	#define TAB_SIZE (8)
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	12
Damien	92c0656	2013-10-22 22:32:27 +0100	[diff] [blame^]	13	// TODO seems that CPython allows NULL byte in the input stream
				14	// don't know if that's intentional or not, but we don't allow it
				15
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	16	struct _py_lexer_t {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	17	const char *name; // name of source
				18	void *stream_data; // data for stream
				19	py_lexer_stream_next_char_t stream_next_char; // stream callback to get next char
Damien	fa2162b	2013-10-20 17:42:00 +0100	[diff] [blame]	20	py_lexer_stream_close_t stream_close; // stream callback to free
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	21
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	22	unichar chr0, chr1, chr2; // current cached characters from source
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	23
				24	uint line; // source line
				25	uint column; // source column
				26
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	27	int emit_dent; // non-zero when there are INDENT/DEDENT tokens to emit
				28	int nested_bracket_level; // >0 when there are nested brackets over multiple lines
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	29
				30	uint alloc_indent_level;
				31	uint num_indent_level;
				32	uint16_t *indent_level;
				33
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	34	vstr_t vstr;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	35	py_token_t tok_cur;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	36	};
				37
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	38	bool str_strn_equal(const char str, const char strn, int len) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	39	uint i = 0;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	40
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	41	while (i < len && str == strn) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	42	++i;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	43	++str;
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	44	++strn;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	45	}
				46
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	47	return i == len && *str == 0;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	48	}
				49
				50	void py_token_show(const py_token_t *tok) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	51	printf("(%s:%d:%d) kind:%d str:%p len:%d", tok->src_name, tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	52	if (tok->str != NULL && tok->len > 0) {
				53	const char *i = tok->str;
				54	const char *j = i + tok->len;
				55	printf(" ");
				56	while (i < j) {
				57	unichar c = g_utf8_get_char(i);
				58	i = g_utf8_next_char(i);
				59	if (g_unichar_isprint(c)) {
				60	printf("%c", c);
				61	} else {
				62	printf("?");
				63	}
				64	}
				65	}
				66	printf("\n");
				67	}
				68
				69	void py_token_show_error_prefix(const py_token_t *tok) {
				70	printf("(%s:%d:%d) ", tok->src_name, tok->src_line, tok->src_column);
				71	}
				72
				73	bool py_token_show_error(const py_token_t tok, const char msg) {
				74	printf("(%s:%d:%d) %s\n", tok->src_name, tok->src_line, tok->src_column, msg);
				75	return false;
				76	}
				77
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	78	#define CUR_CHAR(lex) ((lex)->chr0)
				79
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	80	static bool is_end(py_lexer_t *lex) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	81	return lex->chr0 == PY_LEXER_CHAR_EOF;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	82	}
				83
				84	static bool is_physical_newline(py_lexer_t *lex) {
				85	return lex->chr0 == '\n' \|\| lex->chr0 == '\r';
				86	}
				87
				88	static bool is_char(py_lexer_t *lex, char c) {
				89	return lex->chr0 == c;
				90	}
				91
				92	static bool is_char_or(py_lexer_t *lex, char c1, char c2) {
				93	return lex->chr0 == c1 \|\| lex->chr0 == c2;
				94	}
				95
				96	static bool is_char_or3(py_lexer_t *lex, char c1, char c2, char c3) {
				97	return lex->chr0 == c1 \|\| lex->chr0 == c2 \|\| lex->chr0 == c3;
				98	}
				99
				100	/*
				101	static bool is_char_following(py_lexer_t *lex, char c) {
				102	return lex->chr1 == c;
				103	}
				104	*/
				105
				106	static bool is_char_following_or(py_lexer_t *lex, char c1, char c2) {
				107	return lex->chr1 == c1 \|\| lex->chr1 == c2;
				108	}
				109
				110	static bool is_char_following_following_or(py_lexer_t *lex, char c1, char c2) {
				111	return lex->chr2 == c1 \|\| lex->chr2 == c2;
				112	}
				113
				114	static bool is_char_and(py_lexer_t *lex, char c1, char c2) {
				115	return lex->chr0 == c1 && lex->chr1 == c2;
				116	}
				117
				118	static bool is_whitespace(py_lexer_t *lex) {
				119	return g_unichar_isspace(lex->chr0);
				120	}
				121
				122	static bool is_letter(py_lexer_t *lex) {
				123	return g_unichar_isalpha(lex->chr0);
				124	}
				125
				126	static bool is_digit(py_lexer_t *lex) {
				127	return g_unichar_isdigit(lex->chr0);
				128	}
				129
				130	static bool is_following_digit(py_lexer_t *lex) {
				131	return g_unichar_isdigit(lex->chr1);
				132	}
				133
				134	// TODO UNICODE include unicode characters in definition of identifiers
				135	static bool is_head_of_identifier(py_lexer_t *lex) {
				136	return is_letter(lex) \|\| lex->chr0 == '_';
				137	}
				138
				139	// TODO UNICODE include unicode characters in definition of identifiers
				140	static bool is_tail_of_identifier(py_lexer_t *lex) {
				141	return is_head_of_identifier(lex) \|\| is_digit(lex);
				142	}
				143
				144	static void next_char(py_lexer_t *lex) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	145	if (lex->chr0 == PY_LEXER_CHAR_EOF) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	146	return;
				147	}
				148
				149	int advance = 1;
				150
				151	if (lex->chr0 == '\n') {
				152	// LF is a new line
				153	++lex->line;
				154	lex->column = 1;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	155	} else if (lex->chr0 == '\r') {
				156	// CR is a new line
				157	++lex->line;
				158	lex->column = 1;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	159	if (lex->chr1 == '\n') {
				160	// CR LF is a single new line
				161	advance = 2;
				162	}
				163	} else if (lex->chr0 == '\t') {
				164	// a tab
				165	lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;
				166	} else {
				167	// a character worth one column
				168	++lex->column;
				169	}
				170
				171	for (; advance > 0; advance--) {
				172	lex->chr0 = lex->chr1;
				173	lex->chr1 = lex->chr2;
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	174	lex->chr2 = lex->stream_next_char(lex->stream_data);
				175	if (lex->chr2 == PY_LEXER_CHAR_EOF) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	176	// EOF
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	177	if (lex->chr1 != PY_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	178	lex->chr2 = '\n'; // insert newline at end of file
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	179	}
				180	}
				181	}
				182	}
				183
				184	void indent_push(py_lexer_t *lex, uint indent) {
				185	if (lex->num_indent_level >= lex->alloc_indent_level) {
				186	lex->alloc_indent_level *= 2;
				187	lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level);
				188	}
				189	lex->indent_level[lex->num_indent_level++] = indent;
				190	}
				191
				192	uint indent_top(py_lexer_t *lex) {
				193	return lex->indent_level[lex->num_indent_level - 1];
				194	}
				195
				196	void indent_pop(py_lexer_t *lex) {
				197	lex->num_indent_level -= 1;
				198	}
				199
				200	// some tricky operator encoding:
				201	// <op> = begin with <op>, if this opchar matches then begin here
				202	// e<op> = end with <op>, if this opchar matches then end
				203	// E<op> = mandatory end with <op>, this opchar must match, then end
				204	// c<op> = continue with <op>, if this opchar matches then continue matching
				205	// this means if the start of two ops are the same then they are equal til the last char
				206
				207	static const char *tok_enc =
				208	"()[]{},:;@~" // singles
				209	"<e=c<e=" // < <= << <<=
				210	">e=c>e=" // > >= >> >>=
				211	"e=ce=" // * = * **=
				212	"+e=" // + +=
				213	"-e=e>" // - -= ->
				214	"&e=" // & &=
				215	"\|e=" // \| \|=
				216	"/e=c/e=" // / /= // //=
				217	"%e=" // % %=
				218	"^e=" // ^ ^=
				219	"=e=" // = ==
				220	"!E=" // !=
				221	".c.E."; // . ...
				222
				223	// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
				224	static const uint8_t tok_enc_kind[] = {
				225	PY_TOKEN_DEL_PAREN_OPEN, PY_TOKEN_DEL_PAREN_CLOSE,
				226	PY_TOKEN_DEL_BRACKET_OPEN, PY_TOKEN_DEL_BRACKET_CLOSE,
				227	PY_TOKEN_DEL_BRACE_OPEN, PY_TOKEN_DEL_BRACE_CLOSE,
				228	PY_TOKEN_DEL_COMMA, PY_TOKEN_DEL_COLON, PY_TOKEN_DEL_SEMICOLON, PY_TOKEN_DEL_AT, PY_TOKEN_OP_TILDE,
				229
				230	PY_TOKEN_OP_LESS, PY_TOKEN_OP_LESS_EQUAL, PY_TOKEN_OP_DBL_LESS, PY_TOKEN_DEL_DBL_LESS_EQUAL,
				231	PY_TOKEN_OP_MORE, PY_TOKEN_OP_MORE_EQUAL, PY_TOKEN_OP_DBL_MORE, PY_TOKEN_DEL_DBL_MORE_EQUAL,
				232	PY_TOKEN_OP_STAR, PY_TOKEN_DEL_STAR_EQUAL, PY_TOKEN_OP_DBL_STAR, PY_TOKEN_DEL_DBL_STAR_EQUAL,
				233	PY_TOKEN_OP_PLUS, PY_TOKEN_DEL_PLUS_EQUAL,
				234	PY_TOKEN_OP_MINUS, PY_TOKEN_DEL_MINUS_EQUAL, PY_TOKEN_DEL_MINUS_MORE,
				235	PY_TOKEN_OP_AMPERSAND, PY_TOKEN_DEL_AMPERSAND_EQUAL,
				236	PY_TOKEN_OP_PIPE, PY_TOKEN_DEL_PIPE_EQUAL,
				237	PY_TOKEN_OP_SLASH, PY_TOKEN_DEL_SLASH_EQUAL, PY_TOKEN_OP_DBL_SLASH, PY_TOKEN_DEL_DBL_SLASH_EQUAL,
				238	PY_TOKEN_OP_PERCENT, PY_TOKEN_DEL_PERCENT_EQUAL,
				239	PY_TOKEN_OP_CARET, PY_TOKEN_DEL_CARET_EQUAL,
				240	PY_TOKEN_DEL_EQUAL, PY_TOKEN_OP_DBL_EQUAL,
				241	PY_TOKEN_OP_NOT_EQUAL,
				242	PY_TOKEN_DEL_PERIOD, PY_TOKEN_ELLIPSES,
				243	};
				244
				245	// must have the same order as enum in lexer.h
				246	static const char *tok_kw[] = {
				247	"False",
				248	"None",
				249	"True",
				250	"and",
				251	"as",
				252	"assert",
				253	"break",
				254	"class",
				255	"continue",
				256	"def",
				257	"del",
				258	"elif",
				259	"else",
				260	"except",
				261	"finally",
				262	"for",
				263	"from",
				264	"global",
				265	"if",
				266	"import",
				267	"in",
				268	"is",
				269	"lambda",
				270	"nonlocal",
				271	"not",
				272	"or",
				273	"pass",
				274	"raise",
				275	"return",
				276	"try",
				277	"while",
				278	"with",
				279	"yield",
				280	NULL,
				281	};
				282
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	283	static void py_lexer_next_token_into(py_lexer_t lex, py_token_t tok, bool first_token) {
				284	// skip white space and comments
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	285	bool had_physical_newline = false;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	286	while (!is_end(lex)) {
				287	if (is_physical_newline(lex)) {
				288	had_physical_newline = true;
				289	next_char(lex);
				290	} else if (is_whitespace(lex)) {
				291	next_char(lex);
				292	} else if (is_char(lex, '#')) {
				293	next_char(lex);
				294	while (!is_end(lex) && !is_physical_newline(lex)) {
				295	next_char(lex);
				296	}
				297	// had_physical_newline will be set on next loop
				298	} else if (is_char(lex, '\\')) {
				299	// backslash (outside string literals) must appear just before a physical newline
				300	next_char(lex);
				301	if (!is_physical_newline(lex)) {
				302	// TODO SyntaxError
				303	assert(0);
				304	} else {
				305	next_char(lex);
				306	}
				307	} else {
				308	break;
				309	}
				310	}
				311
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	312	// set token source information
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	313	tok->src_name = lex->name;
				314	tok->src_line = lex->line;
				315	tok->src_column = lex->column;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	316
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	317	// start new token text
				318	vstr_reset(&lex->vstr);
				319
				320	if (first_token && lex->line == 1 && lex->column != 1) {
				321	// check that the first token is in the first column
				322	// if first token is not on first line, we get a physical newline and
				323	// this check is done as part of normal indent/dedent checking below
				324	// (done to get equivalence with CPython)
				325	tok->kind = PY_TOKEN_INDENT;
				326
				327	} else if (lex->emit_dent < 0) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	328	tok->kind = PY_TOKEN_DEDENT;
				329	lex->emit_dent += 1;
				330
				331	} else if (lex->emit_dent > 0) {
				332	tok->kind = PY_TOKEN_INDENT;
				333	lex->emit_dent -= 1;
				334
Damien	91d387d	2013-10-09 15:09:52 +0100	[diff] [blame]	335	} else if (had_physical_newline && lex->nested_bracket_level == 0) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	336	tok->kind = PY_TOKEN_NEWLINE;
				337
				338	uint num_spaces = lex->column - 1;
				339	lex->emit_dent = 0;
				340	if (num_spaces == indent_top(lex)) {
				341	} else if (num_spaces > indent_top(lex)) {
				342	indent_push(lex, num_spaces);
				343	lex->emit_dent += 1;
				344	} else {
				345	while (num_spaces < indent_top(lex)) {
				346	indent_pop(lex);
				347	lex->emit_dent -= 1;
				348	}
				349	if (num_spaces != indent_top(lex)) {
Damien	91d387d	2013-10-09 15:09:52 +0100	[diff] [blame]	350	tok->kind = PY_TOKEN_DEDENT_MISMATCH;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	351	}
				352	}
				353
				354	} else if (is_end(lex)) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	355	if (indent_top(lex) > 0) {
				356	tok->kind = PY_TOKEN_NEWLINE;
				357	lex->emit_dent = 0;
				358	while (indent_top(lex) > 0) {
				359	indent_pop(lex);
				360	lex->emit_dent -= 1;
				361	}
				362	} else {
				363	tok->kind = PY_TOKEN_END;
				364	}
				365
				366	} else if (is_char_or(lex, '\'', '\"')
				367	\|\| (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
				368	\|\| ((is_char_and(lex, 'r', 'b') \|\| is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
				369	// a string or bytes literal
				370
				371	// parse type codes
				372	bool is_raw = false;
				373	bool is_bytes = false;
				374	if (is_char(lex, 'u')) {
				375	next_char(lex);
				376	} else if (is_char(lex, 'b')) {
				377	is_bytes = true;
				378	next_char(lex);
				379	if (is_char(lex, 'r')) {
				380	is_raw = true;
				381	next_char(lex);
				382	}
				383	} else if (is_char(lex, 'r')) {
				384	is_raw = true;
				385	next_char(lex);
				386	if (is_char(lex, 'b')) {
				387	is_bytes = true;
				388	next_char(lex);
				389	}
				390	}
				391
				392	// set token kind
				393	if (is_bytes) {
				394	tok->kind = PY_TOKEN_BYTES;
				395	} else {
				396	tok->kind = PY_TOKEN_STRING;
				397	}
				398
				399	// get first quoting character
				400	char quote_char = '\'';
				401	if (is_char(lex, '\"')) {
				402	quote_char = '\"';
				403	}
				404	next_char(lex);
				405
				406	// work out if it's a single or triple quoted literal
				407	int num_quotes;
				408	if (is_char_and(lex, quote_char, quote_char)) {
				409	// triple quotes
				410	next_char(lex);
				411	next_char(lex);
				412	num_quotes = 3;
				413	} else {
				414	// single quotes
				415	num_quotes = 1;
				416	}
				417
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	418	// parse the literal
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	419	int n_closing = 0;
				420	while (!is_end(lex) && (num_quotes > 1 \|\| !is_char(lex, '\n')) && n_closing < num_quotes) {
				421	if (is_char(lex, quote_char)) {
				422	n_closing += 1;
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	423	vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	424	} else {
				425	n_closing = 0;
				426	if (!is_raw && is_char(lex, '\\')) {
				427	next_char(lex);
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	428	unichar c = CUR_CHAR(lex);
				429	switch (c) {
				430	case PY_LEXER_CHAR_EOF: break; // TODO a proper error message?
				431	case '\n': c = PY_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
				432	case '\\': break;
				433	case '\'': break;
				434	case '"': break;
				435	case 'a': c = 0x07; break;
				436	case 'b': c = 0x08; break;
				437	case 't': c = 0x09; break;
				438	case 'n': c = 0x0a; break;
				439	case 'v': c = 0x0b; break;
				440	case 'f': c = 0x0c; break;
				441	case 'r': c = 0x0d; break;
				442	// TODO \ooo octal
				443	case 'x': // TODO \xhh
				444	case 'N': // TODO \N{name} only in strings
				445	case 'u': // TODO \uxxxx only in strings
				446	case 'U': // TODO \Uxxxxxxxx only in strings
				447	default: break; // TODO error message
				448	}
				449	if (c != PY_LEXER_CHAR_EOF) {
				450	vstr_add_char(&lex->vstr, c);
				451	}
				452	} else {
				453	vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	454	}
				455	}
				456	next_char(lex);
				457	}
				458
				459	// check we got the required end quotes
				460	if (n_closing < num_quotes) {
				461	tok->kind = PY_TOKEN_LONELY_STRING_OPEN;
				462	}
				463
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	464	// cut off the end quotes from the token text
				465	vstr_cut_tail(&lex->vstr, n_closing);
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	466
				467	} else if (is_head_of_identifier(lex)) {
				468	tok->kind = PY_TOKEN_NAME;
				469
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	470	// get first char
				471	vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	472	next_char(lex);
				473
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	474	// get tail chars
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	475	while (!is_end(lex) && is_tail_of_identifier(lex)) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	476	vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	477	next_char(lex);
				478	}
				479
				480	} else if (is_digit(lex) \|\| (is_char(lex, '.') && is_following_digit(lex))) {
				481	tok->kind = PY_TOKEN_NUMBER;
				482
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	483	// get first char
				484	vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	485	next_char(lex);
				486
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	487	// get tail chars
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	488	while (!is_end(lex)) {
				489	if (is_char_or(lex, 'e', 'E')) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	490	vstr_add_char(&lex->vstr, 'e');
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	491	next_char(lex);
				492	if (is_char(lex, '+') \|\| is_char(lex, '-')) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	493	vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	494	next_char(lex);
				495	}
				496	} else if (is_letter(lex) \|\| is_digit(lex) \|\| is_char_or(lex, '_', '.')) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	497	vstr_add_char(&lex->vstr, CUR_CHAR(lex));
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	498	next_char(lex);
				499	} else {
				500	break;
				501	}
				502	}
				503
				504	} else {
				505	// search for encoded delimiter or operator
				506
				507	const char *t = tok_enc;
				508	uint tok_enc_index = 0;
				509	for (; t != 0 && !is_char(lex, t); t += 1) {
				510	if (t == 'e' \|\| t == 'c') {
				511	t += 1;
				512	} else if (*t == 'E') {
				513	tok_enc_index -= 1;
				514	t += 1;
				515	}
				516	tok_enc_index += 1;
				517	}
				518
				519	next_char(lex);
				520
				521	if (*t == 0) {
				522	// didn't match any delimiter or operator characters
				523	tok->kind = PY_TOKEN_INVALID;
				524
				525	} else {
				526	// matched a delimiter or operator character
				527
				528	// get the maximum characters for a valid token
				529	t += 1;
				530	uint t_index = tok_enc_index;
				531	for (;;) {
				532	for (; *t == 'e'; t += 1) {
				533	t += 1;
				534	t_index += 1;
				535	if (is_char(lex, *t)) {
				536	next_char(lex);
				537	tok_enc_index = t_index;
				538	break;
				539	}
				540	}
				541
				542	if (*t == 'E') {
				543	t += 1;
				544	if (is_char(lex, *t)) {
				545	next_char(lex);
				546	tok_enc_index = t_index;
				547	} else {
				548	tok->kind = PY_TOKEN_INVALID;
				549	}
				550	break;
				551	}
				552
				553	if (*t == 'c') {
				554	t += 1;
				555	t_index += 1;
				556	if (is_char(lex, *t)) {
				557	next_char(lex);
				558	tok_enc_index = t_index;
				559	t += 1;
				560	} else {
				561	break;
				562	}
				563	} else {
				564	break;
				565	}
				566	}
				567
				568	// set token kind
				569	tok->kind = tok_enc_kind[tok_enc_index];
				570
				571	// compute bracket level for implicit line joining
				572	if (tok->kind == PY_TOKEN_DEL_PAREN_OPEN \|\| tok->kind == PY_TOKEN_DEL_BRACKET_OPEN \|\| tok->kind == PY_TOKEN_DEL_BRACE_OPEN) {
				573	lex->nested_bracket_level += 1;
				574	} else if (tok->kind == PY_TOKEN_DEL_PAREN_CLOSE \|\| tok->kind == PY_TOKEN_DEL_BRACKET_CLOSE \|\| tok->kind == PY_TOKEN_DEL_BRACE_CLOSE) {
				575	lex->nested_bracket_level -= 1;
				576	}
				577	}
				578	}
				579
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	580	// point token text to vstr buffer
				581	tok->str = vstr_str(&lex->vstr);
				582	tok->len = vstr_len(&lex->vstr);
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	583
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	584	// check for keywords
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	585	if (tok->kind == PY_TOKEN_NAME) {
				586	for (int i = 0; tok_kw[i] != NULL; i++) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	587	if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	588	tok->kind = PY_TOKEN_KW_FALSE + i;
				589	break;
				590	}
				591	}
				592	}
				593	}
				594
Damien	fa2162b	2013-10-20 17:42:00 +0100	[diff] [blame]	595	py_lexer_t py_lexer_new(const char src_name, void *stream_data, py_lexer_stream_next_char_t stream_next_char, py_lexer_stream_close_t stream_close) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	596	py_lexer_t *lex = m_new(py_lexer_t, 1);
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	597
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	598	lex->name = src_name; // TODO do we need to strdup this?
				599	lex->stream_data = stream_data;
				600	lex->stream_next_char = stream_next_char;
Damien	fa2162b	2013-10-20 17:42:00 +0100	[diff] [blame]	601	lex->stream_close = stream_close;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	602	lex->line = 1;
				603	lex->column = 1;
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	604	lex->emit_dent = 0;
				605	lex->nested_bracket_level = 0;
				606	lex->alloc_indent_level = 16;
				607	lex->num_indent_level = 1;
				608	lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
				609	lex->indent_level[0] = 0;
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	610	vstr_init(&lex->vstr);
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	611
				612	// preload characters
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	613	lex->chr0 = stream_next_char(stream_data);
				614	lex->chr1 = stream_next_char(stream_data);
				615	lex->chr2 = stream_next_char(stream_data);
				616
				617	// if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
				618	if (lex->chr0 == PY_LEXER_CHAR_EOF) {
				619	lex->chr0 = '\n';
				620	} else if (lex->chr1 == PY_LEXER_CHAR_EOF) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	621	if (lex->chr0 != '\n' && lex->chr0 != '\r') {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	622	lex->chr1 = '\n';
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	623	}
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	624	} else if (lex->chr2 == PY_LEXER_CHAR_EOF) {
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	625	if (lex->chr1 != '\n' && lex->chr1 != '\r') {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	626	lex->chr2 = '\n';
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	627	}
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	628	}
				629
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	630	// preload first token
				631	py_lexer_next_token_into(lex, &lex->tok_cur, true);
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	632
				633	return lex;
				634	}
				635
				636	void py_lexer_free(py_lexer_t *lex) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	637	if (lex) {
Damien	fa2162b	2013-10-20 17:42:00 +0100	[diff] [blame]	638	if (lex->stream_close) {
				639	lex->stream_close(lex->stream_data);
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	640	}
Damien	bb5316b	2013-10-22 21:12:29 +0100	[diff] [blame]	641	vstr_clear(&lex->vstr);
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	642	m_free(lex);
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	643	}
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	644	}
				645
				646	void py_lexer_to_next(py_lexer_t *lex) {
Damien	a5185f4	2013-10-20 14:41:27 +0100	[diff] [blame]	647	py_lexer_next_token_into(lex, &lex->tok_cur, false);
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	648	}
				649
				650	const py_token_t py_lexer_cur(const py_lexer_t lex) {
				651	return &lex->tok_cur;
				652	}
				653
				654	bool py_lexer_is_kind(py_lexer_t *lex, py_token_kind_t kind) {
				655	return lex->tok_cur.kind == kind;
				656	}
				657
				658	/*
				659	bool py_lexer_is_str(py_lexer_t lex, const char str) {
				660	return py_token_is_str(&lex->tok_cur, str);
				661	}
				662
Damien	429d719	2013-10-04 19:53:11 +0100	[diff] [blame]	663	bool py_lexer_opt_kind(py_lexer_t *lex, py_token_kind_t kind) {
				664	if (py_lexer_is_kind(lex, kind)) {
				665	py_lexer_to_next(lex);
				666	return true;
				667	}
				668	return false;
				669	}
				670
				671	bool py_lexer_opt_str(py_lexer_t lex, const char str) {
				672	if (py_lexer_is_str(lex, str)) {
				673	py_lexer_to_next(lex);
				674	return true;
				675	}
				676	return false;
				677	}
				678	*/
				679
				680	bool py_lexer_show_error(py_lexer_t lex, const char msg) {
				681	return py_token_show_error(&lex->tok_cur, msg);
				682	}
Damien	91d387d	2013-10-09 15:09:52 +0100	[diff] [blame]	683
				684	bool py_lexer_show_error_pythonic(py_lexer_t lex, const char msg) {
				685	printf(" File \"%s\", line %d column %d\n%s\n", lex->tok_cur.src_name, lex->tok_cur.src_line, lex->tok_cur.src_column, msg);
				686	return false;
				687	}