Blame - py/lexer.c - lite/micropython

2017-02-17 11:10:35 +1100

[diff] [blame]

28

#include <string.h>

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

29

#include <assert.h>

30

Damien George

b4b10fd

2015-01-01 23:30:53 +0000

[diff] [blame]

31

#include "py/mpstate.h"

Damien George

511c083

2016-11-16 16:22:08 +1100

[diff] [blame]

32

#include "py/reader.h"

Damien George

51dfcb4

2015-01-01 20:27:54 +0000

[diff] [blame]

33

#include "py/lexer.h"

Damien George

081f932

2015-09-07 17:08:49 +0100

[diff] [blame]

34

#include "py/runtime.h"

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

35

Damien George

dd5353a

2015-12-18 12:35:44 +0000

[diff] [blame]

36

#if MICROPY_ENABLE_COMPILER

37

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

38

#define TAB_SIZE (8)

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

39

Damien

92c0656

2013-10-22 22:32:27 +0100

[diff] [blame]

40

// TODO seems that CPython allows NULL byte in the input stream

41

// don't know if that's intentional or not, but we don't allow it

42

Damien George

2016-11-16 18:27:20 +1100

[diff] [blame]

43

#define MP_LEXER_EOF ((unichar)MP_READER_EOF)

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

44

#define CUR_CHAR(lex) ((lex)->chr0)

45

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

46

STATIC bool is_end(mp_lexer_t *lex) {

Damien George

94fbe97

2014-07-30 11:46:05 +0100

[diff] [blame]

47

return lex->chr0 == MP_LEXER_EOF;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

48

}

49

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

50

STATIC bool is_physical_newline(mp_lexer_t *lex) {

Damien George

2015-01-30 00:27:46 +0000

[diff] [blame]

51

return lex->chr0 == '\n';

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

52

}

53

Damien George

2015-03-19 00:21:29 +0000

[diff] [blame]

54

STATIC bool is_char(mp_lexer_t *lex, byte c) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

55

return lex->chr0 == c;

56

}

57

Damien George

2015-03-19 00:21:29 +0000

[diff] [blame]

58

STATIC bool is_char_or(mp_lexer_t *lex, byte c1, byte c2) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

59

return lex->chr0 == c1 || lex->chr0 == c2;

60

}

61

Damien George

2015-03-19 00:21:29 +0000

[diff] [blame]

62

STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

63

return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;

64

}

65

Damien George

2015-03-19 00:21:29 +0000

[diff] [blame]

66

STATIC bool is_char_following(mp_lexer_t *lex, byte c) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

67

return lex->chr1 == c;

68

}

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

69

Damien George

2015-03-19 00:21:29 +0000

[diff] [blame]

70

STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

71

return lex->chr1 == c1 || lex->chr1 == c2;

72

}

73

Damien George

2015-03-19 00:21:29 +0000

[diff] [blame]

74

STATIC bool is_char_following_following_or(mp_lexer_t *lex, byte c1, byte c2) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

75

return lex->chr2 == c1 || lex->chr2 == c2;

76

}

77

Damien George

2015-03-19 00:21:29 +0000

[diff] [blame]

78

STATIC bool is_char_and(mp_lexer_t *lex, byte c1, byte c2) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

79

return lex->chr0 == c1 && lex->chr1 == c2;

80

}

81

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

82

STATIC bool is_whitespace(mp_lexer_t *lex) {

Damien George

2013-12-30 18:23:50 +0000

[diff] [blame]

83

return unichar_isspace(lex->chr0);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

84

}

85

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

86

STATIC bool is_letter(mp_lexer_t *lex) {

Damien George

2013-12-30 18:23:50 +0000

[diff] [blame]

87

return unichar_isalpha(lex->chr0);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

88

}

89

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

90

STATIC bool is_digit(mp_lexer_t *lex) {

Damien George

2013-12-30 18:23:50 +0000

[diff] [blame]

91

return unichar_isdigit(lex->chr0);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

92

}

93

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

94

STATIC bool is_following_digit(mp_lexer_t *lex) {

Damien George

2013-12-30 18:23:50 +0000

[diff] [blame]

95

return unichar_isdigit(lex->chr1);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

96

}

97

Damien George

2b00047

2015-09-07 17:33:44 +0100

[diff] [blame]

98

STATIC bool is_following_base_char(mp_lexer_t *lex) {

99

const unichar chr1 = lex->chr1 | 0x20;

100

return chr1 == 'b' || chr1 == 'o' || chr1 == 'x';

Damien George

2015-02-08 01:57:40 +0000

[diff] [blame]

101

}

102

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

103

STATIC bool is_following_odigit(mp_lexer_t *lex) {

Paul Sokolovsky

2014-01-22 22:40:02 +0200

[diff] [blame]

104

return lex->chr1 >= '0' && lex->chr1 <= '7';

105

}

106

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

107

STATIC bool is_string_or_bytes(mp_lexer_t *lex) {

108

return is_char_or(lex, '\'', '\"')

109

|| (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))

110

|| ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))

111

&& is_char_following_following_or(lex, '\'', '\"'));

112

}

113

Damien George

2015-06-09 10:58:07 +0000

[diff] [blame]

114

// to easily parse utf-8 identifiers we allow any raw byte with high bit set

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

115

STATIC bool is_head_of_identifier(mp_lexer_t *lex) {

Damien George

2015-06-09 10:58:07 +0000

[diff] [blame]

116

return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

117

}

118

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

119

STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

120

return is_head_of_identifier(lex) || is_digit(lex);

121

}

122

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

123

STATIC void next_char(mp_lexer_t *lex) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

124

if (lex->chr0 == '\n') {

Damien George

2015-01-30 00:27:46 +0000

[diff] [blame]

125

// a new line

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

126

++lex->line;

127

lex->column = 1;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

128

} else if (lex->chr0 == '\t') {

129

// a tab

130

lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;

131

} else {

132

// a character worth one column

++lex->column;

}

Damien George

2015-01-30 00:27:46 +0000

[diff] [blame]

136

lex->chr0 = lex->chr1;

137

lex->chr1 = lex->chr2;

Damien George

2016-11-16 18:27:20 +1100

[diff] [blame]

138

lex->chr2 = lex->reader.readbyte(lex->reader.data);

Damien George

2015-01-30 00:27:46 +0000

[diff] [blame]

139

Tom Collins

6f56412

2017-05-09 13:19:46 -0700

[diff] [blame]

140

if (lex->chr1 == '\r') {

Damien George

2015-01-30 00:27:46 +0000

[diff] [blame]

141

// CR is a new line, converted to LF

Tom Collins

6f56412

2017-05-09 13:19:46 -0700

[diff] [blame]

142

lex->chr1 = '\n';

143

if (lex->chr2 == '\n') {

144

// CR LF is a single new line, throw out the extra LF

Damien George

2016-11-16 18:27:20 +1100

[diff] [blame]

145

lex->chr2 = lex->reader.readbyte(lex->reader.data);

Damien George

2015-01-30 00:27:46 +0000

[diff] [blame]

}

}

Tom Collins

2017-05-09 13:19:46 -0700

[diff] [blame]

149

// check if we need to insert a newline at end of file

150

if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {

151

lex->chr2 = '\n';

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

}

}

Damien George

2017-02-17 12:44:24 +1100

[diff] [blame]

155

STATIC void indent_push(mp_lexer_t *lex, size_t indent) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

156

if (lex->num_indent_level >= lex->alloc_indent_level) {

Damien George

58ebde4

2014-05-21 20:32:59 +0100

[diff] [blame]

157

lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);

158

lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

159

}

160

lex->indent_level[lex->num_indent_level++] = indent;

161

}

162

Damien George

2017-02-17 12:44:24 +1100

[diff] [blame]

163

STATIC size_t indent_top(mp_lexer_t *lex) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

164

return lex->indent_level[lex->num_indent_level - 1];

165

}

166

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

167

STATIC void indent_pop(mp_lexer_t *lex) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

168

lex->num_indent_level -= 1;

169

}

170

171

// some tricky operator encoding:

172

// <op> = begin with <op>, if this opchar matches then begin here

173

// e<op> = end with <op>, if this opchar matches then end

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

174

// c<op> = continue with <op>, if this opchar matches then continue matching

175

// this means if the start of two ops are the same then they are equal til the last char

176

Damien George

3ff16ff

2016-05-20 12:38:15 +0100

[diff] [blame]

177

STATIC const char *const tok_enc =

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

178

"()[]{},:;@~" // singles

179

"<e=c<e=" // < <= << <<=

180

">e=c>e=" // > >= >> >>=

181

"*e=c*e=" // * *= ** **=

"+e=" // + +=

"-e=e>" // - -= ->

"&e=" // & &=

"|e=" // | |=

"/e=c/e=" // / /= // //=

187

"%e=" // % %=

188

"^e=" // ^ ^=

189

"=e=" // = ==

Damien George

2017-03-29 10:55:36 +1100

[diff] [blame]

190

"!."; // start of special cases: != . ...

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

191

192

// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

193

STATIC const uint8_t tok_enc_kind[] = {

Damien

d99b052

2013-12-21 18:17:45 +0000

[diff] [blame]

194

MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,

195

MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,

196

MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,

197

MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

198

Damien

d99b052

2013-12-21 18:17:45 +0000

[diff] [blame]

199

MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,

200

MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,

201

MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,

202

MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,

203

MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,

204

MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,

205

MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,

206

MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,

207

MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,

208

MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,

209

MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

210

};

211

212

// must have the same order as enum in lexer.h

Damien George

2017-02-17 11:10:35 +1100

[diff] [blame]

213

// must be sorted according to strcmp

Damien George

3ff16ff

2016-05-20 12:38:15 +0100

[diff] [blame]

214

STATIC const char *const tok_kw[] = {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

215

"False",

216

"None",

217

"True",

Damien George

2017-02-17 11:10:35 +1100

[diff] [blame]

218

"__debug__",

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

219

"and",

220

"as",

221

"assert",

pohmelie

81ebba7

2016-01-27 23:23:11 +0300

[diff] [blame]

222

#if MICROPY_PY_ASYNC_AWAIT

223

"async",

224

"await",

225

#endif

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

"break",

"class",

"continue",

"def",

"del",

"elif",

"else",

"except",

"finally",

"for",

"from",

"global",

"if",

"import",

"in",

"is",

"lambda",

"nonlocal",

"not",

"or",

"pass",

"raise",

"return",

"try",

"while",

"with",

"yield",

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

253

};

254

Paul Sokolovsky

2014-01-22 22:40:02 +0200

[diff] [blame]

255

// This is called with CUR_CHAR() before first hex digit, and should return with

256

// it pointing to last hex digit

Damien George

54eb4e7

2014-07-03 13:47:47 +0100

[diff] [blame]

257

// num_digits must be greater than zero

Damien George

2017-02-17 12:44:24 +1100

[diff] [blame]

258

STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) {

Damien George

54eb4e7

2014-07-03 13:47:47 +0100

[diff] [blame]

259

mp_uint_t num = 0;

Paul Sokolovsky

2014-01-22 22:40:02 +0200

[diff] [blame]

260

while (num_digits-- != 0) {

261

next_char(lex);

262

unichar c = CUR_CHAR(lex);

263

if (!unichar_isxdigit(c)) {

264

return false;

265

}

Dave Hylands

3ad94d6

2015-05-18 14:41:25 -0700

[diff] [blame]

266

num = (num << 4) + unichar_xdigit_value(c);

Paul Sokolovsky

2014-01-22 22:40:02 +0200

[diff] [blame]

}

*result = num;

return true;

}

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

272

STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {

273

// get first quoting character

274

char quote_char = '\'';

275

if (is_char(lex, '\"')) {

276

quote_char = '\"';

277

}

278

next_char(lex);

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

279

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

280

// work out if it's a single or triple quoted literal

281

size_t num_quotes;

282

if (is_char_and(lex, quote_char, quote_char)) {

// triple quotes

next_char(lex);

next_char(lex);

num_quotes = 3;

} else {

// single quotes

num_quotes = 1;

}

size_t n_closing = 0;

293

while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {

294

if (is_char(lex, quote_char)) {

295

n_closing += 1;

296

vstr_add_char(&lex->vstr, CUR_CHAR(lex));

297

} else {

298

n_closing = 0;

299

if (is_char(lex, '\\')) {

300

next_char(lex);

301

unichar c = CUR_CHAR(lex);

302

if (is_raw) {

303

// raw strings allow escaping of quotes, but the backslash is also emitted

304

vstr_add_char(&lex->vstr, '\\');

305

} else {

306

switch (c) {

307

// note: "c" can never be MP_LEXER_EOF because next_char

308

// always inserts a newline at the end of the input stream

309

case '\n': c = MP_LEXER_EOF; break; // backslash escape the newline, just ignore it

case '\\': break;

case '\'': break;

case '"': break;

case 'a': c = 0x07; break;

314

case 'b': c = 0x08; break;

315

case 't': c = 0x09; break;

316

case 'n': c = 0x0a; break;

317

case 'v': c = 0x0b; break;

318

case 'f': c = 0x0c; break;

319

case 'r': c = 0x0d; break;

320

case 'u':

321

case 'U':

322

if (lex->tok_kind == MP_TOKEN_BYTES) {

323

// b'\u1234' == b'\\u1234'

324

vstr_add_char(&lex->vstr, '\\');

325

break;

326

}

327

// Otherwise fall through.

case 'x':

{

mp_uint_t num = 0;

if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {

332

// not enough hex chars for escape sequence

333

lex->tok_kind = MP_TOKEN_INVALID;

}

c = num;

break;

}

case 'N':

// Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the

340

// entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly

341

// 3MB of text; even gzip-compressed and with minimal structure, it'll take

342

// roughly half a meg of storage. This form of Unicode escape may be added

343

// later on, but it's definitely not a priority right now. -- CJA 20140607

344

mp_not_implemented("unicode name escapes");

345

break;

346

default:

347

if (c >= '0' && c <= '7') {

348

// Octal sequence, 1-3 chars

Damien George

2017-02-17 12:44:24 +1100

[diff] [blame]

349

size_t digits = 3;

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

350

mp_uint_t num = c - '0';

351

while (is_following_odigit(lex) && --digits != 0) {

352

next_char(lex);

353

num = num * 8 + (CUR_CHAR(lex) - '0');

}

c = num;

} else {

// unrecognised escape character; CPython lets this through verbatim as '\' and then the character

358

vstr_add_char(&lex->vstr, '\\');

}

break;

}

}

if (c != MP_LEXER_EOF) {

364

if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {

365

if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {

366

vstr_add_char(&lex->vstr, c);

367

} else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {

368

vstr_add_byte(&lex->vstr, c);

369

} else {

370

// unicode character out of range

371

// this raises a generic SyntaxError; could provide more info

372

lex->tok_kind = MP_TOKEN_INVALID;

373

}

374

} else {

375

// without unicode everything is just added as an 8-bit byte

376

if (c < 0x100) {

377

vstr_add_byte(&lex->vstr, c);

378

} else {

379

// 8-bit character out of range

380

// this raises a generic SyntaxError; could provide more info

381

lex->tok_kind = MP_TOKEN_INVALID;

}

}

}

} else {

// Add the "character" as a byte so that we remain 8-bit clean.

387

// This way, strings are parsed correctly whether or not they contain utf-8 chars.

388

vstr_add_byte(&lex->vstr, CUR_CHAR(lex));

}

}

next_char(lex);

}

// check we got the required end quotes

395

if (n_closing < num_quotes) {

396

lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;

397

}

398

399

// cut off the end quotes from the token text

400

vstr_cut_tail_bytes(&lex->vstr, n_closing);

401

}

402

403

STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

404

bool had_physical_newline = false;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

405

while (!is_end(lex)) {

406

if (is_physical_newline(lex)) {

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

407

if (stop_at_newline && lex->nested_bracket_level == 0) {

408

break;

409

}

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

410

had_physical_newline = true;

411

next_char(lex);

412

} else if (is_whitespace(lex)) {

413

next_char(lex);

414

} else if (is_char(lex, '#')) {

415

next_char(lex);

416

while (!is_end(lex) && !is_physical_newline(lex)) {

417

next_char(lex);

418

}

419

// had_physical_newline will be set on next loop

Damien George

773278e

2017-02-17 11:30:14 +1100

[diff] [blame]

420

} else if (is_char_and(lex, '\\', '\n')) {

421

// line-continuation, so don't set had_physical_newline

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

422

next_char(lex);

Damien George

773278e

2017-02-17 11:30:14 +1100

[diff] [blame]

423

next_char(lex);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

} else {

break;

}

}

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

428

return had_physical_newline;

429

}

430

431

void mp_lexer_to_next(mp_lexer_t *lex) {

432

// start new token text

433

vstr_reset(&lex->vstr);

434

435

// skip white space and comments

436

bool had_physical_newline = skip_whitespace(lex, false);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

437

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

438

// set token source information

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

439

lex->tok_line = lex->line;

440

lex->tok_column = lex->column;

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

441

Damien George

98b3072

2017-02-17 10:56:06 +1100

[diff] [blame]

442

if (lex->emit_dent < 0) {

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

443

lex->tok_kind = MP_TOKEN_DEDENT;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

444

lex->emit_dent += 1;

445

446

} else if (lex->emit_dent > 0) {

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

447

lex->tok_kind = MP_TOKEN_INDENT;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

448

lex->emit_dent -= 1;

449

Damien

91d387d

2013-10-09 15:09:52 +0100

[diff] [blame]

450

} else if (had_physical_newline && lex->nested_bracket_level == 0) {

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

451

lex->tok_kind = MP_TOKEN_NEWLINE;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

452

Damien George

2017-02-17 12:44:24 +1100

[diff] [blame]

453

size_t num_spaces = lex->column - 1;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

454

if (num_spaces == indent_top(lex)) {

455

} else if (num_spaces > indent_top(lex)) {

456

indent_push(lex, num_spaces);

457

lex->emit_dent += 1;

458

} else {

459

while (num_spaces < indent_top(lex)) {

indent_pop(lex);

lex->emit_dent -= 1;

}

if (num_spaces != indent_top(lex)) {

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

464

lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

}

}

} else if (is_end(lex)) {

Damien George

31101d9

2016-10-12 11:00:17 +1100

[diff] [blame]

469

lex->tok_kind = MP_TOKEN_END;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

470

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

471

} else if (is_string_or_bytes(lex)) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

472

// a string or bytes literal

473

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

474

// Python requires adjacent string/bytes literals to be automatically

475

// concatenated. We do it here in the tokeniser to make efficient use of RAM,

476

// because then the lexer's vstr can be used to accumulate the string literal,

477

// in contrast to creating a parse tree of strings and then joining them later

478

// in the compiler. It's also more compact in code size to do it here.

479

480

// MP_TOKEN_END is used to indicate that this is the first string token

481

lex->tok_kind = MP_TOKEN_END;

482

483

// Loop to accumulate string/bytes literals

do {

// parse type codes

bool is_raw = false;

mp_token_kind_t kind = MP_TOKEN_STRING;

488

int n_char = 0;

489

if (is_char(lex, 'u')) {

490

n_char = 1;

491

} else if (is_char(lex, 'b')) {

492

kind = MP_TOKEN_BYTES;

493

n_char = 1;

494

if (is_char_following(lex, 'r')) {

is_raw = true;

n_char = 2;

}

} else if (is_char(lex, 'r')) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

499

is_raw = true;

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

500

n_char = 1;

501

if (is_char_following(lex, 'b')) {

502

kind = MP_TOKEN_BYTES;

503

n_char = 2;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

504

}

505

}

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

506

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

507

// Set or check token kind

508

if (lex->tok_kind == MP_TOKEN_END) {

509

lex->tok_kind = kind;

510

} else if (lex->tok_kind != kind) {

511

// Can't concatenate string with bytes

512

break;

513

}

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

514

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

515

// Skip any type code characters

if (n_char != 0) {

next_char(lex);

if (n_char == 2) {

next_char(lex);

}

}

// Parse the literal

parse_string_literal(lex, is_raw);

525

526

// Skip whitespace so we can check if there's another string following

527

skip_whitespace(lex, true);

528

529

} while (is_string_or_bytes(lex));

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

530

531

} else if (is_head_of_identifier(lex)) {

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

532

lex->tok_kind = MP_TOKEN_NAME;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

533

Damien George

2015-06-09 10:58:07 +0000

[diff] [blame]

534

// get first char (add as byte to remain 8-bit clean and support utf-8)

535

vstr_add_byte(&lex->vstr, CUR_CHAR(lex));

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

536

next_char(lex);

537

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

538

// get tail chars

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

539

while (!is_end(lex) && is_tail_of_identifier(lex)) {

Damien George

2015-06-09 10:58:07 +0000

[diff] [blame]

540

vstr_add_byte(&lex->vstr, CUR_CHAR(lex));

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

next_char(lex);

}

Damien George

2017-02-17 10:59:57 +1100

[diff] [blame]

544

// Check if the name is a keyword.

545

// We also check for __debug__ here and convert it to its value. This is

546

// so the parser gives a syntax error on, eg, x.__debug__. Otherwise, we

547

// need to check for this special token in many places in the compiler.

Damien George

2017-02-17 11:10:35 +1100

[diff] [blame]

548

const char *s = vstr_null_terminated_str(&lex->vstr);

Damien George

2017-02-17 10:59:57 +1100

[diff] [blame]

549

for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {

Damien George

2017-02-17 11:10:35 +1100

[diff] [blame]

550

int cmp = strcmp(s, tok_kw[i]);

551

if (cmp == 0) {

552

lex->tok_kind = MP_TOKEN_KW_FALSE + i;

553

if (lex->tok_kind == MP_TOKEN_KW___DEBUG__) {

Damien George

2017-02-17 10:59:57 +1100

[diff] [blame]

554

lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);

Damien George

2017-02-17 10:59:57 +1100

[diff] [blame]

555

}

556

break;

Damien George

2017-02-17 11:10:35 +1100

[diff] [blame]

557

} else if (cmp < 0) {

558

// Table is sorted and comparison was less-than, so stop searching

559

break;

Damien George

2017-02-17 10:59:57 +1100

[diff] [blame]

}

}

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

563

} else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {

Damien George

2015-02-08 01:57:40 +0000

[diff] [blame]

564

bool forced_integer = false;

565

if (is_char(lex, '.')) {

566

lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;

567

} else {

568

lex->tok_kind = MP_TOKEN_INTEGER;

Damien George

2b00047

2015-09-07 17:33:44 +0100

[diff] [blame]

569

if (is_char(lex, '0') && is_following_base_char(lex)) {

Damien George

2015-02-08 01:57:40 +0000

[diff] [blame]

570

forced_integer = true;

571

}

572

}

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

573

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

574

// get first char

575

vstr_add_char(&lex->vstr, CUR_CHAR(lex));

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

576

next_char(lex);

577

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

578

// get tail chars

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

579

while (!is_end(lex)) {

Damien George

2015-02-08 01:57:40 +0000

[diff] [blame]

580

if (!forced_integer && is_char_or(lex, 'e', 'E')) {

581

lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

582

vstr_add_char(&lex->vstr, 'e');

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

583

next_char(lex);

584

if (is_char(lex, '+') || is_char(lex, '-')) {

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

585

vstr_add_char(&lex->vstr, CUR_CHAR(lex));

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

586

next_char(lex);

587

}

Damien George

2015-02-08 01:57:40 +0000

[diff] [blame]

588

} else if (is_letter(lex) || is_digit(lex) || is_char(lex, '.')) {

589

if (is_char_or3(lex, '.', 'j', 'J')) {

590

lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;

591

}

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

592

vstr_add_char(&lex->vstr, CUR_CHAR(lex));

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

next_char(lex);

} else {

break;

}

}

} else {

// search for encoded delimiter or operator

601

602

const char *t = tok_enc;

Damien George

2017-02-17 12:44:24 +1100

[diff] [blame]

603

size_t tok_enc_index = 0;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

604

for (; *t != 0 && !is_char(lex, *t); t += 1) {

605

if (*t == 'e' || *t == 'c') {

606

t += 1;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

}

tok_enc_index += 1;

}

next_char(lex);

if (*t == 0) {

// didn't match any delimiter or operator characters

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

615

lex->tok_kind = MP_TOKEN_INVALID;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

616

Damien George

2017-03-29 10:55:36 +1100

[diff] [blame]

617

} else if (*t == '!') {

618

// "!=" is a special case because "!" is not a valid operator

619

if (is_char(lex, '=')) {

620

next_char(lex);

621

lex->tok_kind = MP_TOKEN_OP_NOT_EQUAL;

622

} else {

623

lex->tok_kind = MP_TOKEN_INVALID;

624

}

625

626

} else if (*t == '.') {

627

// "." and "..." are special cases because ".." is not a valid operator

628

if (is_char_and(lex, '.', '.')) {

629

next_char(lex);

630

next_char(lex);

631

lex->tok_kind = MP_TOKEN_ELLIPSIS;

632

} else {

633

lex->tok_kind = MP_TOKEN_DEL_PERIOD;

634

}

635

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

636

} else {

637

// matched a delimiter or operator character

638

639

// get the maximum characters for a valid token

640

t += 1;

Damien George

2017-02-17 12:44:24 +1100

[diff] [blame]

641

size_t t_index = tok_enc_index;

Damien George

2017-03-29 10:55:36 +1100

[diff] [blame]

642

while (*t == 'c' || *t == 'e') {

643

t_index += 1;

644

if (is_char(lex, t[1])) {

645

next_char(lex);

646

tok_enc_index = t_index;

647

if (*t == 'e') {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

648

break;

649

}

Damien George

2017-03-29 10:55:36 +1100

[diff] [blame]

650

} else if (*t == 'c') {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

651

break;

652

}

Damien George

2017-03-29 10:55:36 +1100

[diff] [blame]

653

t += 2;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

654

}

655

656

// set token kind

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

657

lex->tok_kind = tok_enc_kind[tok_enc_index];

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

658

659

// compute bracket level for implicit line joining

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

660

if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

661

lex->nested_bracket_level += 1;

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

662

} else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

663

lex->nested_bracket_level -= 1;

664

}

665

}

666

}

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

667

}

668

Damien George

2016-11-16 18:27:20 +1100

[diff] [blame]

669

mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {

Damien George

2017-03-14 11:16:31 +1100

[diff] [blame]

670

mp_lexer_t *lex = m_new_obj(mp_lexer_t);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

671

Damien George

b829b5c

2014-01-25 13:51:19 +0000

[diff] [blame]

672

lex->source_name = src_name;

Damien George

2016-11-16 18:27:20 +1100

[diff] [blame]

673

lex->reader = reader;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

674

lex->line = 1;

Tom Collins

2998647

2017-05-04 16:31:08 -0700

[diff] [blame]

675

lex->column = -2; // account for 3 dummy bytes

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

676

lex->emit_dent = 0;

677

lex->nested_bracket_level = 0;

Damien George

58ebde4

2014-05-21 20:32:59 +0100

[diff] [blame]

678

lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

679

lex->num_indent_level = 1;

Damien George

2017-03-14 11:16:31 +1100

[diff] [blame]

680

lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);

Paul Sokolovsky

5d2499c

2014-01-13 23:15:23 +0200

[diff] [blame]

681

vstr_init(&lex->vstr, 32);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

682

Damien George

e1199ec

2014-05-10 17:48:01 +0100

[diff] [blame]

683

// store sentinel for first indentation level

684

lex->indent_level[0] = 0;

685

Tom Collins

2998647

2017-05-04 16:31:08 -0700

[diff] [blame]

686

// load lexer with start of file, advancing lex->column to 1

687

// start with dummy bytes and use next_char() for proper EOL/EOF handling

688

lex->chr0 = lex->chr1 = lex->chr2 = 0;

689

next_char(lex);

690

next_char(lex);

691

next_char(lex);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

692

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

693

// preload first token

Damien George

98b3072

2017-02-17 10:56:06 +1100

[diff] [blame]

694

mp_lexer_to_next(lex);

695

696

// Check that the first token is in the first column. If it's not then we

697

// convert the token kind to INDENT so that the parser gives a syntax error.

698

if (lex->tok_column != 1) {

699

lex->tok_kind = MP_TOKEN_INDENT;

700

}

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

return lex;

}

Damien George

2017-02-17 12:44:24 +1100

[diff] [blame]

705

mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, size_t len, size_t free_len) {

Damien George

511c083

2016-11-16 16:22:08 +1100

[diff] [blame]

706

mp_reader_t reader;

Damien George

2017-03-14 11:16:31 +1100

[diff] [blame]

707

mp_reader_new_mem(&reader, (const byte*)str, len, free_len);

Damien George

2016-11-16 18:27:20 +1100

[diff] [blame]

708

return mp_lexer_new(src_name, reader);

Damien George

511c083

2016-11-16 16:22:08 +1100

[diff] [blame]

709

}

710

Damien George

8beba73

2017-01-29 15:16:51 +1100

[diff] [blame]

711

#if MICROPY_READER_POSIX || MICROPY_READER_VFS

Damien George

e5ef15a

2016-11-16 16:25:06 +1100

[diff] [blame]

712

713

mp_lexer_t *mp_lexer_new_from_file(const char *filename) {

714

mp_reader_t reader;

Damien George

2017-03-14 11:16:31 +1100

[diff] [blame]

715

mp_reader_new_file(&reader, filename);

Damien George

2016-11-16 18:27:20 +1100

[diff] [blame]

716

return mp_lexer_new(qstr_from_str(filename), reader);

Damien George

e5ef15a

2016-11-16 16:25:06 +1100

[diff] [blame]

717

}

718

Damien George

66d955c

2016-11-16 18:12:55 +1100

[diff] [blame]

719

#if MICROPY_HELPER_LEXER_UNIX

720

721

mp_lexer_t *mp_lexer_new_from_fd(qstr filename, int fd, bool close_fd) {

722

mp_reader_t reader;

Damien George

2017-03-14 11:16:31 +1100

[diff] [blame]

723

mp_reader_new_file_from_fd(&reader, fd, close_fd);

Damien George

2016-11-16 18:27:20 +1100

[diff] [blame]

724

return mp_lexer_new(filename, reader);

Damien George

66d955c

2016-11-16 18:12:55 +1100

[diff] [blame]

}

#endif

Damien George

2016-11-16 16:25:06 +1100

[diff] [blame]

729

#endif

730

Damien

d99b052

2013-12-21 18:17:45 +0000

[diff] [blame]

731

void mp_lexer_free(mp_lexer_t *lex) {

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

732

if (lex) {

Damien George

2016-11-16 18:27:20 +1100

[diff] [blame]

733

lex->reader.close(lex->reader.data);

Damien

bb5316b

2013-10-22 21:12:29 +0100

[diff] [blame]

734

vstr_clear(&lex->vstr);

Paul Sokolovsky

624ed5d

2014-01-23 22:25:57 +0200

[diff] [blame]

735

m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);

Damien

732407f

2013-12-29 19:33:23 +0000

[diff] [blame]

736

m_del_obj(mp_lexer_t, lex);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

737

}

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

738

}

739

Damien George

c305ae3

2016-12-22 10:49:54 +1100

[diff] [blame]

740

#if 0

741

// This function is used to print the current token and should only be

742

// needed to debug the lexer, so it's not available via a config option.

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

743

void mp_lexer_show_token(const mp_lexer_t *lex) {

Damien George

451a087

2014-12-05 22:50:16 +0000

[diff] [blame]

744

printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

745

if (lex->vstr.len > 0) {

746

const byte *i = (const byte *)lex->vstr.buf;

747

const byte *j = (const byte *)i + lex->vstr.len;

748

printf(" ");

749

while (i < j) {

750

unichar c = utf8_get_char(i);

751

i = utf8_next_char(i);

752

if (unichar_isprint(c)) {

Damien George

7f19a39

2015-06-22 17:40:12 +0100

[diff] [blame]

753

printf("%c", (int)c);

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

} else {

printf("?");

}

}

}

printf("\n");

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

760

}

Damien George