Blame - py/lexer.c - lite/micropython

2017-02-17 11:10:35 +1100

[diff] [blame]

28

#include <string.h>

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

29

#include <assert.h>

30

Damien George

511c083

2016-11-16 16:22:08 +1100

[diff] [blame]

31

#include "py/reader.h"

Damien George

51dfcb4

2015-01-01 20:27:54 +0000

[diff] [blame]

32

#include "py/lexer.h"

Damien George

081f932

2015-09-07 17:08:49 +0100

[diff] [blame]

33

#include "py/runtime.h"

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

34

Damien George

dd5353a

2015-12-18 12:35:44 +0000

[diff] [blame]

35

#if MICROPY_ENABLE_COMPILER

36

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

37

#define TAB_SIZE (8)

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

38

Damien

92c0656

2013-10-22 22:32:27 +0100

[diff] [blame]

39

// TODO seems that CPython allows NULL byte in the input stream

40

// don't know if that's intentional or not, but we don't allow it

41

Damien George

2016-11-16 18:27:20 +1100

[diff] [blame]

42

#define MP_LEXER_EOF ((unichar)MP_READER_EOF)

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

43

#define CUR_CHAR(lex) ((lex)->chr0)

44

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

45

STATIC bool is_end(mp_lexer_t *lex) {

Damien George

94fbe97

2014-07-30 11:46:05 +0100

[diff] [blame]

46

return lex->chr0 == MP_LEXER_EOF;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

47

}

48

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

49

STATIC bool is_physical_newline(mp_lexer_t *lex) {

Damien George

2015-01-30 00:27:46 +0000

[diff] [blame]

50

return lex->chr0 == '\n';

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

51

}

52

Damien George

2015-03-19 00:21:29 +0000

[diff] [blame]

53

STATIC bool is_char(mp_lexer_t *lex, byte c) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

54

return lex->chr0 == c;

55

}

56

Damien George

2015-03-19 00:21:29 +0000

[diff] [blame]

57

STATIC bool is_char_or(mp_lexer_t *lex, byte c1, byte c2) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

58

return lex->chr0 == c1 || lex->chr0 == c2;

59

}

60

Damien George

2015-03-19 00:21:29 +0000

[diff] [blame]

61

STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

62

return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;

63

}

64

Damien George

2015-03-19 00:21:29 +0000

[diff] [blame]

65

STATIC bool is_char_following(mp_lexer_t *lex, byte c) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

66

return lex->chr1 == c;

67

}

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

68

Damien George

2015-03-19 00:21:29 +0000

[diff] [blame]

69

STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

70

return lex->chr1 == c1 || lex->chr1 == c2;

71

}

72

Damien George

2015-03-19 00:21:29 +0000

[diff] [blame]

73

STATIC bool is_char_following_following_or(mp_lexer_t *lex, byte c1, byte c2) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

74

return lex->chr2 == c1 || lex->chr2 == c2;

75

}

76

Damien George

2015-03-19 00:21:29 +0000

[diff] [blame]

77

STATIC bool is_char_and(mp_lexer_t *lex, byte c1, byte c2) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

78

return lex->chr0 == c1 && lex->chr1 == c2;

79

}

80

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

81

STATIC bool is_whitespace(mp_lexer_t *lex) {

Damien George

2013-12-30 18:23:50 +0000

[diff] [blame]

82

return unichar_isspace(lex->chr0);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

83

}

84

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

85

STATIC bool is_letter(mp_lexer_t *lex) {

Damien George

2013-12-30 18:23:50 +0000

[diff] [blame]

86

return unichar_isalpha(lex->chr0);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

87

}

88

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

89

STATIC bool is_digit(mp_lexer_t *lex) {

Damien George

2013-12-30 18:23:50 +0000

[diff] [blame]

90

return unichar_isdigit(lex->chr0);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

91

}

92

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

93

STATIC bool is_following_digit(mp_lexer_t *lex) {

Damien George

2013-12-30 18:23:50 +0000

[diff] [blame]

94

return unichar_isdigit(lex->chr1);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

95

}

96

Damien George

2b00047

2015-09-07 17:33:44 +0100

[diff] [blame]

97

STATIC bool is_following_base_char(mp_lexer_t *lex) {

98

const unichar chr1 = lex->chr1 | 0x20;

99

return chr1 == 'b' || chr1 == 'o' || chr1 == 'x';

Damien George

2015-02-08 01:57:40 +0000

[diff] [blame]

100

}

101

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

102

STATIC bool is_following_odigit(mp_lexer_t *lex) {

Paul Sokolovsky

2014-01-22 22:40:02 +0200

[diff] [blame]

103

return lex->chr1 >= '0' && lex->chr1 <= '7';

104

}

105

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

106

STATIC bool is_string_or_bytes(mp_lexer_t *lex) {

107

return is_char_or(lex, '\'', '\"')

108

|| (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))

109

|| ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))

110

&& is_char_following_following_or(lex, '\'', '\"'));

111

}

112

Damien George

2015-06-09 10:58:07 +0000

[diff] [blame]

113

// to easily parse utf-8 identifiers we allow any raw byte with high bit set

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

114

STATIC bool is_head_of_identifier(mp_lexer_t *lex) {

Damien George

2015-06-09 10:58:07 +0000

[diff] [blame]

115

return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

116

}

117

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

118

STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

119

return is_head_of_identifier(lex) || is_digit(lex);

120

}

121

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

122

STATIC void next_char(mp_lexer_t *lex) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

123

if (lex->chr0 == '\n') {

Damien George

2015-01-30 00:27:46 +0000

[diff] [blame]

124

// a new line

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

125

++lex->line;

126

lex->column = 1;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

127

} else if (lex->chr0 == '\t') {

128

// a tab

129

lex->column = (((lex->column - 1 + TAB_SIZE) / TAB_SIZE) * TAB_SIZE) + 1;

130

} else {

131

// a character worth one column

++lex->column;

}

Damien George

2015-01-30 00:27:46 +0000

[diff] [blame]

135

lex->chr0 = lex->chr1;

136

lex->chr1 = lex->chr2;

Damien George

2016-11-16 18:27:20 +1100

[diff] [blame]

137

lex->chr2 = lex->reader.readbyte(lex->reader.data);

Damien George

2015-01-30 00:27:46 +0000

[diff] [blame]

138

Tom Collins

6f56412

2017-05-09 13:19:46 -0700

[diff] [blame]

139

if (lex->chr1 == '\r') {

Damien George

2015-01-30 00:27:46 +0000

[diff] [blame]

140

// CR is a new line, converted to LF

Tom Collins

6f56412

2017-05-09 13:19:46 -0700

[diff] [blame]

141

lex->chr1 = '\n';

142

if (lex->chr2 == '\n') {

143

// CR LF is a single new line, throw out the extra LF

Damien George

2016-11-16 18:27:20 +1100

[diff] [blame]

144

lex->chr2 = lex->reader.readbyte(lex->reader.data);

Damien George

2015-01-30 00:27:46 +0000

[diff] [blame]

}

}

Tom Collins

2017-05-09 13:19:46 -0700

[diff] [blame]

148

// check if we need to insert a newline at end of file

149

if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {

150

lex->chr2 = '\n';

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

}

}

Damien George

2017-02-17 12:44:24 +1100

[diff] [blame]

154

STATIC void indent_push(mp_lexer_t *lex, size_t indent) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

155

if (lex->num_indent_level >= lex->alloc_indent_level) {

Damien George

58ebde4

2014-05-21 20:32:59 +0100

[diff] [blame]

156

lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);

157

lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

158

}

159

lex->indent_level[lex->num_indent_level++] = indent;

160

}

161

Damien George

2017-02-17 12:44:24 +1100

[diff] [blame]

162

STATIC size_t indent_top(mp_lexer_t *lex) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

163

return lex->indent_level[lex->num_indent_level - 1];

164

}

165

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

166

STATIC void indent_pop(mp_lexer_t *lex) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

167

lex->num_indent_level -= 1;

168

}

169

170

// some tricky operator encoding:

171

// <op> = begin with <op>, if this opchar matches then begin here

172

// e<op> = end with <op>, if this opchar matches then end

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

173

// c<op> = continue with <op>, if this opchar matches then continue matching

174

// this means if the start of two ops are the same then they are equal til the last char

175

Damien George

3ff16ff

2016-05-20 12:38:15 +0100

[diff] [blame]

176

STATIC const char *const tok_enc =

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

177

"()[]{},:;@~" // singles

178

"<e=c<e=" // < <= << <<=

179

">e=c>e=" // > >= >> >>=

180

"*e=c*e=" // * *= ** **=

"+e=" // + +=

"-e=e>" // - -= ->

"&e=" // & &=

"|e=" // | |=

"/e=c/e=" // / /= // //=

186

"%e=" // % %=

187

"^e=" // ^ ^=

188

"=e=" // = ==

Damien George

2017-03-29 10:55:36 +1100

[diff] [blame]

189

"!."; // start of special cases: != . ...

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

190

191

// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries

Paul Sokolovsky

2014-02-12 18:31:30 +0200

[diff] [blame]

192

STATIC const uint8_t tok_enc_kind[] = {

Damien

d99b052

2013-12-21 18:17:45 +0000

[diff] [blame]

193

MP_TOKEN_DEL_PAREN_OPEN, MP_TOKEN_DEL_PAREN_CLOSE,

194

MP_TOKEN_DEL_BRACKET_OPEN, MP_TOKEN_DEL_BRACKET_CLOSE,

195

MP_TOKEN_DEL_BRACE_OPEN, MP_TOKEN_DEL_BRACE_CLOSE,

196

MP_TOKEN_DEL_COMMA, MP_TOKEN_DEL_COLON, MP_TOKEN_DEL_SEMICOLON, MP_TOKEN_DEL_AT, MP_TOKEN_OP_TILDE,

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

197

Damien

d99b052

2013-12-21 18:17:45 +0000

[diff] [blame]

198

MP_TOKEN_OP_LESS, MP_TOKEN_OP_LESS_EQUAL, MP_TOKEN_OP_DBL_LESS, MP_TOKEN_DEL_DBL_LESS_EQUAL,

199

MP_TOKEN_OP_MORE, MP_TOKEN_OP_MORE_EQUAL, MP_TOKEN_OP_DBL_MORE, MP_TOKEN_DEL_DBL_MORE_EQUAL,

200

MP_TOKEN_OP_STAR, MP_TOKEN_DEL_STAR_EQUAL, MP_TOKEN_OP_DBL_STAR, MP_TOKEN_DEL_DBL_STAR_EQUAL,

201

MP_TOKEN_OP_PLUS, MP_TOKEN_DEL_PLUS_EQUAL,

202

MP_TOKEN_OP_MINUS, MP_TOKEN_DEL_MINUS_EQUAL, MP_TOKEN_DEL_MINUS_MORE,

203

MP_TOKEN_OP_AMPERSAND, MP_TOKEN_DEL_AMPERSAND_EQUAL,

204

MP_TOKEN_OP_PIPE, MP_TOKEN_DEL_PIPE_EQUAL,

205

MP_TOKEN_OP_SLASH, MP_TOKEN_DEL_SLASH_EQUAL, MP_TOKEN_OP_DBL_SLASH, MP_TOKEN_DEL_DBL_SLASH_EQUAL,

206

MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,

207

MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,

208

MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

209

};

210

211

// must have the same order as enum in lexer.h

Damien George

2017-02-17 11:10:35 +1100

[diff] [blame]

212

// must be sorted according to strcmp

Damien George

3ff16ff

2016-05-20 12:38:15 +0100

[diff] [blame]

213

STATIC const char *const tok_kw[] = {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

214

"False",

215

"None",

216

"True",

Damien George

2017-02-17 11:10:35 +1100

[diff] [blame]

217

"__debug__",

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

218

"and",

219

"as",

220

"assert",

pohmelie

81ebba7

2016-01-27 23:23:11 +0300

[diff] [blame]

221

#if MICROPY_PY_ASYNC_AWAIT

222

"async",

223

"await",

224

#endif

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

"break",

"class",

"continue",

"def",

"del",

"elif",

"else",

"except",

"finally",

"for",

"from",

"global",

"if",

"import",

"in",

"is",

"lambda",

"nonlocal",

"not",

"or",

"pass",

"raise",

"return",

"try",

"while",

"with",

"yield",

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

252

};

253

Paul Sokolovsky

2014-01-22 22:40:02 +0200

[diff] [blame]

254

// This is called with CUR_CHAR() before first hex digit, and should return with

255

// it pointing to last hex digit

Damien George

54eb4e7

2014-07-03 13:47:47 +0100

[diff] [blame]

256

// num_digits must be greater than zero

Damien George

2017-02-17 12:44:24 +1100

[diff] [blame]

257

STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) {

Damien George

54eb4e7

2014-07-03 13:47:47 +0100

[diff] [blame]

258

mp_uint_t num = 0;

Paul Sokolovsky

2014-01-22 22:40:02 +0200

[diff] [blame]

259

while (num_digits-- != 0) {

260

next_char(lex);

261

unichar c = CUR_CHAR(lex);

262

if (!unichar_isxdigit(c)) {

263

return false;

264

}

Dave Hylands

3ad94d6

2015-05-18 14:41:25 -0700

[diff] [blame]

265

num = (num << 4) + unichar_xdigit_value(c);

Paul Sokolovsky

2014-01-22 22:40:02 +0200

[diff] [blame]

}

*result = num;

return true;

}

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

271

STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {

272

// get first quoting character

273

char quote_char = '\'';

274

if (is_char(lex, '\"')) {

275

quote_char = '\"';

276

}

277

next_char(lex);

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

278

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

279

// work out if it's a single or triple quoted literal

280

size_t num_quotes;

281

if (is_char_and(lex, quote_char, quote_char)) {

// triple quotes

next_char(lex);

next_char(lex);

num_quotes = 3;

} else {

// single quotes

num_quotes = 1;

}

size_t n_closing = 0;

292

while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {

293

if (is_char(lex, quote_char)) {

294

n_closing += 1;

295

vstr_add_char(&lex->vstr, CUR_CHAR(lex));

296

} else {

297

n_closing = 0;

298

if (is_char(lex, '\\')) {

299

next_char(lex);

300

unichar c = CUR_CHAR(lex);

301

if (is_raw) {

302

// raw strings allow escaping of quotes, but the backslash is also emitted

303

vstr_add_char(&lex->vstr, '\\');

304

} else {

305

switch (c) {

306

// note: "c" can never be MP_LEXER_EOF because next_char

307

// always inserts a newline at the end of the input stream

308

case '\n': c = MP_LEXER_EOF; break; // backslash escape the newline, just ignore it

case '\\': break;

case '\'': break;

case '"': break;

case 'a': c = 0x07; break;

313

case 'b': c = 0x08; break;

314

case 't': c = 0x09; break;

315

case 'n': c = 0x0a; break;

316

case 'v': c = 0x0b; break;

317

case 'f': c = 0x0c; break;

318

case 'r': c = 0x0d; break;

319

case 'u':

320

case 'U':

321

if (lex->tok_kind == MP_TOKEN_BYTES) {

322

// b'\u1234' == b'\\u1234'

323

vstr_add_char(&lex->vstr, '\\');

324

break;

325

}

326

// Otherwise fall through.

case 'x':

{

mp_uint_t num = 0;

if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {

331

// not enough hex chars for escape sequence

332

lex->tok_kind = MP_TOKEN_INVALID;

}

c = num;

break;

}

case 'N':

// Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the

339

// entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly

340

// 3MB of text; even gzip-compressed and with minimal structure, it'll take

341

// roughly half a meg of storage. This form of Unicode escape may be added

342

// later on, but it's definitely not a priority right now. -- CJA 20140607

Javier Candeira

35a1fea

2017-08-09 14:40:45 +1000

[diff] [blame]

343

mp_raise_NotImplementedError("unicode name escapes");

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

344

break;

345

default:

346

if (c >= '0' && c <= '7') {

347

// Octal sequence, 1-3 chars

Damien George

2017-02-17 12:44:24 +1100

[diff] [blame]

348

size_t digits = 3;

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

349

mp_uint_t num = c - '0';

350

while (is_following_odigit(lex) && --digits != 0) {

351

next_char(lex);

352

num = num * 8 + (CUR_CHAR(lex) - '0');

}

c = num;

} else {

// unrecognised escape character; CPython lets this through verbatim as '\' and then the character

357

vstr_add_char(&lex->vstr, '\\');

}

break;

}

}

if (c != MP_LEXER_EOF) {

363

if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {

364

if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {

365

vstr_add_char(&lex->vstr, c);

366

} else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {

367

vstr_add_byte(&lex->vstr, c);

368

} else {

369

// unicode character out of range

370

// this raises a generic SyntaxError; could provide more info

371

lex->tok_kind = MP_TOKEN_INVALID;

372

}

373

} else {

374

// without unicode everything is just added as an 8-bit byte

375

if (c < 0x100) {

376

vstr_add_byte(&lex->vstr, c);

377

} else {

378

// 8-bit character out of range

379

// this raises a generic SyntaxError; could provide more info

380

lex->tok_kind = MP_TOKEN_INVALID;

}

}

}

} else {

// Add the "character" as a byte so that we remain 8-bit clean.

386

// This way, strings are parsed correctly whether or not they contain utf-8 chars.

387

vstr_add_byte(&lex->vstr, CUR_CHAR(lex));

}

}

next_char(lex);

}

// check we got the required end quotes

394

if (n_closing < num_quotes) {

395

lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;

396

}

397

398

// cut off the end quotes from the token text

399

vstr_cut_tail_bytes(&lex->vstr, n_closing);

400

}

401

402

STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

403

bool had_physical_newline = false;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

404

while (!is_end(lex)) {

405

if (is_physical_newline(lex)) {

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

406

if (stop_at_newline && lex->nested_bracket_level == 0) {

407

break;

408

}

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

409

had_physical_newline = true;

410

next_char(lex);

411

} else if (is_whitespace(lex)) {

412

next_char(lex);

413

} else if (is_char(lex, '#')) {

414

next_char(lex);

415

while (!is_end(lex) && !is_physical_newline(lex)) {

416

next_char(lex);

417

}

418

// had_physical_newline will be set on next loop

Damien George

773278e

2017-02-17 11:30:14 +1100

[diff] [blame]

419

} else if (is_char_and(lex, '\\', '\n')) {

420

// line-continuation, so don't set had_physical_newline

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

421

next_char(lex);

Damien George

773278e

2017-02-17 11:30:14 +1100

[diff] [blame]

422

next_char(lex);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

} else {

break;

}

}

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

427

return had_physical_newline;

428

}

429

430

void mp_lexer_to_next(mp_lexer_t *lex) {

431

// start new token text

432

vstr_reset(&lex->vstr);

433

434

// skip white space and comments

435

bool had_physical_newline = skip_whitespace(lex, false);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

436

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

437

// set token source information

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

438

lex->tok_line = lex->line;

439

lex->tok_column = lex->column;

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

440

Damien George

98b3072

2017-02-17 10:56:06 +1100

[diff] [blame]

441

if (lex->emit_dent < 0) {

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

442

lex->tok_kind = MP_TOKEN_DEDENT;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

443

lex->emit_dent += 1;

444

445

} else if (lex->emit_dent > 0) {

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

446

lex->tok_kind = MP_TOKEN_INDENT;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

447

lex->emit_dent -= 1;

448

Damien

91d387d

2013-10-09 15:09:52 +0100

[diff] [blame]

449

} else if (had_physical_newline && lex->nested_bracket_level == 0) {

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

450

lex->tok_kind = MP_TOKEN_NEWLINE;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

451

Damien George

2017-02-17 12:44:24 +1100

[diff] [blame]

452

size_t num_spaces = lex->column - 1;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

453

if (num_spaces == indent_top(lex)) {

454

} else if (num_spaces > indent_top(lex)) {

455

indent_push(lex, num_spaces);

456

lex->emit_dent += 1;

457

} else {

458

while (num_spaces < indent_top(lex)) {

indent_pop(lex);

lex->emit_dent -= 1;

}

if (num_spaces != indent_top(lex)) {

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

463

lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

}

}

} else if (is_end(lex)) {

Damien George

31101d9

2016-10-12 11:00:17 +1100

[diff] [blame]

468

lex->tok_kind = MP_TOKEN_END;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

469

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

470

} else if (is_string_or_bytes(lex)) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

471

// a string or bytes literal

472

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

473

// Python requires adjacent string/bytes literals to be automatically

474

// concatenated. We do it here in the tokeniser to make efficient use of RAM,

475

// because then the lexer's vstr can be used to accumulate the string literal,

476

// in contrast to creating a parse tree of strings and then joining them later

477

// in the compiler. It's also more compact in code size to do it here.

478

479

// MP_TOKEN_END is used to indicate that this is the first string token

480

lex->tok_kind = MP_TOKEN_END;

481

482

// Loop to accumulate string/bytes literals

do {

// parse type codes

bool is_raw = false;

mp_token_kind_t kind = MP_TOKEN_STRING;

487

int n_char = 0;

488

if (is_char(lex, 'u')) {

489

n_char = 1;

490

} else if (is_char(lex, 'b')) {

491

kind = MP_TOKEN_BYTES;

492

n_char = 1;

493

if (is_char_following(lex, 'r')) {

is_raw = true;

n_char = 2;

}

} else if (is_char(lex, 'r')) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

498

is_raw = true;

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

499

n_char = 1;

500

if (is_char_following(lex, 'b')) {

501

kind = MP_TOKEN_BYTES;

502

n_char = 2;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

503

}

504

}

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

505

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

506

// Set or check token kind

507

if (lex->tok_kind == MP_TOKEN_END) {

508

lex->tok_kind = kind;

509

} else if (lex->tok_kind != kind) {

510

// Can't concatenate string with bytes

511

break;

512

}

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

513

Damien George

2017-02-17 12:12:40 +1100

[diff] [blame]

514

// Skip any type code characters

if (n_char != 0) {

next_char(lex);

if (n_char == 2) {

next_char(lex);

}

}

// Parse the literal

parse_string_literal(lex, is_raw);

524

525

// Skip whitespace so we can check if there's another string following

526

skip_whitespace(lex, true);

527

528

} while (is_string_or_bytes(lex));

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

529

530

} else if (is_head_of_identifier(lex)) {

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

531

lex->tok_kind = MP_TOKEN_NAME;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

532

Damien George

2015-06-09 10:58:07 +0000

[diff] [blame]

533

// get first char (add as byte to remain 8-bit clean and support utf-8)

534

vstr_add_byte(&lex->vstr, CUR_CHAR(lex));

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

535

next_char(lex);

536

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

537

// get tail chars

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

538

while (!is_end(lex) && is_tail_of_identifier(lex)) {

Damien George

2015-06-09 10:58:07 +0000

[diff] [blame]

539

vstr_add_byte(&lex->vstr, CUR_CHAR(lex));

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

next_char(lex);

}

Damien George

2017-02-17 10:59:57 +1100

[diff] [blame]

543

// Check if the name is a keyword.

544

// We also check for __debug__ here and convert it to its value. This is

545

// so the parser gives a syntax error on, eg, x.__debug__. Otherwise, we

546

// need to check for this special token in many places in the compiler.

Damien George

2017-02-17 11:10:35 +1100

[diff] [blame]

547

const char *s = vstr_null_terminated_str(&lex->vstr);

Damien George

2017-02-17 10:59:57 +1100

[diff] [blame]

548

for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {

Damien George

2017-02-17 11:10:35 +1100

[diff] [blame]

549

int cmp = strcmp(s, tok_kw[i]);

550

if (cmp == 0) {

551

lex->tok_kind = MP_TOKEN_KW_FALSE + i;

552

if (lex->tok_kind == MP_TOKEN_KW___DEBUG__) {

Damien George

2017-02-17 10:59:57 +1100

[diff] [blame]

553

lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);

Damien George

2017-02-17 10:59:57 +1100

[diff] [blame]

554

}

555

break;

Damien George

2017-02-17 11:10:35 +1100

[diff] [blame]

556

} else if (cmp < 0) {

557

// Table is sorted and comparison was less-than, so stop searching

558

break;

Damien George

2017-02-17 10:59:57 +1100

[diff] [blame]

}

}

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

562

} else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {

Damien George

2015-02-08 01:57:40 +0000

[diff] [blame]

563

bool forced_integer = false;

564

if (is_char(lex, '.')) {

565

lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;

566

} else {

567

lex->tok_kind = MP_TOKEN_INTEGER;

Damien George

2b00047

2015-09-07 17:33:44 +0100

[diff] [blame]

568

if (is_char(lex, '0') && is_following_base_char(lex)) {

Damien George

2015-02-08 01:57:40 +0000

[diff] [blame]

569

forced_integer = true;

570

}

571

}

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

572

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

573

// get first char

574

vstr_add_char(&lex->vstr, CUR_CHAR(lex));

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

575

next_char(lex);

576

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

577

// get tail chars

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

578

while (!is_end(lex)) {

Damien George

2015-02-08 01:57:40 +0000

[diff] [blame]

579

if (!forced_integer && is_char_or(lex, 'e', 'E')) {

580

lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

581

vstr_add_char(&lex->vstr, 'e');

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

582

next_char(lex);

583

if (is_char(lex, '+') || is_char(lex, '-')) {

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

584

vstr_add_char(&lex->vstr, CUR_CHAR(lex));

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

585

next_char(lex);

586

}

Damien George

2015-02-08 01:57:40 +0000

[diff] [blame]

587

} else if (is_letter(lex) || is_digit(lex) || is_char(lex, '.')) {

588

if (is_char_or3(lex, '.', 'j', 'J')) {

589

lex->tok_kind = MP_TOKEN_FLOAT_OR_IMAG;

590

}

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

591

vstr_add_char(&lex->vstr, CUR_CHAR(lex));

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

next_char(lex);

} else {

break;

}

}

} else {

// search for encoded delimiter or operator

600

601

const char *t = tok_enc;

Damien George

2017-02-17 12:44:24 +1100

[diff] [blame]

602

size_t tok_enc_index = 0;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

603

for (; *t != 0 && !is_char(lex, *t); t += 1) {

604

if (*t == 'e' || *t == 'c') {

605

t += 1;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

}

tok_enc_index += 1;

}

next_char(lex);

if (*t == 0) {

// didn't match any delimiter or operator characters

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

614

lex->tok_kind = MP_TOKEN_INVALID;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

615

Damien George

2017-03-29 10:55:36 +1100

[diff] [blame]

616

} else if (*t == '!') {

617

// "!=" is a special case because "!" is not a valid operator

618

if (is_char(lex, '=')) {

619

next_char(lex);

620

lex->tok_kind = MP_TOKEN_OP_NOT_EQUAL;

621

} else {

622

lex->tok_kind = MP_TOKEN_INVALID;

623

}

624

625

} else if (*t == '.') {

626

// "." and "..." are special cases because ".." is not a valid operator

627

if (is_char_and(lex, '.', '.')) {

628

next_char(lex);

629

next_char(lex);

630

lex->tok_kind = MP_TOKEN_ELLIPSIS;

631

} else {

632

lex->tok_kind = MP_TOKEN_DEL_PERIOD;

633

}

634

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

635

} else {

636

// matched a delimiter or operator character

637

638

// get the maximum characters for a valid token

639

t += 1;

Damien George

2017-02-17 12:44:24 +1100

[diff] [blame]

640

size_t t_index = tok_enc_index;

Damien George

2017-03-29 10:55:36 +1100

[diff] [blame]

641

while (*t == 'c' || *t == 'e') {

642

t_index += 1;

643

if (is_char(lex, t[1])) {

644

next_char(lex);

645

tok_enc_index = t_index;

646

if (*t == 'e') {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

647

break;

648

}

Damien George

2017-03-29 10:55:36 +1100

[diff] [blame]

649

} else if (*t == 'c') {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

650

break;

651

}

Damien George

2017-03-29 10:55:36 +1100

[diff] [blame]

652

t += 2;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

653

}

654

655

// set token kind

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

656

lex->tok_kind = tok_enc_kind[tok_enc_index];

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

657

658

// compute bracket level for implicit line joining

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

659

if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

660

lex->nested_bracket_level += 1;

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

661

} else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

662

lex->nested_bracket_level -= 1;

663

}

664

}

665

}

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

666

}

667

Damien George

2016-11-16 18:27:20 +1100

[diff] [blame]

668

mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {

Damien George

2017-03-14 11:16:31 +1100

[diff] [blame]

669

mp_lexer_t *lex = m_new_obj(mp_lexer_t);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

670

Damien George

b829b5c

2014-01-25 13:51:19 +0000

[diff] [blame]

671

lex->source_name = src_name;

Damien George

2016-11-16 18:27:20 +1100

[diff] [blame]

672

lex->reader = reader;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

673

lex->line = 1;

Tom Collins

145796f

2017-06-30 16:23:29 -0700

[diff] [blame]

674

lex->column = (size_t)-2; // account for 3 dummy bytes

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

675

lex->emit_dent = 0;

676

lex->nested_bracket_level = 0;

Damien George

58ebde4

2014-05-21 20:32:59 +0100

[diff] [blame]

677

lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

678

lex->num_indent_level = 1;

Damien George

2017-03-14 11:16:31 +1100

[diff] [blame]

679

lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);

Paul Sokolovsky

5d2499c

2014-01-13 23:15:23 +0200

[diff] [blame]

680

vstr_init(&lex->vstr, 32);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

681

Damien George

e1199ec

2014-05-10 17:48:01 +0100

[diff] [blame]

682

// store sentinel for first indentation level

683

lex->indent_level[0] = 0;

684

Tom Collins

2998647

2017-05-04 16:31:08 -0700

[diff] [blame]

685

// load lexer with start of file, advancing lex->column to 1

686

// start with dummy bytes and use next_char() for proper EOL/EOF handling

687

lex->chr0 = lex->chr1 = lex->chr2 = 0;

688

next_char(lex);

689

next_char(lex);

690

next_char(lex);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

691

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

692

// preload first token

Damien George

98b3072

2017-02-17 10:56:06 +1100

[diff] [blame]

693

mp_lexer_to_next(lex);

694

695

// Check that the first token is in the first column. If it's not then we

696

// convert the token kind to INDENT so that the parser gives a syntax error.

697

if (lex->tok_column != 1) {

698

lex->tok_kind = MP_TOKEN_INDENT;

699

}

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

return lex;

}

Damien George

2017-02-17 12:44:24 +1100

[diff] [blame]

704

mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, size_t len, size_t free_len) {

Damien George

511c083

2016-11-16 16:22:08 +1100

[diff] [blame]

705

mp_reader_t reader;

Damien George

2017-03-14 11:16:31 +1100

[diff] [blame]

706

mp_reader_new_mem(&reader, (const byte*)str, len, free_len);

Damien George

2016-11-16 18:27:20 +1100

[diff] [blame]

707

return mp_lexer_new(src_name, reader);

Damien George

511c083

2016-11-16 16:22:08 +1100

[diff] [blame]

708

}

709

Damien George

8beba73

2017-01-29 15:16:51 +1100

[diff] [blame]

710

#if MICROPY_READER_POSIX || MICROPY_READER_VFS

Damien George

e5ef15a

2016-11-16 16:25:06 +1100

[diff] [blame]

711

712

mp_lexer_t *mp_lexer_new_from_file(const char *filename) {

713

mp_reader_t reader;

Damien George

2017-03-14 11:16:31 +1100

[diff] [blame]

714

mp_reader_new_file(&reader, filename);

Damien George

2016-11-16 18:27:20 +1100

[diff] [blame]

715

return mp_lexer_new(qstr_from_str(filename), reader);

Damien George

e5ef15a

2016-11-16 16:25:06 +1100

[diff] [blame]

716

}

717

Damien George

66d955c

2016-11-16 18:12:55 +1100

[diff] [blame]

718

#if MICROPY_HELPER_LEXER_UNIX

719

720

mp_lexer_t *mp_lexer_new_from_fd(qstr filename, int fd, bool close_fd) {

721

mp_reader_t reader;

Damien George

2017-03-14 11:16:31 +1100

[diff] [blame]

722

mp_reader_new_file_from_fd(&reader, fd, close_fd);

Damien George

2016-11-16 18:27:20 +1100

[diff] [blame]

723

return mp_lexer_new(filename, reader);

Damien George

66d955c

2016-11-16 18:12:55 +1100

[diff] [blame]

}

#endif

Damien George

2016-11-16 16:25:06 +1100

[diff] [blame]

728

#endif

729

Damien

d99b052

2013-12-21 18:17:45 +0000

[diff] [blame]

730

void mp_lexer_free(mp_lexer_t *lex) {

Damien

2013-10-20 14:41:27 +0100

[diff] [blame]

731

if (lex) {

Damien George

2016-11-16 18:27:20 +1100

[diff] [blame]

732

lex->reader.close(lex->reader.data);

Damien

bb5316b

2013-10-22 21:12:29 +0100

[diff] [blame]

733

vstr_clear(&lex->vstr);

Paul Sokolovsky

624ed5d

2014-01-23 22:25:57 +0200

[diff] [blame]

734

m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);

Damien

732407f

2013-12-29 19:33:23 +0000

[diff] [blame]

735

m_del_obj(mp_lexer_t, lex);

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

736

}

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

737

}

738

Damien George

c305ae3

2016-12-22 10:49:54 +1100

[diff] [blame]

739

#if 0

740

// This function is used to print the current token and should only be

741

// needed to debug the lexer, so it's not available via a config option.

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

742

void mp_lexer_show_token(const mp_lexer_t *lex) {

Damien George

451a087

2014-12-05 22:50:16 +0000

[diff] [blame]

743

printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%zu", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

744

if (lex->vstr.len > 0) {

745

const byte *i = (const byte *)lex->vstr.buf;

746

const byte *j = (const byte *)i + lex->vstr.len;

747

printf(" ");

748

while (i < j) {

749

unichar c = utf8_get_char(i);

750

i = utf8_next_char(i);

751

if (unichar_isprint(c)) {

Damien George

7f19a39

2015-06-22 17:40:12 +0100

[diff] [blame]

752

printf("%c", (int)c);

Damien George

2014-12-05 19:35:18 +0000

[diff] [blame]

} else {

printf("?");

}

}

}

printf("\n");

Damien

2013-10-04 19:53:11 +0100

[diff] [blame]

759

}

Damien George