blob: b1b61d74fe56d6f6fa439179506cab00f345ab88 [file] [log] [blame]
Paul Sokolovsky83865342014-06-13 00:51:34 +03001/*
2 * This file is part of the Micro Python project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 * Copyright (c) 2014 Paul Sokolovsky
8 *
9 * Permission is hereby granted, free of charge, to any person obtaining a copy
10 * of this software and associated documentation files (the "Software"), to deal
11 * in the Software without restriction, including without limitation the rights
12 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 * copies of the Software, and to permit persons to whom the Software is
14 * furnished to do so, subject to the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be included in
17 * all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 * THE SOFTWARE.
26 */
27
28#include <stdbool.h>
29#include <string.h>
30#include <assert.h>
31
32#include "mpconfig.h"
33#include "nlr.h"
34#include "misc.h"
35#include "qstr.h"
36#include "obj.h"
37#include "runtime0.h"
38#include "runtime.h"
39#include "pfenv.h"
40#include "objstr.h"
41#include "objlist.h"
42
43STATIC mp_obj_t str_modulo_format(mp_obj_t pattern, uint n_args, const mp_obj_t *args, mp_obj_t dict);
44const mp_obj_t mp_const_empty_bytes;
45
46// use this macro to extract the string hash
47#define GET_STR_HASH(str_obj_in, str_hash) uint str_hash; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_hash = qstr_hash(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_hash = ((mp_obj_str_t*)str_obj_in)->hash; }
48
49// use this macro to extract the string length
50#define GET_STR_LEN(str_obj_in, str_len) uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_len = qstr_len(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; }
51
52// use this macro to extract the string data and length
53#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) const byte *str_data; uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; }
54
55STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str);
56STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str);
57STATIC NORETURN void bad_implicit_conversion(mp_obj_t self_in);
58STATIC NORETURN void arg_type_mixup();
59
60STATIC bool is_str_or_bytes(mp_obj_t o) {
61 return MP_OBJ_IS_STR(o) || MP_OBJ_IS_TYPE(o, &mp_type_bytes);
62}
63
64/******************************************************************************/
65/* str */
66
Chris Angelico64b468d2014-06-04 05:28:12 +100067void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len, bool is_bytes) {
Paul Sokolovsky83865342014-06-13 00:51:34 +030068 // this escapes characters, but it will be very slow to print (calling print many times)
69 bool has_single_quote = false;
70 bool has_double_quote = false;
71 for (const byte *s = str_data, *top = str_data + str_len; !has_double_quote && s < top; s++) {
72 if (*s == '\'') {
73 has_single_quote = true;
74 } else if (*s == '"') {
75 has_double_quote = true;
76 }
77 }
78 int quote_char = '\'';
79 if (has_single_quote && !has_double_quote) {
80 quote_char = '"';
81 }
82 print(env, "%c", quote_char);
Chris Angelico64b468d2014-06-04 05:28:12 +100083 const char *s = (const char *)str_data, *top = (const char *)str_data + str_len;
84 while (s < top) {
85 unichar ch;
86 if (is_bytes) {
87 ch = *(unsigned char *)s++; // Don't sign-extend bytes
Paul Sokolovsky83865342014-06-13 00:51:34 +030088 } else {
Chris Angelico64b468d2014-06-04 05:28:12 +100089 ch = utf8_get_char(s);
90 s = utf8_next_char(s);
91 }
92 if (ch == quote_char) {
93 print(env, "\\%c", quote_char);
94 } else if (ch == '\\') {
95 print(env, "\\\\");
96 } else if (32 <= ch && ch <= 126) {
97 print(env, "%c", ch);
98 } else if (ch == '\n') {
99 print(env, "\\n");
100 } else if (ch == '\r') {
101 print(env, "\\r");
102 } else if (ch == '\t') {
103 print(env, "\\t");
104 } else if (ch < 0x100) {
105 print(env, "\\x%02x", ch);
106 } else if (ch < 0x10000) {
107 print(env, "\\u%04x", ch);
108 } else {
109 print(env, "\\U%08x", ch);
Paul Sokolovsky83865342014-06-13 00:51:34 +0300110 }
111 }
112 print(env, "%c", quote_char);
113}
114
115STATIC void str_print(void (*print)(void *env, const char *fmt, ...), void *env, mp_obj_t self_in, mp_print_kind_t kind) {
116 GET_STR_DATA_LEN(self_in, str_data, str_len);
117 bool is_bytes = MP_OBJ_IS_TYPE(self_in, &mp_type_bytes);
118 if (kind == PRINT_STR && !is_bytes) {
119 print(env, "%.*s", str_len, str_data);
120 } else {
121 if (is_bytes) {
122 print(env, "b");
123 }
Chris Angelico64b468d2014-06-04 05:28:12 +1000124 mp_str_print_quoted(print, env, str_data, str_len, is_bytes);
Paul Sokolovsky83865342014-06-13 00:51:34 +0300125 }
126}
127
128STATIC mp_obj_t str_make_new(mp_obj_t type_in, uint n_args, uint n_kw, const mp_obj_t *args) {
129#if MICROPY_CPYTHON_COMPAT
130 if (n_kw != 0) {
131 mp_arg_error_unimpl_kw();
132 }
133#endif
134
135 switch (n_args) {
136 case 0:
137 return MP_OBJ_NEW_QSTR(MP_QSTR_);
138
139 case 1:
140 {
141 vstr_t *vstr = vstr_new();
142 mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[0], PRINT_STR);
143 mp_obj_t s = mp_obj_new_str(vstr->buf, vstr->len, false);
144 vstr_free(vstr);
145 return s;
146 }
147
148 case 2:
149 case 3:
150 {
151 // TODO: validate 2nd/3rd args
152 if (!MP_OBJ_IS_TYPE(args[0], &mp_type_bytes)) {
153 nlr_raise(mp_obj_new_exception_msg(&mp_type_TypeError, "bytes expected"));
154 }
155 GET_STR_DATA_LEN(args[0], str_data, str_len);
156 GET_STR_HASH(args[0], str_hash);
157 mp_obj_str_t *o = mp_obj_new_str_of_type(&mp_type_str, NULL, str_len);
158 o->data = str_data;
159 o->hash = str_hash;
160 return o;
161 }
162
163 default:
164 nlr_raise(mp_obj_new_exception_msg(&mp_type_TypeError, "str takes at most 3 arguments"));
165 }
166}
167
168STATIC mp_obj_t bytes_make_new(mp_obj_t type_in, uint n_args, uint n_kw, const mp_obj_t *args) {
169 if (n_args == 0) {
170 return mp_const_empty_bytes;
171 }
172
173#if MICROPY_CPYTHON_COMPAT
174 if (n_kw != 0) {
175 mp_arg_error_unimpl_kw();
176 }
177#endif
178
179 if (MP_OBJ_IS_STR(args[0])) {
180 if (n_args < 2 || n_args > 3) {
181 goto wrong_args;
182 }
183 GET_STR_DATA_LEN(args[0], str_data, str_len);
184 GET_STR_HASH(args[0], str_hash);
185 mp_obj_str_t *o = mp_obj_new_str_of_type(&mp_type_bytes, NULL, str_len);
186 o->data = str_data;
187 o->hash = str_hash;
188 return o;
189 }
190
191 if (n_args > 1) {
192 goto wrong_args;
193 }
194
195 if (MP_OBJ_IS_SMALL_INT(args[0])) {
196 uint len = MP_OBJ_SMALL_INT_VALUE(args[0]);
197 byte *data;
198
199 mp_obj_t o = mp_obj_str_builder_start(&mp_type_bytes, len, &data);
200 memset(data, 0, len);
201 return mp_obj_str_builder_end(o);
202 }
203
204 int len;
205 byte *data;
206 vstr_t *vstr = NULL;
207 mp_obj_t o = NULL;
208 // Try to create array of exact len if initializer len is known
209 mp_obj_t len_in = mp_obj_len_maybe(args[0]);
210 if (len_in == MP_OBJ_NULL) {
211 len = -1;
212 vstr = vstr_new();
213 } else {
214 len = MP_OBJ_SMALL_INT_VALUE(len_in);
215 o = mp_obj_str_builder_start(&mp_type_bytes, len, &data);
216 }
217
218 mp_obj_t iterable = mp_getiter(args[0]);
219 mp_obj_t item;
220 while ((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
221 if (len == -1) {
222 vstr_add_char(vstr, MP_OBJ_SMALL_INT_VALUE(item));
223 } else {
224 *data++ = MP_OBJ_SMALL_INT_VALUE(item);
225 }
226 }
227
228 if (len == -1) {
229 vstr_shrink(vstr);
230 // TODO: Optimize, borrow buffer from vstr
231 len = vstr_len(vstr);
232 o = mp_obj_str_builder_start(&mp_type_bytes, len, &data);
233 memcpy(data, vstr_str(vstr), len);
234 vstr_free(vstr);
235 }
236
237 return mp_obj_str_builder_end(o);
238
239wrong_args:
240 nlr_raise(mp_obj_new_exception_msg(&mp_type_TypeError, "wrong number of arguments"));
241}
242
243// like strstr but with specified length and allows \0 bytes
244// TODO replace with something more efficient/standard
245STATIC const byte *find_subbytes(const byte *haystack, machine_uint_t hlen, const byte *needle, machine_uint_t nlen, machine_int_t direction) {
246 if (hlen >= nlen) {
247 machine_uint_t str_index, str_index_end;
248 if (direction > 0) {
249 str_index = 0;
250 str_index_end = hlen - nlen;
251 } else {
252 str_index = hlen - nlen;
253 str_index_end = 0;
254 }
255 for (;;) {
256 if (memcmp(&haystack[str_index], needle, nlen) == 0) {
257 //found
258 return haystack + str_index;
259 }
260 if (str_index == str_index_end) {
261 //not found
262 break;
263 }
264 str_index += direction;
265 }
266 }
267 return NULL;
268}
269
270STATIC mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
271 GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len);
272 mp_obj_type_t *lhs_type = mp_obj_get_type(lhs_in);
273 mp_obj_type_t *rhs_type = mp_obj_get_type(rhs_in);
274 switch (op) {
275 case MP_BINARY_OP_ADD:
276 case MP_BINARY_OP_INPLACE_ADD:
277 if (lhs_type == rhs_type) {
278 // add 2 strings or bytes
279
280 GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
281 int alloc_len = lhs_len + rhs_len;
282
283 /* code for making qstr
284 byte *q_ptr;
285 byte *val = qstr_build_start(alloc_len, &q_ptr);
286 memcpy(val, lhs_data, lhs_len);
287 memcpy(val + lhs_len, rhs_data, rhs_len);
288 return MP_OBJ_NEW_QSTR(qstr_build_end(q_ptr));
289 */
290
291 // code for non-qstr
292 byte *data;
293 mp_obj_t s = mp_obj_str_builder_start(lhs_type, alloc_len, &data);
294 memcpy(data, lhs_data, lhs_len);
295 memcpy(data + lhs_len, rhs_data, rhs_len);
296 return mp_obj_str_builder_end(s);
297 }
298 break;
299
300 case MP_BINARY_OP_IN:
301 /* NOTE `a in b` is `b.__contains__(a)` */
302 if (lhs_type == rhs_type) {
303 GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
304 return MP_BOOL(find_subbytes(lhs_data, lhs_len, rhs_data, rhs_len, 1) != NULL);
305 }
306 break;
307
308 case MP_BINARY_OP_MULTIPLY: {
309 if (!MP_OBJ_IS_SMALL_INT(rhs_in)) {
310 return MP_OBJ_NULL; // op not supported
311 }
312 int n = MP_OBJ_SMALL_INT_VALUE(rhs_in);
313 byte *data;
314 mp_obj_t s = mp_obj_str_builder_start(lhs_type, lhs_len * n, &data);
315 mp_seq_multiply(lhs_data, sizeof(*lhs_data), lhs_len, n, data);
316 return mp_obj_str_builder_end(s);
317 }
318
319 case MP_BINARY_OP_MODULO: {
320 mp_obj_t *args;
321 uint n_args;
322 mp_obj_t dict = MP_OBJ_NULL;
323 if (MP_OBJ_IS_TYPE(rhs_in, &mp_type_tuple)) {
324 // TODO: Support tuple subclasses?
325 mp_obj_tuple_get(rhs_in, &n_args, &args);
326 } else if (MP_OBJ_IS_TYPE(rhs_in, &mp_type_dict)) {
327 args = NULL;
328 n_args = 0;
329 dict = rhs_in;
330 } else {
331 args = &rhs_in;
332 n_args = 1;
333 }
334 return str_modulo_format(lhs_in, n_args, args, dict);
335 }
336
337 //case MP_BINARY_OP_NOT_EQUAL: // This is never passed here
338 case MP_BINARY_OP_EQUAL: // This will be passed only for bytes, str is dealt with in mp_obj_equal()
339 case MP_BINARY_OP_LESS:
340 case MP_BINARY_OP_LESS_EQUAL:
341 case MP_BINARY_OP_MORE:
342 case MP_BINARY_OP_MORE_EQUAL:
343 if (lhs_type == rhs_type) {
344 GET_STR_DATA_LEN(rhs_in, rhs_data, rhs_len);
345 return MP_BOOL(mp_seq_cmp_bytes(op, lhs_data, lhs_len, rhs_data, rhs_len));
346 }
347 if (lhs_type == &mp_type_bytes) {
348 mp_buffer_info_t bufinfo;
349 if (!mp_get_buffer(rhs_in, &bufinfo, MP_BUFFER_READ)) {
350 goto uncomparable;
351 }
352 return MP_BOOL(mp_seq_cmp_bytes(op, lhs_data, lhs_len, bufinfo.buf, bufinfo.len));
353 }
354uncomparable:
355 if (op == MP_BINARY_OP_EQUAL) {
356 return mp_const_false;
357 }
358 }
359
360 return MP_OBJ_NULL; // op not supported
361}
362
Chris Angelico64b468d2014-06-04 05:28:12 +1000363// Convert an index into a pointer to its lead byte. Out of bounds indexing will raise IndexError or
364// be capped to the first/last character of the string, depending on is_slice.
365STATIC const char *str_index_to_ptr(const char *self_data, uint self_len, mp_obj_t index, bool is_slice) {
366 machine_int_t i;
367 // Copied from mp_get_index; I don't want bounds checking, just give me
368 // the integer as-is. (I can't bounds-check without scanning the whole
369 // string; an out-of-bounds index will be caught in the loops below.)
370 if (MP_OBJ_IS_SMALL_INT(index)) {
371 i = MP_OBJ_SMALL_INT_VALUE(index);
372 } else if (!mp_obj_get_int_maybe(index, &i)) {
373 nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "string indices must be integers, not %s", mp_obj_get_type_str(index)));
374 }
375 const char *s, *top = self_data + self_len;
376 if (i < 0)
377 {
378 // Negative indexing is performed by counting from the end of the string.
379 for (s = top - 1; i; --s) {
380 if (s < self_data) {
381 if (is_slice) {
382 return self_data;
383 }
384 nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_IndexError, "string index out of range"));
385 }
386 if (!UTF8_IS_CONT(*s)) {
387 ++i;
388 }
389 }
390 ++s;
391 } else if (!i) {
392 return self_data; // Shortcut - str[0] is its base pointer
393 } else {
394 // Positive indexing, correspondingly, counts from the start of the string.
395 // It's assumed that negative indexing will generally be used with small
396 // absolute values (eg str[-1], not str[-1000000]), which means it'll be
397 // more efficient this way.
398 for (s = self_data; true; ++s) {
399 if (s >= top) {
400 if (is_slice) {
401 return top;
402 }
403 nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_IndexError, "string index out of range"));
404 }
405 while (UTF8_IS_CONT(*s)) {
406 ++s;
407 }
408 if (!i--) {
409 return s;
410 }
411 }
412 }
413 return s;
414}
415
Paul Sokolovsky83865342014-06-13 00:51:34 +0300416STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
417 mp_obj_type_t *type = mp_obj_get_type(self_in);
418 GET_STR_DATA_LEN(self_in, self_data, self_len);
419 if (value == MP_OBJ_SENTINEL) {
420 // load
421#if MICROPY_PY_BUILTINS_SLICE
422 if (MP_OBJ_IS_TYPE(index, &mp_type_slice)) {
Chris Angelico64b468d2014-06-04 05:28:12 +1000423 mp_obj_t ostart, ostop, ostep;
424 mp_obj_slice_get(index, &ostart, &ostop, &ostep);
425 if (ostep != mp_const_none && ostep != MP_OBJ_NEW_SMALL_INT(1)) {
Paul Sokolovsky83865342014-06-13 00:51:34 +0300426 nlr_raise(mp_obj_new_exception_msg(&mp_type_NotImplementedError,
427 "only slices with step=1 (aka None) are supported"));
428 }
Chris Angelico64b468d2014-06-04 05:28:12 +1000429
430 if (type == &mp_type_bytes) {
431 machine_int_t start = 0, stop = self_len;
432 if (ostart != mp_const_none) {
433 start = MP_OBJ_SMALL_INT_VALUE(ostart);
434 if (start < 0) {
435 start = self_len + start;
436 }
437 }
438 if (ostop != mp_const_none) {
439 stop = MP_OBJ_SMALL_INT_VALUE(ostop);
440 if (stop < 0) {
441 stop = self_len + stop;
442 }
443 }
444 return mp_obj_new_str_of_type(type, self_data + start, stop - start);
445 }
446 const char *pstart, *pstop;
447 if (ostart != mp_const_none) {
448 pstart = str_index_to_ptr((const char *)self_data, self_len, ostart, true);
449 } else {
450 pstart = (const char *)self_data;
451 }
452 if (ostop != mp_const_none) {
453 // pstop will point just after the stop character. This depends on
454 // the \0 at the end of the string.
455 pstop = str_index_to_ptr((const char *)self_data, self_len, ostop, true);
456 } else {
457 pstop = (const char *)self_data + self_len;
458 }
459 if (pstop < pstart) {
460 return MP_OBJ_NEW_QSTR(MP_QSTR_);
461 }
462 return mp_obj_new_str_of_type(type, (const byte *)pstart, pstop - pstart);
Paul Sokolovsky83865342014-06-13 00:51:34 +0300463 }
464#endif
Paul Sokolovsky83865342014-06-13 00:51:34 +0300465 if (type == &mp_type_bytes) {
Chris Angelico64b468d2014-06-04 05:28:12 +1000466 uint index_val = mp_get_index(type, self_len, index, false);
Paul Sokolovsky83865342014-06-13 00:51:34 +0300467 return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)self_data[index_val]);
Paul Sokolovsky83865342014-06-13 00:51:34 +0300468 }
Chris Angelico64b468d2014-06-04 05:28:12 +1000469 const char *s = str_index_to_ptr((const char *)self_data, self_len, index, false);
470 int len = 1;
471 if (UTF8_IS_NONASCII(*s)) {
472 // Count the number of 1 bits (after the first)
473 for (char mask = 0x40; *s & mask; mask >>= 1) {
474 ++len;
475 }
476 }
477 return mp_obj_new_str(s, len, true); // This will create a one-character string
Paul Sokolovsky83865342014-06-13 00:51:34 +0300478 } else {
479 return MP_OBJ_NULL; // op not supported
480 }
481}
482
483STATIC mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) {
484 assert(is_str_or_bytes(self_in));
485 const mp_obj_type_t *self_type = mp_obj_get_type(self_in);
486
487 // get separation string
488 GET_STR_DATA_LEN(self_in, sep_str, sep_len);
489
490 // process args
491 uint seq_len;
492 mp_obj_t *seq_items;
493 if (MP_OBJ_IS_TYPE(arg, &mp_type_tuple)) {
494 mp_obj_tuple_get(arg, &seq_len, &seq_items);
495 } else {
496 if (!MP_OBJ_IS_TYPE(arg, &mp_type_list)) {
497 // arg is not a list, try to convert it to one
498 // TODO: Try to optimize?
499 arg = mp_type_list.make_new((mp_obj_t)&mp_type_list, 1, 0, &arg);
500 }
501 mp_obj_list_get(arg, &seq_len, &seq_items);
502 }
503
504 // count required length
505 int required_len = 0;
506 for (int i = 0; i < seq_len; i++) {
507 if (mp_obj_get_type(seq_items[i]) != self_type) {
508 nlr_raise(mp_obj_new_exception_msg(&mp_type_TypeError,
509 "join expects a list of str/bytes objects consistent with self object"));
510 }
511 if (i > 0) {
512 required_len += sep_len;
513 }
514 GET_STR_LEN(seq_items[i], l);
515 required_len += l;
516 }
517
518 // make joined string
519 byte *data;
520 mp_obj_t joined_str = mp_obj_str_builder_start(self_type, required_len, &data);
521 for (int i = 0; i < seq_len; i++) {
522 if (i > 0) {
523 memcpy(data, sep_str, sep_len);
524 data += sep_len;
525 }
526 GET_STR_DATA_LEN(seq_items[i], s, l);
527 memcpy(data, s, l);
528 data += l;
529 }
530
531 // return joined string
532 return mp_obj_str_builder_end(joined_str);
533}
534
535#define is_ws(c) ((c) == ' ' || (c) == '\t')
536
537STATIC mp_obj_t str_split(uint n_args, const mp_obj_t *args) {
538 const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
539 machine_int_t splits = -1;
540 mp_obj_t sep = mp_const_none;
541 if (n_args > 1) {
542 sep = args[1];
543 if (n_args > 2) {
544 splits = mp_obj_get_int(args[2]);
545 }
546 }
547
548 mp_obj_t res = mp_obj_new_list(0, NULL);
549 GET_STR_DATA_LEN(args[0], s, len);
550 const byte *top = s + len;
551
552 if (sep == mp_const_none) {
553 // sep not given, so separate on whitespace
554
555 // Initial whitespace is not counted as split, so we pre-do it
556 while (s < top && is_ws(*s)) s++;
557 while (s < top && splits != 0) {
558 const byte *start = s;
559 while (s < top && !is_ws(*s)) s++;
560 mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, s - start));
561 if (s >= top) {
562 break;
563 }
564 while (s < top && is_ws(*s)) s++;
565 if (splits > 0) {
566 splits--;
567 }
568 }
569
570 if (s < top) {
571 mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, s, top - s));
572 }
573
574 } else {
575 // sep given
576
577 uint sep_len;
578 const char *sep_str = mp_obj_str_get_data(sep, &sep_len);
579
580 if (sep_len == 0) {
581 nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "empty separator"));
582 }
583
584 for (;;) {
585 const byte *start = s;
586 for (;;) {
587 if (splits == 0 || s + sep_len > top) {
588 s = top;
589 break;
590 } else if (memcmp(s, sep_str, sep_len) == 0) {
591 break;
592 }
593 s++;
594 }
595 mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, s - start));
596 if (s >= top) {
597 break;
598 }
599 s += sep_len;
600 if (splits > 0) {
601 splits--;
602 }
603 }
604 }
605
606 return res;
607}
608
609STATIC mp_obj_t str_rsplit(uint n_args, const mp_obj_t *args) {
610 if (n_args < 3) {
611 // If we don't have split limit, it doesn't matter from which side
612 // we split.
613 return str_split(n_args, args);
614 }
615 const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
616 mp_obj_t sep = args[1];
617 GET_STR_DATA_LEN(args[0], s, len);
618
619 machine_int_t splits = mp_obj_get_int(args[2]);
620 machine_int_t org_splits = splits;
621 // Preallocate list to the max expected # of elements, as we
622 // will fill it from the end.
623 mp_obj_list_t *res = mp_obj_new_list(splits + 1, NULL);
624 int idx = splits;
625
626 if (sep == mp_const_none) {
627 assert(!"TODO: rsplit(None,n) not implemented");
628 } else {
629 uint sep_len;
630 const char *sep_str = mp_obj_str_get_data(sep, &sep_len);
631
632 if (sep_len == 0) {
633 nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "empty separator"));
634 }
635
636 const byte *beg = s;
637 const byte *last = s + len;
638 for (;;) {
639 s = last - sep_len;
640 for (;;) {
641 if (splits == 0 || s < beg) {
642 break;
643 } else if (memcmp(s, sep_str, sep_len) == 0) {
644 break;
645 }
646 s--;
647 }
648 if (s < beg || splits == 0) {
649 res->items[idx] = mp_obj_new_str_of_type(self_type, beg, last - beg);
650 break;
651 }
652 res->items[idx--] = mp_obj_new_str_of_type(self_type, s + sep_len, last - s - sep_len);
653 last = s;
654 if (splits > 0) {
655 splits--;
656 }
657 }
658 if (idx != 0) {
659 // We split less parts than split limit, now go cleanup surplus
660 int used = org_splits + 1 - idx;
661 memcpy(res->items, &res->items[idx], used * sizeof(mp_obj_t));
662 mp_seq_clear(res->items, used, res->alloc, sizeof(*res->items));
663 res->len = used;
664 }
665 }
666
667 return res;
668}
669
670
671STATIC mp_obj_t str_finder(uint n_args, const mp_obj_t *args, machine_int_t direction, bool is_index) {
672 assert(2 <= n_args && n_args <= 4);
673 assert(MP_OBJ_IS_STR(args[0]));
674 assert(MP_OBJ_IS_STR(args[1]));
675
676 GET_STR_DATA_LEN(args[0], haystack, haystack_len);
677 GET_STR_DATA_LEN(args[1], needle, needle_len);
678
679 machine_uint_t start = 0;
680 machine_uint_t end = haystack_len;
681 if (n_args >= 3 && args[2] != mp_const_none) {
682 start = mp_get_index(&mp_type_str, haystack_len, args[2], true);
683 }
684 if (n_args >= 4 && args[3] != mp_const_none) {
685 end = mp_get_index(&mp_type_str, haystack_len, args[3], true);
686 }
687
688 const byte *p = find_subbytes(haystack + start, end - start, needle, needle_len, direction);
689 if (p == NULL) {
690 // not found
691 if (is_index) {
692 nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "substring not found"));
693 } else {
694 return MP_OBJ_NEW_SMALL_INT(-1);
695 }
696 } else {
697 // found
698 return MP_OBJ_NEW_SMALL_INT(p - haystack);
699 }
700}
701
702STATIC mp_obj_t str_find(uint n_args, const mp_obj_t *args) {
703 return str_finder(n_args, args, 1, false);
704}
705
706STATIC mp_obj_t str_rfind(uint n_args, const mp_obj_t *args) {
707 return str_finder(n_args, args, -1, false);
708}
709
710STATIC mp_obj_t str_index(uint n_args, const mp_obj_t *args) {
711 return str_finder(n_args, args, 1, true);
712}
713
714STATIC mp_obj_t str_rindex(uint n_args, const mp_obj_t *args) {
715 return str_finder(n_args, args, -1, true);
716}
717
718// TODO: (Much) more variety in args
719STATIC mp_obj_t str_startswith(uint n_args, const mp_obj_t *args) {
720 GET_STR_DATA_LEN(args[0], str, str_len);
721 GET_STR_DATA_LEN(args[1], prefix, prefix_len);
722 uint index_val = 0;
723 if (n_args > 2) {
724 index_val = mp_get_index(&mp_type_str, str_len, args[2], true);
725 }
726 if (prefix_len + index_val > str_len) {
727 return mp_const_false;
728 }
729 return MP_BOOL(memcmp(str + index_val, prefix, prefix_len) == 0);
730}
731
732STATIC mp_obj_t str_endswith(uint n_args, const mp_obj_t *args) {
733 GET_STR_DATA_LEN(args[0], str, str_len);
734 GET_STR_DATA_LEN(args[1], suffix, suffix_len);
735 assert(n_args == 2);
736
737 if (suffix_len > str_len) {
738 return mp_const_false;
739 }
740 return MP_BOOL(memcmp(str + (str_len - suffix_len), suffix, suffix_len) == 0);
741}
742
743enum { LSTRIP, RSTRIP, STRIP };
744
745STATIC mp_obj_t str_uni_strip(int type, uint n_args, const mp_obj_t *args) {
746 assert(1 <= n_args && n_args <= 2);
747 assert(is_str_or_bytes(args[0]));
748 const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
749
750 const byte *chars_to_del;
751 uint chars_to_del_len;
752 static const byte whitespace[] = " \t\n\r\v\f";
753
754 if (n_args == 1) {
755 chars_to_del = whitespace;
756 chars_to_del_len = sizeof(whitespace);
757 } else {
758 if (mp_obj_get_type(args[1]) != self_type) {
759 arg_type_mixup();
760 }
761 GET_STR_DATA_LEN(args[1], s, l);
762 chars_to_del = s;
763 chars_to_del_len = l;
764 }
765
766 GET_STR_DATA_LEN(args[0], orig_str, orig_str_len);
767
768 machine_uint_t first_good_char_pos = 0;
769 bool first_good_char_pos_set = false;
770 machine_uint_t last_good_char_pos = 0;
771 machine_uint_t i = 0;
772 machine_int_t delta = 1;
773 if (type == RSTRIP) {
774 i = orig_str_len - 1;
775 delta = -1;
776 }
777 for (machine_uint_t len = orig_str_len; len > 0; len--) {
778 if (find_subbytes(chars_to_del, chars_to_del_len, &orig_str[i], 1, 1) == NULL) {
779 if (!first_good_char_pos_set) {
780 first_good_char_pos_set = true;
781 first_good_char_pos = i;
782 if (type == LSTRIP) {
783 last_good_char_pos = orig_str_len - 1;
784 break;
785 } else if (type == RSTRIP) {
786 first_good_char_pos = 0;
787 last_good_char_pos = i;
788 break;
789 }
790 }
791 last_good_char_pos = i;
792 }
793 i += delta;
794 }
795
796 if (!first_good_char_pos_set) {
797 // string is all whitespace, return ''
798 return MP_OBJ_NEW_QSTR(MP_QSTR_);
799 }
800
801 assert(last_good_char_pos >= first_good_char_pos);
802 //+1 to accomodate the last character
803 machine_uint_t stripped_len = last_good_char_pos - first_good_char_pos + 1;
804 if (stripped_len == orig_str_len) {
805 // If nothing was stripped, don't bother to dup original string
806 // TODO: watch out for this case when we'll get to bytearray.strip()
807 assert(first_good_char_pos == 0);
808 return args[0];
809 }
810 return mp_obj_new_str_of_type(self_type, orig_str + first_good_char_pos, stripped_len);
811}
812
813STATIC mp_obj_t str_strip(uint n_args, const mp_obj_t *args) {
814 return str_uni_strip(STRIP, n_args, args);
815}
816
817STATIC mp_obj_t str_lstrip(uint n_args, const mp_obj_t *args) {
818 return str_uni_strip(LSTRIP, n_args, args);
819}
820
821STATIC mp_obj_t str_rstrip(uint n_args, const mp_obj_t *args) {
822 return str_uni_strip(RSTRIP, n_args, args);
823}
824
825// Takes an int arg, but only parses unsigned numbers, and only changes
826// *num if at least one digit was parsed.
827static int str_to_int(const char *str, int *num) {
828 const char *s = str;
829 if (unichar_isdigit(*s)) {
830 *num = 0;
831 do {
832 *num = *num * 10 + (*s - '0');
833 s++;
834 }
835 while (unichar_isdigit(*s));
836 }
837 return s - str;
838}
839
840static bool isalignment(char ch) {
841 return ch && strchr("<>=^", ch) != NULL;
842}
843
844static bool istype(char ch) {
845 return ch && strchr("bcdeEfFgGnosxX%", ch) != NULL;
846}
847
848static bool arg_looks_integer(mp_obj_t arg) {
849 return MP_OBJ_IS_TYPE(arg, &mp_type_bool) || MP_OBJ_IS_INT(arg);
850}
851
852static bool arg_looks_numeric(mp_obj_t arg) {
853 return arg_looks_integer(arg)
854#if MICROPY_PY_BUILTINS_FLOAT
855 || MP_OBJ_IS_TYPE(arg, &mp_type_float)
856#endif
857 ;
858}
859
860static mp_obj_t arg_as_int(mp_obj_t arg) {
861#if MICROPY_PY_BUILTINS_FLOAT
862 if (MP_OBJ_IS_TYPE(arg, &mp_type_float)) {
863
864 // TODO: Needs a way to construct an mpz integer from a float
865
866 mp_small_int_t num = mp_obj_get_float(arg);
867 return MP_OBJ_NEW_SMALL_INT(num);
868 }
869#endif
870 return arg;
871}
872
873mp_obj_t mp_obj_str_format(uint n_args, const mp_obj_t *args) {
874 assert(MP_OBJ_IS_STR(args[0]));
875
876 GET_STR_DATA_LEN(args[0], str, len);
877 int arg_i = 0;
878 vstr_t *vstr = vstr_new();
879 pfenv_t pfenv_vstr;
880 pfenv_vstr.data = vstr;
881 pfenv_vstr.print_strn = pfenv_vstr_add_strn;
882
883 for (const byte *top = str + len; str < top; str++) {
884 if (*str == '}') {
885 str++;
886 if (str < top && *str == '}') {
887 vstr_add_char(vstr, '}');
888 continue;
889 }
890 nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "single '}' encountered in format string"));
891 }
892 if (*str != '{') {
893 vstr_add_char(vstr, *str);
894 continue;
895 }
896
897 str++;
898 if (str < top && *str == '{') {
899 vstr_add_char(vstr, '{');
900 continue;
901 }
902
903 // replacement_field ::= "{" [field_name] ["!" conversion] [":" format_spec] "}"
904
905 vstr_t *field_name = NULL;
906 char conversion = '\0';
907 vstr_t *format_spec = NULL;
908
909 if (str < top && *str != '}' && *str != '!' && *str != ':') {
910 field_name = vstr_new();
911 while (str < top && *str != '}' && *str != '!' && *str != ':') {
912 vstr_add_char(field_name, *str++);
913 }
914 vstr_add_char(field_name, '\0');
915 }
916
917 // conversion ::= "r" | "s"
918
919 if (str < top && *str == '!') {
920 str++;
921 if (str < top && (*str == 'r' || *str == 's')) {
922 conversion = *str++;
923 } else {
924 nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "end of format while looking for conversion specifier"));
925 }
926 }
927
928 if (str < top && *str == ':') {
929 str++;
930 // {:} is the same as {}, which is the same as {!s}
931 // This makes a difference when passing in a True or False
932 // '{}'.format(True) returns 'True'
933 // '{:d}'.format(True) returns '1'
934 // So we treat {:} as {} and this later gets treated to be {!s}
935 if (*str != '}') {
936 format_spec = vstr_new();
937 while (str < top && *str != '}') {
938 vstr_add_char(format_spec, *str++);
939 }
940 vstr_add_char(format_spec, '\0');
941 }
942 }
943 if (str >= top) {
944 nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "unmatched '{' in format"));
945 }
946 if (*str != '}') {
947 nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "expected ':' after format specifier"));
948 }
949
950 mp_obj_t arg = mp_const_none;
951
952 if (field_name) {
953 if (arg_i > 0) {
954 nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "can't switch from automatic field numbering to manual field specification"));
955 }
956 int index = 0;
957 if (str_to_int(vstr_str(field_name), &index) != vstr_len(field_name) - 1) {
958 nlr_raise(mp_obj_new_exception_msg(&mp_type_KeyError, "attributes not supported yet"));
959 }
960 if (index >= n_args - 1) {
961 nlr_raise(mp_obj_new_exception_msg(&mp_type_IndexError, "tuple index out of range"));
962 }
963 arg = args[index + 1];
964 arg_i = -1;
965 vstr_free(field_name);
966 field_name = NULL;
967 } else {
968 if (arg_i < 0) {
969 nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "can't switch from manual field specification to automatic field numbering"));
970 }
971 if (arg_i >= n_args - 1) {
972 nlr_raise(mp_obj_new_exception_msg(&mp_type_IndexError, "tuple index out of range"));
973 }
974 arg = args[arg_i + 1];
975 arg_i++;
976 }
977 if (!format_spec && !conversion) {
978 conversion = 's';
979 }
980 if (conversion) {
981 mp_print_kind_t print_kind;
982 if (conversion == 's') {
983 print_kind = PRINT_STR;
984 } else if (conversion == 'r') {
985 print_kind = PRINT_REPR;
986 } else {
987 nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_ValueError, "unknown conversion specifier %c", conversion));
988 }
989 vstr_t *arg_vstr = vstr_new();
990 mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, arg_vstr, arg, print_kind);
991 arg = mp_obj_new_str(vstr_str(arg_vstr), vstr_len(arg_vstr), false);
992 vstr_free(arg_vstr);
993 }
994
995 char sign = '\0';
996 char fill = '\0';
997 char align = '\0';
998 int width = -1;
999 int precision = -1;
1000 char type = '\0';
1001 int flags = 0;
1002
1003 if (format_spec) {
1004 // The format specifier (from http://docs.python.org/2/library/string.html#formatspec)
1005 //
1006 // [[fill]align][sign][#][0][width][,][.precision][type]
1007 // fill ::= <any character>
1008 // align ::= "<" | ">" | "=" | "^"
1009 // sign ::= "+" | "-" | " "
1010 // width ::= integer
1011 // precision ::= integer
1012 // type ::= "b" | "c" | "d" | "e" | "E" | "f" | "F" | "g" | "G" | "n" | "o" | "s" | "x" | "X" | "%"
1013
1014 const char *s = vstr_str(format_spec);
1015 if (isalignment(*s)) {
1016 align = *s++;
1017 } else if (*s && isalignment(s[1])) {
1018 fill = *s++;
1019 align = *s++;
1020 }
1021 if (*s == '+' || *s == '-' || *s == ' ') {
1022 if (*s == '+') {
1023 flags |= PF_FLAG_SHOW_SIGN;
1024 } else if (*s == ' ') {
1025 flags |= PF_FLAG_SPACE_SIGN;
1026 }
1027 sign = *s++;
1028 }
1029 if (*s == '#') {
1030 flags |= PF_FLAG_SHOW_PREFIX;
1031 s++;
1032 }
1033 if (*s == '0') {
1034 if (!align) {
1035 align = '=';
1036 }
1037 if (!fill) {
1038 fill = '0';
1039 }
1040 }
1041 s += str_to_int(s, &width);
1042 if (*s == ',') {
1043 flags |= PF_FLAG_SHOW_COMMA;
1044 s++;
1045 }
1046 if (*s == '.') {
1047 s++;
1048 s += str_to_int(s, &precision);
1049 }
1050 if (istype(*s)) {
1051 type = *s++;
1052 }
1053 if (*s) {
1054 nlr_raise(mp_obj_new_exception_msg(&mp_type_KeyError, "Invalid conversion specification"));
1055 }
1056 vstr_free(format_spec);
1057 format_spec = NULL;
1058 }
1059 if (!align) {
1060 if (arg_looks_numeric(arg)) {
1061 align = '>';
1062 } else {
1063 align = '<';
1064 }
1065 }
1066 if (!fill) {
1067 fill = ' ';
1068 }
1069
1070 if (sign) {
1071 if (type == 's') {
1072 nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "Sign not allowed in string format specifier"));
1073 }
1074 if (type == 'c') {
1075 nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "Sign not allowed with integer format specifier 'c'"));
1076 }
1077 } else {
1078 sign = '-';
1079 }
1080
1081 switch (align) {
1082 case '<': flags |= PF_FLAG_LEFT_ADJUST; break;
1083 case '=': flags |= PF_FLAG_PAD_AFTER_SIGN; break;
1084 case '^': flags |= PF_FLAG_CENTER_ADJUST; break;
1085 }
1086
1087 if (arg_looks_integer(arg)) {
1088 switch (type) {
1089 case 'b':
1090 pfenv_print_mp_int(&pfenv_vstr, arg, 1, 2, 'a', flags, fill, width, 0);
1091 continue;
1092
1093 case 'c':
1094 {
1095 char ch = mp_obj_get_int(arg);
1096 pfenv_print_strn(&pfenv_vstr, &ch, 1, flags, fill, width);
1097 continue;
1098 }
1099
1100 case '\0': // No explicit format type implies 'd'
1101 case 'n': // I don't think we support locales in uPy so use 'd'
1102 case 'd':
1103 pfenv_print_mp_int(&pfenv_vstr, arg, 1, 10, 'a', flags, fill, width, 0);
1104 continue;
1105
1106 case 'o':
1107 if (flags & PF_FLAG_SHOW_PREFIX) {
1108 flags |= PF_FLAG_SHOW_OCTAL_LETTER;
1109 }
1110
1111 pfenv_print_mp_int(&pfenv_vstr, arg, 1, 8, 'a', flags, fill, width, 0);
1112 continue;
1113
1114 case 'X':
1115 case 'x':
1116 pfenv_print_mp_int(&pfenv_vstr, arg, 1, 16, type - ('X' - 'A'), flags, fill, width, 0);
1117 continue;
1118
1119 case 'e':
1120 case 'E':
1121 case 'f':
1122 case 'F':
1123 case 'g':
1124 case 'G':
1125 case '%':
1126 // The floating point formatters all work with anything that
1127 // looks like an integer
1128 break;
1129
1130 default:
1131 nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_ValueError,
1132 "unknown format code '%c' for object of type '%s'", type, mp_obj_get_type_str(arg)));
1133 }
1134 }
1135
1136 // NOTE: no else here. We need the e, f, g etc formats for integer
1137 // arguments (from above if) to take this if.
1138 if (arg_looks_numeric(arg)) {
1139 if (!type) {
1140
1141 // Even though the docs say that an unspecified type is the same
1142 // as 'g', there is one subtle difference, when the exponent
1143 // is one less than the precision.
1144 //
1145 // '{:10.1}'.format(0.0) ==> '0e+00'
1146 // '{:10.1g}'.format(0.0) ==> '0'
1147 //
1148 // TODO: Figure out how to deal with this.
1149 //
1150 // A proper solution would involve adding a special flag
1151 // or something to format_float, and create a format_double
1152 // to deal with doubles. In order to fix this when using
1153 // sprintf, we'd need to use the e format and tweak the
1154 // returned result to strip trailing zeros like the g format
1155 // does.
1156 //
1157 // {:10.3} and {:10.2e} with 1.23e2 both produce 1.23e+02
1158 // but with 1.e2 you get 1e+02 and 1.00e+02
1159 //
1160 // Stripping the trailing 0's (like g) does would make the
1161 // e format give us the right format.
1162 //
1163 // CPython sources say:
1164 // Omitted type specifier. Behaves in the same way as repr(x)
1165 // and str(x) if no precision is given, else like 'g', but with
1166 // at least one digit after the decimal point. */
1167
1168 type = 'g';
1169 }
1170 if (type == 'n') {
1171 type = 'g';
1172 }
1173
1174 flags |= PF_FLAG_PAD_NAN_INF; // '{:06e}'.format(float('-inf')) should give '-00inf'
1175 switch (type) {
1176#if MICROPY_PY_BUILTINS_FLOAT
1177 case 'e':
1178 case 'E':
1179 case 'f':
1180 case 'F':
1181 case 'g':
1182 case 'G':
1183 pfenv_print_float(&pfenv_vstr, mp_obj_get_float(arg), type, flags, fill, width, precision);
1184 break;
1185
1186 case '%':
1187 flags |= PF_FLAG_ADD_PERCENT;
1188 pfenv_print_float(&pfenv_vstr, mp_obj_get_float(arg) * 100.0F, 'f', flags, fill, width, precision);
1189 break;
1190#endif
1191
1192 default:
1193 nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_ValueError,
1194 "unknown format code '%c' for object of type 'float'",
1195 type, mp_obj_get_type_str(arg)));
1196 }
1197 } else {
1198 // arg doesn't look like a number
1199
1200 if (align == '=') {
1201 nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "'=' alignment not allowed in string format specifier"));
1202 }
1203
1204 switch (type) {
1205 case '\0':
1206 mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, arg, PRINT_STR);
1207 break;
1208
1209 case 's':
1210 {
1211 uint len;
1212 const char *s = mp_obj_str_get_data(arg, &len);
1213 if (precision < 0) {
1214 precision = len;
1215 }
1216 if (len > precision) {
1217 len = precision;
1218 }
1219 pfenv_print_strn(&pfenv_vstr, s, len, flags, fill, width);
1220 break;
1221 }
1222
1223 default:
1224 nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_ValueError,
1225 "unknown format code '%c' for object of type 'str'",
1226 type, mp_obj_get_type_str(arg)));
1227 }
1228 }
1229 }
1230
1231 mp_obj_t s = mp_obj_new_str(vstr->buf, vstr->len, false);
1232 vstr_free(vstr);
1233 return s;
1234}
1235
1236STATIC mp_obj_t str_modulo_format(mp_obj_t pattern, uint n_args, const mp_obj_t *args, mp_obj_t dict) {
1237 assert(MP_OBJ_IS_STR(pattern));
1238
1239 GET_STR_DATA_LEN(pattern, str, len);
1240 const byte *start_str = str;
1241 int arg_i = 0;
1242 vstr_t *vstr = vstr_new();
1243 pfenv_t pfenv_vstr;
1244 pfenv_vstr.data = vstr;
1245 pfenv_vstr.print_strn = pfenv_vstr_add_strn;
1246
1247 for (const byte *top = str + len; str < top; str++) {
1248 mp_obj_t arg = MP_OBJ_NULL;
1249 if (*str != '%') {
1250 vstr_add_char(vstr, *str);
1251 continue;
1252 }
1253 if (++str >= top) {
1254 break;
1255 }
1256 if (*str == '%') {
1257 vstr_add_char(vstr, '%');
1258 continue;
1259 }
1260
1261 // Dictionary value lookup
1262 if (*str == '(') {
1263 const byte *key = ++str;
1264 while (*str != ')') {
1265 if (str >= top) {
1266 nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "incomplete format key"));
1267 }
1268 ++str;
1269 }
1270 mp_obj_t k_obj = mp_obj_new_str((const char*)key, str - key, true);
1271 arg = mp_obj_dict_get(dict, k_obj);
1272 str++;
1273 }
1274
1275 int flags = 0;
1276 char fill = ' ';
1277 int alt = 0;
1278 while (str < top) {
1279 if (*str == '-') flags |= PF_FLAG_LEFT_ADJUST;
1280 else if (*str == '+') flags |= PF_FLAG_SHOW_SIGN;
1281 else if (*str == ' ') flags |= PF_FLAG_SPACE_SIGN;
1282 else if (*str == '#') alt = PF_FLAG_SHOW_PREFIX;
1283 else if (*str == '0') {
1284 flags |= PF_FLAG_PAD_AFTER_SIGN;
1285 fill = '0';
1286 } else break;
1287 str++;
1288 }
1289 // parse width, if it exists
1290 int width = 0;
1291 if (str < top) {
1292 if (*str == '*') {
1293 if (arg_i >= n_args) {
1294 goto not_enough_args;
1295 }
1296 width = mp_obj_get_int(args[arg_i++]);
1297 str++;
1298 } else {
1299 for (; str < top && '0' <= *str && *str <= '9'; str++) {
1300 width = width * 10 + *str - '0';
1301 }
1302 }
1303 }
1304 int prec = -1;
1305 if (str < top && *str == '.') {
1306 if (++str < top) {
1307 if (*str == '*') {
1308 if (arg_i >= n_args) {
1309 goto not_enough_args;
1310 }
1311 prec = mp_obj_get_int(args[arg_i++]);
1312 str++;
1313 } else {
1314 prec = 0;
1315 for (; str < top && '0' <= *str && *str <= '9'; str++) {
1316 prec = prec * 10 + *str - '0';
1317 }
1318 }
1319 }
1320 }
1321
1322 if (str >= top) {
1323 nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "incomplete format"));
1324 }
1325
1326 // Tuple value lookup
1327 if (arg == MP_OBJ_NULL) {
1328 if (arg_i >= n_args) {
1329not_enough_args:
1330 nlr_raise(mp_obj_new_exception_msg(&mp_type_TypeError, "not enough arguments for format string"));
1331 }
1332 arg = args[arg_i++];
1333 }
1334 switch (*str) {
1335 case 'c':
1336 if (MP_OBJ_IS_STR(arg)) {
1337 uint len;
1338 const char *s = mp_obj_str_get_data(arg, &len);
1339 if (len != 1) {
1340 nlr_raise(mp_obj_new_exception_msg(&mp_type_TypeError, "%%c requires int or char"));
1341 break;
1342 }
1343 pfenv_print_strn(&pfenv_vstr, s, 1, flags, ' ', width);
1344 break;
1345 }
1346 if (arg_looks_integer(arg)) {
1347 char ch = mp_obj_get_int(arg);
1348 pfenv_print_strn(&pfenv_vstr, &ch, 1, flags, ' ', width);
1349 break;
1350 }
1351#if MICROPY_PY_BUILTINS_FLOAT
1352 // This is what CPython reports, so we report the same.
1353 if (MP_OBJ_IS_TYPE(arg, &mp_type_float)) {
1354 nlr_raise(mp_obj_new_exception_msg(&mp_type_TypeError, "integer argument expected, got float"));
1355
1356 }
1357#endif
1358 nlr_raise(mp_obj_new_exception_msg(&mp_type_TypeError, "an integer is required"));
1359 break;
1360
1361 case 'd':
1362 case 'i':
1363 case 'u':
1364 pfenv_print_mp_int(&pfenv_vstr, arg_as_int(arg), 1, 10, 'a', flags, fill, width, prec);
1365 break;
1366
1367#if MICROPY_PY_BUILTINS_FLOAT
1368 case 'e':
1369 case 'E':
1370 case 'f':
1371 case 'F':
1372 case 'g':
1373 case 'G':
1374 pfenv_print_float(&pfenv_vstr, mp_obj_get_float(arg), *str, flags, fill, width, prec);
1375 break;
1376#endif
1377
1378 case 'o':
1379 if (alt) {
1380 flags |= (PF_FLAG_SHOW_PREFIX | PF_FLAG_SHOW_OCTAL_LETTER);
1381 }
1382 pfenv_print_mp_int(&pfenv_vstr, arg, 1, 8, 'a', flags, fill, width, prec);
1383 break;
1384
1385 case 'r':
1386 case 's':
1387 {
1388 vstr_t *arg_vstr = vstr_new();
1389 mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf,
1390 arg_vstr, arg, *str == 'r' ? PRINT_REPR : PRINT_STR);
1391 uint len = vstr_len(arg_vstr);
1392 if (prec < 0) {
1393 prec = len;
1394 }
1395 if (len > prec) {
1396 len = prec;
1397 }
1398 pfenv_print_strn(&pfenv_vstr, vstr_str(arg_vstr), len, flags, ' ', width);
1399 vstr_free(arg_vstr);
1400 break;
1401 }
1402
1403 case 'X':
1404 case 'x':
1405 pfenv_print_mp_int(&pfenv_vstr, arg, 1, 16, *str - ('X' - 'A'), flags | alt, fill, width, prec);
1406 break;
1407
1408 default:
1409 nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_ValueError,
1410 "unsupported format character '%c' (0x%x) at index %d",
1411 *str, *str, str - start_str));
1412 }
1413 }
1414
1415 if (arg_i != n_args) {
1416 nlr_raise(mp_obj_new_exception_msg(&mp_type_TypeError, "not all arguments converted during string formatting"));
1417 }
1418
1419 mp_obj_t s = mp_obj_new_str(vstr->buf, vstr->len, false);
1420 vstr_free(vstr);
1421 return s;
1422}
1423
1424STATIC mp_obj_t str_replace(uint n_args, const mp_obj_t *args) {
1425 assert(MP_OBJ_IS_STR(args[0]));
1426
1427 machine_int_t max_rep = -1;
1428 if (n_args == 4) {
1429 max_rep = mp_obj_get_int(args[3]);
1430 if (max_rep == 0) {
1431 return args[0];
1432 } else if (max_rep < 0) {
1433 max_rep = -1;
1434 }
1435 }
1436
1437 // if max_rep is still -1 by this point we will need to do all possible replacements
1438
1439 // check argument types
1440
1441 if (!MP_OBJ_IS_STR(args[1])) {
1442 bad_implicit_conversion(args[1]);
1443 }
1444
1445 if (!MP_OBJ_IS_STR(args[2])) {
1446 bad_implicit_conversion(args[2]);
1447 }
1448
1449 // extract string data
1450
1451 GET_STR_DATA_LEN(args[0], str, str_len);
1452 GET_STR_DATA_LEN(args[1], old, old_len);
1453 GET_STR_DATA_LEN(args[2], new, new_len);
1454
1455 // old won't exist in str if it's longer, so nothing to replace
1456 if (old_len > str_len) {
1457 return args[0];
1458 }
1459
1460 // data for the replaced string
1461 byte *data = NULL;
1462 mp_obj_t replaced_str = MP_OBJ_NULL;
1463
1464 // do 2 passes over the string:
1465 // first pass computes the required length of the replaced string
1466 // second pass does the replacements
1467 for (;;) {
1468 machine_uint_t replaced_str_index = 0;
1469 machine_uint_t num_replacements_done = 0;
1470 const byte *old_occurrence;
1471 const byte *offset_ptr = str;
1472 machine_uint_t str_len_remain = str_len;
1473 if (old_len == 0) {
1474 // if old_str is empty, copy new_str to start of replaced string
1475 // copy the replacement string
1476 if (data != NULL) {
1477 memcpy(data, new, new_len);
1478 }
1479 replaced_str_index += new_len;
1480 num_replacements_done++;
1481 }
1482 while (num_replacements_done != max_rep && str_len_remain > 0 && (old_occurrence = find_subbytes(offset_ptr, str_len_remain, old, old_len, 1)) != NULL) {
1483 if (old_len == 0) {
1484 old_occurrence += 1;
1485 }
1486 // copy from just after end of last occurrence of to-be-replaced string to right before start of next occurrence
1487 if (data != NULL) {
1488 memcpy(data + replaced_str_index, offset_ptr, old_occurrence - offset_ptr);
1489 }
1490 replaced_str_index += old_occurrence - offset_ptr;
1491 // copy the replacement string
1492 if (data != NULL) {
1493 memcpy(data + replaced_str_index, new, new_len);
1494 }
1495 replaced_str_index += new_len;
1496 offset_ptr = old_occurrence + old_len;
1497 str_len_remain = str + str_len - offset_ptr;
1498 num_replacements_done++;
1499 }
1500
1501 // copy from just after end of last occurrence of to-be-replaced string to end of old string
1502 if (data != NULL) {
1503 memcpy(data + replaced_str_index, offset_ptr, str_len_remain);
1504 }
1505 replaced_str_index += str_len_remain;
1506
1507 if (data == NULL) {
1508 // first pass
1509 if (num_replacements_done == 0) {
1510 // no substr found, return original string
1511 return args[0];
1512 } else {
1513 // substr found, allocate new string
1514 replaced_str = mp_obj_str_builder_start(mp_obj_get_type(args[0]), replaced_str_index, &data);
1515 assert(data != NULL);
1516 }
1517 } else {
1518 // second pass, we are done
1519 break;
1520 }
1521 }
1522
1523 return mp_obj_str_builder_end(replaced_str);
1524}
1525
1526STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) {
1527 assert(2 <= n_args && n_args <= 4);
1528 assert(MP_OBJ_IS_STR(args[0]));
1529 assert(MP_OBJ_IS_STR(args[1]));
1530
1531 GET_STR_DATA_LEN(args[0], haystack, haystack_len);
1532 GET_STR_DATA_LEN(args[1], needle, needle_len);
1533
1534 machine_uint_t start = 0;
1535 machine_uint_t end = haystack_len;
1536 if (n_args >= 3 && args[2] != mp_const_none) {
1537 start = mp_get_index(&mp_type_str, haystack_len, args[2], true);
1538 }
1539 if (n_args >= 4 && args[3] != mp_const_none) {
1540 end = mp_get_index(&mp_type_str, haystack_len, args[3], true);
1541 }
1542
1543 // if needle_len is zero then we count each gap between characters as an occurrence
1544 if (needle_len == 0) {
1545 return MP_OBJ_NEW_SMALL_INT(end - start + 1);
1546 }
1547
1548 // count the occurrences
1549 machine_int_t num_occurrences = 0;
1550 for (machine_uint_t haystack_index = start; haystack_index + needle_len <= end; haystack_index++) {
1551 if (memcmp(&haystack[haystack_index], needle, needle_len) == 0) {
1552 num_occurrences++;
1553 haystack_index += needle_len - 1;
1554 }
1555 }
1556
1557 return MP_OBJ_NEW_SMALL_INT(num_occurrences);
1558}
1559
1560STATIC mp_obj_t str_partitioner(mp_obj_t self_in, mp_obj_t arg, machine_int_t direction) {
1561 if (!is_str_or_bytes(self_in)) {
1562 assert(0);
1563 }
1564 mp_obj_type_t *self_type = mp_obj_get_type(self_in);
1565 if (self_type != mp_obj_get_type(arg)) {
1566 arg_type_mixup();
1567 }
1568
1569 GET_STR_DATA_LEN(self_in, str, str_len);
1570 GET_STR_DATA_LEN(arg, sep, sep_len);
1571
1572 if (sep_len == 0) {
1573 nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "empty separator"));
1574 }
1575
1576 mp_obj_t result[] = {MP_OBJ_NEW_QSTR(MP_QSTR_), MP_OBJ_NEW_QSTR(MP_QSTR_), MP_OBJ_NEW_QSTR(MP_QSTR_)};
1577
1578 if (direction > 0) {
1579 result[0] = self_in;
1580 } else {
1581 result[2] = self_in;
1582 }
1583
1584 const byte *position_ptr = find_subbytes(str, str_len, sep, sep_len, direction);
1585 if (position_ptr != NULL) {
1586 machine_uint_t position = position_ptr - str;
1587 result[0] = mp_obj_new_str_of_type(self_type, str, position);
1588 result[1] = arg;
1589 result[2] = mp_obj_new_str_of_type(self_type, str + position + sep_len, str_len - position - sep_len);
1590 }
1591
1592 return mp_obj_new_tuple(3, result);
1593}
1594
1595STATIC mp_obj_t str_partition(mp_obj_t self_in, mp_obj_t arg) {
1596 return str_partitioner(self_in, arg, 1);
1597}
1598
1599STATIC mp_obj_t str_rpartition(mp_obj_t self_in, mp_obj_t arg) {
1600 return str_partitioner(self_in, arg, -1);
1601}
1602
1603// Supposedly not too critical operations, so optimize for code size
1604STATIC mp_obj_t str_caseconv(unichar (*op)(unichar), mp_obj_t self_in) {
1605 GET_STR_DATA_LEN(self_in, self_data, self_len);
1606 byte *data;
1607 mp_obj_t s = mp_obj_str_builder_start(mp_obj_get_type(self_in), self_len, &data);
1608 for (int i = 0; i < self_len; i++) {
1609 *data++ = op(*self_data++);
1610 }
1611 *data = 0;
1612 return mp_obj_str_builder_end(s);
1613}
1614
1615STATIC mp_obj_t str_lower(mp_obj_t self_in) {
1616 return str_caseconv(unichar_tolower, self_in);
1617}
1618
1619STATIC mp_obj_t str_upper(mp_obj_t self_in) {
1620 return str_caseconv(unichar_toupper, self_in);
1621}
1622
1623STATIC mp_obj_t str_uni_istype(bool (*f)(unichar), mp_obj_t self_in) {
1624 GET_STR_DATA_LEN(self_in, self_data, self_len);
1625
1626 if (self_len == 0) {
1627 return mp_const_false; // default to False for empty str
1628 }
1629
1630 if (f != unichar_isupper && f != unichar_islower) {
1631 for (int i = 0; i < self_len; i++) {
1632 if (!f(*self_data++)) {
1633 return mp_const_false;
1634 }
1635 }
1636 } else {
1637 bool contains_alpha = false;
1638
1639 for (int i = 0; i < self_len; i++) { // only check alphanumeric characters
1640 if (unichar_isalpha(*self_data++)) {
1641 contains_alpha = true;
1642 if (!f(*(self_data - 1))) { // -1 because we already incremented above
1643 return mp_const_false;
1644 }
1645 }
1646 }
1647
1648 if (!contains_alpha) {
1649 return mp_const_false;
1650 }
1651 }
1652
1653 return mp_const_true;
1654}
1655
1656STATIC mp_obj_t str_isspace(mp_obj_t self_in) {
1657 return str_uni_istype(unichar_isspace, self_in);
1658}
1659
1660STATIC mp_obj_t str_isalpha(mp_obj_t self_in) {
1661 return str_uni_istype(unichar_isalpha, self_in);
1662}
1663
1664STATIC mp_obj_t str_isdigit(mp_obj_t self_in) {
1665 return str_uni_istype(unichar_isdigit, self_in);
1666}
1667
1668STATIC mp_obj_t str_isupper(mp_obj_t self_in) {
1669 return str_uni_istype(unichar_isupper, self_in);
1670}
1671
1672STATIC mp_obj_t str_islower(mp_obj_t self_in) {
1673 return str_uni_istype(unichar_islower, self_in);
1674}
1675
1676#if MICROPY_CPYTHON_COMPAT
1677// These methods are superfluous in the presense of str() and bytes()
1678// constructors.
1679// TODO: should accept kwargs too
1680STATIC mp_obj_t bytes_decode(uint n_args, const mp_obj_t *args) {
1681 mp_obj_t new_args[2];
1682 if (n_args == 1) {
1683 new_args[0] = args[0];
1684 new_args[1] = MP_OBJ_NEW_QSTR(MP_QSTR_utf_hyphen_8);
1685 args = new_args;
1686 n_args++;
1687 }
1688 return str_make_new(NULL, n_args, 0, args);
1689}
1690
1691// TODO: should accept kwargs too
1692STATIC mp_obj_t str_encode(uint n_args, const mp_obj_t *args) {
1693 mp_obj_t new_args[2];
1694 if (n_args == 1) {
1695 new_args[0] = args[0];
1696 new_args[1] = MP_OBJ_NEW_QSTR(MP_QSTR_utf_hyphen_8);
1697 args = new_args;
1698 n_args++;
1699 }
1700 return bytes_make_new(NULL, n_args, 0, args);
1701}
1702#endif
1703
1704STATIC machine_int_t str_get_buffer(mp_obj_t self_in, mp_buffer_info_t *bufinfo, int flags) {
1705 if (flags == MP_BUFFER_READ) {
1706 GET_STR_DATA_LEN(self_in, str_data, str_len);
1707 bufinfo->buf = (void*)str_data;
1708 bufinfo->len = str_len;
1709 bufinfo->typecode = 'b';
1710 return 0;
1711 } else {
1712 // can't write to a string
1713 bufinfo->buf = NULL;
1714 bufinfo->len = 0;
1715 bufinfo->typecode = -1;
1716 return 1;
1717 }
1718}
1719
1720#if MICROPY_CPYTHON_COMPAT
1721STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(bytes_decode_obj, 1, 3, bytes_decode);
1722STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_encode_obj, 1, 3, str_encode);
1723#endif
1724STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find);
1725STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rfind_obj, 2, 4, str_rfind);
1726STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_index_obj, 2, 4, str_index);
1727STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rindex_obj, 2, 4, str_rindex);
1728STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join);
1729STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, str_split);
1730STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rsplit_obj, 1, 3, str_rsplit);
1731STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_startswith_obj, 2, 3, str_startswith);
1732STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_endswith_obj, 2, 3, str_endswith);
1733STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip);
1734STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_lstrip_obj, 1, 2, str_lstrip);
1735STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rstrip_obj, 1, 2, str_rstrip);
1736STATIC MP_DEFINE_CONST_FUN_OBJ_VAR(str_format_obj, 1, mp_obj_str_format);
1737STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_replace_obj, 3, 4, str_replace);
1738STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_count_obj, 2, 4, str_count);
1739STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_partition_obj, str_partition);
1740STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_rpartition_obj, str_rpartition);
1741STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_lower_obj, str_lower);
1742STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_upper_obj, str_upper);
1743STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_isspace_obj, str_isspace);
1744STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_isalpha_obj, str_isalpha);
1745STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_isdigit_obj, str_isdigit);
1746STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_isupper_obj, str_isupper);
1747STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_islower_obj, str_islower);
1748
1749STATIC const mp_map_elem_t str_locals_dict_table[] = {
1750#if MICROPY_CPYTHON_COMPAT
1751 { MP_OBJ_NEW_QSTR(MP_QSTR_decode), (mp_obj_t)&bytes_decode_obj },
1752 { MP_OBJ_NEW_QSTR(MP_QSTR_encode), (mp_obj_t)&str_encode_obj },
1753#endif
1754 { MP_OBJ_NEW_QSTR(MP_QSTR_find), (mp_obj_t)&str_find_obj },
1755 { MP_OBJ_NEW_QSTR(MP_QSTR_rfind), (mp_obj_t)&str_rfind_obj },
1756 { MP_OBJ_NEW_QSTR(MP_QSTR_index), (mp_obj_t)&str_index_obj },
1757 { MP_OBJ_NEW_QSTR(MP_QSTR_rindex), (mp_obj_t)&str_rindex_obj },
1758 { MP_OBJ_NEW_QSTR(MP_QSTR_join), (mp_obj_t)&str_join_obj },
1759 { MP_OBJ_NEW_QSTR(MP_QSTR_split), (mp_obj_t)&str_split_obj },
1760 { MP_OBJ_NEW_QSTR(MP_QSTR_rsplit), (mp_obj_t)&str_rsplit_obj },
1761 { MP_OBJ_NEW_QSTR(MP_QSTR_startswith), (mp_obj_t)&str_startswith_obj },
1762 { MP_OBJ_NEW_QSTR(MP_QSTR_endswith), (mp_obj_t)&str_endswith_obj },
1763 { MP_OBJ_NEW_QSTR(MP_QSTR_strip), (mp_obj_t)&str_strip_obj },
1764 { MP_OBJ_NEW_QSTR(MP_QSTR_lstrip), (mp_obj_t)&str_lstrip_obj },
1765 { MP_OBJ_NEW_QSTR(MP_QSTR_rstrip), (mp_obj_t)&str_rstrip_obj },
1766 { MP_OBJ_NEW_QSTR(MP_QSTR_format), (mp_obj_t)&str_format_obj },
1767 { MP_OBJ_NEW_QSTR(MP_QSTR_replace), (mp_obj_t)&str_replace_obj },
1768 { MP_OBJ_NEW_QSTR(MP_QSTR_count), (mp_obj_t)&str_count_obj },
1769 { MP_OBJ_NEW_QSTR(MP_QSTR_partition), (mp_obj_t)&str_partition_obj },
1770 { MP_OBJ_NEW_QSTR(MP_QSTR_rpartition), (mp_obj_t)&str_rpartition_obj },
1771 { MP_OBJ_NEW_QSTR(MP_QSTR_lower), (mp_obj_t)&str_lower_obj },
1772 { MP_OBJ_NEW_QSTR(MP_QSTR_upper), (mp_obj_t)&str_upper_obj },
1773 { MP_OBJ_NEW_QSTR(MP_QSTR_isspace), (mp_obj_t)&str_isspace_obj },
1774 { MP_OBJ_NEW_QSTR(MP_QSTR_isalpha), (mp_obj_t)&str_isalpha_obj },
1775 { MP_OBJ_NEW_QSTR(MP_QSTR_isdigit), (mp_obj_t)&str_isdigit_obj },
1776 { MP_OBJ_NEW_QSTR(MP_QSTR_isupper), (mp_obj_t)&str_isupper_obj },
1777 { MP_OBJ_NEW_QSTR(MP_QSTR_islower), (mp_obj_t)&str_islower_obj },
1778};
1779
1780STATIC MP_DEFINE_CONST_DICT(str_locals_dict, str_locals_dict_table);
1781
1782const mp_obj_type_t mp_type_str = {
1783 { &mp_type_type },
1784 .name = MP_QSTR_str,
1785 .print = str_print,
1786 .make_new = str_make_new,
1787 .binary_op = str_binary_op,
1788 .subscr = str_subscr,
1789 .getiter = mp_obj_new_str_iterator,
1790 .buffer_p = { .get_buffer = str_get_buffer },
1791 .locals_dict = (mp_obj_t)&str_locals_dict,
1792};
1793
1794// Reuses most of methods from str
1795const mp_obj_type_t mp_type_bytes = {
1796 { &mp_type_type },
1797 .name = MP_QSTR_bytes,
1798 .print = str_print,
1799 .make_new = bytes_make_new,
1800 .binary_op = str_binary_op,
1801 .subscr = str_subscr,
1802 .getiter = mp_obj_new_bytes_iterator,
1803 .buffer_p = { .get_buffer = str_get_buffer },
1804 .locals_dict = (mp_obj_t)&str_locals_dict,
1805};
1806
1807// the zero-length bytes
1808STATIC const mp_obj_str_t empty_bytes_obj = {{&mp_type_bytes}, 0, 0, NULL};
1809const mp_obj_t mp_const_empty_bytes = (mp_obj_t)&empty_bytes_obj;
1810
1811mp_obj_t mp_obj_str_builder_start(const mp_obj_type_t *type, uint len, byte **data) {
1812 mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
1813 o->base.type = type;
1814 o->len = len;
1815 o->hash = 0;
1816 byte *p = m_new(byte, len + 1);
1817 o->data = p;
1818 *data = p;
1819 return o;
1820}
1821
1822mp_obj_t mp_obj_str_builder_end(mp_obj_t o_in) {
1823 mp_obj_str_t *o = o_in;
1824 o->hash = qstr_compute_hash(o->data, o->len);
1825 byte *p = (byte*)o->data;
1826 p[o->len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
1827 return o;
1828}
1829
1830mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, uint len) {
1831 mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
1832 o->base.type = type;
1833 o->len = len;
1834 if (data) {
1835 o->hash = qstr_compute_hash(data, len);
1836 byte *p = m_new(byte, len + 1);
1837 o->data = p;
1838 memcpy(p, data, len * sizeof(byte));
1839 p[len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
1840 }
1841 return o;
1842}
1843
1844mp_obj_t mp_obj_new_str(const char* data, uint len, bool make_qstr_if_not_already) {
1845 if (make_qstr_if_not_already) {
1846 // use existing, or make a new qstr
1847 return MP_OBJ_NEW_QSTR(qstr_from_strn(data, len));
1848 } else {
1849 qstr q = qstr_find_strn(data, len);
1850 if (q != MP_QSTR_NULL) {
1851 // qstr with this data already exists
1852 return MP_OBJ_NEW_QSTR(q);
1853 } else {
1854 // no existing qstr, don't make one
1855 return mp_obj_new_str_of_type(&mp_type_str, (const byte*)data, len);
1856 }
1857 }
1858}
1859
1860mp_obj_t mp_obj_str_intern(mp_obj_t str) {
1861 GET_STR_DATA_LEN(str, data, len);
1862 return MP_OBJ_NEW_QSTR(qstr_from_strn((const char*)data, len));
1863}
1864
1865mp_obj_t mp_obj_new_bytes(const byte* data, uint len) {
1866 return mp_obj_new_str_of_type(&mp_type_bytes, data, len);
1867}
1868
1869bool mp_obj_str_equal(mp_obj_t s1, mp_obj_t s2) {
1870 if (MP_OBJ_IS_QSTR(s1) && MP_OBJ_IS_QSTR(s2)) {
1871 return s1 == s2;
1872 } else {
1873 GET_STR_HASH(s1, h1);
1874 GET_STR_HASH(s2, h2);
1875 // If any of hashes is 0, it means it's not valid
1876 if (h1 != 0 && h2 != 0 && h1 != h2) {
1877 return false;
1878 }
1879 GET_STR_DATA_LEN(s1, d1, l1);
1880 GET_STR_DATA_LEN(s2, d2, l2);
1881 if (l1 != l2) {
1882 return false;
1883 }
1884 return memcmp(d1, d2, l1) == 0;
1885 }
1886}
1887
1888STATIC void bad_implicit_conversion(mp_obj_t self_in) {
1889 nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(self_in)));
1890}
1891
1892STATIC void arg_type_mixup() {
1893 nlr_raise(mp_obj_new_exception_msg(&mp_type_TypeError, "Can't mix str and bytes arguments"));
1894}
1895
1896uint mp_obj_str_get_hash(mp_obj_t self_in) {
1897 // TODO: This has too big overhead for hash accessor
1898 if (MP_OBJ_IS_STR(self_in) || MP_OBJ_IS_TYPE(self_in, &mp_type_bytes)) {
1899 GET_STR_HASH(self_in, h);
1900 return h;
1901 } else {
1902 bad_implicit_conversion(self_in);
1903 }
1904}
1905
1906uint mp_obj_str_get_len(mp_obj_t self_in) {
1907 // TODO This has a double check for the type, one in obj.c and one here
1908 if (MP_OBJ_IS_STR(self_in) || MP_OBJ_IS_TYPE(self_in, &mp_type_bytes)) {
Chris Angelico64b468d2014-06-04 05:28:12 +10001909 GET_STR_DATA_LEN(self_in, self_data, self_len);
1910 return unichar_charlen((const char *)self_data, self_len);
Paul Sokolovsky83865342014-06-13 00:51:34 +03001911 } else {
1912 bad_implicit_conversion(self_in);
1913 }
1914}
1915
1916// use this if you will anyway convert the string to a qstr
1917// will be more efficient for the case where it's already a qstr
1918qstr mp_obj_str_get_qstr(mp_obj_t self_in) {
1919 if (MP_OBJ_IS_QSTR(self_in)) {
1920 return MP_OBJ_QSTR_VALUE(self_in);
1921 } else if (MP_OBJ_IS_TYPE(self_in, &mp_type_str)) {
1922 mp_obj_str_t *self = self_in;
1923 return qstr_from_strn((char*)self->data, self->len);
1924 } else {
1925 bad_implicit_conversion(self_in);
1926 }
1927}
1928
1929// only use this function if you need the str data to be zero terminated
1930// at the moment all strings are zero terminated to help with C ASCIIZ compatibility
1931const char *mp_obj_str_get_str(mp_obj_t self_in) {
1932 if (MP_OBJ_IS_STR(self_in)) {
1933 GET_STR_DATA_LEN(self_in, s, l);
1934 (void)l; // len unused
1935 return (const char*)s;
1936 } else {
1937 bad_implicit_conversion(self_in);
1938 }
1939}
1940
1941const char *mp_obj_str_get_data(mp_obj_t self_in, uint *len) {
1942 if (is_str_or_bytes(self_in)) {
1943 GET_STR_DATA_LEN(self_in, s, l);
1944 *len = l;
1945 return (const char*)s;
1946 } else {
1947 bad_implicit_conversion(self_in);
1948 }
1949}
1950
1951/******************************************************************************/
1952/* str iterator */
1953
1954typedef struct _mp_obj_str_it_t {
1955 mp_obj_base_t base;
1956 mp_obj_t str;
1957 machine_uint_t cur;
1958} mp_obj_str_it_t;
1959
1960STATIC mp_obj_t str_it_iternext(mp_obj_t self_in) {
1961 mp_obj_str_it_t *self = self_in;
1962 GET_STR_DATA_LEN(self->str, str, len);
1963 if (self->cur < len) {
1964 mp_obj_t o_out = mp_obj_new_str((const char*)str + self->cur, 1, true);
1965 self->cur += 1;
1966 return o_out;
1967 } else {
1968 return MP_OBJ_STOP_ITERATION;
1969 }
1970}
1971
1972STATIC const mp_obj_type_t mp_type_str_it = {
1973 { &mp_type_type },
1974 .name = MP_QSTR_iterator,
1975 .getiter = mp_identity,
1976 .iternext = str_it_iternext,
1977};
1978
1979STATIC mp_obj_t bytes_it_iternext(mp_obj_t self_in) {
1980 mp_obj_str_it_t *self = self_in;
1981 GET_STR_DATA_LEN(self->str, str, len);
1982 if (self->cur < len) {
1983 mp_obj_t o_out = MP_OBJ_NEW_SMALL_INT((mp_small_int_t)str[self->cur]);
1984 self->cur += 1;
1985 return o_out;
1986 } else {
1987 return MP_OBJ_STOP_ITERATION;
1988 }
1989}
1990
1991STATIC const mp_obj_type_t mp_type_bytes_it = {
1992 { &mp_type_type },
1993 .name = MP_QSTR_iterator,
1994 .getiter = mp_identity,
1995 .iternext = bytes_it_iternext,
1996};
1997
1998mp_obj_t mp_obj_new_str_iterator(mp_obj_t str) {
1999 mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
2000 o->base.type = &mp_type_str_it;
2001 o->str = str;
2002 o->cur = 0;
2003 return o;
2004}
2005
2006mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str) {
2007 mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t);
2008 o->base.type = &mp_type_bytes_it;
2009 o->str = str;
2010 o->cur = 0;
2011 return o;
2012}