py: Improve allocation policy of qstr data.

Previous to this patch all interned strings lived in their own malloc'd
chunk.  On average this wastes N/2 bytes per interned string, where N is
the number-of-bytes for a quanta of the memory allocator (16 bytes on 32
bit archs).

With this patch interned strings are concatenated into the same malloc'd
chunk when possible.  Such chunks are enlarged inplace when possible,
and shrunk to fit when a new chunk is needed.

RAM savings with this patch are highly varied, but should always show an
improvement (unless only 3 or 4 strings are interned).  New version
typically uses about 70% of previous memory for the qstr data, and can
lead to savings of around 10% of total memory footprint of a running
script.

Costs about 120 bytes code size on Thumb2 archs (depends on how many
calls to gc_realloc are made).
diff --git a/py/gc.c b/py/gc.c
index 6cdafc0..7283e5f 100644
--- a/py/gc.c
+++ b/py/gc.c
@@ -522,7 +522,7 @@
 
 #else // Alternative gc_realloc impl
 
-void *gc_realloc(void *ptr_in, mp_uint_t n_bytes) {
+void *gc_realloc(void *ptr_in, mp_uint_t n_bytes, bool allow_move) {
     if (MP_STATE_MEM(gc_lock_depth) > 0) {
         return NULL;
     }
@@ -624,6 +624,11 @@
         return ptr_in;
     }
 
+    if (!allow_move) {
+        // not allowed to move memory block so return failure
+        return NULL;
+    }
+
     // can't resize inplace; try to find a new contiguous chain
     void *ptr_out = gc_alloc(n_bytes,
 #if MICROPY_ENABLE_FINALISER
diff --git a/py/gc.h b/py/gc.h
index bb7e2d4..c61892f 100644
--- a/py/gc.h
+++ b/py/gc.h
@@ -48,7 +48,7 @@
 void *gc_alloc(mp_uint_t n_bytes, bool has_finaliser);
 void gc_free(void *ptr);
 mp_uint_t gc_nbytes(const void *ptr);
-void *gc_realloc(void *ptr, mp_uint_t n_bytes);
+void *gc_realloc(void *ptr, mp_uint_t n_bytes, bool allow_move);
 
 typedef struct _gc_info_t {
     mp_uint_t total;
diff --git a/py/malloc.c b/py/malloc.c
index f8728c5..0aecbd7 100644
--- a/py/malloc.c
+++ b/py/malloc.c
@@ -56,7 +56,10 @@
 #define malloc(b) gc_alloc((b), false)
 #define malloc_with_finaliser(b) gc_alloc((b), true)
 #define free gc_free
-#define realloc gc_realloc
+#define realloc(ptr, n) gc_realloc(ptr, n, true)
+#define realloc_ext(ptr, n, mv) gc_realloc(ptr, n, mv)
+#else
+#define realloc_ext(ptr, n, mv) realloc(ptr, n)
 #endif // MICROPY_ENABLE_GC
 
 void *m_malloc(size_t num_bytes) {
@@ -134,11 +137,11 @@
 }
 
 #if MICROPY_MALLOC_USES_ALLOCATED_SIZE
-void *m_realloc_maybe(void *ptr, size_t old_num_bytes, size_t new_num_bytes) {
+void *m_realloc_maybe(void *ptr, size_t old_num_bytes, size_t new_num_bytes, bool allow_move) {
 #else
-void *m_realloc_maybe(void *ptr, size_t new_num_bytes) {
+void *m_realloc_maybe(void *ptr, size_t new_num_bytes, bool allow_move) {
 #endif
-    void *new_ptr = realloc(ptr, new_num_bytes);
+    void *new_ptr = realloc_ext(ptr, new_num_bytes, allow_move);
 #if MICROPY_MEM_STATS
     // At first thought, "Total bytes allocated" should only grow,
     // after all, it's *total*. But consider for example 2K block
diff --git a/py/misc.h b/py/misc.h
index 91bd000..1a0c085 100644
--- a/py/misc.h
+++ b/py/misc.h
@@ -63,12 +63,12 @@
 #endif
 #if MICROPY_MALLOC_USES_ALLOCATED_SIZE
 #define m_renew(type, ptr, old_num, new_num) ((type*)(m_realloc((ptr), sizeof(type) * (old_num), sizeof(type) * (new_num))))
-#define m_renew_maybe(type, ptr, old_num, new_num) ((type*)(m_realloc_maybe((ptr), sizeof(type) * (old_num), sizeof(type) * (new_num))))
+#define m_renew_maybe(type, ptr, old_num, new_num, allow_move) ((type*)(m_realloc_maybe((ptr), sizeof(type) * (old_num), sizeof(type) * (new_num), (allow_move))))
 #define m_del(type, ptr, num) m_free(ptr, sizeof(type) * (num))
 #define m_del_var(obj_type, var_type, var_num, ptr) (m_free(ptr, sizeof(obj_type) + sizeof(var_type) * (var_num)))
 #else
 #define m_renew(type, ptr, old_num, new_num) ((type*)(m_realloc((ptr), sizeof(type) * (new_num))))
-#define m_renew_maybe(type, ptr, old_num, new_num) ((type*)(m_realloc_maybe((ptr), sizeof(type) * (new_num))))
+#define m_renew_maybe(type, ptr, old_num, new_num, allow_move) ((type*)(m_realloc_maybe((ptr), sizeof(type) * (new_num), (allow_move))))
 #define m_del(type, ptr, num) ((void)(num), m_free(ptr))
 #define m_del_var(obj_type, var_type, var_num, ptr) ((void)(var_num), m_free(ptr))
 #endif
@@ -80,11 +80,11 @@
 void *m_malloc0(size_t num_bytes);
 #if MICROPY_MALLOC_USES_ALLOCATED_SIZE
 void *m_realloc(void *ptr, size_t old_num_bytes, size_t new_num_bytes);
-void *m_realloc_maybe(void *ptr, size_t old_num_bytes, size_t new_num_bytes);
+void *m_realloc_maybe(void *ptr, size_t old_num_bytes, size_t new_num_bytes, bool allow_move);
 void m_free(void *ptr, size_t num_bytes);
 #else
 void *m_realloc(void *ptr, size_t new_num_bytes);
-void *m_realloc_maybe(void *ptr, size_t new_num_bytes);
+void *m_realloc_maybe(void *ptr, size_t new_num_bytes, bool allow_move);
 void m_free(void *ptr);
 #endif
 void *m_malloc_fail(size_t num_bytes);
diff --git a/py/mpconfig.h b/py/mpconfig.h
index e92e4c4..0b86ec4 100644
--- a/py/mpconfig.h
+++ b/py/mpconfig.h
@@ -75,6 +75,14 @@
 #define MICROPY_ALLOC_GC_STACK_SIZE (64)
 #endif
 
+// Number of bytes to allocate initially when creating new chunks to store
+// interned string data.  Smaller numbers lead to more chunks being needed
+// and more wastage at the end of the chunk.  Larger numbers lead to wasted
+// space at the end when no more strings need interning.
+#ifndef MICROPY_ALLOC_QSTR_CHUNK_INIT
+#define MICROPY_ALLOC_QSTR_CHUNK_INIT (128)
+#endif
+
 // Initial amount for lexer indentation level
 #ifndef MICROPY_ALLOC_LEXER_INDENT_INIT
 #define MICROPY_ALLOC_LEXER_INDENT_INIT (10)
diff --git a/py/mpstate.h b/py/mpstate.h
index 42593e4..dd185a7 100644
--- a/py/mpstate.h
+++ b/py/mpstate.h
@@ -131,6 +131,12 @@
     // END ROOT POINTER SECTION
     ////////////////////////////////////////////////////////////
 
+    // pointer and sizes to store interned string data
+    // (qstr_last_chunk can be root pointer but is also stored in qstr pool)
+    byte *qstr_last_chunk;
+    mp_uint_t qstr_last_alloc;
+    mp_uint_t qstr_last_used;
+
     // Stack top at the start of program
     // Note: this entry is used to locate the end of the root pointer section.
     char *stack_top;
diff --git a/py/objexcept.c b/py/objexcept.c
index 75369ba..97536c6 100644
--- a/py/objexcept.c
+++ b/py/objexcept.c
@@ -444,7 +444,7 @@
         self->traceback_len = 0;
     } else if (self->traceback_len + 3 > self->traceback_alloc) {
         // be conservative with growing traceback data
-        mp_uint_t *tb_data = m_renew_maybe(mp_uint_t, self->traceback_data, self->traceback_alloc, self->traceback_alloc + 3);
+        mp_uint_t *tb_data = m_renew_maybe(mp_uint_t, self->traceback_data, self->traceback_alloc, self->traceback_alloc + 3, true);
         if (tb_data == NULL) {
             return;
         }
diff --git a/py/parse.c b/py/parse.c
index 35f5f82..c00c590 100644
--- a/py/parse.c
+++ b/py/parse.c
@@ -135,7 +135,7 @@
         return;
     }
     if (parser->rule_stack_top >= parser->rule_stack_alloc) {
-        rule_stack_t *rs = m_renew_maybe(rule_stack_t, parser->rule_stack, parser->rule_stack_alloc, parser->rule_stack_alloc + MICROPY_ALLOC_PARSE_RULE_INC);
+        rule_stack_t *rs = m_renew_maybe(rule_stack_t, parser->rule_stack, parser->rule_stack_alloc, parser->rule_stack_alloc + MICROPY_ALLOC_PARSE_RULE_INC, true);
         if (rs == NULL) {
             memory_error(parser);
             return;
@@ -293,7 +293,7 @@
         return;
     }
     if (parser->result_stack_top >= parser->result_stack_alloc) {
-        mp_parse_node_t *stack = m_renew_maybe(mp_parse_node_t, parser->result_stack, parser->result_stack_alloc, parser->result_stack_alloc + MICROPY_ALLOC_PARSE_RESULT_INC);
+        mp_parse_node_t *stack = m_renew_maybe(mp_parse_node_t, parser->result_stack, parser->result_stack_alloc, parser->result_stack_alloc + MICROPY_ALLOC_PARSE_RESULT_INC, true);
         if (stack == NULL) {
             memory_error(parser);
             return;
diff --git a/py/qstr.c b/py/qstr.c
index b1ae6ac..dd04ae8 100644
--- a/py/qstr.c
+++ b/py/qstr.c
@@ -92,6 +92,7 @@
 
 void qstr_init(void) {
     MP_STATE_VM(last_pool) = (qstr_pool_t*)&const_pool; // we won't modify the const_pool since it has no allocated room left
+    MP_STATE_VM(qstr_last_chunk) = NULL;
 }
 
 STATIC const byte *find_qstr(qstr q) {
@@ -152,8 +153,46 @@
     assert(len < (1 << (8 * MICROPY_QSTR_BYTES_IN_LEN)));
     qstr q = qstr_find_strn(str, len);
     if (q == 0) {
+        // qstr does not exist in interned pool so need to add it
+
+        // compute number of bytes needed to intern this string
+        mp_uint_t n_bytes = 2 + MICROPY_QSTR_BYTES_IN_LEN + len + 1;
+
+        if (MP_STATE_VM(qstr_last_chunk) != NULL && MP_STATE_VM(qstr_last_used) + n_bytes > MP_STATE_VM(qstr_last_alloc)) {
+            // not enough room at end of previously interned string so try to grow
+            byte *new_p = m_renew_maybe(byte, MP_STATE_VM(qstr_last_chunk), MP_STATE_VM(qstr_last_alloc), MP_STATE_VM(qstr_last_alloc) + n_bytes, false);
+            if (new_p == NULL) {
+                // could not grow existing memory; shrink it to fit previous
+                (void)m_renew(byte, MP_STATE_VM(qstr_last_chunk), MP_STATE_VM(qstr_last_alloc), MP_STATE_VM(qstr_last_used));
+                MP_STATE_VM(qstr_last_chunk) = NULL;
+            } else {
+                // could grow existing memory
+                MP_STATE_VM(qstr_last_alloc) += n_bytes;
+            }
+        }
+
+        if (MP_STATE_VM(qstr_last_chunk) == NULL) {
+            // no existing memory for the interned string so allocate a new chunk
+            mp_uint_t al = n_bytes;
+            if (al < MICROPY_ALLOC_QSTR_CHUNK_INIT) {
+                al = MICROPY_ALLOC_QSTR_CHUNK_INIT;
+            }
+            MP_STATE_VM(qstr_last_chunk) = m_new_maybe(byte, al);
+            if (MP_STATE_VM(qstr_last_chunk) == NULL) {
+                // failed to allocate a large chunk so try with exact size
+                MP_STATE_VM(qstr_last_chunk) = m_new(byte, n_bytes);
+                al = n_bytes;
+            }
+            MP_STATE_VM(qstr_last_alloc) = al;
+            MP_STATE_VM(qstr_last_used) = 0;
+        }
+
+        // allocate memory from the chunk for this new interned string's data
+        byte *q_ptr = MP_STATE_VM(qstr_last_chunk) + MP_STATE_VM(qstr_last_used);
+        MP_STATE_VM(qstr_last_used) += n_bytes;
+
+        // store the interned strings' data
         mp_uint_t hash = qstr_compute_hash((const byte*)str, len);
-        byte *q_ptr = m_new(byte, 2 + MICROPY_QSTR_BYTES_IN_LEN + len + 1);
         q_ptr[0] = hash;
         q_ptr[1] = hash >> 8;
         Q_SET_LENGTH(q_ptr, len);