py: Improve memory usage debugging; better GC AT dumping.

In unix port, mem_info(1) now prints pretty GC alloc table.
diff --git a/py/gc.c b/py/gc.c
index 8e71307..ce2fa3a 100644
--- a/py/gc.c
+++ b/py/gc.c
@@ -492,7 +492,7 @@
     }
 }
 
-mp_uint_t gc_nbytes(void *ptr_in) {
+mp_uint_t gc_nbytes(const void *ptr_in) {
     mp_uint_t ptr = (mp_uint_t)ptr_in;
 
     if (VERIFY_PTR(ptr)) {
@@ -681,31 +681,32 @@
     for (mp_uint_t bl = 0; bl < gc_alloc_table_byte_len * BLOCKS_PER_ATB; bl++) {
         if (bl % DUMP_BYTES_PER_LINE == 0) {
             // a new line of blocks
-            #if EXTENSIVE_HEAP_PROFILING
             {
                 // check if this line contains only free blocks
-                bool only_free_blocks = true;
-                for (mp_uint_t bl2 = bl; bl2 < gc_alloc_table_byte_len * BLOCKS_PER_ATB && bl2 < bl + DUMP_BYTES_PER_LINE; bl2++) {
-                    if (ATB_GET_KIND(bl2) != AT_FREE) {
-
-                        only_free_blocks = false;
+                mp_uint_t bl2 = bl;
+                while (bl2 < gc_alloc_table_byte_len * BLOCKS_PER_ATB && ATB_GET_KIND(bl2) == AT_FREE) {
+                    bl2++;
+                }
+                if (bl2 - bl >= 2 * DUMP_BYTES_PER_LINE) {
+                    // there are at least 2 lines containing only free blocks, so abbreviate their printing
+                    printf("\n       (" UINT_FMT " lines all free)", (bl2 - bl) / DUMP_BYTES_PER_LINE);
+                    bl = bl2 & (~(DUMP_BYTES_PER_LINE - 1));
+                    if (bl >= gc_alloc_table_byte_len * BLOCKS_PER_ATB) {
+                        // got to end of heap
                         break;
                     }
                 }
-                if (only_free_blocks) {
-                    // line contains only free blocks, so skip printing it
-                    bl += DUMP_BYTES_PER_LINE - 1;
-                    continue;
-                }
             }
-            #endif
             // print header for new line of blocks
-            printf("\n%04x: ", (uint)bl);
+            #if EXTENSIVE_HEAP_PROFILING
+            printf("\n%05x: ", (uint)(bl * BYTES_PER_BLOCK) & 0xfffff);
+            #else
+            printf("\n%05x: ", (uint)PTR_FROM_BLOCK(bl) & 0xfffff);
+            #endif
         }
         int c = ' ';
         switch (ATB_GET_KIND(bl)) {
             case AT_FREE: c = '.'; break;
-            case AT_HEAD: c = 'h'; break;
             /* this prints out if the object is reachable from BSS or STACK (for unix only)
             case AT_HEAD: {
                 extern char __bss_start, _end;
@@ -734,7 +735,7 @@
                 break;
             }
             */
-            /* this prints the uPy object type of the head block
+            /* this prints the uPy object type of the head block */
             case AT_HEAD: {
                 mp_uint_t *ptr = gc_pool_start + bl * WORDS_PER_BLOCK;
                 if (*ptr == (mp_uint_t)&mp_type_tuple) { c = 'T'; }
@@ -742,10 +743,10 @@
                 else if (*ptr == (mp_uint_t)&mp_type_dict) { c = 'D'; }
                 else if (*ptr == (mp_uint_t)&mp_type_float) { c = 'F'; }
                 else if (*ptr == (mp_uint_t)&mp_type_fun_bc) { c = 'B'; }
+                else if (*ptr == (mp_uint_t)&mp_type_module) { c = 'M'; }
                 else { c = 'h'; }
                 break;
             }
-            */
             case AT_TAIL: c = 't'; break;
             case AT_MARK: c = 'm'; break;
         }
diff --git a/py/gc.h b/py/gc.h
index fd48d26..dc276dd 100644
--- a/py/gc.h
+++ b/py/gc.h
@@ -40,7 +40,7 @@
 
 void *gc_alloc(mp_uint_t n_bytes, bool has_finaliser);
 void gc_free(void *ptr);
-mp_uint_t gc_nbytes(void *ptr);
+mp_uint_t gc_nbytes(const void *ptr);
 void *gc_realloc(void *ptr, mp_uint_t n_bytes);
 
 typedef struct _gc_info_t {
diff --git a/py/qstr.c b/py/qstr.c
index f841f1d..c2cfda8 100644
--- a/py/qstr.c
+++ b/py/qstr.c
@@ -30,6 +30,7 @@
 #include "mpconfig.h"
 #include "misc.h"
 #include "qstr.h"
+#include "gc.h"
 
 // NOTE: we are using linear arrays to store and search for qstr's (unique strings, interned strings)
 // ultimately we will replace this with a static hash table of some kind
@@ -220,9 +221,17 @@
         *n_pool += 1;
         *n_qstr += pool->len;
         for (const byte **q = pool->qstrs, **q_top = pool->qstrs + pool->len; q < q_top; q++) {
+            #if MICROPY_ENABLE_GC
+            *n_str_data_bytes += gc_nbytes(*q); // this counts actual bytes used in heap
+            #else
             *n_str_data_bytes += Q_GET_ALLOC(*q);
+            #endif
         }
+        #if MICROPY_ENABLE_GC
+        *n_total_bytes += gc_nbytes(pool); // this counts actual bytes used in heap
+        #else
         *n_total_bytes += sizeof(qstr_pool_t) + sizeof(qstr) * pool->alloc;
+        #endif
     }
     *n_total_bytes += *n_str_data_bytes;
 }