py: Optimise storage of iterator so it takes only 4 slots on Py stack.
diff --git a/py/compile.c b/py/compile.c
index 4fde278..5ea7bb4 100644
--- a/py/compile.c
+++ b/py/compile.c
@@ -2887,7 +2887,7 @@
             EMIT(yield_value);
             EMIT(pop_top);
         } else {
-            EMIT_ARG(store_comp, comp->scope_cur->kind, 5 * for_depth + 6);
+            EMIT_ARG(store_comp, comp->scope_cur->kind, 4 * for_depth + 5);
         }
     } else if (MP_PARSE_NODE_IS_STRUCT_KIND(pn_iter, PN_comp_if)) {
         // if condition
@@ -3070,13 +3070,13 @@
         #endif
         }
 
-        // dummy 4 objects
+        // There are 4 slots on the stack for the iterator, and the first one is
+        // NULL to indicate that the second one points to the iterator object.
         EMIT(load_null);
-        EMIT(load_null);
+        compile_load_id(comp, qstr_arg);
         EMIT(load_null);
         EMIT(load_null);
 
-        compile_load_id(comp, qstr_arg);
         compile_scope_comp_iter(comp, pns_comp_for, pns->nodes[0], 0);
 
         if (scope->kind == SCOPE_GEN_EXPR) {
diff --git a/py/emitbc.c b/py/emitbc.c
index 0d7e651..caa6761 100644
--- a/py/emitbc.c
+++ b/py/emitbc.c
@@ -735,7 +735,7 @@
             // need to pop the iterator if we are breaking out of a for loop
             emit_write_bytecode_byte(emit, MP_BC_POP_TOP);
             // also pop the iter_buf
-            for (size_t i = 0; i < sizeof(mp_obj_iter_buf_t) / sizeof(mp_obj_t); ++i) {
+            for (size_t i = 0; i < sizeof(mp_obj_iter_buf_t) / sizeof(mp_obj_t) - 1; ++i) {
                 emit_write_bytecode_byte(emit, MP_BC_POP_TOP);
             }
         }
@@ -778,7 +778,7 @@
 }
 
 void mp_emit_bc_get_iter(emit_t *emit, bool use_stack) {
-    emit_bc_pre(emit, use_stack ? sizeof(mp_obj_iter_buf_t) / sizeof(mp_obj_t) : 0);
+    emit_bc_pre(emit, use_stack ? sizeof(mp_obj_iter_buf_t) / sizeof(mp_obj_t) - 1 : 0);
     emit_write_bytecode_byte(emit, use_stack ? MP_BC_GET_ITER_STACK : MP_BC_GET_ITER);
 }
 
@@ -788,7 +788,7 @@
 }
 
 void mp_emit_bc_for_iter_end(emit_t *emit, bool use_stack) {
-    emit_bc_pre(emit, use_stack ? -1 - sizeof(mp_obj_iter_buf_t) / sizeof(mp_obj_t) : -1);
+    emit_bc_pre(emit, -(use_stack ? sizeof(mp_obj_iter_buf_t) / sizeof(mp_obj_t) : 1));
 }
 
 void mp_emit_bc_pop_block(emit_t *emit) {
diff --git a/py/emitnative.c b/py/emitnative.c
index 6e8a37b..ffc4d91 100644
--- a/py/emitnative.c
+++ b/py/emitnative.c
@@ -105,8 +105,8 @@
     [MP_F_NATIVE_CALL_FUNCTION_N_KW] = 3,
     [MP_F_CALL_METHOD_N_KW] = 3,
     [MP_F_CALL_METHOD_N_KW_VAR] = 3,
-    [MP_F_GETITER] = 1,
-    [MP_F_ITERNEXT] = 1,
+    [MP_F_NATIVE_GETITER] = 2,
+    [MP_F_NATIVE_ITERNEXT] = 1,
     [MP_F_NLR_PUSH] = 1,
     [MP_F_NLR_POP] = 0,
     [MP_F_NATIVE_RAISE] = 1,
@@ -1808,20 +1808,20 @@
     assert(vtype == VTYPE_PYOBJ);
     if (use_stack) {
         emit_get_stack_pointer_to_reg_for_push(emit, REG_ARG_2, sizeof(mp_obj_iter_buf_t) / sizeof(mp_obj_t));
+        emit_call(emit, MP_F_NATIVE_GETITER);
     } else {
         // mp_getiter will allocate the iter_buf on the heap
         ASM_MOV_IMM_TO_REG(emit->as, 0, REG_ARG_2);
+        emit_call(emit, MP_F_NATIVE_GETITER);
+        emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET);
     }
-    emit_call(emit, MP_F_GETITER);
-    emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET);
 }
 
 STATIC void emit_native_for_iter(emit_t *emit, mp_uint_t label) {
     emit_native_pre(emit);
-    vtype_kind_t vtype;
-    emit_access_stack(emit, 1, &vtype, REG_ARG_1);
-    assert(vtype == VTYPE_PYOBJ);
-    emit_call(emit, MP_F_ITERNEXT);
+    emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_1, sizeof(mp_obj_iter_buf_t) / sizeof(mp_obj_t));
+    adjust_stack(emit, 4);
+    emit_call(emit, MP_F_NATIVE_ITERNEXT);
     ASM_MOV_IMM_TO_REG(emit->as, (mp_uint_t)MP_OBJ_STOP_ITERATION, REG_TEMP1);
     ASM_JUMP_IF_REG_EQ(emit->as, REG_RET, REG_TEMP1, label);
     emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET);
@@ -1830,10 +1830,7 @@
 STATIC void emit_native_for_iter_end(emit_t *emit, bool use_stack) {
     // adjust stack counter (we get here from for_iter ending, which popped the value for us)
     emit_native_pre(emit);
-    adjust_stack(emit, -1);
-    if (use_stack) {
-        adjust_stack(emit, -(sizeof(mp_obj_iter_buf_t) / sizeof(mp_obj_t)));
-    }
+    adjust_stack(emit, -(use_stack ? sizeof(mp_obj_iter_buf_t) / sizeof(mp_obj_t) : 1));
     emit_post(emit);
 }
 
diff --git a/py/nativeglue.c b/py/nativeglue.c
index 5db6308..694dfca 100644
--- a/py/nativeglue.c
+++ b/py/nativeglue.c
@@ -98,6 +98,32 @@
     }
 }
 
+// wrapper that handles iterator buffer
+STATIC mp_obj_t mp_native_getiter(mp_obj_t obj, mp_obj_iter_buf_t *iter) {
+    if (iter == NULL) {
+        return mp_getiter(obj, NULL);
+    } else {
+        obj = mp_getiter(obj, iter);
+        if (obj != MP_OBJ_FROM_PTR(iter)) {
+            // Iterator didn't use the stack so indicate that with MP_OBJ_NULL.
+            iter->base.type = MP_OBJ_NULL;
+            iter->buf[0] = obj;
+        }
+        return NULL;
+    }
+}
+
+// wrapper that handles iterator buffer
+STATIC mp_obj_t mp_native_iternext(mp_obj_iter_buf_t *iter) {
+    mp_obj_t obj;
+    if (iter->base.type == MP_OBJ_NULL) {
+        obj = iter->buf[0];
+    } else {
+        obj = MP_OBJ_FROM_PTR(iter);
+    }
+    return mp_iternext(obj);
+}
+
 // these must correspond to the respective enum in runtime0.h
 void *const mp_fun_table[MP_F_NUMBER_OF] = {
     mp_convert_obj_to_native,
@@ -127,8 +153,8 @@
     mp_native_call_function_n_kw,
     mp_call_method_n_kw,
     mp_call_method_n_kw_var,
-    mp_getiter,
-    mp_iternext,
+    mp_native_getiter,
+    mp_native_iternext,
     nlr_push,
     nlr_pop,
     mp_native_raise,
diff --git a/py/runtime0.h b/py/runtime0.h
index 8d62403..b1ed710 100644
--- a/py/runtime0.h
+++ b/py/runtime0.h
@@ -127,8 +127,8 @@
     MP_F_NATIVE_CALL_FUNCTION_N_KW,
     MP_F_CALL_METHOD_N_KW,
     MP_F_CALL_METHOD_N_KW_VAR,
-    MP_F_GETITER,
-    MP_F_ITERNEXT,
+    MP_F_NATIVE_GETITER,
+    MP_F_NATIVE_ITERNEXT,
     MP_F_NLR_PUSH,
     MP_F_NLR_POP,
     MP_F_NATIVE_RAISE,
diff --git a/py/vm.c b/py/vm.c
index f9eb269..7a906cd 100644
--- a/py/vm.c
+++ b/py/vm.c
@@ -728,12 +728,20 @@
                     SET_TOP(mp_getiter(TOP(), NULL));
                     DISPATCH();
 
+                // An iterator for a for-loop takes 4 slots on the stack.  They are either
+                // used to store the iterator object itself, or the first slot is NULL and
+                // the second slot holds a reference to the iterator object.
                 ENTRY(MP_BC_GET_ITER_STACK): {
                     MARK_EXC_IP_SELECTIVE();
                     mp_obj_t obj = TOP();
                     mp_obj_iter_buf_t *iter_buf = (mp_obj_iter_buf_t*)sp;
-                    sp += sizeof(mp_obj_iter_buf_t) / sizeof(mp_obj_t);
-                    SET_TOP(mp_getiter(obj, iter_buf));
+                    sp += sizeof(mp_obj_iter_buf_t) / sizeof(mp_obj_t) - 1;
+                    obj = mp_getiter(obj, iter_buf);
+                    if (obj != MP_OBJ_FROM_PTR(iter_buf)) {
+                        // Iterator didn't use the stack so indicate that with MP_OBJ_NULL.
+                        sp[-3] = MP_OBJ_NULL;
+                        sp[-2] = obj;
+                    }
                     DISPATCH();
                 }
 
@@ -741,10 +749,15 @@
                     MARK_EXC_IP_SELECTIVE();
                     DECODE_ULABEL; // the jump offset if iteration finishes; for labels are always forward
                     code_state->sp = sp;
-                    assert(TOP());
-                    mp_obj_t value = mp_iternext_allow_raise(TOP());
+                    mp_obj_t obj;
+                    if (sp[-3] == MP_OBJ_NULL) {
+                        obj = sp[-2];
+                    } else {
+                        obj = MP_OBJ_FROM_PTR(&sp[-3]);
+                    }
+                    mp_obj_t value = mp_iternext_allow_raise(obj);
                     if (value == MP_OBJ_STOP_ITERATION) {
-                        sp -= 5; // pop the exhausted iterator
+                        sp -= 4; // pop the exhausted iterator
                         ip += ulab; // jump to after for-block
                     } else {
                         PUSH(value); // push the next iteration value
@@ -1294,7 +1307,7 @@
                         const byte *ip = code_state->ip + 1;
                         DECODE_ULABEL; // the jump offset if iteration finishes; for labels are always forward
                         code_state->ip = ip + ulab; // jump to after for-block
-                        code_state->sp -= 5; // pop the exhausted iterator
+                        code_state->sp -= 4; // pop the exhausted iterator
                         goto outer_dispatch_loop; // continue with dispatch loop
                     } else if (*code_state->ip == MP_BC_YIELD_FROM) {
                         // StopIteration inside yield from call means return a value of