py: Allow bytecode/native to put iter_buf on stack for simple for loops.

So that the "for x in it: ..." statement can now work without using the
heap (so long as the iterator argument fits in an iter_buf structure).
diff --git a/py/bc0.h b/py/bc0.h
index 5ff9e50..c2b019f 100644
--- a/py/bc0.h
+++ b/py/bc0.h
@@ -79,6 +79,7 @@
 #define MP_BC_POP_BLOCK          (0x44)
 #define MP_BC_POP_EXCEPT         (0x45)
 #define MP_BC_UNWIND_JUMP        (0x46) // rel byte code offset, 16-bit signed, in excess; then a byte
+#define MP_BC_GET_ITER_STACK     (0x47)
 
 #define MP_BC_BUILD_TUPLE        (0x50) // uint
 #define MP_BC_BUILD_LIST         (0x51) // uint
diff --git a/py/compile.c b/py/compile.c
index b84793d..d2fa03f 100644
--- a/py/compile.c
+++ b/py/compile.c
@@ -1475,7 +1475,7 @@
     uint pop_label = comp_next_label(comp);
 
     compile_node(comp, pns->nodes[1]); // iterator
-    EMIT(get_iter);
+    EMIT_ARG(get_iter, true);
     EMIT_ARG(label_assign, continue_label);
     EMIT_ARG(for_iter, pop_label);
     c_assign(comp, pns->nodes[0], ASSIGN_STORE); // variable
@@ -1484,7 +1484,7 @@
         EMIT_ARG(jump, continue_label);
     }
     EMIT_ARG(label_assign, pop_label);
-    EMIT(for_iter_end);
+    EMIT_ARG(for_iter_end, true);
 
     // break/continue apply to outer loop (if any) in the else block
     END_BREAK_CONTINUE_BLOCK
@@ -1680,7 +1680,7 @@
 }
 
 STATIC void compile_yield_from(compiler_t *comp) {
-    EMIT(get_iter);
+    EMIT_ARG(get_iter, false);
     EMIT_ARG(load_const_tok, MP_TOKEN_KW_NONE);
     EMIT(yield_from);
 }
@@ -2372,7 +2372,7 @@
     close_over_variables_etc(comp, this_scope, 0, 0);
 
     compile_node(comp, pns_comp_for->nodes[1]); // source of the iterator
-    EMIT(get_iter);
+    EMIT_ARG(get_iter, false);
     EMIT_ARG(call_function, 1, 0, 0);
 }
 
@@ -2900,13 +2900,13 @@
         // for loop
         mp_parse_node_struct_t *pns_comp_for2 = (mp_parse_node_struct_t*)pn_iter;
         compile_node(comp, pns_comp_for2->nodes[1]);
-        EMIT(get_iter);
+        EMIT_ARG(get_iter, false);
         compile_scope_comp_iter(comp, pns_comp_for2, pn_inner_expr, for_depth + 1);
     }
 
     EMIT_ARG(jump, l_top);
     EMIT_ARG(label_assign, l_end);
-    EMIT(for_iter_end);
+    EMIT_ARG(for_iter_end, false);
 }
 
 STATIC void check_for_doc_string(compiler_t *comp, mp_parse_node_t pn) {
diff --git a/py/emit.h b/py/emit.h
index 7d00f11..6acae9e 100644
--- a/py/emit.h
+++ b/py/emit.h
@@ -110,9 +110,9 @@
     void (*setup_except)(emit_t *emit, mp_uint_t label);
     void (*setup_finally)(emit_t *emit, mp_uint_t label);
     void (*end_finally)(emit_t *emit);
-    void (*get_iter)(emit_t *emit);
+    void (*get_iter)(emit_t *emit, bool use_stack);
     void (*for_iter)(emit_t *emit, mp_uint_t label);
-    void (*for_iter_end)(emit_t *emit);
+    void (*for_iter_end)(emit_t *emit, bool use_stack);
     void (*pop_block)(emit_t *emit);
     void (*pop_except)(emit_t *emit);
     void (*unary_op)(emit_t *emit, mp_unary_op_t op);
@@ -228,9 +228,9 @@
 void mp_emit_bc_setup_except(emit_t *emit, mp_uint_t label);
 void mp_emit_bc_setup_finally(emit_t *emit, mp_uint_t label);
 void mp_emit_bc_end_finally(emit_t *emit);
-void mp_emit_bc_get_iter(emit_t *emit);
+void mp_emit_bc_get_iter(emit_t *emit, bool use_stack);
 void mp_emit_bc_for_iter(emit_t *emit, mp_uint_t label);
-void mp_emit_bc_for_iter_end(emit_t *emit);
+void mp_emit_bc_for_iter_end(emit_t *emit, bool use_stack);
 void mp_emit_bc_pop_block(emit_t *emit);
 void mp_emit_bc_pop_except(emit_t *emit);
 void mp_emit_bc_unary_op(emit_t *emit, mp_unary_op_t op);
diff --git a/py/emitbc.c b/py/emitbc.c
index e3a047f..3e0c0b3 100644
--- a/py/emitbc.c
+++ b/py/emitbc.c
@@ -734,6 +734,10 @@
         if (label & MP_EMIT_BREAK_FROM_FOR) {
             // need to pop the iterator if we are breaking out of a for loop
             emit_write_bytecode_byte(emit, MP_BC_POP_TOP);
+            // also pop the iter_buf
+            for (size_t i = 0; i < sizeof(mp_obj_iter_buf_t) / sizeof(mp_obj_t); ++i) {
+                emit_write_bytecode_byte(emit, MP_BC_POP_TOP);
+            }
         }
         emit_write_bytecode_byte_signed_label(emit, MP_BC_JUMP, label & ~MP_EMIT_BREAK_FROM_FOR);
     } else {
@@ -773,9 +777,9 @@
     emit_write_bytecode_byte(emit, MP_BC_END_FINALLY);
 }
 
-void mp_emit_bc_get_iter(emit_t *emit) {
-    emit_bc_pre(emit, 0);
-    emit_write_bytecode_byte(emit, MP_BC_GET_ITER);
+void mp_emit_bc_get_iter(emit_t *emit, bool use_stack) {
+    emit_bc_pre(emit, use_stack ? sizeof(mp_obj_iter_buf_t) / sizeof(mp_obj_t) : 0);
+    emit_write_bytecode_byte(emit, use_stack ? MP_BC_GET_ITER_STACK : MP_BC_GET_ITER);
 }
 
 void mp_emit_bc_for_iter(emit_t *emit, mp_uint_t label) {
@@ -783,8 +787,13 @@
     emit_write_bytecode_byte_unsigned_label(emit, MP_BC_FOR_ITER, label);
 }
 
-void mp_emit_bc_for_iter_end(emit_t *emit) {
+void mp_emit_bc_for_iter_end(emit_t *emit, bool use_stack) {
     emit_bc_pre(emit, -1);
+    if (use_stack) {
+        for (size_t i = 0; i < sizeof(mp_obj_iter_buf_t) / sizeof(mp_obj_t); ++i) {
+            mp_emit_bc_pop_top(emit);
+        }
+    }
 }
 
 void mp_emit_bc_pop_block(emit_t *emit) {
diff --git a/py/emitnative.c b/py/emitnative.c
index e8e3754..6e8a37b 100644
--- a/py/emitnative.c
+++ b/py/emitnative.c
@@ -1799,14 +1799,19 @@
     emit_post(emit);
 }
 
-STATIC void emit_native_get_iter(emit_t *emit) {
+STATIC void emit_native_get_iter(emit_t *emit, bool use_stack) {
     // perhaps the difficult one, as we want to rewrite for loops using native code
     // in cases where we iterate over a Python object, can we use normal runtime calls?
 
     vtype_kind_t vtype;
     emit_pre_pop_reg(emit, &vtype, REG_ARG_1);
     assert(vtype == VTYPE_PYOBJ);
-    assert(0); // TODO allocate memory for iter_buf
+    if (use_stack) {
+        emit_get_stack_pointer_to_reg_for_push(emit, REG_ARG_2, sizeof(mp_obj_iter_buf_t) / sizeof(mp_obj_t));
+    } else {
+        // mp_getiter will allocate the iter_buf on the heap
+        ASM_MOV_IMM_TO_REG(emit->as, 0, REG_ARG_2);
+    }
     emit_call(emit, MP_F_GETITER);
     emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET);
 }
@@ -1822,10 +1827,13 @@
     emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET);
 }
 
-STATIC void emit_native_for_iter_end(emit_t *emit) {
+STATIC void emit_native_for_iter_end(emit_t *emit, bool use_stack) {
     // adjust stack counter (we get here from for_iter ending, which popped the value for us)
     emit_native_pre(emit);
     adjust_stack(emit, -1);
+    if (use_stack) {
+        adjust_stack(emit, -(sizeof(mp_obj_iter_buf_t) / sizeof(mp_obj_t)));
+    }
     emit_post(emit);
 }
 
diff --git a/py/showbc.c b/py/showbc.c
index 9d20dec..b52905f 100644
--- a/py/showbc.c
+++ b/py/showbc.c
@@ -387,6 +387,10 @@
             printf("GET_ITER");
             break;
 
+        case MP_BC_GET_ITER_STACK:
+            printf("GET_ITER_STACK");
+            break;
+
         case MP_BC_FOR_ITER:
             DECODE_ULABEL; // the jump offset if iteration finishes; for labels are always forward
             printf("FOR_ITER " UINT_FMT, (mp_uint_t)(ip + unum - mp_showbc_code_start));
diff --git a/py/vm.c b/py/vm.c
index 917d41a..848a77a 100644
--- a/py/vm.c
+++ b/py/vm.c
@@ -681,7 +681,9 @@
                     }
                     ip = (const byte*)MP_OBJ_TO_PTR(POP()); // pop destination ip for jump
                     if (unum != 0) {
+                        // pop iter and iter_buf
                         sp--;
+                        sp -= sizeof(mp_obj_iter_buf_t) / sizeof(mp_obj_t);
                     }
                     DISPATCH_WITH_PEND_EXC_CHECK();
                 }
@@ -726,6 +728,15 @@
                     SET_TOP(mp_getiter(TOP(), NULL));
                     DISPATCH();
 
+                ENTRY(MP_BC_GET_ITER_STACK): {
+                    MARK_EXC_IP_SELECTIVE();
+                    mp_obj_t obj = TOP();
+                    mp_obj_iter_buf_t *iter_buf = (mp_obj_iter_buf_t*)sp;
+                    sp += sizeof(mp_obj_iter_buf_t) / sizeof(mp_obj_t);
+                    SET_TOP(mp_getiter(obj, iter_buf));
+                    DISPATCH();
+                }
+
                 ENTRY(MP_BC_FOR_ITER): {
                     MARK_EXC_IP_SELECTIVE();
                     DECODE_ULABEL; // the jump offset if iteration finishes; for labels are always forward
diff --git a/py/vmentrytable.h b/py/vmentrytable.h
index dd30dd7..8731c3d 100644
--- a/py/vmentrytable.h
+++ b/py/vmentrytable.h
@@ -73,6 +73,7 @@
     [MP_BC_SETUP_FINALLY] = &&entry_MP_BC_SETUP_FINALLY,
     [MP_BC_END_FINALLY] = &&entry_MP_BC_END_FINALLY,
     [MP_BC_GET_ITER] = &&entry_MP_BC_GET_ITER,
+    [MP_BC_GET_ITER_STACK] = &&entry_MP_BC_GET_ITER_STACK,
     [MP_BC_FOR_ITER] = &&entry_MP_BC_FOR_ITER,
     [MP_BC_POP_BLOCK] = &&entry_MP_BC_POP_BLOCK,
     [MP_BC_POP_EXCEPT] = &&entry_MP_BC_POP_EXCEPT,