py: Add casting to viper; add native mem stores to viper.

Viper can now do the following:

def store(p:ptr8, c:int):
    p[0] = c

This does a store of c to the memory pointed to by p using a machine
instructions inline in the code.
diff --git a/py/emitnative.c b/py/emitnative.c
index d58da7e..16f0a08 100644
--- a/py/emitnative.c
+++ b/py/emitnative.c
@@ -148,6 +148,10 @@
 #define ASM_ADD_REG_REG(as, reg_dest, reg_src) asm_x64_add_r64_r64((as), (reg_dest), (reg_src))
 #define ASM_SUB_REG_REG(as, reg_dest, reg_src) asm_x64_sub_r64_r64((as), (reg_dest), (reg_src))
 
+#define ASM_STORE_REG_REG(as, reg_src, reg_base) asm_x64_mov_r64_to_disp((as), (reg_src), (reg_base), 0)
+#define ASM_STORE8_REG_REG(as, reg_src, reg_base) asm_x64_mov_r8_to_disp((as), (reg_src), (reg_base), 0)
+#define ASM_STORE16_REG_REG(as, reg_src, reg_base) asm_x64_mov_r16_to_disp((as), (reg_src), (reg_base), 0)
+
 #elif N_X86
 
 // x86 specific stuff
@@ -269,6 +273,10 @@
 #define ASM_ADD_REG_REG(as, reg_dest, reg_src) asm_x86_add_r32_r32((as), (reg_dest), (reg_src))
 #define ASM_SUB_REG_REG(as, reg_dest, reg_src) asm_x86_sub_r32_r32((as), (reg_dest), (reg_src))
 
+#define ASM_STORE_REG_REG(as, reg_src, reg_base) asm_x86_mov_r32_to_disp((as), (reg_src), (reg_base), 0)
+#define ASM_STORE8_REG_REG(as, reg_src, reg_base) asm_x86_mov_r8_to_disp((as), (reg_src), (reg_base), 0)
+#define ASM_STORE16_REG_REG(as, reg_src, reg_base) asm_x86_mov_r16_to_disp((as), (reg_src), (reg_base), 0)
+
 #elif N_THUMB
 
 // thumb specific stuff
@@ -341,6 +349,10 @@
 #define ASM_ADD_REG_REG(as, reg_dest, reg_src) asm_thumb_add_rlo_rlo_rlo((as), (reg_dest), (reg_dest), (reg_src))
 #define ASM_SUB_REG_REG(as, reg_dest, reg_src) asm_thumb_sub_rlo_rlo_rlo((as), (reg_dest), (reg_dest), (reg_src))
 
+#define ASM_STORE_REG_REG(as, reg_src, reg_base) asm_thumb_str_rlo_rlo_i5((as), (reg_src), (reg_base), 0)
+#define ASM_STORE8_REG_REG(as, reg_src, reg_base) asm_thumb_strb_rlo_rlo_i5((as), (reg_src), (reg_base), 0)
+#define ASM_STORE16_REG_REG(as, reg_src, reg_base) asm_thumb_strh_rlo_rlo_i5((as), (reg_src), (reg_base), 0)
+
 #elif N_ARM
 
 // ARM specific stuff
@@ -414,6 +426,11 @@
 #define ASM_ADD_REG_REG(as, reg_dest, reg_src) asm_arm_add_reg_reg_reg((as), (reg_dest), (reg_dest), (reg_src))
 #define ASM_SUB_REG_REG(as, reg_dest, reg_src) asm_arm_sub_reg_reg_reg((as), (reg_dest), (reg_dest), (reg_src))
 
+// TODO someone please implement str
+#define ASM_STORE_REG_REG(as, reg_src, reg_base) asm_arm_str_reg_reg_i5((as), (reg_src), (reg_base), 0)
+#define ASM_STORE8_REG_REG(as, reg_src, reg_base) asm_arm_strb_reg_reg_i5((as), (reg_src), (reg_base), 0)
+#define ASM_STORE16_REG_REG(as, reg_src, reg_base) asm_arm_strh_reg_reg_i5((as), (reg_src), (reg_base), 0)
+
 #else
 
 #error unknown native emitter
@@ -426,15 +443,21 @@
     STACK_IMM,
 } stack_info_kind_t;
 
+// these enums must be distinct and the bottom 2 bits
+// must correspond to the correct MP_NATIVE_TYPE_xxx value
 typedef enum {
-    VTYPE_PYOBJ = MP_NATIVE_TYPE_OBJ,
-    VTYPE_BOOL = MP_NATIVE_TYPE_BOOL,
-    VTYPE_INT = MP_NATIVE_TYPE_INT,
-    VTYPE_UINT = MP_NATIVE_TYPE_UINT,
-    VTYPE_UNBOUND,
-    VTYPE_PTR,
-    VTYPE_PTR_NONE,
-    VTYPE_BUILTIN_V_INT,
+    VTYPE_PYOBJ = 0x00 | MP_NATIVE_TYPE_OBJ,
+    VTYPE_BOOL = 0x00 | MP_NATIVE_TYPE_BOOL,
+    VTYPE_INT = 0x00 | MP_NATIVE_TYPE_INT,
+    VTYPE_UINT = 0x00 | MP_NATIVE_TYPE_UINT,
+
+    VTYPE_PTR = 0x10 | MP_NATIVE_TYPE_UINT, // pointer to word sized entity
+    VTYPE_PTR8 = 0x20 | MP_NATIVE_TYPE_UINT,
+    VTYPE_PTR16 = 0x30 | MP_NATIVE_TYPE_UINT,
+    VTYPE_PTR_NONE = 0x40 | MP_NATIVE_TYPE_UINT,
+
+    VTYPE_UNBOUND = 0x50 | MP_NATIVE_TYPE_OBJ,
+    VTYPE_BUILTIN_CAST = 0x60 | MP_NATIVE_TYPE_OBJ,
 } vtype_kind_t;
 
 typedef struct _stack_info_t {
@@ -495,6 +518,9 @@
                 case MP_QSTR_bool: type = VTYPE_BOOL; break;
                 case MP_QSTR_int: type = VTYPE_INT; break;
                 case MP_QSTR_uint: type = VTYPE_UINT; break;
+                case MP_QSTR_ptr: type = VTYPE_PTR; break;
+                case MP_QSTR_ptr8: type = VTYPE_PTR8; break;
+                case MP_QSTR_ptr16: type = VTYPE_PTR16; break;
                 default: printf("ViperTypeError: unknown type %s\n", qstr_str(arg2)); return;
             }
             if (op == MP_EMIT_NATIVE_TYPE_RETURN) {
@@ -606,6 +632,7 @@
         }
     }
 
+    // TODO don't load r7 if we don't need it
     asm_thumb_mov_reg_i32(emit->as, ASM_THUMB_REG_R7, (mp_uint_t)mp_fun_table);
 #elif N_ARM
     for (int i = 0; i < scope->num_pos_args; i++) {
@@ -623,6 +650,7 @@
         }
     }
 
+    // TODO don't load r7 if we don't need it
     asm_arm_mov_reg_i32(emit->as, ASM_ARM_REG_R7, (mp_uint_t)mp_fun_table);
 #else
     #error not implemented
@@ -645,7 +673,7 @@
         mp_uint_t f_len = ASM_GET_CODE_SIZE(emit->as);
 
         // compute type signature
-        // TODO check that viper types here convert correctly to valid types for emit glue
+        // note that the lower 2 bits of a vtype are tho correct MP_NATIVE_TYPE_xxx
         mp_uint_t type_sig = emit->return_vtype & 3;
         for (mp_uint_t i = 0; i < emit->scope->num_pos_args; i++) {
             type_sig |= (emit->local_vtype[i] & 3) << (i * 2 + 2);
@@ -796,12 +824,26 @@
     }
 }
 
-// If stacked value is in a register, then *reg_dest is set to that register.
-// Otherwise, the value is put in *reg_dest.
-STATIC void emit_pre_pop_reg_flexible(emit_t *emit, vtype_kind_t *vtype, int *reg_dest) {
+// does an efficient X=pop(); discard(); push(X)
+// needs a (non-temp) register in case the poped element was stored in the stack
+STATIC void emit_fold_stack_top(emit_t *emit, int reg_dest) {
+    stack_info_t *si = &emit->stack_info[emit->stack_size - 2];
+    si[0] = si[1];
+    if (si->kind == STACK_VALUE) {
+        // if folded element was on the stack we need to put it in a register
+        ASM_MOV_LOCAL_TO_REG(emit->as, emit->stack_start + emit->stack_size - 1, reg_dest);
+        si->kind = STACK_REG;
+        si->u_reg = reg_dest;
+    }
+    adjust_stack(emit, -1);
+}
+
+// If stacked value is in a register and the register is not r1 or r2, then
+// *reg_dest is set to that register.  Otherwise the value is put in *reg_dest.
+STATIC void emit_pre_pop_reg_flexible(emit_t *emit, vtype_kind_t *vtype, int *reg_dest, int not_r1, int not_r2) {
     emit->last_emit_was_return_value = false;
     stack_info_t *si = peek_stack(emit, 0);
-    if (si->kind == STACK_REG) {
+    if (si->kind == STACK_REG && si->u_reg != not_r1 && si->u_reg != not_r2) {
         *vtype = si->vtype;
         *reg_dest = si->u_reg;
         need_reg_single(emit, *reg_dest, 1);
@@ -836,6 +878,11 @@
 STATIC void emit_post(emit_t *emit) {
 }
 
+STATIC void emit_post_top_set_vtype(emit_t *emit, vtype_kind_t new_vtype) {
+    stack_info_t *si = &emit->stack_info[emit->stack_size - 1];
+    si->vtype = new_vtype;
+}
+
 STATIC void emit_post_push_reg(emit_t *emit, vtype_kind_t vtype, int reg) {
     stack_info_t *si = &emit->stack_info[emit->stack_size];
     si->vtype = vtype;
@@ -954,6 +1001,7 @@
             emit_call_with_imm_arg(emit, MP_F_CONVERT_NATIVE_TO_OBJ, si->vtype, REG_ARG_2); // arg2 = type
             ASM_MOV_REG_TO_LOCAL(emit->as, REG_RET, local_num);
             si->vtype = VTYPE_PYOBJ;
+            DEBUG_printf("  convert_native_to_obj(local_num=" UINT_FMT ")\n", local_num);
         }
     }
 
@@ -1173,9 +1221,23 @@
 }
 
 STATIC void emit_native_load_global(emit_t *emit, qstr qst) {
+    DEBUG_printf("load_global(%s)\n", qstr_str(qst));
     emit_native_pre(emit);
-    emit_call_with_imm_arg(emit, MP_F_LOAD_GLOBAL, qst, REG_ARG_1);
-    emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET);
+    // check for builtin casting operators
+    if (emit->do_viper_types && qst == MP_QSTR_int) {
+        emit_post_push_imm(emit, VTYPE_BUILTIN_CAST, VTYPE_INT);
+    } else if (emit->do_viper_types && qst == MP_QSTR_uint) {
+        emit_post_push_imm(emit, VTYPE_BUILTIN_CAST, VTYPE_UINT);
+    } else if (emit->do_viper_types && qst == MP_QSTR_ptr) {
+        emit_post_push_imm(emit, VTYPE_BUILTIN_CAST, VTYPE_PTR);
+    } else if (emit->do_viper_types && qst == MP_QSTR_ptr8) {
+        emit_post_push_imm(emit, VTYPE_BUILTIN_CAST, VTYPE_PTR8);
+    } else if (emit->do_viper_types && qst == MP_QSTR_ptr16) {
+        emit_post_push_imm(emit, VTYPE_BUILTIN_CAST, VTYPE_PTR16);
+    } else {
+        emit_call_with_imm_arg(emit, MP_F_LOAD_GLOBAL, qst, REG_ARG_1);
+        emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET);
+    }
 }
 
 STATIC void emit_native_load_attr(emit_t *emit, qstr qst) {
@@ -1314,16 +1376,103 @@
 }
 
 STATIC void emit_native_store_subscr(emit_t *emit) {
-    // depends on type of subject:
-    //  - integer, function, pointer to structure: error
-    //  - pointer to integers: store as per array
-    //  - Python object: call runtime with converted object or type info
-    vtype_kind_t vtype_index, vtype_base, vtype_value;
-    emit_pre_pop_reg_reg_reg(emit, &vtype_index, REG_ARG_2, &vtype_base, REG_ARG_1, &vtype_value, REG_ARG_3); // index, base, value to store
-    assert(vtype_index == VTYPE_PYOBJ);
-    assert(vtype_base == VTYPE_PYOBJ);
-    assert(vtype_value == VTYPE_PYOBJ);
-    emit_call(emit, MP_F_OBJ_SUBSCR);
+    DEBUG_printf("store_subscr\n");
+    // need to compile: base[index] = value
+
+    // pop: index, base, value
+    // optimise case where index is an immediate
+    vtype_kind_t vtype_base = peek_vtype(emit, 1);
+
+    if (vtype_base == VTYPE_PYOBJ) {
+        // standard Python call
+        vtype_kind_t vtype_index, vtype_value;
+        emit_pre_pop_reg_reg_reg(emit, &vtype_index, REG_ARG_2, &vtype_base, REG_ARG_1, &vtype_value, REG_ARG_3);
+        assert(vtype_index == VTYPE_PYOBJ);
+        assert(vtype_value == VTYPE_PYOBJ);
+        emit_call(emit, MP_F_OBJ_SUBSCR);
+    } else {
+        // viper call
+        stack_info_t *top = peek_stack(emit, 0);
+        if (top->vtype == VTYPE_INT && top->kind == STACK_IMM) {
+            // index is an immediate
+            mp_int_t index_value = top->u_imm;
+            emit_pre_pop_discard(emit); // discard index
+            vtype_kind_t vtype_value;
+            int reg_base = REG_ARG_1;
+            int reg_index = REG_ARG_2;
+            int reg_value = REG_ARG_3;
+            emit_pre_pop_reg_flexible(emit, &vtype_base, &reg_base, reg_index, reg_value);
+            emit_pre_pop_reg_flexible(emit, &vtype_value, &reg_value, reg_base, reg_index);
+            switch (vtype_base) {
+                case VTYPE_PTR8: {
+                    // pointer to 8-bit memory
+                    // TODO optimise to use thumb strb r1, [r2, r3]
+                    if (index_value != 0) {
+                        // index is non-zero
+                        #if N_THUMB
+                        if (index_value > 0 && index_value < 32) {
+                            asm_thumb_strb_rlo_rlo_i5(emit->as, reg_value, reg_base, index_value);
+                            break;
+                        }
+                        #endif
+                        ASM_MOV_IMM_TO_REG(emit->as, index_value, reg_index);
+                        ASM_ADD_REG_REG(emit->as, reg_index, reg_base); // add index to base
+                        reg_base = reg_index;
+                    }
+                    ASM_STORE8_REG_REG(emit->as, reg_value, reg_base); // store value to (base+index)
+                    break;
+                }
+                case VTYPE_PTR16: {
+                    // pointer to 16-bit memory
+                    if (index_value != 0) {
+                        // index is a non-zero immediate
+                        #if N_THUMB
+                        if (index_value > 0 && index_value < 32) {
+                            asm_thumb_strh_rlo_rlo_i5(emit->as, reg_value, reg_base, index_value);
+                            break;
+                        }
+                        #endif
+                        ASM_MOV_IMM_TO_REG(emit->as, index_value << 1, reg_index);
+                        ASM_ADD_REG_REG(emit->as, reg_index, reg_base); // add 2*index to base
+                        reg_base = reg_index;
+                    }
+                    ASM_STORE16_REG_REG(emit->as, reg_value, reg_base); // store value to (base+2*index)
+                    break;
+                }
+                default:
+                    printf("ViperTypeError: can't store to type %d\n", vtype_base);
+            }
+        } else {
+            // index is not an immediate
+            vtype_kind_t vtype_index, vtype_value;
+            int reg_index = REG_ARG_2;
+            int reg_value = REG_ARG_3;
+            emit_pre_pop_reg_flexible(emit, &vtype_index, &reg_index, REG_ARG_1, reg_value);
+            emit_pre_pop_reg(emit, &vtype_base, REG_ARG_1);
+            emit_pre_pop_reg_flexible(emit, &vtype_value, &reg_value, REG_ARG_1, reg_index);
+            switch (vtype_base) {
+                case VTYPE_PTR8: {
+                    // pointer to 8-bit memory
+                    // TODO optimise to use thumb strb r1, [r2, r3]
+                    assert(vtype_index == VTYPE_INT);
+                    ASM_ADD_REG_REG(emit->as, REG_ARG_1, reg_index); // add index to base
+                    ASM_STORE8_REG_REG(emit->as, reg_value, REG_ARG_1); // store value to (base+index)
+                    break;
+                }
+                case VTYPE_PTR16: {
+                    // pointer to 16-bit memory
+                    assert(vtype_index == VTYPE_INT);
+                    ASM_ADD_REG_REG(emit->as, REG_ARG_1, reg_index); // add index to base
+                    ASM_ADD_REG_REG(emit->as, REG_ARG_1, reg_index); // add index to base
+                    ASM_STORE16_REG_REG(emit->as, reg_value, REG_ARG_1); // store value to (base+2*index)
+                    break;
+                }
+                default:
+                    printf("ViperTypeError: can't store to type %d\n", vtype_base);
+            }
+        }
+
+    }
 }
 
 STATIC void emit_native_delete_fast(emit_t *emit, qstr qst, mp_uint_t local_num) {
@@ -1581,7 +1730,7 @@
         }
         #endif
         int reg_rhs = REG_ARG_3;
-        emit_pre_pop_reg_flexible(emit, &vtype_rhs, &reg_rhs);
+        emit_pre_pop_reg_flexible(emit, &vtype_rhs, &reg_rhs, REG_RET, REG_ARG_2);
         emit_pre_pop_reg(emit, &vtype_lhs, REG_ARG_2);
         if (0) {
             // dummy
@@ -1813,64 +1962,53 @@
 STATIC void emit_native_call_function(emit_t *emit, mp_uint_t n_positional, mp_uint_t n_keyword, mp_uint_t star_flags) {
     DEBUG_printf("call_function(n_pos=" UINT_FMT ", n_kw=" UINT_FMT ", star_flags=" UINT_FMT ")\n", n_positional, n_keyword, star_flags);
 
-    // call special viper runtime routine with type info for args, and wanted type info for return
+    // TODO: in viper mode, call special runtime routine with type info for args,
+    // and wanted type info for return, to remove need for boxing/unboxing
+
     assert(!star_flags);
 
-    /* we no longer have these _n specific call_function's
-     * they anyway push args into an array
-     * and they would take too much room in the native dispatch table
-    if (n_positional == 0) {
-        vtype_kind_t vtype_fun;
+    emit_native_pre(emit);
+    vtype_kind_t vtype_fun = peek_vtype(emit, n_positional + 2 * n_keyword);
+    if (vtype_fun == VTYPE_BUILTIN_CAST) {
+        // casting operator
+        assert(n_positional == 1 && n_keyword == 0);
+        DEBUG_printf("  cast to %d\n", vtype_fun);
+        vtype_kind_t vtype_cast = peek_stack(emit, 1)->u_imm;
+        switch (peek_vtype(emit, 0)) {
+            case VTYPE_PYOBJ: {
+                vtype_kind_t vtype;
+                emit_pre_pop_reg(emit, &vtype, REG_ARG_1);
+                emit_pre_pop_discard(emit);
+                emit_call_with_imm_arg(emit, MP_F_CONVERT_OBJ_TO_NATIVE, MP_NATIVE_TYPE_UINT, REG_ARG_2); // arg2 = type
+                emit_post_push_reg(emit, vtype_cast, REG_RET);
+                break;
+            }
+            case VTYPE_BOOL:
+            case VTYPE_INT:
+            case VTYPE_UINT:
+            case VTYPE_PTR:
+            case VTYPE_PTR8:
+            case VTYPE_PTR16:
+            case VTYPE_PTR_NONE:
+                emit_fold_stack_top(emit, REG_ARG_1);
+                emit_post_top_set_vtype(emit, vtype_cast);
+                break;
+            default:
+                assert(!"TODO: convert obj to int");
+        }
+    } else {
+        if (n_positional != 0 || n_keyword != 0) {
+            emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_3, n_positional + 2 * n_keyword); // pointer to args
+        }
         emit_pre_pop_reg(emit, &vtype_fun, REG_ARG_1); // the function
         assert(vtype_fun == VTYPE_PYOBJ);
-        emit_call(emit, MP_F_CALL_FUNCTION_0);
-    } else if (n_positional == 1) {
-        vtype_kind_t vtype_fun, vtype_arg1;
-        emit_pre_pop_reg_reg(emit, &vtype_arg1, REG_ARG_2, &vtype_fun, REG_ARG_1); // the single argument, the function
-        assert(vtype_fun == VTYPE_PYOBJ);
-        assert(vtype_arg1 == VTYPE_PYOBJ);
-        emit_call(emit, MP_F_CALL_FUNCTION_1);
-    } else if (n_positional == 2) {
-        vtype_kind_t vtype_fun, vtype_arg1, vtype_arg2;
-        emit_pre_pop_reg_reg_reg(emit, &vtype_arg2, REG_ARG_3, &vtype_arg1, REG_ARG_2, &vtype_fun, REG_ARG_1); // the second argument, the first argument, the function
-        assert(vtype_fun == VTYPE_PYOBJ);
-        assert(vtype_arg1 == VTYPE_PYOBJ);
-        assert(vtype_arg2 == VTYPE_PYOBJ);
-        emit_call(emit, MP_F_CALL_FUNCTION_2);
-    } else {
-    */
-
-    emit_native_pre(emit);
-    if (n_positional != 0 || n_keyword != 0) {
-        emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_3, n_positional + 2 * n_keyword); // pointer to args
+        emit_call_with_imm_arg(emit, MP_F_NATIVE_CALL_FUNCTION_N_KW, n_positional | (n_keyword << 8), REG_ARG_2);
+        emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET);
     }
-    vtype_kind_t vtype_fun;
-    emit_pre_pop_reg(emit, &vtype_fun, REG_ARG_1); // the function
-    assert(vtype_fun == VTYPE_PYOBJ);
-    emit_call_with_imm_arg(emit, MP_F_NATIVE_CALL_FUNCTION_N_KW, n_positional | (n_keyword << 8), REG_ARG_2);
-    emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET);
 }
 
 STATIC void emit_native_call_method(emit_t *emit, mp_uint_t n_positional, mp_uint_t n_keyword, mp_uint_t star_flags) {
     assert(!star_flags);
-
-    /*
-    if (n_positional == 0) {
-        vtype_kind_t vtype_meth, vtype_self;
-        emit_pre_pop_reg_reg(emit, &vtype_self, REG_ARG_2, &vtype_meth, REG_ARG_1); // the self object (or NULL), the method
-        assert(vtype_meth == VTYPE_PYOBJ);
-        assert(vtype_self == VTYPE_PYOBJ);
-        emit_call(emit, MP_F_CALL_METHOD_1);
-    } else if (n_positional == 1) {
-        vtype_kind_t vtype_meth, vtype_self, vtype_arg1;
-        emit_pre_pop_reg_reg_reg(emit, &vtype_arg1, REG_ARG_3, &vtype_self, REG_ARG_2, &vtype_meth, REG_ARG_1); // the first argument, the self object (or NULL), the method
-        assert(vtype_meth == VTYPE_PYOBJ);
-        assert(vtype_self == VTYPE_PYOBJ);
-        assert(vtype_arg1 == VTYPE_PYOBJ);
-        emit_call(emit, MP_F_CALL_METHOD_2);
-    } else {
-    */
-
     emit_native_pre(emit);
     emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_3, 2 + n_positional + 2 * n_keyword); // pointer to items, including meth and self
     emit_call_with_2_imm_args(emit, MP_F_CALL_METHOD_N_KW, n_positional, REG_ARG_1, n_keyword, REG_ARG_2);
@@ -1879,17 +2017,24 @@
 
 STATIC void emit_native_return_value(emit_t *emit) {
     DEBUG_printf("return_value\n");
-    vtype_kind_t vtype;
-    emit_pre_pop_reg(emit, &vtype, REG_RET);
     if (emit->do_viper_types) {
-        if (vtype == VTYPE_PTR_NONE) {
+        if (peek_vtype(emit, 0) == VTYPE_PTR_NONE) {
+            emit_pre_pop_discard(emit);
             if (emit->return_vtype == VTYPE_PYOBJ) {
                 ASM_MOV_IMM_TO_REG(emit->as, (mp_uint_t)mp_const_none, REG_RET);
+            } else {
+                ASM_MOV_IMM_TO_REG(emit->as, 0, REG_RET);
             }
-        } else if (vtype != emit->return_vtype) {
-            printf("ViperTypeError: incompatible return type\n");
+        } else {
+            vtype_kind_t vtype;
+            emit_pre_pop_reg(emit, &vtype, REG_RET);
+            if (vtype != emit->return_vtype) {
+                printf("ViperTypeError: incompatible return type\n");
+            }
         }
     } else {
+        vtype_kind_t vtype;
+        emit_pre_pop_reg(emit, &vtype, REG_RET);
         assert(vtype == VTYPE_PYOBJ);
     }
     emit->last_emit_was_return_value = true;