py: Add LOAD_SUPER_METHOD bytecode to allow heap-free super meth calls.

This patch allows the following code to run without allocating on the heap:

    super().foo(...)

Before this patch such a call would allocate a super object on the heap and
then load the foo method and call it right away.  The super object is only
needed to perform the lookup of the method and not needed after that.  This
patch makes an optimisation to allocate the super object on the C stack and
discard it right after use.

Changes in code size due to this patch are:

   bare-arm: +128
    minimal: +232
   unix x64: +416
unix nanbox: +364
     stmhal: +184
    esp8266: +340
     cc3200: +128
diff --git a/py/bc.c b/py/bc.c
index 4c0eb53..fc17946 100644
--- a/py/bc.c
+++ b/py/bc.c
@@ -304,7 +304,7 @@
     OC4(U, U, U, U), // 0x0c-0x0f
     OC4(B, B, B, U), // 0x10-0x13
     OC4(V, U, Q, V), // 0x14-0x17
-    OC4(B, U, V, V), // 0x18-0x1b
+    OC4(B, V, V, Q), // 0x18-0x1b
     OC4(Q, Q, Q, Q), // 0x1c-0x1f
     OC4(B, B, V, V), // 0x20-0x23
     OC4(Q, Q, Q, B), // 0x24-0x27
diff --git a/py/bc0.h b/py/bc0.h
index c2b019f..b5650ab 100644
--- a/py/bc0.h
+++ b/py/bc0.h
@@ -37,12 +37,13 @@
 #define MP_BC_LOAD_CONST_OBJ     (0x17) // ptr
 #define MP_BC_LOAD_NULL          (0x18)
 
-#define MP_BC_LOAD_FAST_N        (0x1a) // uint
-#define MP_BC_LOAD_DEREF         (0x1b) // uint
-#define MP_BC_LOAD_NAME          (0x1c) // qstr
-#define MP_BC_LOAD_GLOBAL        (0x1d) // qstr
-#define MP_BC_LOAD_ATTR          (0x1e) // qstr
-#define MP_BC_LOAD_METHOD        (0x1f) // qstr
+#define MP_BC_LOAD_FAST_N        (0x19) // uint
+#define MP_BC_LOAD_DEREF         (0x1a) // uint
+#define MP_BC_LOAD_NAME          (0x1b) // qstr
+#define MP_BC_LOAD_GLOBAL        (0x1c) // qstr
+#define MP_BC_LOAD_ATTR          (0x1d) // qstr
+#define MP_BC_LOAD_METHOD        (0x1e) // qstr
+#define MP_BC_LOAD_SUPER_METHOD  (0x1f) // qstr
 #define MP_BC_LOAD_BUILD_CLASS   (0x20)
 #define MP_BC_LOAD_SUBSCR        (0x21)
 
diff --git a/py/compile.c b/py/compile.c
index 42c2cc3..8533e05 100644
--- a/py/compile.c
+++ b/py/compile.c
@@ -1694,7 +1694,7 @@
 
 #if MICROPY_PY_ASYNC_AWAIT
 STATIC void compile_await_object_method(compiler_t *comp, qstr method) {
-    EMIT_ARG(load_method, method);
+    EMIT_ARG(load_method, method, false);
     EMIT_ARG(call_method, 0, 0, 0);
     compile_yield_from(comp);
 }
@@ -1785,7 +1785,7 @@
         }
 
         compile_load_id(comp, context);
-        EMIT_ARG(load_method, MP_QSTR___aexit__);
+        EMIT_ARG(load_method, MP_QSTR___aexit__, false);
 
         EMIT_ARG(setup_except, try_exception_label);
         compile_increase_except_level(comp);
@@ -2219,9 +2219,20 @@
             return;
         }
 
-        // a super() call
-        EMIT_ARG(call_function, 2, 0, 0);
-        i = 1;
+        if (num_trail >= 3
+            && MP_PARSE_NODE_STRUCT_KIND(pns_trail[1]) == PN_trailer_period
+            && MP_PARSE_NODE_STRUCT_KIND(pns_trail[2]) == PN_trailer_paren) {
+            // optimisation for method calls super().f(...), to eliminate heap allocation
+            mp_parse_node_struct_t *pns_period = pns_trail[1];
+            mp_parse_node_struct_t *pns_paren = pns_trail[2];
+            EMIT_ARG(load_method, MP_PARSE_NODE_LEAF_ARG(pns_period->nodes[0]), true);
+            compile_trailer_paren_helper(comp, pns_paren->nodes[0], true, 0);
+            i = 3;
+        } else {
+            // a super() call
+            EMIT_ARG(call_function, 2, 0, 0);
+            i = 1;
+        }
     }
 
     // compile the remaining trailers
@@ -2232,7 +2243,7 @@
             // optimisation for method calls a.f(...), following PyPy
             mp_parse_node_struct_t *pns_period = pns_trail[i];
             mp_parse_node_struct_t *pns_paren = pns_trail[i + 1];
-            EMIT_ARG(load_method, MP_PARSE_NODE_LEAF_ARG(pns_period->nodes[0]));
+            EMIT_ARG(load_method, MP_PARSE_NODE_LEAF_ARG(pns_period->nodes[0]), false);
             compile_trailer_paren_helper(comp, pns_paren->nodes[0], true, 0);
             i += 1;
         } else {
diff --git a/py/emit.h b/py/emit.h
index 64bb957..0236a9b 100644
--- a/py/emit.h
+++ b/py/emit.h
@@ -88,7 +88,7 @@
     void (*load_const_obj)(emit_t *emit, mp_obj_t obj);
     void (*load_null)(emit_t *emit);
     void (*load_attr)(emit_t *emit, qstr qst);
-    void (*load_method)(emit_t *emit, qstr qst);
+    void (*load_method)(emit_t *emit, qstr qst, bool is_super);
     void (*load_build_class)(emit_t *emit);
     void (*load_subscr)(emit_t *emit);
     void (*store_attr)(emit_t *emit, qstr qst);
@@ -205,7 +205,7 @@
 void mp_emit_bc_load_const_obj(emit_t *emit, mp_obj_t obj);
 void mp_emit_bc_load_null(emit_t *emit);
 void mp_emit_bc_load_attr(emit_t *emit, qstr qst);
-void mp_emit_bc_load_method(emit_t *emit, qstr qst);
+void mp_emit_bc_load_method(emit_t *emit, qstr qst, bool is_super);
 void mp_emit_bc_load_build_class(emit_t *emit);
 void mp_emit_bc_load_subscr(emit_t *emit);
 void mp_emit_bc_store_attr(emit_t *emit, qstr qst);
diff --git a/py/emitbc.c b/py/emitbc.c
index 673cd40..6d8db81 100644
--- a/py/emitbc.c
+++ b/py/emitbc.c
@@ -594,9 +594,9 @@
     }
 }
 
-void mp_emit_bc_load_method(emit_t *emit, qstr qst) {
-    emit_bc_pre(emit, 1);
-    emit_write_bytecode_byte_qstr(emit, MP_BC_LOAD_METHOD, qst);
+void mp_emit_bc_load_method(emit_t *emit, qstr qst, bool is_super) {
+    emit_bc_pre(emit, 1 - 2 * is_super);
+    emit_write_bytecode_byte_qstr(emit, is_super ? MP_BC_LOAD_SUPER_METHOD : MP_BC_LOAD_METHOD, qst);
 }
 
 void mp_emit_bc_load_build_class(emit_t *emit) {
diff --git a/py/emitnative.c b/py/emitnative.c
index 3ab001f..99adc80 100644
--- a/py/emitnative.c
+++ b/py/emitnative.c
@@ -85,6 +85,7 @@
     [MP_F_LOAD_BUILD_CLASS] = 0,
     [MP_F_LOAD_ATTR] = 2,
     [MP_F_LOAD_METHOD] = 3,
+    [MP_F_LOAD_SUPER_METHOD] = 2,
     [MP_F_STORE_NAME] = 2,
     [MP_F_STORE_GLOBAL] = 2,
     [MP_F_STORE_ATTR] = 3,
@@ -1065,12 +1066,18 @@
     emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET);
 }
 
-STATIC void emit_native_load_method(emit_t *emit, qstr qst) {
-    vtype_kind_t vtype_base;
-    emit_pre_pop_reg(emit, &vtype_base, REG_ARG_1); // arg1 = base
-    assert(vtype_base == VTYPE_PYOBJ);
-    emit_get_stack_pointer_to_reg_for_push(emit, REG_ARG_3, 2); // arg3 = dest ptr
-    emit_call_with_imm_arg(emit, MP_F_LOAD_METHOD, qst, REG_ARG_2); // arg2 = method name
+STATIC void emit_native_load_method(emit_t *emit, qstr qst, bool is_super) {
+    if (is_super) {
+        emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_2, 3); // arg2 = dest ptr
+        emit_get_stack_pointer_to_reg_for_push(emit, REG_ARG_2, 2); // arg2 = dest ptr
+        emit_call_with_imm_arg(emit, MP_F_LOAD_SUPER_METHOD, qst, REG_ARG_1); // arg1 = method name
+    } else {
+        vtype_kind_t vtype_base;
+        emit_pre_pop_reg(emit, &vtype_base, REG_ARG_1); // arg1 = base
+        assert(vtype_base == VTYPE_PYOBJ);
+        emit_get_stack_pointer_to_reg_for_push(emit, REG_ARG_3, 2); // arg3 = dest ptr
+        emit_call_with_imm_arg(emit, MP_F_LOAD_METHOD, qst, REG_ARG_2); // arg2 = method name
+    }
 }
 
 STATIC void emit_native_load_build_class(emit_t *emit) {
diff --git a/py/nativeglue.c b/py/nativeglue.c
index 694dfca..c75e5ec 100644
--- a/py/nativeglue.c
+++ b/py/nativeglue.c
@@ -133,6 +133,7 @@
     mp_load_build_class,
     mp_load_attr,
     mp_load_method,
+    mp_load_super_method,
     mp_store_name,
     mp_store_global,
     mp_store_attr,
diff --git a/py/objtype.c b/py/objtype.c
index de1ee8c..2a119e4 100644
--- a/py/objtype.c
+++ b/py/objtype.c
@@ -1070,6 +1070,11 @@
     .attr = super_attr,
 };
 
+void mp_load_super_method(qstr attr, mp_obj_t *dest) {
+    mp_obj_super_t super = {{&mp_type_super}, dest[1], dest[2]};
+    mp_load_method(MP_OBJ_FROM_PTR(&super), attr, dest);
+}
+
 /******************************************************************************/
 // subclassing and built-ins specific to types
 
diff --git a/py/persistentcode.c b/py/persistentcode.c
index 2a9a5b7..a71045a 100644
--- a/py/persistentcode.c
+++ b/py/persistentcode.c
@@ -39,7 +39,7 @@
 #include "py/smallint.h"
 
 // The current version of .mpy files
-#define MPY_VERSION (1)
+#define MPY_VERSION (2)
 
 // The feature flags byte encodes the compile-time config options that
 // affect the generate bytecode.
diff --git a/py/runtime.h b/py/runtime.h
index 1778691..d75d23f 100644
--- a/py/runtime.h
+++ b/py/runtime.h
@@ -131,6 +131,7 @@
 void mp_convert_member_lookup(mp_obj_t obj, const mp_obj_type_t *type, mp_obj_t member, mp_obj_t *dest);
 void mp_load_method(mp_obj_t base, qstr attr, mp_obj_t *dest);
 void mp_load_method_maybe(mp_obj_t base, qstr attr, mp_obj_t *dest);
+void mp_load_super_method(qstr attr, mp_obj_t *dest);
 void mp_store_attr(mp_obj_t base, qstr attr, mp_obj_t val);
 
 mp_obj_t mp_getiter(mp_obj_t o, mp_obj_iter_buf_t *iter_buf);
diff --git a/py/runtime0.h b/py/runtime0.h
index b1ed710..720fe6a 100644
--- a/py/runtime0.h
+++ b/py/runtime0.h
@@ -107,6 +107,7 @@
     MP_F_LOAD_BUILD_CLASS,
     MP_F_LOAD_ATTR,
     MP_F_LOAD_METHOD,
+    MP_F_LOAD_SUPER_METHOD,
     MP_F_STORE_NAME,
     MP_F_STORE_GLOBAL,
     MP_F_STORE_ATTR,
diff --git a/py/showbc.c b/py/showbc.c
index b52905f..0bccf84 100644
--- a/py/showbc.c
+++ b/py/showbc.c
@@ -245,6 +245,11 @@
             printf("LOAD_METHOD %s", qstr_str(qst));
             break;
 
+        case MP_BC_LOAD_SUPER_METHOD:
+            DECODE_QSTR;
+            printf("LOAD_SUPER_METHOD %s", qstr_str(qst));
+            break;
+
         case MP_BC_LOAD_BUILD_CLASS:
             printf("LOAD_BUILD_CLASS");
             break;
diff --git a/py/vm.c b/py/vm.c
index 8ce635c..469528d 100644
--- a/py/vm.c
+++ b/py/vm.c
@@ -376,6 +376,14 @@
                     DISPATCH();
                 }
 
+                ENTRY(MP_BC_LOAD_SUPER_METHOD): {
+                    MARK_EXC_IP_SELECTIVE();
+                    DECODE_QSTR;
+                    sp -= 1;
+                    mp_load_super_method(qst, sp - 1);
+                    DISPATCH();
+                }
+
                 ENTRY(MP_BC_LOAD_BUILD_CLASS):
                     MARK_EXC_IP_SELECTIVE();
                     PUSH(mp_load_build_class());
diff --git a/py/vmentrytable.h b/py/vmentrytable.h
index 8731c3d..dd9789e 100644
--- a/py/vmentrytable.h
+++ b/py/vmentrytable.h
@@ -44,6 +44,7 @@
     [MP_BC_LOAD_GLOBAL] = &&entry_MP_BC_LOAD_GLOBAL,
     [MP_BC_LOAD_ATTR] = &&entry_MP_BC_LOAD_ATTR,
     [MP_BC_LOAD_METHOD] = &&entry_MP_BC_LOAD_METHOD,
+    [MP_BC_LOAD_SUPER_METHOD] = &&entry_MP_BC_LOAD_SUPER_METHOD,
     [MP_BC_LOAD_BUILD_CLASS] = &&entry_MP_BC_LOAD_BUILD_CLASS,
     [MP_BC_LOAD_SUBSCR] = &&entry_MP_BC_LOAD_SUBSCR,
     [MP_BC_STORE_FAST_N] = &&entry_MP_BC_STORE_FAST_N,