py: Add LOAD_SUPER_METHOD bytecode to allow heap-free super meth calls.

This patch allows the following code to run without allocating on the heap:

    super().foo(...)

Before this patch such a call would allocate a super object on the heap and
then load the foo method and call it right away.  The super object is only
needed to perform the lookup of the method and not needed after that.  This
patch makes an optimisation to allocate the super object on the C stack and
discard it right after use.

Changes in code size due to this patch are:

   bare-arm: +128
    minimal: +232
   unix x64: +416
unix nanbox: +364
     stmhal: +184
    esp8266: +340
     cc3200: +128
diff --git a/py/compile.c b/py/compile.c
index 42c2cc3..8533e05 100644
--- a/py/compile.c
+++ b/py/compile.c
@@ -1694,7 +1694,7 @@
 
 #if MICROPY_PY_ASYNC_AWAIT
 STATIC void compile_await_object_method(compiler_t *comp, qstr method) {
-    EMIT_ARG(load_method, method);
+    EMIT_ARG(load_method, method, false);
     EMIT_ARG(call_method, 0, 0, 0);
     compile_yield_from(comp);
 }
@@ -1785,7 +1785,7 @@
         }
 
         compile_load_id(comp, context);
-        EMIT_ARG(load_method, MP_QSTR___aexit__);
+        EMIT_ARG(load_method, MP_QSTR___aexit__, false);
 
         EMIT_ARG(setup_except, try_exception_label);
         compile_increase_except_level(comp);
@@ -2219,9 +2219,20 @@
             return;
         }
 
-        // a super() call
-        EMIT_ARG(call_function, 2, 0, 0);
-        i = 1;
+        if (num_trail >= 3
+            && MP_PARSE_NODE_STRUCT_KIND(pns_trail[1]) == PN_trailer_period
+            && MP_PARSE_NODE_STRUCT_KIND(pns_trail[2]) == PN_trailer_paren) {
+            // optimisation for method calls super().f(...), to eliminate heap allocation
+            mp_parse_node_struct_t *pns_period = pns_trail[1];
+            mp_parse_node_struct_t *pns_paren = pns_trail[2];
+            EMIT_ARG(load_method, MP_PARSE_NODE_LEAF_ARG(pns_period->nodes[0]), true);
+            compile_trailer_paren_helper(comp, pns_paren->nodes[0], true, 0);
+            i = 3;
+        } else {
+            // a super() call
+            EMIT_ARG(call_function, 2, 0, 0);
+            i = 1;
+        }
     }
 
     // compile the remaining trailers
@@ -2232,7 +2243,7 @@
             // optimisation for method calls a.f(...), following PyPy
             mp_parse_node_struct_t *pns_period = pns_trail[i];
             mp_parse_node_struct_t *pns_paren = pns_trail[i + 1];
-            EMIT_ARG(load_method, MP_PARSE_NODE_LEAF_ARG(pns_period->nodes[0]));
+            EMIT_ARG(load_method, MP_PARSE_NODE_LEAF_ARG(pns_period->nodes[0]), false);
             compile_trailer_paren_helper(comp, pns_paren->nodes[0], true, 0);
             i += 1;
         } else {