py: Compress load-int, load-fast, store-fast, unop, binop bytecodes.

There is a lot potential in compress bytecodes and make more use of the
coding space.  This patch introduces "multi" bytecodes which have their
argument included in the bytecode (by addition).

UNARY_OP and BINARY_OP now no longer take a 1 byte argument for the
opcode.  Rather, the opcode is included in the first byte itself.

LOAD_FAST_[0,1,2] and STORE_FAST_[0,1,2] are removed in favour of their
multi versions, which can take an argument between 0 and 15 inclusive.
The majority of LOAD_FAST/STORE_FAST codes fit in this range and so this
saves a byte for each of these.

LOAD_CONST_SMALL_INT_MULTI is used to load small ints between -16 and 47
inclusive.  Such ints are quite common and now only need 1 byte to
store, and now have much faster decoding.

In all this patch saves about 2% RAM for typically bytecode (1.8% on
64-bit test, 2.5% on pyboard test).  It also reduces the binary size
(because bytecodes are simplified) and doesn't harm performance.
diff --git a/py/vm.c b/py/vm.c
index d959880..36ea10f 100644
--- a/py/vm.c
+++ b/py/vm.c
@@ -223,18 +223,6 @@
                     PUSH(MP_OBJ_NULL);
                     DISPATCH();
 
-                ENTRY(MP_BC_LOAD_FAST_0):
-                    obj_shared = fastn[0];
-                    goto load_check;
-
-                ENTRY(MP_BC_LOAD_FAST_1):
-                    obj_shared = fastn[-1];
-                    goto load_check;
-
-                ENTRY(MP_BC_LOAD_FAST_2):
-                    obj_shared = fastn[-2];
-                    goto load_check;
-
                 ENTRY(MP_BC_LOAD_FAST_N):
                     DECODE_UINT;
                     obj_shared = fastn[-unum];
@@ -288,18 +276,6 @@
                     DISPATCH();
                 }
 
-                ENTRY(MP_BC_STORE_FAST_0):
-                    fastn[0] = POP();
-                    DISPATCH();
-
-                ENTRY(MP_BC_STORE_FAST_1):
-                    fastn[-1] = POP();
-                    DISPATCH();
-
-                ENTRY(MP_BC_STORE_FAST_2):
-                    fastn[-2] = POP();
-                    DISPATCH();
-
                 ENTRY(MP_BC_STORE_FAST_N):
                     DECODE_UINT;
                     fastn[-unum] = POP();
@@ -606,19 +582,6 @@
                     }
                     DISPATCH();
 
-                ENTRY(MP_BC_UNARY_OP):
-                    unum = *ip++;
-                    SET_TOP(mp_unary_op(unum, TOP()));
-                    DISPATCH();
-
-                ENTRY(MP_BC_BINARY_OP): {
-                    unum = *ip++;
-                    mp_obj_t rhs = POP();
-                    mp_obj_t lhs = TOP();
-                    SET_TOP(mp_binary_op(unum, lhs, rhs));
-                    DISPATCH();
-                }
-
                 ENTRY(MP_BC_BUILD_TUPLE):
                     DECODE_UINT;
                     sp -= unum - 1;
@@ -890,7 +853,53 @@
                     mp_import_all(POP());
                     DISPATCH();
 
-                ENTRY_DEFAULT: {
+#if MICROPY_OPT_COMPUTED_GOTO
+                ENTRY(MP_BC_LOAD_CONST_SMALL_INT_MULTI):
+                    PUSH(MP_OBJ_NEW_SMALL_INT((mp_int_t)ip[-1] - MP_BC_LOAD_CONST_SMALL_INT_MULTI - 16));
+                    DISPATCH();
+
+                ENTRY(MP_BC_LOAD_FAST_MULTI):
+                    obj_shared = fastn[MP_BC_LOAD_FAST_MULTI - (mp_int_t)ip[-1]];
+                    goto load_check;
+
+                ENTRY(MP_BC_STORE_FAST_MULTI):
+                    fastn[MP_BC_STORE_FAST_MULTI - (mp_int_t)ip[-1]] = POP();
+                    DISPATCH();
+
+                ENTRY(MP_BC_UNARY_OP_MULTI):
+                    SET_TOP(mp_unary_op(ip[-1] - MP_BC_UNARY_OP_MULTI, TOP()));
+                    DISPATCH();
+
+                ENTRY(MP_BC_BINARY_OP_MULTI): {
+                    mp_obj_t rhs = POP();
+                    mp_obj_t lhs = TOP();
+                    SET_TOP(mp_binary_op(ip[-1] - MP_BC_BINARY_OP_MULTI, lhs, rhs));
+                    DISPATCH();
+                }
+
+                ENTRY_DEFAULT:
+#else
+                ENTRY_DEFAULT:
+                    if (ip[-1] < MP_BC_LOAD_CONST_SMALL_INT_MULTI + 64) {
+                        PUSH(MP_OBJ_NEW_SMALL_INT((mp_int_t)ip[-1] - MP_BC_LOAD_CONST_SMALL_INT_MULTI - 16));
+                        DISPATCH();
+                    } else if (ip[-1] < MP_BC_LOAD_FAST_MULTI + 16) {
+                        obj_shared = fastn[MP_BC_LOAD_FAST_MULTI - (mp_int_t)ip[-1]];
+                        goto load_check;
+                    } else if (ip[-1] < MP_BC_STORE_FAST_MULTI + 16) {
+                        fastn[MP_BC_STORE_FAST_MULTI - (mp_int_t)ip[-1]] = POP();
+                        DISPATCH();
+                    } else if (ip[-1] < MP_BC_UNARY_OP_MULTI + 5) {
+                        SET_TOP(mp_unary_op(ip[-1] - MP_BC_UNARY_OP_MULTI, TOP()));
+                        DISPATCH();
+                    } else if (ip[-1] < MP_BC_BINARY_OP_MULTI + 35) {
+                        mp_obj_t rhs = POP();
+                        mp_obj_t lhs = TOP();
+                        SET_TOP(mp_binary_op(ip[-1] - MP_BC_BINARY_OP_MULTI, lhs, rhs));
+                        DISPATCH();
+                    } else
+#endif
+                {
                     mp_obj_t obj = mp_obj_new_exception_msg(&mp_type_NotImplementedError, "byte code not implemented");
                     nlr_pop();
                     fastn[0] = obj;