py: Partially fix native emitter to work with latest runtime.

Native emitter has been broken since stack order has changed from
reverse to standard.  This fix gets it partially working.
diff --git a/py/asmx64.c b/py/asmx64.c
index de34332..197ccd8 100644
--- a/py/asmx64.c
+++ b/py/asmx64.c
@@ -94,6 +94,7 @@
 
     uint max_num_labels;
     int *label_offsets;
+    int num_locals;
 };
 
 // for allocating memory, see src/v8/src/platform-linux.cc
@@ -108,8 +109,8 @@
     return ptr;
 }
 
-asm_x64_t* asm_x64_new(uint max_num_labels) {
-    asm_x64_t* as;
+asm_x64_t *asm_x64_new(uint max_num_labels) {
+    asm_x64_t *as;
 
     as = m_new(asm_x64_t, 1);
     as->pass = 0;
@@ -118,11 +119,12 @@
     as->code_base = NULL;
     as->max_num_labels = max_num_labels;
     as->label_offsets = m_new(int, max_num_labels);
+    as->num_locals = 0;
 
     return as;
 }
 
-void asm_x64_free(asm_x64_t* as, bool free_code) {
+void asm_x64_free(asm_x64_t *as, bool free_code) {
     if (free_code) {
         // need to un-mmap
         //m_free(as->code_base);
@@ -174,7 +176,7 @@
 }
 
 // all functions must go through this one to emit bytes
-static byte* asm_x64_get_cur_to_write_bytes(asm_x64_t* as, int num_bytes_to_write) {
+static byte *asm_x64_get_cur_to_write_bytes(asm_x64_t *as, int num_bytes_to_write) {
     //printf("emit %d\n", num_bytes_to_write);
     if (as->pass < ASM_X64_PASS_3) {
         as->code_offset += num_bytes_to_write;
@@ -187,33 +189,33 @@
     }
 }
 
-uint asm_x64_get_code_size(asm_x64_t* as) {
+uint asm_x64_get_code_size(asm_x64_t *as) {
     return as->code_size;
 }
 
-void* asm_x64_get_code(asm_x64_t* as) {
+void *asm_x64_get_code(asm_x64_t *as) {
     return as->code_base;
 }
 
-static void asm_x64_write_byte_1(asm_x64_t* as, byte b1) {
+static void asm_x64_write_byte_1(asm_x64_t *as, byte b1) {
     byte* c = asm_x64_get_cur_to_write_bytes(as, 1);
     c[0] = b1;
 }
 
-static void asm_x64_write_byte_2(asm_x64_t* as, byte b1, byte b2) {
+static void asm_x64_write_byte_2(asm_x64_t *as, byte b1, byte b2) {
     byte* c = asm_x64_get_cur_to_write_bytes(as, 2);
     c[0] = b1;
     c[1] = b2;
 }
 
-static void asm_x64_write_byte_3(asm_x64_t* as, byte b1, byte b2, byte b3) {
+static void asm_x64_write_byte_3(asm_x64_t *as, byte b1, byte b2, byte b3) {
     byte* c = asm_x64_get_cur_to_write_bytes(as, 3);
     c[0] = b1;
     c[1] = b2;
     c[2] = b3;
 }
 
-static void asm_x64_write_word32(asm_x64_t* as, int w32) {
+static void asm_x64_write_word32(asm_x64_t *as, int w32) {
     byte* c = asm_x64_get_cur_to_write_bytes(as, 4);
     c[0] = IMM32_L0(w32);
     c[1] = IMM32_L1(w32);
@@ -221,7 +223,7 @@
     c[3] = IMM32_L3(w32);
 }
 
-static void asm_x64_write_word64(asm_x64_t* as, int64_t w64) {
+static void asm_x64_write_word64(asm_x64_t *as, int64_t w64) {
     byte* c = asm_x64_get_cur_to_write_bytes(as, 8);
     c[0] = IMM32_L0(w64);
     c[1] = IMM32_L1(w64);
@@ -234,7 +236,7 @@
 }
 
 /* unused
-static void asm_x64_write_word32_to(asm_x64_t* as, int offset, int w32) {
+static void asm_x64_write_word32_to(asm_x64_t *as, int offset, int w32) {
     byte* c;
     assert(offset + 4 <= as->code_size);
     c = as->code_base + offset;
@@ -245,7 +247,7 @@
 }
 */
 
-static void asm_x64_write_r64_disp(asm_x64_t* as, int r64, int disp_r64, int disp_offset) {
+static void asm_x64_write_r64_disp(asm_x64_t *as, int r64, int disp_r64, int disp_offset) {
     assert(disp_r64 != REG_RSP);
 
     if (disp_offset == 0 && disp_r64 != REG_RBP) {
@@ -258,60 +260,55 @@
     }
 }
 
-void asm_x64_nop(asm_x64_t* as)
-{
+void asm_x64_nop(asm_x64_t *as) {
     asm_x64_write_byte_1(as, OPCODE_NOP);
 }
 
-void asm_x64_push_r64(asm_x64_t* as, int src_r64)
-{
+void asm_x64_push_r64(asm_x64_t *as, int src_r64) {
     asm_x64_write_byte_1(as, OPCODE_PUSH_R64 | src_r64);
 }
 
-void asm_x64_push_i32(asm_x64_t* as, int src_i32)
-{
+void asm_x64_push_i32(asm_x64_t *as, int src_i32) {
     asm_x64_write_byte_1(as, OPCODE_PUSH_I64);
     asm_x64_write_word32(as, src_i32); // will be sign extended to 64 bits
 }
 
-void asm_x64_push_disp(asm_x64_t* as, int src_r64, int src_offset) {
+void asm_x64_push_disp(asm_x64_t *as, int src_r64, int src_offset) {
     asm_x64_write_byte_1(as, OPCODE_PUSH_M64);
     asm_x64_write_r64_disp(as, 6, src_r64, src_offset);
 }
 
-void asm_x64_pop_r64(asm_x64_t* as, int dest_r64)
-{
+void asm_x64_pop_r64(asm_x64_t *as, int dest_r64) {
     asm_x64_write_byte_1(as, OPCODE_POP_R64 | dest_r64);
 }
 
-static void asm_x64_ret(asm_x64_t* as)
-{
+static void asm_x64_ret(asm_x64_t *as) {
     asm_x64_write_byte_1(as, OPCODE_RET);
 }
 
-void asm_x64_mov_r32_to_r32(asm_x64_t* as, int src_r32, int dest_r32) {
+void asm_x64_mov_r32_to_r32(asm_x64_t *as, int src_r32, int dest_r32) {
     // defaults to 32 bit operation
     asm_x64_write_byte_2(as, OPCODE_MOV_R64_TO_RM64, MODRM_R64(src_r32) | MODRM_RM_REG | MODRM_RM_R64(dest_r32));
 }
 
-void asm_x64_mov_r64_to_r64(asm_x64_t* as, int src_r64, int dest_r64) {
+void asm_x64_mov_r64_to_r64(asm_x64_t *as, int src_r64, int dest_r64) {
     // use REX prefix for 64 bit operation
     asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_MOV_R64_TO_RM64, MODRM_R64(src_r64) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
 }
 
-void asm_x64_mov_r64_to_disp(asm_x64_t* as, int src_r64, int dest_r64, int dest_disp) {
+void asm_x64_mov_r64_to_disp(asm_x64_t *as, int src_r64, int dest_r64, int dest_disp) {
     // use REX prefix for 64 bit operation
     asm_x64_write_byte_2(as, REX_PREFIX | REX_W, OPCODE_MOV_R64_TO_RM64);
     asm_x64_write_r64_disp(as, src_r64, dest_r64, dest_disp);
 }
 
-void asm_x64_mov_disp_to_r64(asm_x64_t* as, int src_r64, int src_disp, int dest_r64) {
+void asm_x64_mov_disp_to_r64(asm_x64_t *as, int src_r64, int src_disp, int dest_r64) {
     // use REX prefix for 64 bit operation
     asm_x64_write_byte_2(as, REX_PREFIX | REX_W, OPCODE_MOV_RM64_TO_R64);
     asm_x64_write_r64_disp(as, dest_r64, src_r64, src_disp);
 }
 
-void asm_x64_lea_disp_to_r64(asm_x64_t* as, int src_r64, int src_disp, int dest_r64) {
+void asm_x64_lea_disp_to_r64(asm_x64_t *as, int src_r64, int src_disp, int dest_r64) {
     // use REX prefix for 64 bit operation
     asm_x64_write_byte_2(as, REX_PREFIX | REX_W, OPCODE_LEA_MEM_TO_R64);
     asm_x64_write_r64_disp(as, dest_r64, src_r64, src_disp);
@@ -321,13 +318,13 @@
     asm_x64_write_byte_2(as, OPCODE_MOV_I8_TO_R8 | dest_r64, src_i8);
 }
 
-void asm_x64_mov_i32_to_r64(asm_x64_t* as, int src_i32, int dest_r64) {
+void asm_x64_mov_i32_to_r64(asm_x64_t *as, int src_i32, int dest_r64) {
     // cpu defaults to i32 to r64, with zero extension
     asm_x64_write_byte_1(as, OPCODE_MOV_I64_TO_R64 | dest_r64);
     asm_x64_write_word32(as, src_i32);
 }
 
-void asm_x64_mov_i64_to_r64(asm_x64_t* as, int64_t src_i64, int dest_r64) {
+void asm_x64_mov_i64_to_r64(asm_x64_t *as, int64_t src_i64, int dest_r64) {
     // cpu defaults to i32 to r64
     // to mov i64 to r64 need to use REX prefix
     asm_x64_write_byte_2(as, REX_PREFIX | REX_W, OPCODE_MOV_I64_TO_R64 | dest_r64);
@@ -344,7 +341,7 @@
     }
 }
 
-void asm_x64_mov_i32_to_disp(asm_x64_t* as, int src_i32, int dest_r32, int dest_disp)
+void asm_x64_mov_i32_to_disp(asm_x64_t *as, int src_i32, int dest_r32, int dest_disp)
 {
     assert(0);
     asm_x64_write_byte_1(as, OPCODE_MOV_I32_TO_RM32);
@@ -356,11 +353,11 @@
     asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_XOR_R64_TO_RM64, MODRM_R64(src_r64) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
 }
 
-void asm_x64_add_r64_to_r64(asm_x64_t* as, int src_r64, int dest_r64) {
+void asm_x64_add_r64_to_r64(asm_x64_t *as, int src_r64, int dest_r64) {
     asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_ADD_R64_TO_RM64, MODRM_R64(src_r64) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
 }
 
-void asm_x64_add_i32_to_r32(asm_x64_t* as, int src_i32, int dest_r32)
+void asm_x64_add_i32_to_r32(asm_x64_t *as, int src_i32, int dest_r32)
 {
     assert(dest_r32 != REG_RSP); // in this case i think src_i32 must be 64 bits
     if (SIGNED_FIT8(src_i32))
@@ -375,17 +372,17 @@
     }
 }
 
-void asm_x64_sub_r32_from_r32(asm_x64_t* as, int src_r32, int dest_r32) {
+void asm_x64_sub_r32_from_r32(asm_x64_t *as, int src_r32, int dest_r32) {
     // defaults to 32 bit operation
     asm_x64_write_byte_2(as, OPCODE_SUB_R64_FROM_RM64, MODRM_R64(src_r32) | MODRM_RM_REG | MODRM_RM_R64(dest_r32));
 }
 
-void asm_x64_sub_r64_from_r64(asm_x64_t* as, int src_r64, int dest_r64) {
+void asm_x64_sub_r64_from_r64(asm_x64_t *as, int src_r64, int dest_r64) {
     // use REX prefix for 64 bit operation
     asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_SUB_R64_FROM_RM64, MODRM_R64(src_r64) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
 }
 
-void asm_x64_sub_i32_from_r32(asm_x64_t* as, int src_i32, int dest_r32) {
+void asm_x64_sub_i32_from_r32(asm_x64_t *as, int src_i32, int dest_r32) {
     if (SIGNED_FIT8(src_i32)) {
         // defaults to 32 bit operation
         asm_x64_write_byte_2(as, OPCODE_SUB_I8_FROM_RM64, MODRM_R64(5) | MODRM_RM_REG | MODRM_RM_R64(dest_r32));
@@ -397,7 +394,7 @@
     }
 }
 
-void asm_x64_sub_i32_from_r64(asm_x64_t* as, int src_i32, int dest_r64) {
+void asm_x64_sub_i32_from_r64(asm_x64_t *as, int src_i32, int dest_r64) {
     if (SIGNED_FIT8(src_i32)) {
         // use REX prefix for 64 bit operation
         asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_SUB_I8_FROM_RM64, MODRM_R64(5) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
@@ -410,38 +407,38 @@
 }
 
 /* shifts not tested */
-void asm_x64_shl_r32_by_imm(asm_x64_t* as, int r32, int imm) {
+void asm_x64_shl_r32_by_imm(asm_x64_t *as, int r32, int imm) {
     asm_x64_write_byte_2(as, OPCODE_SHL_RM32_BY_I8, MODRM_R64(4) | MODRM_RM_REG | MODRM_RM_R64(r32));
     asm_x64_write_byte_1(as, imm);
 }
 
-void asm_x64_shr_r32_by_imm(asm_x64_t* as, int r32, int imm) {
+void asm_x64_shr_r32_by_imm(asm_x64_t *as, int r32, int imm) {
     asm_x64_write_byte_2(as, OPCODE_SHR_RM32_BY_I8, MODRM_R64(5) | MODRM_RM_REG | MODRM_RM_R64(r32));
     asm_x64_write_byte_1(as, imm);
 }
 
-void asm_x64_sar_r32_by_imm(asm_x64_t* as, int r32, int imm) {
+void asm_x64_sar_r32_by_imm(asm_x64_t *as, int r32, int imm) {
     asm_x64_write_byte_2(as, OPCODE_SAR_RM32_BY_I8, MODRM_R64(7) | MODRM_RM_REG | MODRM_RM_R64(r32));
     asm_x64_write_byte_1(as, imm);
 }
 
-void asm_x64_cmp_r64_with_r64(asm_x64_t* as, int src_r64_a, int src_r64_b) {
+void asm_x64_cmp_r64_with_r64(asm_x64_t *as, int src_r64_a, int src_r64_b) {
     asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_CMP_R64_WITH_RM64, MODRM_R64(src_r64_a) | MODRM_RM_REG | MODRM_RM_R64(src_r64_b));
 }
 
-void asm_x64_cmp_r32_with_disp(asm_x64_t* as, int src_r32_a, int src_r32_b, int src_disp_b) {
+void asm_x64_cmp_r32_with_disp(asm_x64_t *as, int src_r32_a, int src_r32_b, int src_disp_b) {
     assert(0);
     asm_x64_write_byte_1(as, OPCODE_CMP_R64_WITH_RM64);
     //asm_x64_write_r32_disp(as, src_r32_a, src_r32_b, src_disp_b);
 }
 
-void asm_x64_cmp_disp_with_r32(asm_x64_t* as, int src_r32_a, int src_disp_a, int src_r32_b) {
+void asm_x64_cmp_disp_with_r32(asm_x64_t *as, int src_r32_a, int src_disp_a, int src_r32_b) {
     assert(0);
     asm_x64_write_byte_1(as, OPCODE_CMP_RM32_WITH_R32);
     //asm_x64_write_r32_disp(as, src_r32_b, src_r32_a, src_disp_a);
 }
 
-void asm_x64_cmp_i32_with_r32(asm_x64_t* as, int src_i32, int src_r32) {
+void asm_x64_cmp_i32_with_r32(asm_x64_t *as, int src_i32, int src_r32) {
     if (SIGNED_FIT8(src_i32)) {
         asm_x64_write_byte_2(as, OPCODE_CMP_I8_WITH_RM32, MODRM_R64(7) | MODRM_RM_REG | MODRM_RM_R64(src_r32));
         asm_x64_write_byte_1(as, src_i32 & 0xff);
@@ -451,18 +448,18 @@
     }
 }
 
-void asm_x64_test_r8_with_r8(asm_x64_t* as, int src_r64_a, int src_r64_b) {
+void asm_x64_test_r8_with_r8(asm_x64_t *as, int src_r64_a, int src_r64_b) {
     // TODO implement for other registers
     assert(src_r64_a == REG_RAX);
     assert(src_r64_b == REG_RAX);
     asm_x64_write_byte_2(as, OPCODE_TEST_R8_WITH_RM8, MODRM_R64(src_r64_a) | MODRM_RM_REG | MODRM_RM_R64(src_r64_b));
 }
 
-void asm_x64_setcc_r8(asm_x64_t* as, int jcc_type, int dest_r8) {
+void asm_x64_setcc_r8(asm_x64_t *as, int jcc_type, int dest_r8) {
     asm_x64_write_byte_3(as, OPCODE_SETCC_RM8_A, OPCODE_SETCC_RM8_B | jcc_type, MODRM_R64(0) | MODRM_RM_REG | MODRM_RM_R64(dest_r8));
 }
 
-void asm_x64_label_assign(asm_x64_t* as, int label) {
+void asm_x64_label_assign(asm_x64_t *as, int label) {
     assert(label < as->max_num_labels);
     if (as->pass == ASM_X64_PASS_2) {
         // assign label offset
@@ -524,7 +521,7 @@
     }
 }
 
-void asm_x64_entry(asm_x64_t* as, int num_locals) {
+void asm_x64_entry(asm_x64_t *as, int num_locals) {
     asm_x64_push_r64(as, REG_RBP);
     asm_x64_mov_r64_to_r64(as, REG_RSP, REG_RBP);
     if (num_locals < 0) {
@@ -533,44 +530,55 @@
     num_locals |= 1; // make it odd so stack is aligned on 16 byte boundary
     asm_x64_sub_i32_from_r64(as, num_locals * WORD_SIZE, REG_RSP);
     asm_x64_push_r64(as, REG_RBX);
+    as->num_locals = num_locals;
 }
 
-void asm_x64_exit(asm_x64_t* as) {
+void asm_x64_exit(asm_x64_t *as) {
     asm_x64_pop_r64(as, REG_RBX);
     asm_x64_write_byte_1(as, OPCODE_LEAVE);
     asm_x64_ret(as);
 }
 
-void asm_x64_push_arg(asm_x64_t* as, int src_arg_num) {
+void asm_x64_push_arg(asm_x64_t *as, int src_arg_num) {
     assert(0);
     asm_x64_push_disp(as, REG_RBP, 8 + src_arg_num * WORD_SIZE);
 }
 
-void asm_x64_mov_arg_to_r32(asm_x64_t* as, int src_arg_num, int dest_r32) {
+void asm_x64_mov_arg_to_r32(asm_x64_t *as, int src_arg_num, int dest_r32) {
     assert(0);
     //asm_x64_mov_disp_to_r32(as, REG_RBP, 8 + src_arg_num * WORD_SIZE, dest_r32);
 }
 
-void asm_x64_mov_r32_to_arg(asm_x64_t* as, int src_r32, int dest_arg_num) {
+void asm_x64_mov_r32_to_arg(asm_x64_t *as, int src_r32, int dest_arg_num) {
     assert(0);
     //asm_x64_mov_r32_to_disp(as, src_r32, REG_RBP, 8 + dest_arg_num * WORD_SIZE);
 }
 
-static int asm_x64_local_offset_from_ebp(int local_num)
-{
-    return -(local_num + 1) * WORD_SIZE;
+// locals:
+//  - stored on the stack in ascending order
+//  - numbered 0 through as->num_locals-1
+//  - RBP points above the last local
+//
+//                          | RPB
+//                          v
+//  l0  l1  l2  ...  l(n-1)
+//  ^                ^
+//  | low address    | high address in RAM
+//
+static int asm_x64_local_offset_from_ebp(asm_x64_t *as, int local_num) {
+    return (-as->num_locals + local_num) * WORD_SIZE;
 }
 
-void asm_x64_mov_local_to_r64(asm_x64_t* as, int src_local_num, int dest_r64) {
-    asm_x64_mov_disp_to_r64(as, REG_RBP, asm_x64_local_offset_from_ebp(src_local_num), dest_r64);
+void asm_x64_mov_local_to_r64(asm_x64_t *as, int src_local_num, int dest_r64) {
+    asm_x64_mov_disp_to_r64(as, REG_RBP, asm_x64_local_offset_from_ebp(as, src_local_num), dest_r64);
 }
 
-void asm_x64_mov_r64_to_local(asm_x64_t* as, int src_r64, int dest_local_num) {
-    asm_x64_mov_r64_to_disp(as, src_r64, REG_RBP, asm_x64_local_offset_from_ebp(dest_local_num));
+void asm_x64_mov_r64_to_local(asm_x64_t *as, int src_r64, int dest_local_num) {
+    asm_x64_mov_r64_to_disp(as, src_r64, REG_RBP, asm_x64_local_offset_from_ebp(as, dest_local_num));
 }
 
-void asm_x64_mov_local_addr_to_r64(asm_x64_t* as, int local_num, int dest_r64) {
-    int offset = asm_x64_local_offset_from_ebp(local_num);
+void asm_x64_mov_local_addr_to_r64(asm_x64_t *as, int local_num, int dest_r64) {
+    int offset = asm_x64_local_offset_from_ebp(as, local_num);
     if (offset == 0) {
         asm_x64_mov_r64_to_r64(as, REG_RBP, dest_r64);
     } else {
@@ -578,21 +586,21 @@
     }
 }
 
-void asm_x64_push_local(asm_x64_t* as, int local_num) {
-    asm_x64_push_disp(as, REG_RBP, asm_x64_local_offset_from_ebp(local_num));
+void asm_x64_push_local(asm_x64_t *as, int local_num) {
+    asm_x64_push_disp(as, REG_RBP, asm_x64_local_offset_from_ebp(as, local_num));
 }
 
-void asm_x64_push_local_addr(asm_x64_t* as, int local_num, int temp_r64)
+void asm_x64_push_local_addr(asm_x64_t *as, int local_num, int temp_r64)
 {
     asm_x64_mov_r64_to_r64(as, REG_RBP, temp_r64);
-    asm_x64_add_i32_to_r32(as, asm_x64_local_offset_from_ebp(local_num), temp_r64);
+    asm_x64_add_i32_to_r32(as, asm_x64_local_offset_from_ebp(as, local_num), temp_r64);
     asm_x64_push_r64(as, temp_r64);
 }
 
 /*
    can't use these because code might be relocated when resized
 
-void asm_x64_call(asm_x64_t* as, void* func)
+void asm_x64_call(asm_x64_t *as, void* func)
 {
     asm_x64_sub_i32_from_r32(as, 8, REG_RSP);
     asm_x64_write_byte_1(as, OPCODE_CALL_REL32);
@@ -600,7 +608,7 @@
     asm_x64_mov_r64_to_r64(as, REG_RBP, REG_RSP);
 }
 
-void asm_x64_call_i1(asm_x64_t* as, void* func, int i1)
+void asm_x64_call_i1(asm_x64_t *as, void* func, int i1)
 {
     asm_x64_sub_i32_from_r32(as, 8, REG_RSP);
     asm_x64_sub_i32_from_r32(as, 12, REG_RSP);
@@ -612,7 +620,7 @@
 }
 */
 
-void asm_x64_call_ind(asm_x64_t* as, void *ptr, int temp_r64) {
+void asm_x64_call_ind(asm_x64_t *as, void *ptr, int temp_r64) {
 #ifdef __LP64__
     asm_x64_mov_i64_to_r64_optimised(as, (int64_t)ptr, temp_r64);
 #else