py: Implement more binary ops for viper emitter.

This included a bit of restructuring of the assembler backends.  Note
that the ARM backend is missing a few functions and won't compile.
diff --git a/py/asmx64.c b/py/asmx64.c
index 8d074dc..3f11178 100644
--- a/py/asmx64.c
+++ b/py/asmx64.c
@@ -54,19 +54,21 @@
 #define OPCODE_MOV_RM64_TO_R64   (0x8b)
 #define OPCODE_LEA_MEM_TO_R64    (0x8d) /* /r */
 #define OPCODE_XOR_R64_TO_RM64   (0x31) /* /r */
-#define OPCODE_ADD_R64_TO_RM64   (0x01)
+#define OPCODE_ADD_R64_TO_RM64   (0x01) /* /r */
 #define OPCODE_ADD_I32_TO_RM32   (0x81) /* /0 */
 #define OPCODE_ADD_I8_TO_RM32    (0x83) /* /0 */
 #define OPCODE_SUB_R64_FROM_RM64 (0x29)
 #define OPCODE_SUB_I32_FROM_RM64 (0x81) /* /5 */
 #define OPCODE_SUB_I8_FROM_RM64  (0x83) /* /5 */
-#define OPCODE_SHL_RM32_BY_I8    (0xc1) /* /4 */
-#define OPCODE_SHR_RM32_BY_I8    (0xc1) /* /5 */
-#define OPCODE_SAR_RM32_BY_I8    (0xc1) /* /7 */
-#define OPCODE_CMP_I32_WITH_RM32 (0x81) /* /7 */
-#define OPCODE_CMP_I8_WITH_RM32  (0x83) /* /7 */
-#define OPCODE_CMP_R64_WITH_RM64 (0x39)
-#define OPCODE_CMP_RM32_WITH_R32 (0x3b)
+//#define OPCODE_SHL_RM32_BY_I8    (0xc1) /* /4 */
+//#define OPCODE_SHR_RM32_BY_I8    (0xc1) /* /5 */
+//#define OPCODE_SAR_RM32_BY_I8    (0xc1) /* /7 */
+#define OPCODE_SHL_RM64_CL       (0xd3) /* /4 */
+#define OPCODE_SAR_RM64_CL       (0xd3) /* /7 */
+//#define OPCODE_CMP_I32_WITH_RM32 (0x81) /* /7 */
+//#define OPCODE_CMP_I8_WITH_RM32  (0x83) /* /7 */
+#define OPCODE_CMP_R64_WITH_RM64 (0x39) /* /r */
+//#define OPCODE_CMP_RM32_WITH_R32 (0x3b)
 #define OPCODE_TEST_R8_WITH_RM8  (0x84) /* /r */
 #define OPCODE_JMP_REL8          (0xeb)
 #define OPCODE_JMP_REL32         (0xe9)
@@ -253,6 +255,10 @@
     }
 }
 
+STATIC void asm_x64_generic_r64_r64(asm_x64_t *as, int dest_r64, int src_r64, int op) {
+    asm_x64_write_byte_3(as, REX_PREFIX | REX_W | (src_r64 < 8 ? 0 : REX_R) | (dest_r64 < 8 ? 0 : REX_B), op, MODRM_R64(src_r64) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
+}
+
 void asm_x64_nop(asm_x64_t *as) {
     asm_x64_write_byte_1(as, OPCODE_NOP);
 }
@@ -290,9 +296,8 @@
     asm_x64_write_byte_1(as, OPCODE_RET);
 }
 
-void asm_x64_mov_r64_to_r64(asm_x64_t *as, int src_r64, int dest_r64) {
-    // use REX prefix for 64 bit operation
-    asm_x64_write_byte_3(as, REX_PREFIX | REX_W | (src_r64 < 8 ? 0 : REX_R) | (dest_r64 < 8 ? 0 : REX_B), OPCODE_MOV_R64_TO_RM64, MODRM_R64(src_r64) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
+void asm_x64_mov_r64_r64(asm_x64_t *as, int dest_r64, int src_r64) {
+    asm_x64_generic_r64_r64(as, dest_r64, src_r64, OPCODE_MOV_R64_TO_RM64);
 }
 
 void asm_x64_mov_r8_to_disp(asm_x64_t *as, int src_r64, int dest_r64, int dest_disp) {
@@ -377,30 +382,24 @@
     asm_x64_mov_i64_to_r64(as, src_i64, dest_r64);
 }
 
-void asm_x64_xor_r64_to_r64(asm_x64_t *as, int src_r64, int dest_r64) {
-    assert(src_r64 < 8);
-    assert(dest_r64 < 8);
-    asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_XOR_R64_TO_RM64, MODRM_R64(src_r64) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
+void asm_x64_xor_r64_r64(asm_x64_t *as, int dest_r64, int src_r64) {
+    asm_x64_generic_r64_r64(as, dest_r64, src_r64, OPCODE_XOR_R64_TO_RM64);
 }
 
-void asm_x64_add_r64_to_r64(asm_x64_t *as, int src_r64, int dest_r64) {
-    assert(src_r64 < 8);
-    assert(dest_r64 < 8);
-    asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_ADD_R64_TO_RM64, MODRM_R64(src_r64) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
+void asm_x64_shl_r64_cl(asm_x64_t* as, int dest_r64) {
+    asm_x64_generic_r64_r64(as, dest_r64, 4, OPCODE_SHL_RM64_CL);
 }
 
-/*
-void asm_x64_sub_r32_from_r32(asm_x64_t *as, int src_r32, int dest_r32) {
-    // defaults to 32 bit operation
-    asm_x64_write_byte_2(as, OPCODE_SUB_R64_FROM_RM64, MODRM_R64(src_r32) | MODRM_RM_REG | MODRM_RM_R64(dest_r32));
+void asm_x64_sar_r64_cl(asm_x64_t* as, int dest_r64) {
+    asm_x64_generic_r64_r64(as, dest_r64, 7, OPCODE_SAR_RM64_CL);
 }
-*/
 
-void asm_x64_sub_r64_from_r64(asm_x64_t *as, int src_r64, int dest_r64) {
-    // use REX prefix for 64 bit operation
-    assert(src_r64 < 8);
-    assert(dest_r64 < 8);
-    asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_SUB_R64_FROM_RM64, MODRM_R64(src_r64) | MODRM_RM_REG | MODRM_RM_R64(dest_r64));
+void asm_x64_add_r64_r64(asm_x64_t *as, int dest_r64, int src_r64) {
+    asm_x64_generic_r64_r64(as, dest_r64, src_r64, OPCODE_ADD_R64_TO_RM64);
+}
+
+void asm_x64_sub_r64_r64(asm_x64_t *as, int dest_r64, int src_r64) {
+    asm_x64_generic_r64_r64(as, dest_r64, src_r64, OPCODE_SUB_R64_FROM_RM64);
 }
 
 /*
@@ -417,7 +416,7 @@
 }
 */
 
-void asm_x64_sub_i32_from_r64(asm_x64_t *as, int src_i32, int dest_r64) {
+STATIC void asm_x64_sub_r64_i32(asm_x64_t *as, int dest_r64, int src_i32) {
     assert(dest_r64 < 8);
     if (SIGNED_FIT8(src_i32)) {
         // use REX prefix for 64 bit operation
@@ -448,9 +447,7 @@
 */
 
 void asm_x64_cmp_r64_with_r64(asm_x64_t *as, int src_r64_a, int src_r64_b) {
-    assert(src_r64_a < 8);
-    assert(src_r64_b < 8);
-    asm_x64_write_byte_3(as, REX_PREFIX | REX_W, OPCODE_CMP_R64_WITH_RM64, MODRM_R64(src_r64_a) | MODRM_RM_REG | MODRM_RM_R64(src_r64_b));
+    asm_x64_generic_r64_r64(as, src_r64_b, src_r64_a, OPCODE_CMP_R64_WITH_RM64);
 }
 
 /*
@@ -541,12 +538,12 @@
 
 void asm_x64_entry(asm_x64_t *as, int num_locals) {
     asm_x64_push_r64(as, ASM_X64_REG_RBP);
-    asm_x64_mov_r64_to_r64(as, ASM_X64_REG_RSP, ASM_X64_REG_RBP);
+    asm_x64_mov_r64_r64(as, ASM_X64_REG_RBP, ASM_X64_REG_RSP);
     if (num_locals < 0) {
         num_locals = 0;
     }
     num_locals |= 1; // make it odd so stack is aligned on 16 byte boundary
-    asm_x64_sub_i32_from_r64(as, num_locals * WORD_SIZE, ASM_X64_REG_RSP);
+    asm_x64_sub_r64_i32(as, ASM_X64_REG_RSP, num_locals * WORD_SIZE);
     asm_x64_push_r64(as, ASM_X64_REG_RBX);
     asm_x64_push_r64(as, ASM_X64_REG_R12);
     asm_x64_push_r64(as, ASM_X64_REG_R13);
@@ -587,7 +584,7 @@
 void asm_x64_mov_local_addr_to_r64(asm_x64_t *as, int local_num, int dest_r64) {
     int offset = asm_x64_local_offset_from_ebp(as, local_num);
     if (offset == 0) {
-        asm_x64_mov_r64_to_r64(as, ASM_X64_REG_RBP, dest_r64);
+        asm_x64_mov_r64_r64(as, dest_r64, ASM_X64_REG_RBP);
     } else {
         asm_x64_lea_disp_to_r64(as, ASM_X64_REG_RBP, offset, dest_r64);
     }
@@ -600,7 +597,7 @@
 
 void asm_x64_push_local_addr(asm_x64_t *as, int local_num, int temp_r64)
 {
-    asm_x64_mov_r64_to_r64(as, ASM_X64_REG_RBP, temp_r64);
+    asm_x64_mov_r64_r64(as, temp_r64, ASM_X64_REG_RBP);
     asm_x64_add_i32_to_r32(as, asm_x64_local_offset_from_ebp(as, local_num), temp_r64);
     asm_x64_push_r64(as, temp_r64);
 }
@@ -614,7 +611,7 @@
     asm_x64_sub_i32_from_r32(as, 8, ASM_X64_REG_RSP);
     asm_x64_write_byte_1(as, OPCODE_CALL_REL32);
     asm_x64_write_word32(as, func - (void*)(as->code_cur + 4));
-    asm_x64_mov_r64_to_r64(as, ASM_X64_REG_RBP, ASM_X64_REG_RSP);
+    asm_x64_mov_r64_r64(as, ASM_X64_REG_RSP, ASM_X64_REG_RBP);
 }
 
 void asm_x64_call_i1(asm_x64_t *as, void* func, int i1)
@@ -625,7 +622,7 @@
     asm_x64_write_byte_1(as, OPCODE_CALL_REL32);
     asm_x64_write_word32(as, func - (void*)(as->code_cur + 4));
     asm_x64_add_i32_to_r32(as, 16, ASM_X64_REG_RSP);
-    asm_x64_mov_r64_to_r64(as, ASM_X64_REG_RBP, ASM_X64_REG_RSP);
+    asm_x64_mov_r64_r64(as, ASM_X64_REG_RSP, ASM_X64_REG_RBP);
 }
 */