Add support for DC ZVA (#127)

The DC ZVA instruction allows zeroing areas of memory, typically at block sizes
of 64 bytes per instance. Add support for this in all components, with tests.
diff --git a/src/aarch64/assembler-aarch64.cc b/src/aarch64/assembler-aarch64.cc
index 3ab0faa..8e7cee5 100644
--- a/src/aarch64/assembler-aarch64.cc
+++ b/src/aarch64/assembler-aarch64.cc
@@ -7177,6 +7177,7 @@
       return CPUHas(CPUFeatures::kRNG);
     case FPCR:
     case NZCV:
+    case DCZID_EL0:
       break;
   }
   return true;
diff --git a/src/aarch64/constants-aarch64.h b/src/aarch64/constants-aarch64.h
index 0846952..279587c 100644
--- a/src/aarch64/constants-aarch64.h
+++ b/src/aarch64/constants-aarch64.h
@@ -501,7 +501,8 @@
   NZCV = SystemRegisterEncoder<3, 3, 4, 2, 0>::value,
   FPCR = SystemRegisterEncoder<3, 3, 4, 4, 0>::value,
   RNDR = SystemRegisterEncoder<3, 3, 2, 4, 0>::value,    // Random number.
-  RNDRRS = SystemRegisterEncoder<3, 3, 2, 4, 1>::value   // Reseeded random number.
+  RNDRRS = SystemRegisterEncoder<3, 3, 2, 4, 1>::value,  // Reseeded random number.
+  DCZID_EL0 = SystemRegisterEncoder<3, 3, 0, 0, 7>::value
 };
 
 template<int op1, int crn, int crm, int op2>
diff --git a/src/aarch64/disasm-aarch64.cc b/src/aarch64/disasm-aarch64.cc
index 5bcee74..930dfd6 100644
--- a/src/aarch64/disasm-aarch64.cc
+++ b/src/aarch64/disasm-aarch64.cc
@@ -7016,6 +7016,9 @@
         case RNDRRS:
           AppendToOutput("rndrrs");
           break;
+        case DCZID_EL0:
+          AppendToOutput("dczid_el0");
+          break;
         default:
           AppendToOutput("S%d_%d_c%d_c%d_%d",
                          instr->GetSysOp0(),
diff --git a/src/aarch64/simulator-aarch64.cc b/src/aarch64/simulator-aarch64.cc
index 2130c46..0c842ab 100644
--- a/src/aarch64/simulator-aarch64.cc
+++ b/src/aarch64/simulator-aarch64.cc
@@ -6926,7 +6926,7 @@
 }
 
 
-void Simulator::SysOp_W(int op, int64_t val) {
+bool Simulator::SysOp_W(int op, int64_t val) {
   switch (op) {
     case IVAU:
     case CVAC:
@@ -6948,12 +6948,27 @@
       volatile uint8_t y = *MemRead<uint8_t>(val);
       MetaDataDepot::MetaDataMTE::SetActive(mte_enabled);
       USE(y);
-      // TODO: Implement ZVA, GVA, GZVA.
       break;
     }
+    case ZVA: {
+      if ((dczid_ & 0x10) != 0) {  // Check dc zva is enabled.
+        return false;
+      }
+      int blocksize = (1 << (dczid_ & 0xf)) * kWRegSizeInBytes;
+      VIXL_ASSERT(IsMultiple(blocksize, sizeof(uint64_t)));
+      uintptr_t addr = AlignDown(val, blocksize);
+      for (int i = 0; i < blocksize; i += sizeof(uint64_t)) {
+        MemWrite<uint64_t>(addr + i, 0);
+        LogWriteU64(0, addr + i);
+      }
+      break;
+    }
+    // TODO: Implement GVA, GZVA.
     default:
       VIXL_UNIMPLEMENTED();
+      return false;
   }
+  return true;
 }
 
 void Simulator::PACHelper(int dst,
@@ -7036,6 +7051,9 @@
           LogSystemRegister(NZCV);
           break;
         }
+        case DCZID_EL0:
+          WriteXRegister(instr->GetRt(), dczid_);
+          break;
         default:
           VIXL_UNIMPLEMENTED();
       }
@@ -7153,7 +7171,9 @@
       } else if (sysop == GCSPUSHM) {
         GCSPush(ReadXRegister(instr->GetRt()));
       } else {
-        SysOp_W(sysop, rt);
+        if (!SysOp_W(sysop, rt)) {
+          VisitUnallocated(instr);
+        }
       }
       break;
     }
diff --git a/src/aarch64/simulator-aarch64.h b/src/aarch64/simulator-aarch64.h
index cbb1c4c..5d6d975 100644
--- a/src/aarch64/simulator-aarch64.h
+++ b/src/aarch64/simulator-aarch64.h
@@ -2575,6 +2575,14 @@
   void PrintPWrite(int rt_code, uintptr_t address) {
     PrintPAccess(rt_code, "->", address);
   }
+  void PrintWriteU64(uint64_t x, uintptr_t address) {
+    fprintf(stream_,
+            "#      0x%016lx -> %s0x%016" PRIxPTR "%s\n",
+            x,
+            clr_memory_address,
+            address,
+            clr_normal);
+  }
 
   // Like Print* (above), but respect GetTraceParameters().
   void LogRead(int rt_code, PrintRegisterFormat format, uintptr_t address) {
@@ -2609,6 +2617,9 @@
   void LogPWrite(int rt_code, uintptr_t address) {
     if (ShouldTraceWrites()) PrintPWrite(rt_code, address);
   }
+  void LogWriteU64(uint64_t x, uintptr_t address) {
+    if (ShouldTraceWrites()) PrintWriteU64(x, address);
+  }
   void LogMemTransfer(uintptr_t dst, uintptr_t src, uint8_t value) {
     if (ShouldTraceWrites()) PrintMemTransfer(dst, src, value);
   }
@@ -5006,7 +5017,7 @@
   uint32_t Crc32Checksum(uint32_t acc, T val, uint32_t poly);
   uint32_t Crc32Checksum(uint32_t acc, uint64_t val, uint32_t poly);
 
-  void SysOp_W(int op, int64_t val);
+  bool SysOp_W(int op, int64_t val);
 
   template <typename T>
   T FPRecipSqrtEstimate(T op);
@@ -5456,6 +5467,9 @@
   // A configurable size of SVE vector registers.
   unsigned vector_length_;
 
+  // DC ZVA enable (= 0) status and block size.
+  unsigned dczid_ = (0 << 4) | 4;  // 2^4 words => 64-byte block size.
+
   // Representation of memory attributes such as MTE tagging and BTI page
   // protection in addition to branch interceptions.
   MetaDataDepot meta_data_;
diff --git a/test/aarch64/test-assembler-aarch64.cc b/test/aarch64/test-assembler-aarch64.cc
index 9da3f12..a86b32e 100644
--- a/test/aarch64/test-assembler-aarch64.cc
+++ b/test/aarch64/test-assembler-aarch64.cc
@@ -15321,6 +15321,61 @@
 }
 #endif  // VIXL_NEGATIVE_TESTING
 
+TEST(dc_zva) {
+  SETUP_WITH_FEATURES(CPUFeatures::kNEON);
+
+  const int zva_blocksize = 64;  // Assumed blocksize.
+  uint8_t buf[2 * zva_blocksize];
+  uintptr_t buf_addr = reinterpret_cast<uintptr_t>(buf);
+  uintptr_t aligned_addr = AlignUp(buf_addr, zva_blocksize);
+
+  START();
+  // Skip this test if the ZVA blocksize is not 64 bytes.
+  // Set up initial register values to allow the test to pass when skipped.
+  Label skip;
+  __ Movi(q0.V16B(), 0);
+  __ Movi(q1.V16B(), 0);
+  __ Movi(q2.V16B(), 0);
+  __ Movi(q3.V16B(), 0);
+
+  __ Mrs(x1, DCZID_EL0);
+  __ Cmp(x1, 4);  // 4 => DC ZVA enabled with 64-byte blocks.
+  __ B(ne, &skip);
+
+  // Fill aligned region with a pattern.
+  __ Mov(x0, aligned_addr);
+  __ Movi(q0.V16B(), 0x55);
+  __ Movi(q1.V16B(), 0xaa);
+  __ Movi(q2.V16B(), 0x55);
+  __ Movi(q3.V16B(), 0xaa);
+  __ St4(q0.V16B(), q1.V16B(), q2.V16B(), q3.V16B(), MemOperand(x0));
+
+  // Misalign the address to check DC ZVA re-aligns.
+  __ Add(x0, x0, 42);
+
+  // Clear the aligned region.
+  __ Dc(ZVA, x0);
+
+  // Reload the aligned region to check contents.
+  __ Mov(x0, aligned_addr);
+  __ Ld1(q0.V16B(), q1.V16B(), q2.V16B(), q3.V16B(), MemOperand(x0));
+
+  __ Bind(&skip);
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+    if (core.xreg(1) == 4) {
+      ASSERT_EQUAL_128(0, 0, q0);
+      ASSERT_EQUAL_128(0, 0, q1);
+      ASSERT_EQUAL_128(0, 0, q2);
+      ASSERT_EQUAL_128(0, 0, q3);
+    } else {
+      printf("SKIPPED: DC ZVA chunksize not 64-bytes");
+    }
+  }
+}
+
 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
 // Test the pseudo-instructions that control CPUFeatures dynamically in the
 // Simulator. These are used by the test infrastructure itself, but in a fairly
diff --git a/test/aarch64/test-disasm-aarch64.cc b/test/aarch64/test-disasm-aarch64.cc
index 14a354b..40abef1 100644
--- a/test/aarch64/test-disasm-aarch64.cc
+++ b/test/aarch64/test-disasm-aarch64.cc
@@ -2611,6 +2611,7 @@
   COMPARE(mrs(x15, FPCR), "mrs x15, fpcr");
   COMPARE(mrs(x20, RNDR), "mrs x20, rndr");
   COMPARE(mrs(x5, RNDRRS), "mrs x5, rndrrs");
+  COMPARE(mrs(x9, DCZID_EL0), "mrs x9, dczid_el0");
 
   // Test mrs that use system registers we haven't named.
   COMPARE(dci(MRS | (0x5555 << 5)), "mrs x0, S3_2_c10_c10_5");