Merge branch 'main' into mte
diff --git a/src/aarch64/cpu-aarch64.cc b/src/aarch64/cpu-aarch64.cc
index bb3f4de..1700ec5 100644
--- a/src/aarch64/cpu-aarch64.cc
+++ b/src/aarch64/cpu-aarch64.cc
@@ -48,6 +48,7 @@
 const IDRegister::Field AA64PFR1::kBT(0);
 const IDRegister::Field AA64PFR1::kSSBS(4);
 const IDRegister::Field AA64PFR1::kMTE(8);
+const IDRegister::Field AA64PFR1::kSME(24);
 
 const IDRegister::Field AA64ISAR0::kAES(4);
 const IDRegister::Field AA64ISAR0::kSHA1(8);
@@ -78,6 +79,7 @@
 const IDRegister::Field AA64ISAR1::kDGH(48);
 const IDRegister::Field AA64ISAR1::kI8MM(52);
 
+const IDRegister::Field AA64ISAR2::kWFXT(0);
 const IDRegister::Field AA64ISAR2::kRPRES(4);
 
 const IDRegister::Field AA64MMFR0::kECV(60);
@@ -97,6 +99,14 @@
 const IDRegister::Field AA64ZFR0::kF32MM(52);
 const IDRegister::Field AA64ZFR0::kF64MM(56);
 
+const IDRegister::Field AA64SMFR0::kSMEf32f32(32, 1);
+const IDRegister::Field AA64SMFR0::kSMEb16f32(34, 1);
+const IDRegister::Field AA64SMFR0::kSMEf16f32(35, 1);
+const IDRegister::Field AA64SMFR0::kSMEi8i32(36);
+const IDRegister::Field AA64SMFR0::kSMEf64f64(48, 1);
+const IDRegister::Field AA64SMFR0::kSMEi16i64(52);
+const IDRegister::Field AA64SMFR0::kSMEfa64(63, 1);
+
 CPUFeatures AA64PFR0::GetCPUFeatures() const {
   CPUFeatures f;
   if (Get(kFP) >= 0) f.Combine(CPUFeatures::kFP);
@@ -119,6 +129,8 @@
   if (Get(kSSBS) >= 2) f.Combine(CPUFeatures::kSSBSControl);
   if (Get(kMTE) >= 1) f.Combine(CPUFeatures::kMTEInstructions);
   if (Get(kMTE) >= 2) f.Combine(CPUFeatures::kMTE);
+  if (Get(kMTE) >= 3) f.Combine(CPUFeatures::kMTE3);
+  if (Get(kSME) >= 1) f.Combine(CPUFeatures::kSME);
   return f;
 }
 
@@ -155,6 +167,7 @@
   if (Get(kSB) >= 1) f.Combine(CPUFeatures::kSB);
   if (Get(kSPECRES) >= 1) f.Combine(CPUFeatures::kSPECRES);
   if (Get(kBF16) >= 1) f.Combine(CPUFeatures::kBF16);
+  if (Get(kBF16) >= 2) f.Combine(CPUFeatures::kEBF16);
   if (Get(kDGH) >= 1) f.Combine(CPUFeatures::kDGH);
   if (Get(kI8MM) >= 1) f.Combine(CPUFeatures::kI8MM);
 
@@ -180,6 +193,7 @@
 
 CPUFeatures AA64ISAR2::GetCPUFeatures() const {
   CPUFeatures f;
+  if (Get(kWFXT) >= 2) f.Combine(CPUFeatures::kWFXT);
   if (Get(kRPRES) >= 1) f.Combine(CPUFeatures::kRPRES);
   return f;
 }
@@ -220,6 +234,18 @@
   return f;
 }
 
+CPUFeatures AA64SMFR0::GetCPUFeatures() const {
+  CPUFeatures f;
+  if (Get(kSMEf32f32) >= 1) f.Combine(CPUFeatures::kSMEf32f32);
+  if (Get(kSMEb16f32) >= 1) f.Combine(CPUFeatures::kSMEb16f32);
+  if (Get(kSMEf16f32) >= 1) f.Combine(CPUFeatures::kSMEf16f32);
+  if (Get(kSMEi8i32) >= 15) f.Combine(CPUFeatures::kSMEi8i32);
+  if (Get(kSMEf64f64) >= 1) f.Combine(CPUFeatures::kSMEf64f64);
+  if (Get(kSMEi16i64) >= 15) f.Combine(CPUFeatures::kSMEi16i64);
+  if (Get(kSMEfa64) >= 1) f.Combine(CPUFeatures::kSMEfa64);
+  return f;
+}
+
 int IDRegister::Get(IDRegister::Field field) const {
   int msb = field.GetMsb();
   int lsb = field.GetLsb();
@@ -252,7 +278,7 @@
   // Map each set bit onto a feature. Ideally, we'd use HWCAP_* macros rather
   // than explicit bits, but explicit bits allow us to identify features that
   // the toolchain doesn't know about.
-  static const CPUFeatures::Feature kFeatureBits[] =
+  static const CPUFeatures::Feature kFeatureBitsLow[] =
       {// Bits 0-7
        CPUFeatures::kFP,
        CPUFeatures::kNEON,
@@ -288,8 +314,11 @@
        CPUFeatures::kSSBSControl,
        CPUFeatures::kSB,
        CPUFeatures::kPAuth,
-       CPUFeatures::kPAuthGeneric,
-       // Bits 32-39
+       CPUFeatures::kPAuthGeneric};
+  VIXL_STATIC_ASSERT(ArrayLength(kFeatureBitsLow) < 64);
+
+  static const CPUFeatures::Feature kFeatureBitsHigh[] =
+      {// Bits 0-7
        CPUFeatures::kDCCVADP,
        CPUFeatures::kSVE2,
        CPUFeatures::kSVEAES,
@@ -298,7 +327,7 @@
        CPUFeatures::kSVESHA3,
        CPUFeatures::kSVESM4,
        CPUFeatures::kAXFlag,
-       // Bits 40-47
+       // Bits 8-15
        CPUFeatures::kFrintToFixedSizedInt,
        CPUFeatures::kSVEI8MM,
        CPUFeatures::kSVEF32MM,
@@ -307,24 +336,42 @@
        CPUFeatures::kI8MM,
        CPUFeatures::kBF16,
        CPUFeatures::kDGH,
-       // Bits 48+
+       // Bits 16-23
        CPUFeatures::kRNG,
        CPUFeatures::kBTI,
        CPUFeatures::kMTE,
        CPUFeatures::kECV,
        CPUFeatures::kAFP,
-       CPUFeatures::kRPRES};
+       CPUFeatures::kRPRES,
+       CPUFeatures::kMTE3,
+       CPUFeatures::kSME,
+       // Bits 24-31
+       CPUFeatures::kSMEi16i64,
+       CPUFeatures::kSMEf64f64,
+       CPUFeatures::kSMEi8i32,
+       CPUFeatures::kSMEf16f32,
+       CPUFeatures::kSMEb16f32,
+       CPUFeatures::kSMEf32f32,
+       CPUFeatures::kSMEfa64,
+       CPUFeatures::kWFXT,
+       // Bits 32-39
+       CPUFeatures::kEBF16};
+  VIXL_STATIC_ASSERT(ArrayLength(kFeatureBitsHigh) < 64);
 
-  uint64_t hwcap_low32 = getauxval(AT_HWCAP);
-  uint64_t hwcap_high32 = getauxval(AT_HWCAP2);
-  VIXL_ASSERT(IsUint32(hwcap_low32));
-  VIXL_ASSERT(IsUint32(hwcap_high32));
-  uint64_t hwcap = hwcap_low32 | (hwcap_high32 << 32);
+  auto combine_features = [&features](uint64_t hwcap,
+                                      const CPUFeatures::Feature* feature_array,
+                                      size_t features_size) {
+    for (size_t i = 0; i < features_size; i++) {
+      if (hwcap & (UINT64_C(1) << i)) features.Combine(feature_array[i]);
+    }
+  };
 
-  VIXL_STATIC_ASSERT(ArrayLength(kFeatureBits) < 64);
-  for (size_t i = 0; i < ArrayLength(kFeatureBits); i++) {
-    if (hwcap & (UINT64_C(1) << i)) features.Combine(kFeatureBits[i]);
-  }
+  uint64_t hwcap_low = getauxval(AT_HWCAP);
+  uint64_t hwcap_high = getauxval(AT_HWCAP2);
+
+  combine_features(hwcap_low, kFeatureBitsLow, ArrayLength(kFeatureBitsLow));
+  combine_features(hwcap_high, kFeatureBitsHigh, ArrayLength(kFeatureBitsHigh));
+
   // MTE support from HWCAP2 signifies FEAT_MTE1 and FEAT_MTE2 support
   if (features.Has(CPUFeatures::kMTE)) {
     features.Combine(CPUFeatures::kMTEInstructions);
@@ -425,7 +472,7 @@
 }
 
 
-void CPU::EnsureIAndDCacheCoherency(void *address, size_t length) {
+void CPU::EnsureIAndDCacheCoherency(void* address, size_t length) {
 #ifdef __aarch64__
   // Implement the cache synchronisation for all targets where AArch64 is the
   // host, even if we're building the simulator for an AAarch64 host. This
diff --git a/src/aarch64/cpu-aarch64.h b/src/aarch64/cpu-aarch64.h
index ca033a8..7ec3a25 100644
--- a/src/aarch64/cpu-aarch64.h
+++ b/src/aarch64/cpu-aarch64.h
@@ -57,24 +57,24 @@
    public:
     enum Type { kUnsigned, kSigned };
 
+    static const int kMaxWidthInBits = 4;
+
     // This needs to be constexpr so that fields have "constant initialisation".
     // This avoids initialisation order problems when these values are used to
     // (dynamically) initialise static variables, etc.
-    explicit constexpr Field(int lsb, Type type = kUnsigned)
-        : lsb_(lsb), type_(type) {}
+    explicit constexpr Field(int lsb,
+                             int bitWidth = kMaxWidthInBits,
+                             Type type = kUnsigned)
+        : lsb_(lsb), bitWidth_(bitWidth), type_(type) {}
 
-    static const int kMaxWidthInBits = 4;
-
-    int GetWidthInBits() const {
-      // All current ID fields have four bits.
-      return kMaxWidthInBits;
-    }
+    int GetWidthInBits() const { return bitWidth_; }
     int GetLsb() const { return lsb_; }
     int GetMsb() const { return lsb_ + GetWidthInBits() - 1; }
     Type GetType() const { return type_; }
 
    private:
     int lsb_;
+    int bitWidth_;
     Type type_;
   };
 
@@ -114,6 +114,7 @@
   static const Field kBT;
   static const Field kSSBS;
   static const Field kMTE;
+  static const Field kSME;
 };
 
 class AA64ISAR0 : public IDRegister {
@@ -168,6 +169,7 @@
   CPUFeatures GetCPUFeatures() const;
 
  private:
+  static const Field kWFXT;
   static const Field kRPRES;
 };
 
@@ -220,6 +222,22 @@
   static const Field kF64MM;
 };
 
+class AA64SMFR0 : public IDRegister {
+ public:
+  explicit AA64SMFR0(uint64_t value) : IDRegister(value) {}
+
+  CPUFeatures GetCPUFeatures() const;
+
+ private:
+  static const Field kSMEf32f32;
+  static const Field kSMEb16f32;
+  static const Field kSMEf16f32;
+  static const Field kSMEi8i32;
+  static const Field kSMEf64f64;
+  static const Field kSMEi16i64;
+  static const Field kSMEfa64;
+};
+
 class CPU {
  public:
   // Initialise CPU support.
@@ -286,6 +304,7 @@
   V(AA64MMFR1, "ID_AA64MMFR1_EL1")                                            \
   /* These registers are RES0 in the baseline Arm8.0. We can always safely */ \
   /* read them, but some compilers don't accept the symbolic names. */        \
+  V(AA64SMFR0, "S3_0_C0_C4_5")                                                \
   V(AA64ISAR2, "S3_0_C0_C6_2")                                                \
   V(AA64MMFR2, "S3_0_C0_C7_2")                                                \
   V(AA64ZFR0, "S3_0_C0_C4_4")
diff --git a/src/cpu-features.h b/src/cpu-features.h
index 24781da..de9e157 100644
--- a/src/cpu-features.h
+++ b/src/cpu-features.h
@@ -166,6 +166,7 @@
   /* Memory Tagging Extension.                                              */ \
   V(kMTEInstructions,     "MTE (EL0 instructions)", NULL)                      \
   V(kMTE,                 "MTE",                    NULL)                      \
+  V(kMTE3,                "MTE (asymmetric)",       "mte3")                    \
   /* PAuth extensions.                                                      */ \
   V(kPAuthEnhancedPAC,    "PAuth EnhancedPAC",      NULL)                      \
   V(kPAuthEnhancedPAC2,   "PAuth EnhancedPAC2",     NULL)                      \
@@ -185,7 +186,20 @@
   /* Increased precision of Reciprocal Estimate and Square Root Estimate    */ \
   V(kRPRES,               "RPRES",                  "rpres")                   \
   /* Memory operation instructions, for memcpy, memset                      */ \
-  V(kMOPS,                "Memory ops",             NULL)
+  V(kMOPS,                "Memory ops",             NULL)                      \
+  /* Scalable Matrix Extension (SME)                                        */ \
+  V(kSME,                 "SME",                    "sme")                     \
+  V(kSMEi16i64,           "SME (i16i64)",           "smei16i64")               \
+  V(kSMEf64f64,           "SME (f64f64)",           "smef64f64")               \
+  V(kSMEi8i32,            "SME (i8i32)",            "smei8i32")                \
+  V(kSMEf16f32,           "SME (f16f32)",           "smef16f32")               \
+  V(kSMEb16f32,           "SME (b16f32)",           "smeb16f32")               \
+  V(kSMEf32f32,           "SME (f32f32)",           "smef32f32")               \
+  V(kSMEfa64,             "SME (fa64)",             "smefa64")                 \
+  /* WFET and WFIT instruction support                                      */ \
+  V(kWFXT,                "WFXT",                   "wfxt")                    \
+  /* Extended BFloat16 instructions                                         */ \
+  V(kEBF16,               "EBF16",                  "ebf16")
 // clang-format on