[gcs] Support Guarded Control Stacks (#100)

Add support for Guarded Control Stacks in all components, and allow allocating
new stacks in the simulator.
diff --git a/SConstruct b/SConstruct
index ac31cb2..b855d64 100644
--- a/SConstruct
+++ b/SConstruct
@@ -98,7 +98,9 @@
       'CCFLAGS' : ['-O3'],
       },
     'simulator:aarch64' : {
-      'CCFLAGS' : ['-DVIXL_INCLUDE_SIMULATOR_AARCH64'],
+      'CCFLAGS' : ['-DVIXL_INCLUDE_SIMULATOR_AARCH64',
+                   '-pthread'],
+      'LINKFLAGS' : ['-pthread']
       },
     'symbols:on' : {
       'CCFLAGS' : ['-g'],
diff --git a/src/aarch64/assembler-aarch64.cc b/src/aarch64/assembler-aarch64.cc
index c0f66ec..ad2e7c9 100644
--- a/src/aarch64/assembler-aarch64.cc
+++ b/src/aarch64/assembler-aarch64.cc
@@ -1918,6 +1918,12 @@
 }
 
 
+void Assembler::sysl(int op, const Register& xt) {
+  VIXL_ASSERT(xt.Is64Bits());
+  Emit(SYSL | SysOp(op) | Rt(xt));
+}
+
+
 void Assembler::dc(DataCacheOp op, const Register& rt) {
   if (op == CVAP) VIXL_ASSERT(CPUHas(CPUFeatures::kDCPoP));
   if (op == CVADP) VIXL_ASSERT(CPUHas(CPUFeatures::kDCCVADP));
@@ -1930,6 +1936,35 @@
   sys(op, rt);
 }
 
+void Assembler::gcspushm(const Register& rt) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kGCS));
+  sys(GCSPUSHM, rt);
+}
+
+void Assembler::gcspopm(const Register& rt) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kGCS));
+  sysl(GCSPOPM, rt);
+}
+
+
+void Assembler::gcsss1(const Register& rt) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kGCS));
+  sys(GCSSS1, rt);
+}
+
+
+void Assembler::gcsss2(const Register& rt) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kGCS));
+  sysl(GCSSS2, rt);
+}
+
+
+void Assembler::chkfeat(const Register& rd) {
+  VIXL_ASSERT(rd.Is(x16));
+  USE(rd);
+  hint(CHKFEAT);
+}
+
 
 void Assembler::hint(SystemHint code) { hint(static_cast<int>(code)); }
 
diff --git a/src/aarch64/assembler-aarch64.h b/src/aarch64/assembler-aarch64.h
index 1028da2..9bc7076 100644
--- a/src/aarch64/assembler-aarch64.h
+++ b/src/aarch64/assembler-aarch64.h
@@ -2158,6 +2158,9 @@
   // System instruction with pre-encoded op (op1:crn:crm:op2).
   void sys(int op, const Register& xt = xzr);
 
+  // System instruction with result.
+  void sysl(int op, const Register& xt = xzr);
+
   // System data cache operation.
   void dc(DataCacheOp op, const Register& rt);
 
@@ -7072,6 +7075,21 @@
   // Unsigned Minimum.
   void umin(const Register& rd, const Register& rn, const Operand& op);
 
+  // Check feature status.
+  void chkfeat(const Register& rd);
+
+  // Guarded Control Stack Push.
+  void gcspushm(const Register& rt);
+
+  // Guarded Control Stack Pop.
+  void gcspopm(const Register& rt);
+
+  // Guarded Control Stack Switch Stack 1.
+  void gcsss1(const Register& rt);
+
+  // Guarded Control Stack Switch Stack 2.
+  void gcsss2(const Register& rt);
+
   // Emit generic instructions.
 
   // Emit raw instructions into the instruction stream.
diff --git a/src/aarch64/constants-aarch64.h b/src/aarch64/constants-aarch64.h
index 20bd12f..0846952 100644
--- a/src/aarch64/constants-aarch64.h
+++ b/src/aarch64/constants-aarch64.h
@@ -389,7 +389,8 @@
   BTI    = 32,
   BTI_c  = 34,
   BTI_j  = 36,
-  BTI_jc = 38
+  BTI_jc = 38,
+  CHKFEAT = 40
 };
 
 enum BranchTargetIdentifier {
@@ -534,6 +535,13 @@
   CIGDVAC = CacheOpEncoder<3, 7, 14, 5>::value
 };
 
+enum GCSOp {
+  GCSPUSHM = CacheOpEncoder<3, 7, 7, 0>::value,
+  GCSPOPM = CacheOpEncoder<3, 7, 7, 1>::value,
+  GCSSS1 = CacheOpEncoder<3, 7, 7, 2>::value,
+  GCSSS2 = CacheOpEncoder<3, 7, 7, 3>::value
+};
+
 // Some SVE instructions support a predicate constraint pattern. This is
 // interpreted as a VL-dependent value, and is typically used to initialise
 // predicates, or to otherwise limit the number of processed elements.
@@ -942,7 +950,8 @@
   SystemSysFixed  = 0xD5080000,
   SystemSysFMask  = 0xFFF80000,
   SystemSysMask   = 0xFFF80000,
-  SYS             = SystemSysFixed | 0x00000000
+  SYS             = SystemSysFixed | 0x00000000,
+  SYSL            = SystemSysFixed | 0x00200000
 };
 
 // Exception.
diff --git a/src/aarch64/cpu-features-auditor-aarch64.cc b/src/aarch64/cpu-features-auditor-aarch64.cc
index 66ac97a..3925ced 100644
--- a/src/aarch64/cpu-features-auditor-aarch64.cc
+++ b/src/aarch64/cpu-features-auditor-aarch64.cc
@@ -1276,91 +1276,93 @@
 
 void CPUFeaturesAuditor::VisitSystem(const Instruction* instr) {
   RecordInstructionFeaturesScope scope(this);
-  if (instr->Mask(SystemHintFMask) == SystemHintFixed) {
-    CPUFeatures required;
-    switch (instr->GetInstructionBits()) {
-      case PACIA1716:
-      case PACIB1716:
-      case AUTIA1716:
-      case AUTIB1716:
-      case PACIAZ:
-      case PACIASP:
-      case PACIBZ:
-      case PACIBSP:
-      case AUTIAZ:
-      case AUTIASP:
-      case AUTIBZ:
-      case AUTIBSP:
-      case XPACLRI:
-        required.Combine(CPUFeatures::kPAuth);
-        break;
-      default:
-        switch (instr->GetImmHint()) {
-          case ESB:
-            required.Combine(CPUFeatures::kRAS);
-            break;
-          case BTI:
-          case BTI_j:
-          case BTI_c:
-          case BTI_jc:
-            required.Combine(CPUFeatures::kBTI);
-            break;
-          default:
-            break;
-        }
-        break;
-    }
 
-    // These are all HINT instructions, and behave as NOPs if the corresponding
-    // features are not implemented, so we record the corresponding features
-    // only if they are available.
-    if (available_.Has(required)) scope.Record(required);
-  } else if (instr->Mask(SystemSysMask) == SYS) {
-    switch (instr->GetSysOp()) {
-      // DC instruction variants.
-      case CGVAC:
-      case CGDVAC:
-      case CGVAP:
-      case CGDVAP:
-      case CIGVAC:
-      case CIGDVAC:
-      case GVA:
-      case GZVA:
-        scope.Record(CPUFeatures::kMTE);
-        break;
-      case CVAP:
-        scope.Record(CPUFeatures::kDCPoP);
-        break;
-      case CVADP:
-        scope.Record(CPUFeatures::kDCCVADP);
-        break;
-      case IVAU:
-      case CVAC:
-      case CVAU:
-      case CIVAC:
-      case ZVA:
-        // No special CPU features.
-        break;
-    }
-  } else if (instr->Mask(SystemPStateFMask) == SystemPStateFixed) {
-    switch (instr->Mask(SystemPStateMask)) {
-      case CFINV:
-        scope.Record(CPUFeatures::kFlagM);
-        break;
-      case AXFLAG:
-      case XAFLAG:
-        scope.Record(CPUFeatures::kAXFlag);
-        break;
-    }
-  } else if (instr->Mask(SystemSysRegFMask) == SystemSysRegFixed) {
-    if (instr->Mask(SystemSysRegMask) == MRS) {
+  CPUFeatures required;
+  switch (form_hash_) {
+    case "pacib1716_hi_hints"_h:
+    case "pacia1716_hi_hints"_h:
+    case "pacibsp_hi_hints"_h:
+    case "paciasp_hi_hints"_h:
+    case "pacibz_hi_hints"_h:
+    case "paciaz_hi_hints"_h:
+    case "autib1716_hi_hints"_h:
+    case "autia1716_hi_hints"_h:
+    case "autibsp_hi_hints"_h:
+    case "autiasp_hi_hints"_h:
+    case "autibz_hi_hints"_h:
+    case "autiaz_hi_hints"_h:
+    case "xpaclri_hi_hints"_h:
+      required.Combine(CPUFeatures::kPAuth);
+      break;
+    case "esb_hi_hints"_h:
+      required.Combine(CPUFeatures::kRAS);
+      break;
+    case "bti_hb_hints"_h:
+      required.Combine(CPUFeatures::kBTI);
+      break;
+  }
+
+  // The instructions above are all HINTs and behave as NOPs if the
+  // corresponding features are not implemented, so we record the corresponding
+  // features only if they are available.
+  if (available_.Has(required)) scope.Record(required);
+
+  switch (form_hash_) {
+    case "cfinv_m_pstate"_h:
+      scope.Record(CPUFeatures::kFlagM);
+      break;
+    case "axflag_m_pstate"_h:
+    case "xaflag_m_pstate"_h:
+      scope.Record(CPUFeatures::kAXFlag);
+      break;
+    case "mrs_rs_systemmove"_h:
       switch (instr->GetImmSystemRegister()) {
         case RNDR:
         case RNDRRS:
           scope.Record(CPUFeatures::kRNG);
           break;
       }
-    }
+      break;
+    case "sys_cr_systeminstrs"_h:
+      switch (instr->GetSysOp()) {
+        // DC instruction variants.
+        case CGVAC:
+        case CGDVAC:
+        case CGVAP:
+        case CGDVAP:
+        case CIGVAC:
+        case CIGDVAC:
+        case GVA:
+        case GZVA:
+          scope.Record(CPUFeatures::kMTE);
+          break;
+        case CVAP:
+          scope.Record(CPUFeatures::kDCPoP);
+          break;
+        case CVADP:
+          scope.Record(CPUFeatures::kDCCVADP);
+          break;
+        case IVAU:
+        case CVAC:
+        case CVAU:
+        case CIVAC:
+        case ZVA:
+          // No special CPU features.
+          break;
+        case GCSPUSHM:
+        case GCSSS1:
+          scope.Record(CPUFeatures::kGCS);
+          break;
+      }
+      break;
+    case "sysl_rc_systeminstrs"_h:
+      switch (instr->GetSysOp()) {
+        case GCSPOPM:
+        case GCSSS2:
+          scope.Record(CPUFeatures::kGCS);
+          break;
+      }
+      break;
   }
 }
 
diff --git a/src/aarch64/decoder-constants-aarch64.h b/src/aarch64/decoder-constants-aarch64.h
index 70e01a1..af50a55 100644
--- a/src/aarch64/decoder-constants-aarch64.h
+++ b/src/aarch64/decoder-constants-aarch64.h
@@ -3764,7 +3764,7 @@
       {"001110"_b, "autiaz_hi_hints"},
       {"001111"_b, "autibz_hi_hints"},
       {"0100xx"_b, "bti_hb_hints"},
-      {"010100"_b, "chkfeat_hi_hints"},
+      {"010100"_b, "chkfeat_hf_hints"},
       {"0101x1"_b, "hint_hm_hints"},
       {"01x110"_b, "hint_hm_hints"},
       {"10xxxx"_b, "hint_hm_hints"},
diff --git a/src/aarch64/decoder-visitor-map-aarch64.h b/src/aarch64/decoder-visitor-map-aarch64.h
index b40e0ae..a0a6ef2 100644
--- a/src/aarch64/decoder-visitor-map-aarch64.h
+++ b/src/aarch64/decoder-visitor-map-aarch64.h
@@ -2591,6 +2591,7 @@
       {"dmb_bo_barriers"_h, &VISITORCLASS::VisitSystem},                       \
       {"dsb_bo_barriers"_h, &VISITORCLASS::VisitSystem},                       \
       {"hint_hm_hints"_h, &VISITORCLASS::VisitSystem},                         \
+      {"chkfeat_hf_hints"_h, &VISITORCLASS::VisitSystem},                      \
       {"mrs_rs_systemmove"_h, &VISITORCLASS::VisitSystem},                     \
       {"msr_sr_systemmove"_h, &VISITORCLASS::VisitSystem},                     \
       {"psb_hc_hints"_h, &VISITORCLASS::VisitSystem},                          \
diff --git a/src/aarch64/disasm-aarch64.cc b/src/aarch64/disasm-aarch64.cc
index 6a25271..9f53e81 100644
--- a/src/aarch64/disasm-aarch64.cc
+++ b/src/aarch64/disasm-aarch64.cc
@@ -2018,7 +2018,7 @@
 
 void Disassembler::VisitSystem(const Instruction *instr) {
   const char *mnemonic = mnemonic_.c_str();
-  const char *form = "(System)";
+  const char *form = "";
   const char *suffix = NULL;
 
   switch (form_hash_) {
@@ -2047,6 +2047,10 @@
           break;
       }
       break;
+    case "chkfeat_hf_hints"_h:
+      mnemonic = "chkfeat";
+      form = "x16";
+      break;
     case "hint_hm_hints"_h:
       form = "'IH";
       break;
@@ -2067,9 +2071,6 @@
       break;
     }
     case Hash("sys_cr_systeminstrs"): {
-      mnemonic = "dc";
-      suffix = ", 'Xt";
-
       const std::map<uint32_t, const char *> dcop = {
           {IVAU, "ivau"},
           {CVAC, "cvac"},
@@ -2092,17 +2093,36 @@
       if (dcop.count(sysop)) {
         if (sysop == IVAU) {
           mnemonic = "ic";
+        } else {
+          mnemonic = "dc";
         }
         form = dcop.at(sysop);
+        suffix = ", 'Xt";
+      } else if (sysop == GCSSS1) {
+        mnemonic = "gcsss1";
+        form = "'Xt";
+      } else if (sysop == GCSPUSHM) {
+        mnemonic = "gcspushm";
+        form = "'Xt";
       } else {
         mnemonic = "sys";
         form = "'G1, 'Kn, 'Km, 'G2";
-        if (instr->GetRt() == 31) {
-          suffix = NULL;
+        if (instr->GetRt() < 31) {
+          suffix = ", 'Xt";
         }
-        break;
       }
+      break;
     }
+    case "sysl_rc_systeminstrs"_h:
+      uint32_t sysop = instr->GetSysOp();
+      if (sysop == GCSPOPM) {
+        mnemonic = "gcspopm";
+        form = (instr->GetRt() == 31) ? "" : "'Xt";
+      } else if (sysop == GCSSS2) {
+        mnemonic = "gcsss2";
+        form = "'Xt";
+      }
+      break;
   }
   Format(instr, mnemonic, form, suffix);
 }
diff --git a/src/aarch64/macro-assembler-aarch64.cc b/src/aarch64/macro-assembler-aarch64.cc
index 8ee7144..af90a42 100644
--- a/src/aarch64/macro-assembler-aarch64.cc
+++ b/src/aarch64/macro-assembler-aarch64.cc
@@ -1970,6 +1970,22 @@
   setf16(wn);
 }
 
+void MacroAssembler::Chkfeat(const Register& xdn) {
+  VIXL_ASSERT(allow_macro_instructions_);
+  MacroEmissionCheckScope guard(this);
+  if (xdn.Is(x16)) {
+    chkfeat(xdn);
+  } else {
+    UseScratchRegisterScope temps(this);
+    if (temps.TryAcquire(x16)) {
+      Mov(x16, xdn);
+      chkfeat(x16);
+      Mov(xdn, x16);
+    } else {
+      VIXL_ABORT();
+    }
+  }
+}
 
 #define DEFINE_FUNCTION(FN, REGTYPE, REG, OP)                          \
   void MacroAssembler::FN(const REGTYPE REG, const MemOperand& addr) { \
diff --git a/src/aarch64/macro-assembler-aarch64.h b/src/aarch64/macro-assembler-aarch64.h
index a46e2cc..f6fc4d7 100644
--- a/src/aarch64/macro-assembler-aarch64.h
+++ b/src/aarch64/macro-assembler-aarch64.h
@@ -2717,6 +2717,27 @@
     subps(xd, xn, xm);
   }
   void Cmpp(const Register& xn, const Register& xm) { Subps(xzr, xn, xm); }
+  void Chkfeat(const Register& xdn);
+  void Gcspushm(const Register& rt) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    gcspushm(rt);
+  }
+  void Gcspopm(const Register& rt = xzr) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    gcspopm(rt);
+  }
+  void Gcsss1(const Register& rt) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    gcsss1(rt);
+  }
+  void Gcsss2(const Register& rt) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    gcsss2(rt);
+  }
 
 // NEON 3 vector register instructions.
 #define NEON_3VREG_MACRO_LIST(V) \
@@ -8584,6 +8605,16 @@
     return AcquireFrom(available, kGoverningPRegisterMask).P();
   }
 
+  // TODO: extend to other scratch register lists.
+  bool TryAcquire(const Register& required_reg) {
+    CPURegList* list = masm_->GetScratchRegisterList();
+    if (list->IncludesAliasOf(required_reg)) {
+      list->Remove(required_reg);
+      return true;
+    }
+    return false;
+  }
+
   Register AcquireRegisterOfSize(int size_in_bits);
   Register AcquireSameSizeAs(const Register& reg) {
     return AcquireRegisterOfSize(reg.GetSizeInBits());
diff --git a/src/aarch64/simulator-aarch64.cc b/src/aarch64/simulator-aarch64.cc
index b9a1bdc..81bab07 100644
--- a/src/aarch64/simulator-aarch64.cc
+++ b/src/aarch64/simulator-aarch64.cc
@@ -540,7 +540,9 @@
 Simulator::Simulator(Decoder* decoder, FILE* stream, SimStack::Allocated stack)
     : memory_(std::move(stack)),
       last_instr_(NULL),
-      cpu_features_auditor_(decoder, CPUFeatures::All()) {
+      cpu_features_auditor_(decoder, CPUFeatures::All()),
+      gcs_(kGCSNoStack),
+      gcs_enabled_(false) {
   // Ensure that shift operations act as the simulator expects.
   VIXL_ASSERT((static_cast<int32_t>(-1) >> 1) == -1);
   VIXL_ASSERT((static_cast<uint32_t>(-1) >> 1) == 0x7fffffff);
@@ -660,6 +662,8 @@
   ResetPRegisters();
 
   WriteSp(memory_.GetStack().GetBase());
+  ResetGCSState();
+  EnableGCSCheck();
 
   pc_ = NULL;
   pc_modified_ = false;
@@ -697,6 +701,9 @@
   delete print_disasm_;
   close(placeholder_pipe_fd_[0]);
   close(placeholder_pipe_fd_[1]);
+  if (IsAllocatedGCS(gcs_)) {
+    GetGCSManager().FreeStack(gcs_);
+  }
 }
 
 
@@ -1797,6 +1804,18 @@
   }
 }
 
+void Simulator::PrintGCS(bool is_push, uint64_t addr, size_t entry) {
+  const char* arrow = is_push ? "<-" : "->";
+  fprintf(stream_,
+          "# %sgcs0x%04" PRIx64 "[%" PRIu64 "]: %s %s 0x%016" PRIxPTR "\n",
+          clr_flag_name,
+          gcs_,
+          entry,
+          clr_normal,
+          arrow,
+          addr);
+}
+
 uint16_t Simulator::PrintPartialAccess(uint16_t access_mask,
                                        uint16_t future_access_mask,
                                        int struct_element_count,
@@ -3774,6 +3793,7 @@
   switch (instr->Mask(UnconditionalBranchMask)) {
     case BL:
       WriteLr(instr->GetNextInstruction());
+      GCSPush(reinterpret_cast<uint64_t>(instr->GetNextInstruction()));
       VIXL_FALLTHROUGH();
     case B:
       WritePc(instr->GetImmPCOffsetTarget());
@@ -3817,6 +3837,7 @@
   bool authenticate = false;
   bool link = false;
   bool ret = false;
+  bool compare_gcs = false;
   uint64_t addr = ReadXRegister(instr->GetRn());
   uint64_t context = 0;
 
@@ -3853,16 +3874,13 @@
       context = ReadXRegister(31, Reg31IsStackPointer);
       VIXL_FALLTHROUGH();
     case RET:
+      compare_gcs = true;
       ret = true;
       break;
     default:
       VIXL_UNREACHABLE();
   }
 
-  if (link) {
-    WriteLr(instr->GetNextInstruction());
-  }
-
   if (authenticate) {
     PACKey key = (instr->ExtractBit(10) == 0) ? kPACKeyIA : kPACKeyIB;
     addr = AuthPAC(addr, context, key, kInstructionPointer);
@@ -3873,6 +3891,33 @@
     }
   }
 
+  if (compare_gcs) {
+    uint64_t expected_lr = GCSPeek();
+    char msg[128];
+    if (expected_lr != 0) {
+      if ((expected_lr & 0x3) != 0) {
+        snprintf(msg,
+                 sizeof(msg),
+                 "GCS contains misaligned return address: 0x%016lx\n",
+                 expected_lr);
+        ReportGCSFailure(msg);
+      } else if ((addr != 0) && (addr != expected_lr)) {
+        snprintf(msg,
+                 sizeof(msg),
+                 "GCS mismatch: lr = 0x%016lx, gcs = 0x%016lx\n",
+                 addr,
+                 expected_lr);
+        ReportGCSFailure(msg);
+      }
+      GCSPop();
+    }
+  }
+
+  if (link) {
+    WriteLr(instr->GetNextInstruction());
+    GCSPush(reinterpret_cast<uint64_t>(instr->GetNextInstruction()));
+  }
+
   if (!ret) {
     // Check for interceptions to the target address, if one is found, call it.
     MetaDataDepot::BranchInterceptionAbstract* interception =
@@ -6909,6 +6954,14 @@
           VIXL_UNIMPLEMENTED();
       }
       break;
+    case "chkfeat_hf_hints"_h: {
+      uint64_t feat_select = ReadXRegister(16);
+      uint64_t gcs_enabled = IsGCSCheckEnabled() ? 1 : 0;
+      feat_select &= ~gcs_enabled;
+      WriteXRegister(16, feat_select);
+      break;
+    }
+    case "hint_hm_hints"_h:
     case "nop_hi_hints"_h:
     case "esb_hi_hints"_h:
     case "csdb_hi_hints"_h:
@@ -6992,9 +7045,64 @@
     case "isb_bi_barriers"_h:
       __sync_synchronize();
       break;
-    case "sys_cr_systeminstrs"_h:
-      SysOp_W(instr->GetSysOp(), ReadXRegister(instr->GetRt()));
+    case "sys_cr_systeminstrs"_h: {
+      uint64_t rt = ReadXRegister(instr->GetRt());
+      uint32_t sysop = instr->GetSysOp();
+      if (sysop == GCSSS1) {
+        uint64_t incoming_size = rt >> 32;
+        // Drop upper 32 bits to get GCS index.
+        uint64_t incoming_gcs = rt & 0xffffffff;
+        uint64_t outgoing_gcs = ActivateGCS(incoming_gcs);
+        uint64_t incoming_seal = GCSPop();
+        if (((incoming_seal ^ rt) != 1) ||
+            (GetActiveGCSPtr()->size() != incoming_size)) {
+          char msg[128];
+          snprintf(msg,
+                   sizeof(msg),
+                   "GCS: invalid incoming stack: 0x%016" PRIx64 "\n",
+                   incoming_seal);
+          ReportGCSFailure(msg);
+        }
+        GCSPush(outgoing_gcs + 5);
+      } else if (sysop == GCSPUSHM) {
+        GCSPush(ReadXRegister(instr->GetRt()));
+      } else {
+        SysOp_W(sysop, rt);
+      }
       break;
+    }
+    case "sysl_rc_systeminstrs"_h: {
+      uint32_t sysop = instr->GetSysOp();
+      if (sysop == GCSPOPM) {
+        uint64_t addr = GCSPop();
+        WriteXRegister(instr->GetRt(), addr);
+      } else if (sysop == GCSSS2) {
+        uint64_t outgoing_gcs = GCSPop();
+        // Check for token inserted by gcsss1.
+        if ((outgoing_gcs & 7) != 5) {
+          char msg[128];
+          snprintf(msg,
+                   sizeof(msg),
+                   "GCS: outgoing stack has no token: 0x%016" PRIx64 "\n",
+                   outgoing_gcs);
+          ReportGCSFailure(msg);
+        }
+        uint64_t incoming_gcs = ActivateGCS(outgoing_gcs);
+        outgoing_gcs &= ~UINT64_C(0x3ff);
+
+        // Encode the size into the outgoing stack seal, to check later.
+        uint64_t size = GetActiveGCSPtr()->size();
+        VIXL_ASSERT(IsUint32(size));
+        VIXL_ASSERT(IsUint32(outgoing_gcs + 1));
+        uint64_t outgoing_seal = (size << 32) | (outgoing_gcs + 1);
+        GCSPush(outgoing_seal);
+        ActivateGCS(incoming_gcs);
+        WriteXRegister(instr->GetRt(), outgoing_seal - 1);
+      } else {
+        VIXL_UNIMPLEMENTED();
+      }
+      break;
+    }
     default:
       VIXL_UNIMPLEMENTED();
   }
@@ -14796,12 +14904,35 @@
       reinterpret_cast<void (*)(Simulator*, uintptr_t)>(call_wrapper_address);
 
   if (static_cast<RuntimeCallType>(call_type) == kCallRuntime) {
-    WriteRegister(kLinkRegCode,
-                  instr->GetInstructionAtOffset(kRuntimeCallLength));
+    const Instruction* addr = instr->GetInstructionAtOffset(kRuntimeCallLength);
+    WriteLr(addr);
+    GCSPush(reinterpret_cast<uint64_t>(addr));
   }
   runtime_call_wrapper(this, function_address);
   // Read the return address from `lr` and write it into `pc`.
-  WritePc(ReadRegister<Instruction*>(kLinkRegCode));
+  uint64_t addr = ReadRegister<uint64_t>(kLinkRegCode);
+  if (IsGCSCheckEnabled()) {
+    uint64_t expected_lr = GCSPeek();
+    char msg[128];
+    if (expected_lr != 0) {
+      if ((expected_lr & 0x3) != 0) {
+        snprintf(msg,
+                 sizeof(msg),
+                 "GCS contains misaligned return address: 0x%016lx\n",
+                 expected_lr);
+        ReportGCSFailure(msg);
+      } else if ((addr != 0) && (addr != expected_lr)) {
+        snprintf(msg,
+                 sizeof(msg),
+                 "GCS mismatch: lr = 0x%016lx, gcs = 0x%016lx\n",
+                 addr,
+                 expected_lr);
+        ReportGCSFailure(msg);
+      }
+      GCSPop();
+    }
+  }
+  WritePc(reinterpret_cast<Instruction*>(addr));
 }
 #else
 void Simulator::DoRuntimeCall(const Instruction* instr) {
diff --git a/src/aarch64/simulator-aarch64.h b/src/aarch64/simulator-aarch64.h
index b4b7aa6..760fa6c 100644
--- a/src/aarch64/simulator-aarch64.h
+++ b/src/aarch64/simulator-aarch64.h
@@ -28,6 +28,7 @@
 #define VIXL_AARCH64_SIMULATOR_AARCH64_H_
 
 #include <memory>
+#include <mutex>
 #include <unordered_map>
 #include <vector>
 
@@ -2529,12 +2530,16 @@
   // Other state updates, including system registers.
   void PrintSystemRegister(SystemRegister id);
   void PrintTakenBranch(const Instruction* target);
+  void PrintGCS(bool is_push, uint64_t addr, size_t entry);
   void LogSystemRegister(SystemRegister id) {
     if (ShouldTraceSysRegs()) PrintSystemRegister(id);
   }
   void LogTakenBranch(const Instruction* target) {
     if (ShouldTraceBranches()) PrintTakenBranch(target);
   }
+  void LogGCS(bool is_push, uint64_t addr, size_t entry) {
+    if (ShouldTraceSysRegs()) PrintGCS(is_push, addr, entry);
+  }
 
   // Trace memory accesses.
 
@@ -5352,6 +5357,161 @@
 
   // Debugger for the simulator.
   std::unique_ptr<Debugger> debugger_;
+
+  // The Guarded Control Stack is represented using a vector, where the more
+  // recently stored addresses are at higher-numbered indices.
+  using GuardedControlStack = std::vector<uint64_t>;
+
+  // The GCSManager handles the synchronisation of GCS across multiple
+  // Simulator instances. Each Simulator has its own stack, but all share
+  // a GCSManager instance. This allows exchanging stacks between Simulators
+  // in a threaded application.
+  class GCSManager {
+   public:
+    // Allocate a new Guarded Control Stack and add it to the vector of stacks.
+    uint64_t AllocateStack() {
+      const std::lock_guard<std::mutex> lock(stacks_mtx_);
+
+      GuardedControlStack* new_stack = new GuardedControlStack;
+      uint64_t result;
+
+      // Put the new stack into the first available slot.
+      for (result = 0; result < stacks_.size(); result++) {
+        if (stacks_[result] == nullptr) {
+          stacks_[result] = new_stack;
+          break;
+        }
+      }
+
+      // If there were no slots, create a new one.
+      if (result == stacks_.size()) {
+        stacks_.push_back(new_stack);
+      }
+
+      // Shift the index to look like a stack pointer aligned to a page.
+      result <<= kPageSizeLog2;
+
+      // Push the tagged index onto the new stack as a seal.
+      new_stack->push_back(result + 1);
+      return result;
+    }
+
+    // Free a Guarded Control Stack and set the stacks_ slot to null.
+    void FreeStack(uint64_t gcs) {
+      const std::lock_guard<std::mutex> lock(stacks_mtx_);
+      uint64_t gcs_index = GetGCSIndex(gcs);
+      GuardedControlStack* gcsptr = stacks_[gcs_index];
+      if (gcsptr == nullptr) {
+        VIXL_ABORT_WITH_MSG("Tried to free unallocated GCS ");
+      } else {
+        delete gcsptr;
+        stacks_[gcs_index] = nullptr;
+      }
+    }
+
+    // Get a pointer to the GCS vector using a GCS id.
+    GuardedControlStack* GetGCSPtr(uint64_t gcs) const {
+      return stacks_[GetGCSIndex(gcs)];
+    }
+
+   private:
+    uint64_t GetGCSIndex(uint64_t gcs) const { return gcs >> 12; }
+
+    std::vector<GuardedControlStack*> stacks_;
+    std::mutex stacks_mtx_;
+  };
+
+  // A GCS id indicating no GCS has been allocated.
+  static const uint64_t kGCSNoStack = kPageSize - 1;
+  uint64_t gcs_;
+  bool gcs_enabled_;
+
+ public:
+  GCSManager& GetGCSManager() {
+    static GCSManager manager;
+    return manager;
+  }
+
+  void EnableGCSCheck() { gcs_enabled_ = true; }
+  void DisableGCSCheck() { gcs_enabled_ = false; }
+  bool IsGCSCheckEnabled() const { return gcs_enabled_; }
+
+ private:
+  bool IsAllocatedGCS(uint64_t gcs) const { return gcs != kGCSNoStack; }
+  void ResetGCSState() {
+    GCSManager& m = GetGCSManager();
+    if (IsAllocatedGCS(gcs_)) {
+      m.FreeStack(gcs_);
+    }
+    ActivateGCS(m.AllocateStack());
+    GCSPop();  // Remove seal.
+  }
+
+  GuardedControlStack* GetGCSPtr(uint64_t gcs) {
+    GCSManager& m = GetGCSManager();
+    GuardedControlStack* result = m.GetGCSPtr(gcs);
+    return result;
+  }
+  GuardedControlStack* GetActiveGCSPtr() { return GetGCSPtr(gcs_); }
+
+  uint64_t ActivateGCS(uint64_t gcs) {
+    uint64_t outgoing_gcs = gcs_;
+    gcs_ = gcs;
+    return outgoing_gcs;
+  }
+
+  void GCSPush(uint64_t addr) {
+    GetActiveGCSPtr()->push_back(addr);
+    size_t entry = GetActiveGCSPtr()->size() - 1;
+    LogGCS(/* is_push = */ true, addr, entry);
+  }
+
+  uint64_t GCSPop() {
+    GuardedControlStack* gcs = GetActiveGCSPtr();
+    if (gcs->empty()) {
+      return 0;
+    }
+    uint64_t return_addr = gcs->back();
+    size_t entry = gcs->size() - 1;
+    gcs->pop_back();
+    LogGCS(/* is_push = */ false, return_addr, entry);
+    return return_addr;
+  }
+
+  uint64_t GCSPeek() {
+    GuardedControlStack* gcs = GetActiveGCSPtr();
+    if (gcs->empty()) {
+      return 0;
+    }
+    uint64_t return_addr = gcs->back();
+    return return_addr;
+  }
+
+  void ReportGCSFailure(const char* msg) {
+    if (IsGCSCheckEnabled()) {
+      GuardedControlStack* gcs = GetActiveGCSPtr();
+      printf("%s", msg);
+      if (gcs == nullptr) {
+        printf("GCS pointer is null\n");
+      } else {
+        printf("GCS records, most recent first:\n");
+        int most_recent_index = static_cast<int>(gcs->size()) - 1;
+        for (int i = 0; i < 8; i++) {
+          if (!gcs->empty()) {
+            uint64_t entry = gcs->back();
+            gcs->pop_back();
+            int index = most_recent_index - i;
+            printf(" gcs%" PRIu64 "[%d]: 0x%016" PRIx64 "\n",
+                   gcs_,
+                   index,
+                   entry);
+          }
+        }
+        printf("End of GCS records.\n");
+      }
+      VIXL_ABORT_WITH_MSG("GCS failed ");
+    }
+  }
 };
 
 #if defined(VIXL_HAS_SIMULATED_RUNTIME_CALL_SUPPORT) && __cplusplus < 201402L
diff --git a/src/cpu-features.h b/src/cpu-features.h
index 97eb661..1a041f6 100644
--- a/src/cpu-features.h
+++ b/src/cpu-features.h
@@ -201,7 +201,8 @@
   /* Extended BFloat16 instructions                                         */ \
   V(kEBF16,               "EBF16",                  "ebf16")                   \
   V(kSVE_EBF16,           "EBF16 (SVE)",            "sveebf16")                \
-  V(kCSSC,                "CSSC",                   "cssc")
+  V(kCSSC,                "CSSC",                   "cssc")                    \
+  V(kGCS,                 "GCS",                    "gcs")
 // clang-format on
 
 
diff --git a/test/aarch64/test-api-aarch64.cc b/test/aarch64/test-api-aarch64.cc
index c724f17..3ac9efb 100644
--- a/test/aarch64/test-api-aarch64.cc
+++ b/test/aarch64/test-api-aarch64.cc
@@ -27,6 +27,7 @@
 #include <cstdio>
 #include <cstring>
 #include <string>
+#include <thread>
 
 #include "test-runner.h"
 #include "test-utils.h"
@@ -1763,6 +1764,24 @@
   VIXL_CHECK(s.IsAccessInGuardRegion(s.GetLimit() - 1280, 2048));
   VIXL_CHECK(s.IsAccessInGuardRegion(s.GetLimit() - 1280, 10000));
 }
+
+void AllocateAndFreeGCS() {
+  Decoder d;
+  Simulator s(&d);
+
+  for (int i = 0; i < 100000; i++) {
+    uint64_t gcs = s.GetGCSManager().AllocateStack();
+    s.GetGCSManager().FreeStack(gcs);
+  }
+}
+
+TEST(sim_gcs_manager) {
+  std::thread t1(AllocateAndFreeGCS);
+  std::thread t2(AllocateAndFreeGCS);
+
+  t1.join();
+  t2.join();
+}
 #endif
 
 }  // namespace aarch64
diff --git a/test/aarch64/test-assembler-aarch64.cc b/test/aarch64/test-assembler-aarch64.cc
index 0015547..81d6b7a 100644
--- a/test/aarch64/test-assembler-aarch64.cc
+++ b/test/aarch64/test-assembler-aarch64.cc
@@ -7501,11 +7501,12 @@
   __ Blr(x0);
   __ Adr(ipreg, &jump_call_target);
   __ Blr(ipreg);
-  __ Adr(lr, &done);  // Make Ret return to done label.
+  __ Mov(lr, 0);  // Zero lr so we branch to done.
   __ Br(ipreg);
   __ Bind(&call_target, EmitBTI_c);
   __ Ret();
   __ Bind(&jump_call_target, EmitBTI_jc);
+  __ Cbz(lr, &done);
   __ Ret();
   __ Bind(&done);
   END();
@@ -7529,28 +7530,36 @@
   SETUP_WITH_FEATURES(CPUFeatures::kBTI);
 
   Label start, none, c, j, jc;
+  Label jump_to_c, call_to_j;
   START();
   __ B(&start);
   __ Bind(&none, EmitBTI);
   __ Bind(&c, EmitBTI_c);
   __ Bind(&j, EmitBTI_j);
   __ Bind(&jc, EmitBTI_jc);
-  VIXL_CHECK(__ GetSizeOfCodeGeneratedSince(&none) == 4 * kInstructionSize);
+  __ Hint(BTI);
+  __ Hint(BTI_c);
+  __ Hint(BTI_j);
+  __ Hint(BTI_jc);
+  VIXL_CHECK(__ GetSizeOfCodeGeneratedSince(&none) == 8 * kInstructionSize);
+  __ Cmp(x1, 1);
+  __ B(lt, &jump_to_c);
+  __ B(eq, &call_to_j);
   __ Ret();
 
-  Label jump_to_c, call_to_j;
   __ Bind(&start);
   __ Adr(x0, &none);
-  __ Adr(lr, &jump_to_c);
+  __ Mov(x1, 0);
   __ Br(x0);
 
   __ Bind(&jump_to_c);
   __ Adr(x0, &c);
-  __ Adr(lr, &call_to_j);
+  __ Mov(x1, 1);
   __ Br(x0);
 
   __ Bind(&call_to_j);
   __ Adr(x0, &j);
+  __ Mov(x1, 2);
   __ Blr(x0);
   END();
 
@@ -14723,6 +14732,228 @@
   MinMaxHelper(op, true, s64min, s64max, 0, s64max);
 }
 
+TEST(gcs_chkfeat) {
+  SETUP();
+
+  START();
+  __ Mov(x16, 0x0123'4567'89ab'cdef);
+  __ Chkfeat(x16);
+  __ Mov(x0, x16);
+
+  __ Mov(x1, 0x0123'4567'89ab'cdef);
+  __ Chkfeat(x1);
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+    ASSERT_EQUAL_64(0x0123'4567'89ab'cdee, x0);
+    ASSERT_EQUAL_64(x0, x1);
+  }
+}
+
+TEST(gcs_feature_off) {
+  SETUP();
+
+  START();
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
+  simulator.DisableGCSCheck();
+#else
+// TODO: Disable GCS via operating system for this test, here and in the
+// gcs_off_pac_on test below.
+#endif
+  __ Mov(x16, 0x0123'4567'89ab'cdef);
+  __ Chkfeat(x16);
+
+  // This sequence would fail with GCS enabled.
+  Label lab, end;
+  __ Bl(&lab);
+  __ B(&end);
+
+  __ Bind(&lab);
+  __ Adr(lr, &end);
+  __ Ret();
+
+  __ Bind(&end);
+  END();
+
+  if (CAN_RUN()) {
+    // TODO: This will currently fail on GCS-supporting hardware.
+    RUN();
+    ASSERT_EQUAL_64(0x0123'4567'89ab'cdef, x16);
+  }
+}
+
+TEST(gcs_gcspushm) {
+  SETUP_WITH_FEATURES(CPUFeatures::kGCS);
+
+  Label ret;
+  START();
+  __ Adr(x0, &ret);
+  __ Gcspushm(x0);
+  __ Ret(x0);
+  __ Nop();
+  __ Bind(&ret);
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+  }
+}
+
+TEST(gcs_gcspopm) {
+  SETUP_WITH_FEATURES(CPUFeatures::kGCS);
+
+  Label lab, ret;
+  START();
+  __ Adr(x0, &ret);
+  __ Bl(&lab);
+  __ Bind(&ret);
+  __ Nop();
+  __ Bind(&lab);
+  __ Gcspopm(x1);
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+    ASSERT_EQUAL_64(x0, x1);
+  }
+}
+
+TEST(gcs_gcsss1) {
+  SETUP_WITH_FEATURES(CPUFeatures::kGCS);
+
+  START();
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
+  uint64_t new_gcs = simulator.GetGCSManager().AllocateStack();
+  __ Mov(x0, new_gcs);
+#else
+// TODO: Request new GCS from the operating system.
+#endif
+
+  // Partial stack swap to check GCS has changed, and a token is at the top
+  // of the new stack.
+  __ Gcsss1(x0);
+  __ Gcspopm(x1);
+
+  __ Bic(x0, x0, 7);  // Clear LSB of new GCS.
+  __ Bic(x2, x1, 7);  // Clear LSB of old GCS.
+  __ Cmp(x0, x2);
+  __ Cset(x0, eq);
+  __ And(x1, x1, 7);  // In progress token.
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+    ASSERT_EQUAL_64(0, x0);  // GCS must not be equal.
+    ASSERT_EQUAL_64(5, x1);  // In progress token must be present.
+  }
+}
+
+// TODO: Add extra tests for combinations of PAC and GCS enabled.
+TEST(gcs_stack_swap) {
+  SETUP_WITH_FEATURES(CPUFeatures::kGCS);
+
+  START();
+  Label stack_swap, sub_fn, end;
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
+  uint64_t new_gcs = simulator.GetGCSManager().AllocateStack();
+  __ Mov(x0, new_gcs);
+#else
+// TODO: Request new GCS from the operating system.
+#endif
+  __ Bl(&stack_swap);
+  __ B(&end);
+
+  __ Bind(&stack_swap);
+  __ Gcsss1(x0);  // x0 = new GCS.
+  __ Gcsss2(x1);  // x1 = old GCS.
+  __ Mov(x29, lr);
+  __ Bl(&sub_fn);
+  __ Mov(lr, x29);
+  __ Gcsss1(x1);  // Restore old GCS.
+  __ Gcsss2(x0);
+  __ Ret();
+
+  __ Bind(&sub_fn);
+  __ Mov(x2, 42);
+  __ Ret();
+
+  __ Bind(&end);
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+    ASSERT_EQUAL_64(42, x2);
+  }
+}
+
+TEST(gcs_off_pac_on) {
+  SETUP_WITH_FEATURES(CPUFeatures::kPAuth);
+
+  START();
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
+  simulator.DisableGCSCheck();
+#else
+// TODO: Disable GCS via operating system for this test, and enable for native.
+#endif
+  __ Mov(x16, 1);
+  __ Chkfeat(x16);
+  __ Mov(x1, x16);
+
+  Label fn1, after_fn1;
+
+  __ Mov(x28, sp);
+  __ Mov(x29, lr);
+  __ Mov(sp, 0x477d469dec0b8760);
+
+  __ Mov(x0, 0);
+  __ B(&after_fn1);
+
+  __ Bind(&fn1);
+  __ Mov(x0, 42);
+  __ Paciasp();
+  __ Retaa();
+
+  __ Bind(&after_fn1);
+  __ Bl(&fn1);
+
+  __ Mov(sp, x28);
+  __ Mov(lr, x29);
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+
+    ASSERT_EQUAL_64(42, x0);
+    ASSERT_EQUAL_64(1, x1);
+  }
+}
+
+#ifdef VIXL_NEGATIVE_TESTING
+TEST(gcs_negative_test) {
+  SETUP_WITH_FEATURES(CPUFeatures::kGCS);
+
+  Label fn, bad_return_addr, done;
+  START();
+  __ Bl(&fn);
+  __ Nop();  // GCS enforces that fn() returns here...
+
+  __ Bind(&bad_return_addr);
+  __ B(&done);  // ... but this test attempts to return here.
+
+  __ Bind(&fn);
+  __ Adr(lr, &bad_return_addr);
+  __ Ret();
+
+  __ Bind(&done);
+  END();
+
+  if (CAN_RUN()) {
+    MUST_FAIL_WITH_MESSAGE(RUN(), "GCS failed");
+  }
+}
+#endif  // VIXL_NEGATIVE_TESTING
+
 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
 // Test the pseudo-instructions that control CPUFeatures dynamically in the
 // Simulator. These are used by the test infrastructure itself, but in a fairly
diff --git a/test/aarch64/test-disasm-aarch64.cc b/test/aarch64/test-disasm-aarch64.cc
index 28eb283..7c8f2cc 100644
--- a/test/aarch64/test-disasm-aarch64.cc
+++ b/test/aarch64/test-disasm-aarch64.cc
@@ -3359,6 +3359,20 @@
   CLEANUP();
 }
 
+TEST(gcs) {
+  SETUP();
+
+  COMPARE_MACRO(Chkfeat(x16), "chkfeat x16");
+  COMPARE_MACRO(Gcspopm(x0), "gcspopm x0");
+  COMPARE_MACRO(Gcspopm(), "gcspopm");
+  COMPARE_MACRO(Gcspopm(xzr), "gcspopm");
+  COMPARE_MACRO(Gcsss1(x4), "gcsss1 x4");
+  COMPARE_MACRO(Gcsss2(x2), "gcsss2 x2");
+  COMPARE_MACRO(Gcspushm(x1), "gcspushm x1");
+
+  CLEANUP();
+}
+
 TEST(architecture_features) {
   SETUP();
 
diff --git a/tools/code_coverage.log b/tools/code_coverage.log
index 091124c..f913151 100644
--- a/tools/code_coverage.log
+++ b/tools/code_coverage.log
@@ -14,12 +14,15 @@
 1660224011 82.79% 97.51% 95.50%
 1663161852 82.79% 97.51% 95.50%
 1666104118 82.79% 97.51% 95.50%
+1668785529 82.75% 97.44% 95.40%
 1669202345 82.79% 97.51% 95.51%
 1673432155 82.79% 97.51% 95.51%
 1677171445 82.78% 97.56% 94.81%
 1681814646 82.90% 97.57% 94.87%
 1686666000 82.90% 97.57% 94.87%
 1693487542 82.91% 97.57% 94.87%
+1694008240 82.72% 97.50% 94.95%
+1697036303 82.87% 97.56% 94.76%
 1702052331 82.89% 97.59% 94.77%
 1706691191 82.87% 97.59% 94.74%
 1707395574 82.89% 97.59% 94.77%