Initial commit of cheriot ibex into hw/matcha

-Sync to commit 31dbab1

Bypass-Presubmit-Reason: failed test related to test environment change
Change-Id: I28699fb4cd29b805c60549251b4980c96f2c177b
diff --git a/hw/ip/cheriot-ibex/cheriot_core.core b/hw/ip/cheriot-ibex/cheriot_core.core
new file mode 100644
index 0000000..31ac2b7
--- /dev/null
+++ b/hw/ip/cheriot-ibex/cheriot_core.core
@@ -0,0 +1,186 @@
+CAPI=2:
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+name: "lowrisc:ibex:cheriot_core:0.1"
+description: "Ibex CPU Core Components"
+
+filesets:
+  files_rtl:
+    depend:
+      - lowrisc:prim:assert
+      - lowrisc:prim:clock_gating
+      - lowrisc:prim:lfsr
+      - lowrisc:ibex:cheriot_pkg
+      - lowrisc:ibex:cheriot_icache
+      - lowrisc:dv:dv_fcov_macros
+    files:
+      - rtl/cheri_decoder.sv
+      - rtl/cheri_ex.sv
+      - rtl/cheri_tbre.sv
+      - rtl/cheri_stkz.sv
+      - rtl/cheri_tbre_wrapper.sv
+      - rtl/cheri_trvk_stage.sv
+      - rtl/cheriot_alu.sv
+      - rtl/cheriot_branch_predict.sv
+      - rtl/cheriot_compressed_decoder.sv
+      - rtl/cheriot_controller.sv
+      - rtl/cheriot_cs_registers.sv
+      - rtl/cheriot_csr.sv
+      - rtl/cheriot_counter.sv
+      - rtl/cheriot_decoder.sv
+      - rtl/cheriot_ex_block.sv
+      - rtl/cheriot_fetch_fifo.sv
+      - rtl/cheriot_id_stage.sv
+      - rtl/cheriot_if_stage.sv
+      - rtl/cheriot_load_store_unit.sv
+      - rtl/cheriot_multdiv_fast.sv
+      - rtl/cheriot_multdiv_slow.sv
+      - rtl/cheriot_prefetch_buffer.sv
+      - rtl/cheriot_pmp.sv
+      - rtl/cheriot_wb_stage.sv
+      - rtl/cheriot_dummy_instr.sv
+      - rtl/cheriot_core.sv
+      - rtl/cheriot_pmp_reset_default.svh: {is_include_file: true}
+    file_type: systemVerilogSource
+
+  files_lint_verilator:
+    files:
+      - lint/verilator_waiver.vlt: {file_type: vlt}
+
+  files_lint_verible:
+    files:
+      - lint/verible_waiver.vbw: {file_type: veribleLintWaiver}
+
+  files_check_tool_requirements:
+    depend:
+     - lowrisc:tool:check_tool_requirements
+
+parameters:
+  RVFI:
+    datatype: bool
+    paramtype: vlogdefine
+
+  SYNTHESIS:
+    datatype: bool
+    paramtype: vlogdefine
+
+  FPGA_XILINX:
+    datatype: bool
+    description: Identifies Xilinx FPGA targets to set DSP pragmas for performance counters.
+    default: false
+    paramtype: vlogdefine
+
+  RV32E:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+
+  RV32M:
+    datatype: str
+    default: cheriot_pkg::RV32MFast
+    paramtype: vlogdefine
+    description: "RV32M implementation parameter enum. See the cheriot_pkg::rv32m_e enum in cheriot_pkg.sv for permitted values."
+
+  RV32B:
+    datatype: str
+    default: cheriot_pkg::RV32BNone
+    paramtype: vlogdefine
+    description: "Bitmanip implementation parameter enum. See the cheriot_pkg::rv32b_e enum in cheriot_pkg.sv for permitted values."
+
+  RegFile:
+    datatype: str
+    default: cheriot_pkg::RegFileFF
+    paramtype: vlogdefine
+    description: "Register file implementation parameter enum. See the cheriot_pkg::regfile_e enum in cheriot_pkg.sv for permitted values."
+
+  ICache:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enable instruction cache"
+
+  ICacheECC:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enable ECC protection in instruction cache"
+
+  BranchTargetALU:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enables separate branch target ALU (increasing branch performance EXPERIMENTAL) [0/1]"
+
+  WritebackStage:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enables third pipeline stage (EXPERIMENTAL) [0/1]"
+
+  BranchPredictor:
+    datatype: int
+    paramtype: vlogparam
+    default: 0
+    description: "Enables static branch prediction (EXPERIMENTAL)"
+
+  SecureIbex:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enables security hardening features (EXPERIMENTAL) [0/1]"
+
+  PMPEnable:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enable PMP"
+
+  PMPGranularity:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Granularity of NAPOT range, 0 = 4 byte, 1 = 8 byte, 2 = 16 byte, 3 = 32 byte etc"
+
+  PMPNumRegions:
+    datatype: int
+    default: 4
+    paramtype: vlogparam
+    description: "Number of PMP regions"
+
+targets:
+  default: &default_target
+    filesets:
+      - tool_verilator ? (files_lint_verilator)
+      - tool_veriblelint ? (files_lint_verible)
+      - files_rtl
+      - files_check_tool_requirements
+    toplevel: cheriot_core
+    parameters:
+      - tool_vivado ? (FPGA_XILINX=true)
+  lint:
+    <<: *default_target
+    parameters:
+      - SYNTHESIS=true
+      - RVFI=true
+    default_tool: verilator
+    tools:
+      verilator:
+        mode: lint-only
+        verilator_options:
+          - "-Wall"
+          # RAM primitives wider than 64bit (required for ECC) fail to build in
+          # Verilator without increasing the unroll count (see Verilator#1266)
+          - "--unroll-count 72"
+  format:
+    filesets:
+      - files_rtl
+    parameters:
+      - SYNTHESIS=true
+      - RVFI=true
+    default_tool: veribleformat
+    toplevel: cheriot_core
+    tools:
+      veribleformat:
+        verible_format_args:
+          - "--inplace"
diff --git a/hw/ip/cheriot-ibex/cheriot_icache.core b/hw/ip/cheriot-ibex/cheriot_icache.core
new file mode 100644
index 0000000..6f963c5
--- /dev/null
+++ b/hw/ip/cheriot-ibex/cheriot_icache.core
@@ -0,0 +1,22 @@
+CAPI=2:
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+name: "lowrisc:ibex:cheriot_icache:0.1"
+description: "Ibex instruction cache"
+filesets:
+  files_rtl:
+    depend:
+      - lowrisc:prim:secded
+      - lowrisc:prim:assert
+      - lowrisc:ibex:cheriot_pkg
+    files:
+      - rtl/cheriot_icache.sv
+    file_type: systemVerilogSource
+
+targets:
+  default: &default_target
+    filesets:
+      - files_rtl
+    toplevel: cheriot_icache
+    default_tool: vcs
diff --git a/hw/ip/cheriot-ibex/cheriot_multdiv.core b/hw/ip/cheriot-ibex/cheriot_multdiv.core
new file mode 100644
index 0000000..6898853
--- /dev/null
+++ b/hw/ip/cheriot-ibex/cheriot_multdiv.core
@@ -0,0 +1,28 @@
+CAPI=2:
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+name: "lowrisc:ibex:ibex_multdiv:0.1"
+description: "Multiplier and divider"
+
+filesets:
+  files_rtl:
+    depend:
+      - lowrisc:prim:assert
+      - lowrisc:ibex:cheriot_pkg
+    files:
+      - rtl/cheriot_multdiv_fast.sv
+      - rtl/cheriot_multdiv_slow.sv
+    file_type: systemVerilogSource
+
+parameters:
+  RV32M:
+    datatype: int
+    default: 2
+    paramtype: vlogparam
+    description: "Selection of multiplication implementation. Switch to enable single cycle multiplications."
+
+targets:
+  default: &default_target
+    filesets:
+      - files_rtl
diff --git a/hw/ip/cheriot-ibex/cheriot_pkg.core b/hw/ip/cheriot-ibex/cheriot_pkg.core
new file mode 100644
index 0000000..4c60a18
--- /dev/null
+++ b/hw/ip/cheriot-ibex/cheriot_pkg.core
@@ -0,0 +1,18 @@
+CAPI=2:
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+name: "lowrisc:ibex:cheriot_pkg:0.1"
+description: "Header package for Ibex"
+
+filesets:
+  files_rtl:
+    files:
+      - rtl/cheriot_pkg.sv
+      - rtl/cheri_pkg.sv
+    file_type: systemVerilogSource
+
+targets:
+  default:
+    filesets:
+      - files_rtl
diff --git a/hw/ip/cheriot-ibex/cheriot_top.core b/hw/ip/cheriot-ibex/cheriot_top.core
new file mode 100644
index 0000000..5d08123
--- /dev/null
+++ b/hw/ip/cheriot-ibex/cheriot_top.core
@@ -0,0 +1,175 @@
+CAPI=2:
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+name: "lowrisc:ibex:cheriot_top:0.1"
+description: "Ibex, a small RV32 CPU core"
+
+filesets:
+  files_rtl:
+    depend:
+      - lowrisc:ibex:cheriot_pkg
+      - lowrisc:ibex:cheriot_core
+      - lowrisc:prim:buf
+      - lowrisc:prim:clock_mux2
+      - lowrisc:prim:flop
+      - lowrisc:prim:ram_1p_scr
+    files:
+      - rtl/cheriot_register_file_ff.sv # generic FF-based
+      - rtl/cheriot_register_file_fpga.sv # FPGA
+      - rtl/cheriot_register_file_latch.sv # ASIC
+      - rtl/cheri_regfile.sv # generic FF-based
+      - rtl/cheriot_lockstep.sv
+      - rtl/cheriot_top.sv
+    file_type: systemVerilogSource
+
+  files_lint_verilator:
+    files:
+      - lint/verilator_waiver.vlt: {file_type: vlt}
+
+  files_lint_verible:
+    files:
+      - lint/verible_waiver.vbw: {file_type: veribleLintWaiver}
+
+  files_check_tool_requirements:
+    depend:
+     - lowrisc:tool:check_tool_requirements
+
+parameters:
+  RVFI:
+    datatype: bool
+    paramtype: vlogdefine
+
+  SYNTHESIS:
+    datatype: bool
+    paramtype: vlogdefine
+
+  FPGA_XILINX:
+    datatype: bool
+    description: Identifies Xilinx FPGA targets to set DSP pragmas for performance counters.
+    default: false
+    paramtype: vlogdefine
+
+  RV32E:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+
+  RV32M:
+    datatype: str
+    default: cheriot_pkg::RV32MFast
+    paramtype: vlogdefine
+    description: "RV32M implementation parameter enum. See the cheriot_pkg::rv32m_e enum in cheriot_pkg.sv for permitted values."
+
+  RV32B:
+    datatype: str
+    default: cheriot_pkg::RV32BNone
+    paramtype: vlogdefine
+    description: "Bitmanip implementation parameter enum. See the cheriot_pkg::rv32b_e enum in cheriot_pkg.sv for permitted values."
+
+  RegFile:
+    datatype: str
+    default: cheriot_pkg::RegFileFF
+    paramtype: vlogdefine
+    description: "Register file implementation parameter enum. See the cheriot_pkg::regfile_e enum in cheriot_pkg.sv for permitted values."
+
+  ICache:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enable instruction cache"
+
+  ICacheECC:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enable ECC protection in instruction cache"
+
+  BranchTargetALU:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enables separate branch target ALU (increasing branch performance EXPERIMENTAL) [0/1]"
+
+  WritebackStage:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enables third pipeline stage (EXPERIMENTAL) [0/1]"
+
+  BranchPredictor:
+    datatype: int
+    paramtype: vlogparam
+    default: 0
+    description: "Enables static branch prediction (EXPERIMENTAL)"
+
+  SecureIbex:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enables security hardening features (EXPERIMENTAL) [0/1]"
+
+  ICacheScramble:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enables ICache scrambling feature (EXPERIMENTAL) [0/1]"
+
+  PMPEnable:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enable PMP"
+
+  PMPGranularity:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Granularity of NAPOT range, 0 = 4 byte, 1 = 8 byte, 2 = 16 byte, 3 = 32 byte etc"
+
+  PMPNumRegions:
+    datatype: int
+    default: 4
+    paramtype: vlogparam
+    description: "Number of PMP regions"
+
+targets:
+  default: &default_target
+    filesets:
+      - tool_verilator ? (files_lint_verilator)
+      - tool_veriblelint ? (files_lint_verible)
+      - files_rtl
+      - files_check_tool_requirements
+    toplevel: cheriot_top
+    parameters:
+      - tool_vivado ? (FPGA_XILINX=true)
+  lint:
+    <<: *default_target
+    parameters:
+      - SYNTHESIS=true
+      - RVFI=true
+    default_tool: verilator
+    tools:
+      verilator:
+        mode: lint-only
+        verilator_options:
+          - "-Wall"
+          # RAM primitives wider than 64bit (required for ECC) fail to build in
+          # Verilator without increasing the unroll count (see Verilator#1266)
+          - "--unroll-count 72"
+  format:
+    filesets:
+      - files_rtl
+    parameters:
+      - SYNTHESIS=true
+      - RVFI=true
+    default_tool: veribleformat
+    toplevel: cheriot_top
+    tools:
+      veribleformat:
+        verible_format_args:
+          - "--inplace"
+          - "--formal_parameters_indentation=indent"
+          - "--named_parameter_indentation=indent"
+          - "--named_port_indentation=indent"
+          - "--port_declarations_indentation=indent"
diff --git a/hw/ip/cheriot-ibex/cheriot_top_tracing.core b/hw/ip/cheriot-ibex/cheriot_top_tracing.core
new file mode 100644
index 0000000..48c6995
--- /dev/null
+++ b/hw/ip/cheriot-ibex/cheriot_top_tracing.core
@@ -0,0 +1,161 @@
+CAPI=2:
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+name: "lowrisc:ibex:cheriot_top_tracing:0.1"
+description: "Ibex, a small RV32 CPU core with tracing enabled"
+filesets:
+  files_rtl:
+    depend:
+      - lowrisc:ibex:cheriot_top
+      - lowrisc:ibex:cheriot_tracer
+    files:
+      - rtl/cheriot_top_tracing.sv
+    file_type: systemVerilogSource
+
+parameters:
+  # The tracer uses the RISC-V Formal Interface (RVFI) to collect trace signals.
+  RVFI:
+    datatype: bool
+    paramtype: vlogdefine
+    default: true
+
+  SYNTHESIS:
+    datatype: bool
+    paramtype: vlogdefine
+
+  RV32E:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+
+  RV32M:
+    datatype: str
+    default: cheriot_pkg::RV32MFast
+    paramtype: vlogdefine
+    description: "RV32M implementation parameter enum. See the cheriot_pkg::rv32m_e enum in cheriot_pkg.sv for permitted values."
+
+  RV32B:
+    datatype: str
+    default: cheriot_pkg::RV32BNone
+    paramtype: vlogdefine
+    description: "Bitmanip implementation parameter enum. See the cheriot_pkg::rv32b_e enum in cheriot_pkg.sv for permitted values."
+
+  RegFile:
+    datatype: str
+    default: cheriot_pkg::RegFileFF
+    paramtype: vlogdefine
+    description: "Register file implementation parameter enum. See the cheriot_pkg::regfile_e enum in cheriot_pkg.sv for permitted values."
+
+  ICache:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enable instruction cache"
+
+  ICacheECC:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enable ECC protection in instruction cache"
+
+  BranchTargetALU:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enables separate branch target ALU (increasing branch performance EXPERIMENTAL) [0/1]"
+
+  WritebackStage:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enables third pipeline stage (EXPERIMENTAL) [0/1]"
+
+  BranchPredictor:
+    datatype: int
+    paramtype: vlogparam
+    default: 0
+    description: "Enables static branch prediction (EXPERIMENTAL)"
+
+  SecureIbex:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enables security hardening features (EXPERIMENTAL) [0/1]"
+
+  ICacheScramble:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enables ICache scrambling feature (EXPERIMENTAL) [0/1]"
+
+  PMPEnable:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Enable PMP"
+
+  PMPGranularity:
+    datatype: int
+    default: 0
+    paramtype: vlogparam
+    description: "Granularity of NAPOT range, 0 = 4 byte, 1 = 8 byte, 2 = 16 byte, 3 = 32 byte etc"
+
+  PMPNumRegions:
+    datatype: int
+    default: 4
+    paramtype: vlogparam
+    description: "Number of PMP regions"
+
+targets:
+  default: &default_target
+    filesets:
+      - files_rtl
+    parameters:
+      - RVFI=true
+    toplevel: cheriot_top_tracing
+
+  lint:
+    <<: *default_target
+    parameters:
+      - RVFI=true
+      - SYNTHESIS=true
+      - RV32E
+      - RV32M
+      - RV32B
+      - RegFile
+      - ICache
+      - ICacheECC
+      - BranchTargetALU
+      - WritebackStage
+      - BranchPredictor
+      - SecureIbex
+      - ICacheScramble
+      - PMPEnable
+      - PMPGranularity
+      - PMPNumRegions
+    default_tool: verilator
+    tools:
+      verilator:
+        mode: lint-only
+        verilator_options:
+          - "-Wall"
+          # RAM primitives wider than 64bit (required for ECC) fail to build in
+          # Verilator without increasing the unroll count (see Verilator#1266)
+          - "--unroll-count 72"
+  format:
+    filesets:
+      - files_rtl
+    parameters:
+      - SYNTHESIS=true
+      - RVFI=true
+    default_tool: veribleformat
+    toplevel: cheriot_top_tracing
+    tools:
+      veribleformat:
+        verible_format_args:
+          - "--inplace"
+          - "--formal_parameters_indentation=indent"
+          - "--named_parameter_indentation=indent"
+          - "--named_port_indentation=indent"
+          - "--port_declarations_indentation=indent"
diff --git a/hw/ip/cheriot-ibex/cheriot_tracer.core b/hw/ip/cheriot-ibex/cheriot_tracer.core
new file mode 100644
index 0000000..e9bbce5
--- /dev/null
+++ b/hw/ip/cheriot-ibex/cheriot_tracer.core
@@ -0,0 +1,20 @@
+CAPI=2:
+# Copyright lowRISC contributors.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+name: "lowrisc:ibex:cheriot_tracer:0.1"
+description: "Tracer for use with Ibex using the RVFI interface"
+filesets:
+  files_rtl:
+    depend:
+      - lowrisc:prim:assert
+      - lowrisc:ibex:cheriot_pkg
+    files:
+      - rtl/cheriot_tracer_pkg.sv
+      - rtl/cheriot_tracer.sv
+    file_type: systemVerilogSource
+
+targets:
+  default:
+    filesets:
+      - files_rtl
diff --git a/hw/ip/cheriot-ibex/lint/verible_waiver.vbw b/hw/ip/cheriot-ibex/lint/verible_waiver.vbw
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/hw/ip/cheriot-ibex/lint/verible_waiver.vbw
diff --git a/hw/ip/cheriot-ibex/lint/verilator_waiver.vlt b/hw/ip/cheriot-ibex/lint/verilator_waiver.vlt
new file mode 100644
index 0000000..b7c952c
--- /dev/null
+++ b/hw/ip/cheriot-ibex/lint/verilator_waiver.vlt
@@ -0,0 +1,72 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Lint waivers for Verilator
+// See https://www.veripool.org/projects/verilator/wiki/Manual-verilator#CONFIGURATION-FILES
+// for documentation.
+//
+// Important: This file must included *before* any other Verilog file is read.
+// Otherwise, only global waivers are applied, but not file-specific waivers.
+
+`verilator_config
+lint_off -rule PINCONNECTEMPTY
+
+// We have some boolean top-level parameters in e.g. ibex_core_tracing.sv.
+// When building with fusesoc, these get set with defines like
+// -GRV32M=1 (rather than -GRV32M=1'b1), leading to warnings like:
+//
+//   Operator VAR '<varname>' expects 1 bits on the Initial value, but
+//   Initial value's CONST '32'h1' generates 32 bits.
+//
+// This signoff rule ignores errors like this. Note that it only
+// matches when you set a 1-bit value to a literal 1, so it won't hide
+// silly mistakes like setting it to 2.
+//
+lint_off -rule WIDTH -file "*/rtl/ibex_top_tracing.sv"
+         -match "*expects 1 bits*Initial value's CONST '32'h1'*"
+
+// Operator expects 1 bit on initial value but initial value's CONST generates
+// 32 bits, need a specific RV32B waiver as it uses enums so the above catch-all
+// waiver doesn't work.
+lint_off -rule WIDTH -file "*/rtl/ibex_top_tracing.sv" -match "*'RV32B'*"
+
+// Bits of signal are not used: be_i[3:1]
+// Bits of signal are not used: addr_i[31:10,1:0]
+// Bits of signal are not used: wdata_i[31:8]
+//
+// simulator_ctrl exposes a 32-bit write-only interface to its control
+// registers, but actually only looks at the bottom byte and rounds addresses
+// down to be 4-byte aligned.
+//
+lint_off -rule UNUSED -file "*/rtl/sim/simulator_ctrl.sv" -match "*'be_i'[3:1]*"
+lint_off -rule UNUSED -file "*/rtl/sim/simulator_ctrl.sv" -match "*'addr_i'[31:10,1:0]*"
+lint_off -rule UNUSED -file "*/rtl/sim/simulator_ctrl.sv" -match "*'wdata_i'[31:8]*"
+
+// Bits of signal are not used: timer_addr_i[31:10]
+//
+// The upper bits of this address are used to select whether the timer is
+// addressed at all (encoded in the timer_req_i input). However, we pass the
+// entire 32-bit address around to make the code a bit cleaner.
+lint_off -rule UNUSED -file "*/rtl/timer.sv" -match "*'timer_addr_i'[31:10]*"
+
+// Signal is not used: clk_i
+// leaving clk and reset connected in-case we want to add assertions
+lint_off -rule UNUSED -file "*/rtl/ibex_pmp.sv" -match "*clk_i*"
+lint_off -rule UNUSED -file "*/rtl/ibex_compressed_decoder.sv" -match "*clk_i*"
+lint_off -rule UNUSED -file "*/rtl/ibex_decoder.sv" -match "*clk_i*"
+lint_off -rule UNUSED -file "*/rtl/ibex_branch_predict.sv" -match "*clk_i*"
+
+// Signal is not used: rst_ni
+// leaving clk and reset connected in-case we want to add assertions
+lint_off -rule UNUSED -file "*/rtl/ibex_pmp.sv" -match "*rst_ni*"
+lint_off -rule UNUSED -file "*/rtl/ibex_compressed_decoder.sv" -match "*rst_ni*"
+lint_off -rule UNUSED -file "*/rtl/ibex_decoder.sv" -match "*rst_ni*"
+lint_off -rule UNUSED -file "*/rtl/ibex_branch_predict.sv" -match "*rst_ni*"
+
+// Temporary waivers until OpenTitan primitives are lint-clean
+// https://github.com/lowRISC/opentitan/issues/2313
+lint_off -file "*/lowrisc_prim_*/rtl/*.sv"
+
+lint_off -rule UNUSED -file "*/rtl/ibex_top_tracing.sv" -match "*RndCnstLfsrSeed*"
+lint_off -rule UNUSED -file "*/rtl/ibex_top_tracing.sv" -match "*RndCnstLfsrPerm*"
diff --git a/hw/ip/cheriot-ibex/rtl/cheri_decoder.sv b/hw/ip/cheriot-ibex/rtl/cheri_decoder.sv
new file mode 100644
index 0000000..113c95f
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheri_decoder.sv
@@ -0,0 +1,130 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Cheri instruction decoder
+// should we merge this with cheri_EX? let's leave it alone for now since we may look into
+// a separate decoder PL stage later
+
+module cheri_decoder import cheri_pkg::*; # (
+  parameter bit CheriPPLBC = 1'b1,
+  parameter bit CheriSBND2 = 1'b0
+) (
+  input  logic [31:0]     instr_rdata_i,
+  input  logic            cheri_opcode_en_i,       // op = 0x5b
+  input  logic            cheri_tsafe_en_i,
+  input  logic            cheri_auipcc_en_i,        // op = 0x17 (AUIPC)
+  input  logic            cheri_auicgp_en_i,        // op = 0x7b (AUIGCP)
+  input  logic            cheri_jalr_en_i,          // op = 0x67 (JALR)
+  input  logic            cheri_jal_en_i,           // op = 0x6f (JAL)
+  input  logic            cheri_cload_en_i,         // op = 0x3, [14:12] = 0x3 (LD)
+  input  logic            cheri_cstore_en_i,        // op = 0x23, [14:12] = 0x3 (SD)
+  output logic            instr_is_cheri_o,         // instr in cheri space
+  output logic            instr_is_legal_cheri_o,   // legal cheri instruction
+  output logic [11:0]     cheri_imm12_o,
+  output logic [19:0]     cheri_imm20_o,
+  output logic [20:0]     cheri_imm21_o,
+  output logic [OPDW-1:0] cheri_operator_o,
+  output logic  [4:0]     cheri_cs2_dec_o,
+  output logic            cheri_rf_ren_a_o,
+  output logic            cheri_rf_ren_b_o,
+  output logic            cheri_rf_we_dec_o,
+  output logic            cheri_multicycle_dec_o
+  );
+
+  logic  [6:0] unused_opcode;
+  logic  [2:0] func3_op;
+  logic  [6:0] func7_op;
+  logic  [4:0] imm5_op;
+  logic  [4:0] rd_op;
+
+  // note there are 3 encoding formats of CHERI instructions
+  //  - fmt1: I-format, func3(14:12) = subFuc.
+  //  - fmt2: R-format, func3(14:12) = 0x0, func7(31:25) = subFunc, etc.
+  //  - fmt3: I-format, func3(14:12) = 0x0, func7(31:25) = 0x7f, imm5(24:20) = subFunc
+  //  - opcode [6:0] == 0x5b for all CHERI instructions
+  assign unused_opcode = instr_rdata_i[6:0];
+  assign func3_op      = instr_rdata_i[14:12];
+  assign func7_op      = instr_rdata_i[31:25];
+  assign imm5_op       = instr_rdata_i[24:20];
+  assign rd_op         = instr_rdata_i[11:7];
+
+  always_comb begin
+    cheri_operator_o = OPDW'('h0);
+
+    cheri_operator_o[CCSR_RW]         = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h01);
+    cheri_operator_o[CSET_BOUNDS]     = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h08);
+    cheri_operator_o[CSET_BOUNDS_EX]  = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h09);
+    cheri_operator_o[CSET_BOUNDS_RNDN]= cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h0a);
+    cheri_operator_o[CSEAL]           = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h0b);
+    cheri_operator_o[CUNSEAL]         = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h0c);
+    cheri_operator_o[CAND_PERM]       = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h0d);
+    cheri_operator_o[CSET_ADDR]       = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h10);
+    cheri_operator_o[CINC_ADDR]       = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h11);
+    cheri_operator_o[CSUB_CAP]        = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h14);
+    cheri_operator_o[CSET_HIGH]       = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h16);
+    cheri_operator_o[CIS_SUBSET]      = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h20);
+    cheri_operator_o[CIS_EQUAL]       = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h21);
+
+
+    cheri_operator_o[CGET_PERM]       = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h7f) && (imm5_op==5'h00);
+    cheri_operator_o[CGET_TYPE]       = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h7f) && (imm5_op==5'h01);
+    cheri_operator_o[CGET_BASE]       = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h7f) && (imm5_op==5'h02);
+    cheri_operator_o[CGET_HIGH]        = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h7f) && (imm5_op==5'h17);
+    cheri_operator_o[CGET_TOP]        = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h7f) && (imm5_op==5'h18);
+    cheri_operator_o[CGET_LEN]        = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h7f) && (imm5_op==5'h03);
+    cheri_operator_o[CGET_TAG]        = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h7f) && (imm5_op==5'h04);
+    cheri_operator_o[CRRL]            = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h7f) && (imm5_op==5'h08);
+    cheri_operator_o[CRAM]            = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h7f) && (imm5_op==5'h09);
+    cheri_operator_o[CGET_ADDR]       = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h7f) && (imm5_op==5'h0f);
+    cheri_operator_o[CMOVE_CAP]       = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h7f) && (imm5_op==5'h0a);
+    cheri_operator_o[CCLEAR_TAG]      = cheri_opcode_en_i && (func3_op==0) && (func7_op==7'h7f) && (imm5_op==5'h0b);
+
+    cheri_operator_o[CINC_ADDR_IMM]   = cheri_opcode_en_i && (func3_op == 1);
+    cheri_operator_o[CSET_BOUNDS_IMM] = cheri_opcode_en_i && (func3_op == 2);
+
+
+    cheri_operator_o[CAUIPCC]         = cheri_auipcc_en_i;
+    cheri_operator_o[CAUICGP]         = cheri_auicgp_en_i;
+    cheri_operator_o[CJALR]           = cheri_jalr_en_i;
+    cheri_operator_o[CJAL]            = cheri_jal_en_i;
+    cheri_operator_o[CLOAD_CAP]       = cheri_cload_en_i;
+    // cheri_operator_o[CLBC]            = cheri_cload_en_i & ~func3_op[2] & cheri_tsafe_en_i;
+    cheri_operator_o[CSTORE_CAP]      = cheri_cstore_en_i;
+  end
+
+  // partially decoded, early signal to control muxing and regfile read
+  assign instr_is_cheri_o       = cheri_opcode_en_i | cheri_jalr_en_i | cheri_jal_en_i |
+                                  cheri_auipcc_en_i | cheri_auicgp_en_i | cheri_cload_en_i | cheri_cstore_en_i;
+
+  assign instr_is_legal_cheri_o = |cheri_operator_o;
+
+  assign cheri_cs2_dec_o  = cheri_operator_o[CCSR_RW] ? imm5_op : 0;
+
+  assign cheri_imm12_o    = (cheri_operator_o[CJALR]|cheri_operator_o[CSET_BOUNDS_IMM]|
+                             cheri_operator_o[CINC_ADDR_IMM]|cheri_operator_o[CLOAD_CAP]) ?
+                            {func7_op, imm5_op}:(cheri_operator_o[CSTORE_CAP]?{func7_op, rd_op}:0);
+
+  assign cheri_imm20_o    = (cheri_operator_o[CAUIPCC]|cheri_operator_o[CAUICGP])  ? instr_rdata_i[31:12] : 0;
+
+  assign cheri_imm21_o    = cheri_operator_o[CJAL]  ? {instr_rdata_i[31], instr_rdata_i[19:12],
+                                                       instr_rdata_i[20], instr_rdata_i[30:21], 1'b0} : 0;
+
+  // register dependency decoding (ren_a, ren_b, we)
+  // only handled opcode=0x5b case here.
+  // Will be qualified and combined with other cases by ibexc_decoder
+  assign cheri_rf_ren_a_o = 1'b1;
+  assign cheri_rf_ren_b_o = (func3_op == 0) && (func7_op != 7'h7f) && (func7_op !=7'h01);
+ 
+  // cheri_rf_we_dec_o is not used to generate the actual regfile write enables in the case of 
+  // cheri instructions (which is in cheri_ex and  muxed with rf_we in wb_stage). 
+  // However it is merged into the overall rf_we and used to generate stall_cheri_trvk
+  assign cheri_rf_we_dec_o = cheri_opcode_en_i & (|cheri_operator_o);
+
+  assign cheri_multicycle_dec_o = (cheri_operator_o[CLOAD_CAP] & cheri_tsafe_en_i & ~CheriPPLBC) |
+                                  (CheriSBND2 & (cheri_operator_o[CSET_BOUNDS] |
+                                   cheri_operator_o[CSET_BOUNDS_IMM] |
+                                   cheri_operator_o[CSET_BOUNDS_EX] |
+                                   cheri_operator_o[CRRL] | cheri_operator_o[CRAM]));
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheri_ex.sv b/hw/ip/cheriot-ibex/rtl/cheri_ex.sv
new file mode 100644
index 0000000..45dd6c2
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheri_ex.sv
@@ -0,0 +1,1172 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+module cheri_ex import cheri_pkg::*; #(
+  parameter bit          WritebackStage = 1'b0,
+  parameter bit          MemCapFmt      = 1'b0,
+  parameter int unsigned HeapBase       = 32'h2001_0000,
+  parameter int unsigned TSMapBase      = 32'h2002_f000,
+  parameter int unsigned TSMapSize      = 1024,
+  parameter bit          CheriPPLBC     = 1'b1,
+  parameter bit          CheriSBND2     = 1'b0,
+  parameter bit          CheriStkZ      = 1'b1,
+  parameter bit          CheriCapIT8    = 1'b0
+)(
+   // Clock and Reset
+  input  logic          clk_i,
+  input  logic          rst_ni,
+
+  // configuration & control
+  input  logic          cheri_pmode_i,
+  input  logic          cheri_tsafe_en_i,
+  input  logic          debug_mode_i,
+
+  // data forwarded from WB stage
+  input  logic          fwd_we_i,
+  input  logic  [4:0]   fwd_waddr_i,
+  input  logic [31:0]   fwd_wdata_i,
+  input  reg_cap_t      fwd_wcap_i,
+
+  // regfile interface
+  input  logic  [4:0]   rf_raddr_a_i,
+  input  logic [31:0]   rf_rdata_a_i,
+  input  reg_cap_t      rf_rcap_a_i,
+  input  logic  [4:0]   rf_raddr_b_i,
+  input  logic [31:0]   rf_rdata_b_i,
+  input  reg_cap_t      rf_rcap_b_i,
+  output logic          rf_trsv_en_o,
+  input  logic  [4:0]   rf_waddr_i,
+
+  // pcc interface
+  input  pcc_cap_t      pcc_cap_i,
+  output pcc_cap_t      pcc_cap_o,
+  input  logic [31:0]   pc_id_i,
+
+  // use branch_req_o also to update pcc cap
+  output logic          branch_req_o,          // update PCC (goes to cs_registers)
+  output logic          branch_req_spec_o,     // speculative branch request (go to IF)
+  output logic [31:0]   branch_target_o,
+
+  // Interface to ID stage control logic
+  input  logic          cheri_exec_id_i,
+  input  logic          instr_first_cycle_i,   // 1st exec cycle allowing lsu_req
+
+  // inputs from decoder
+  input  logic          instr_valid_i,
+  input  logic          instr_is_cheri_i,
+  input  logic          instr_is_rv32lsu_i,
+  input  logic          instr_is_compressed_i,
+  input  logic [11:0]   cheri_imm12_i,
+  input  logic [19:0]   cheri_imm20_i,
+  input  logic [20:0]   cheri_imm21_i,
+  input  logic  [4:0]   cheri_cs2_dec_i,       // cs2 used for CSR address
+  input  logic [OPDW-1:0] cheri_operator_i,
+
+  // output to wb stage
+  output logic          cheri_rf_we_o,
+  output logic [31:0]   result_data_o,
+  output reg_cap_t      result_cap_o,
+
+  output logic          cheri_ex_valid_o,
+  output logic          cheri_ex_err_o,
+  output logic [11:0]   cheri_ex_err_info_o,
+  output logic          cheri_wb_err_o,
+  output logic [15:0]   cheri_wb_err_info_o,
+
+  // lsu interface
+  output logic          lsu_req_o,
+  output logic          lsu_cheri_err_o,
+  output logic          lsu_is_cap_o,
+  output logic  [3:0]   lsu_lc_clrperm_o,
+  output logic          lsu_we_o,
+  output logic [31:0]   lsu_addr_o,
+  output logic [1:0]    lsu_type_o,
+  output logic [32:0]   lsu_wdata_o,
+  output reg_cap_t      lsu_wcap_o,
+  output logic          lsu_sign_ext_o,
+  output logic          cpu_stall_by_stkz_o,
+  output logic          cpu_grant_to_stkz_o,
+
+  input  logic          addr_incr_req_i,
+  input  logic [31:0]   addr_last_i,
+  input  logic          lsu_req_done_i,
+  input  logic [32:0]   lsu_rdata_i,
+  input  reg_cap_t      lsu_rcap_i,
+
+  // LSU interface to the existing core (muxed)
+  input  logic          rv32_lsu_req_i,
+  input  logic          rv32_lsu_we_i,
+  input  logic [1:0]    rv32_lsu_type_i,
+  input  logic [31:0]   rv32_lsu_wdata_i,
+  input  logic          rv32_lsu_sign_ext_i,
+  input  logic [31:0]   rv32_lsu_addr_i,
+  output logic          rv32_addr_incr_req_o,
+  output logic [31:0]   rv32_addr_last_o,
+
+  // TBRE LSU request (for muxing)
+  input  logic          lsu_tbre_sel_i,
+  input  logic          tbre_lsu_req_i,
+  input  logic          tbre_lsu_is_cap_i,
+  input  logic          tbre_lsu_we_i,
+  input  logic [31:0]   tbre_lsu_addr_i,
+  input  logic [32:0]   tbre_lsu_wdata_i,
+  output logic          cpu_lsu_dec_o,
+
+  input  logic [31:0]   csr_rdata_i,
+  input  reg_cap_t      csr_rcap_i,
+  input  logic          csr_mstatus_mie_i,
+  output logic          csr_access_o,
+  output logic  [4:0]   csr_addr_o,
+  output logic [31:0]   csr_wdata_o,
+  output reg_cap_t      csr_wcap_o,
+  output cheri_csr_op_e csr_op_o,
+  output logic          csr_op_en_o,
+  output logic          csr_set_mie_o,
+  output logic          csr_clr_mie_o,
+
+  // stack highwater mark updates
+  input  logic [31:0]   csr_mshwm_i,
+  input  logic [31:0]   csr_mshwmb_i,
+  output logic          csr_mshwm_set_o,
+  output logic [31:0]   csr_mshwm_new_o,
+  
+  // stack fast clearing control signals
+  input  logic          stkz_active_i,
+  input  logic          stkz_abort_i,
+  input  logic [31:0]   stkz_ptr_i,
+  input  logic [31:0]   stkz_base_i,
+
+  output logic          ztop_wr_o,
+  output logic [31:0]   ztop_wdata_o,
+  output full_cap_t     ztop_wfcap_o,
+  input  logic [31:0]   ztop_rdata_i,
+  input  reg_cap_t      ztop_rcap_i,
+
+  // debug feature
+  input  logic          csr_dbg_tclr_fault_i
+);
+
+  localparam int unsigned TSMapTop = TSMapBase+TSMapSize*4;
+
+  logic          cheri_lsu_req;
+  logic          cheri_lsu_we;
+  logic [31:0]   cheri_lsu_addr;
+  logic [32:0]   cheri_lsu_wdata;
+  reg_cap_t      cheri_lsu_wcap;
+  logic          cheri_lsu_err;
+  logic          cheri_lsu_is_cap;
+
+  logic [31:0]   rf_rdata_a, rf_rdata_ng_a;
+  logic [31:0]   rf_rdata_b, rf_rdata_ng_b;
+
+  reg_cap_t      rf_rcap_a, rf_rcap_ng_a;
+  reg_cap_t      rf_rcap_b, rf_rcap_ng_b;
+
+  full_cap_t     rf_fullcap_a, rf_fullcap_b;
+
+  reg_cap_t      csc_wcap;
+
+  logic          is_load_cap, is_store_cap, is_cap;
+
+  logic          addr_bound_vio;
+  logic          perm_vio, perm_vio_slc;
+  logic          rv32_lsu_err;
+  logic          addr_bound_vio_rv32;
+  logic          perm_vio_rv32;
+
+  logic [W_PVIO-1:0]  perm_vio_vec, perm_vio_vec_rv32;
+
+  logic  [31:0]  cs1_addr_plusimm;
+  logic  [31:0]  cs1_imm;
+  logic  [31:0]  addr_result;
+
+
+  logic          cheri_rf_we_raw, branch_req_raw, branch_req_spec_raw;
+  logic          csr_set_mie_raw, csr_clr_mie_raw;
+  logic          cheri_ex_valid_raw, cheri_ex_err_raw;
+  logic          csr_op_en_raw;
+  logic          cheri_wb_err_raw;
+  logic          cheri_wb_err_q, cheri_wb_err_d; 
+  logic          ztop_wr_raw;
+
+  logic   [3:0]  cheri_lsu_lc_clrperm;
+  logic          lc_cglg, lc_csdlm, lc_ctag;
+  logic  [31:0]  pc_id_nxt;
+
+  full_cap_t     setaddr1_outcap, setbounds_outcap, setbounds_rndn_outcap;
+  logic  [15:0]  cheri_wb_err_info_q, cheri_wb_err_info_d;
+  logic          set_bounds_done;
+
+  logic   [4:0]  cheri_err_cause, rv32_err_cause;
+  logic   [31:0] cpu_lsu_addr;
+  logic   [32:0] cpu_lsu_wdata;
+  logic          cpu_lsu_we;
+  logic          cpu_lsu_cheri_err, cpu_lsu_is_cap;
+
+  logic          illegal_scr_addr;
+  logic          scr_legalization;
+
+  // data forwarding for CHERI instructions
+  //  - note address 0 is a read-only location per RISC-V
+  always_comb begin : fwd_data_merger
+    if ((rf_raddr_a_i == fwd_waddr_i) && fwd_we_i && (|rf_raddr_a_i)) begin
+      rf_rdata_ng_a = fwd_wdata_i;
+      rf_rcap_ng_a  = fwd_wcap_i;
+    end else begin
+      rf_rdata_ng_a = rf_rdata_a_i;
+      rf_rcap_ng_a  = rf_rcap_a_i;
+    end
+
+    if ((rf_raddr_b_i == fwd_waddr_i) && fwd_we_i && (|rf_raddr_b_i)) begin
+      rf_rdata_ng_b = fwd_wdata_i;
+      rf_rcap_ng_b  = fwd_wcap_i;
+    end else begin
+      rf_rdata_ng_b = rf_rdata_b_i;
+      rf_rcap_ng_b  = rf_rcap_b_i;
+    end
+  end
+
+  // 1st level of operand gating (power-saving)
+  //  - gate off the input to reg2full conversion logic
+  //  - note rv32 lsu req only use cs1
+  //  - may need to use dont_tounch gates 
+  assign rf_rcap_a   = (instr_is_cheri_i | instr_is_rv32lsu_i) ? rf_rcap_ng_a : NULL_REG_CAP;
+  assign rf_rdata_a  = (instr_is_cheri_i | instr_is_rv32lsu_i) ? rf_rdata_ng_a : 32'h0;
+
+  assign rf_rcap_b   = instr_is_cheri_i ? rf_rcap_ng_b : NULL_REG_CAP;
+  assign rf_rdata_b  = instr_is_cheri_i ? rf_rdata_ng_b : 32'h0;
+
+  // expand the capabilities
+  assign rf_fullcap_a = reg2fullcap(rf_rcap_a, rf_rdata_a);
+  assign rf_fullcap_b = reg2fullcap(rf_rcap_b, rf_rdata_b);
+
+  // gate these signals with cheri_exec_id to make sure they are only active when needed 
+  // (only 1 cycle in all cases other than cheri_rf_we)
+  // -- safest approach and probably the right thing to do in case there is a wb_exception
+  assign cheri_rf_we_o     = cheri_rf_we_raw & cheri_exec_id_i;
+  assign branch_req_o      = branch_req_raw & cheri_exec_id_i;
+  assign branch_req_spec_o = branch_req_spec_raw & cheri_exec_id_i;
+  assign csr_set_mie_o     = csr_set_mie_raw & cheri_exec_id_i;
+  assign csr_clr_mie_o     = csr_clr_mie_raw & cheri_exec_id_i;
+  assign csr_op_en_o       = csr_op_en_raw & cheri_exec_id_i;
+  assign ztop_wr_o         = ztop_wr_raw & cheri_exec_id_i;
+
+  // ex_valid only used in multicycle case
+  // ex_err is used for id exceptions
+  assign cheri_ex_valid_o = cheri_ex_valid_raw & cheri_exec_id_i;
+  assign cheri_ex_err_o   = cheri_ex_err_raw & cheri_exec_id_i & ~debug_mode_i;
+
+  if (WritebackStage) begin
+    assign cheri_wb_err_o   = cheri_wb_err_q;
+  end else begin
+    assign cheri_wb_err_o   = cheri_wb_err_d;
+  end
+
+  assign cheri_lsu_lc_clrperm = debug_mode_i ? 4'h0 : {lc_ctag, 1'b0, lc_csdlm, lc_cglg};
+
+  always_comb begin : main_ex
+    logic [PERMS_W-1:0] perms_temp;
+    full_cap_t          tfcap;
+
+    //default
+    cheri_rf_we_raw      = 1'b0;
+    result_data_o        = 32'h0;
+    result_cap_o         = NULL_REG_CAP;
+    csc_wcap             = NULL_REG_CAP;
+    cheri_ex_valid_raw   = 1'b0;
+    cheri_ex_err_raw     = 1'b0;
+    cheri_wb_err_raw     = 1'b0;
+    perms_temp           = 0;
+
+    csr_access_o         = 1'b0;
+    csr_addr_o           = 5'h0;
+    csr_wdata_o          = 32'h0;
+    csr_wcap_o           = NULL_REG_CAP;
+    csr_op_o             = CHERI_CSR_NULL;
+    csr_op_en_raw        = 1'b0;
+    scr_legalization     = 1'b0;
+
+    branch_req_raw       = 1'b0;
+    branch_req_spec_raw  = 1'b0;
+    csr_set_mie_raw      = 1'b0;
+    csr_clr_mie_raw      = 1'b0;
+    branch_target_o      = 32'h0;
+    pcc_cap_o            = NULL_PCC_CAP;
+    tfcap                = NULL_FULL_CAP;
+    lc_cglg              = 1'b0;
+    lc_csdlm             = 1'b0;
+    lc_ctag              = 1'b0;
+    rf_trsv_en_o         = 1'b0;
+    ztop_wr_raw          = 1'b0;
+    ztop_wdata_o         = 32'h0;
+    ztop_wfcap_o         = NULL_FULL_CAP;
+
+    unique case (1'b1)
+      cheri_operator_i[CGET_PERM]:
+        begin
+          result_data_o       = {19'h0, rf_fullcap_a.perms};
+          result_cap_o        = NULL_REG_CAP;   // zerout the cap msw
+          cheri_rf_we_raw     = 1'b1;
+          cheri_ex_valid_raw  = 1'b1;
+        end
+      cheri_operator_i[CGET_TYPE]:
+        begin
+          result_data_o       = {28'h0, decode_otype(rf_fullcap_a.otype, rf_fullcap_a.perms[PERM_EX])};
+          result_cap_o        = NULL_REG_CAP;
+          cheri_rf_we_raw     = 1'b1;
+          cheri_ex_valid_raw  = 1'b1;
+        end
+      cheri_operator_i[CGET_BASE]:
+        begin
+          result_data_o       = rf_fullcap_a.base32;
+          result_cap_o        = NULL_REG_CAP;
+          cheri_rf_we_raw     = 1'b1;
+          cheri_ex_valid_raw  = 1'b1;
+        end
+      cheri_operator_i[CGET_TOP]:
+        begin
+          result_data_o       = rf_fullcap_a.top33[32] ? 32'hffff_ffff : rf_fullcap_a.top33[31:0];
+          result_cap_o        = NULL_REG_CAP;
+          cheri_rf_we_raw     = 1'b1;
+          cheri_ex_valid_raw  = 1'b1;
+        end
+      cheri_operator_i[CGET_LEN]:
+        begin
+          result_data_o       = get_cap_len(rf_fullcap_a);
+          result_cap_o        = NULL_REG_CAP;
+          cheri_rf_we_raw     = 1'b1;
+          cheri_ex_valid_raw  = 1'b1;
+        end
+      cheri_operator_i[CGET_TAG]:
+        begin
+          result_data_o       = {31'h0, rf_fullcap_a.valid};
+          result_cap_o        = NULL_REG_CAP;
+          cheri_rf_we_raw     = 1'b1;
+          cheri_ex_valid_raw  = 1'b1;
+        end
+      cheri_operator_i[CGET_ADDR]:
+        begin
+          result_data_o       = rf_rdata_a;
+          result_cap_o        = NULL_REG_CAP;
+          cheri_rf_we_raw     = 1'b1;
+          cheri_ex_valid_raw  = 1'b1;
+        end
+      cheri_operator_i[CGET_HIGH]:
+        begin
+          logic [65:0] tmp66;
+          tmp66 = MemCapFmt ? (CheriCapIT8 ? reg2mem_it8_fmt1(rf_rcap_a, rf_rdata_a) : 
+                                             reg2mem_fmt1(rf_rcap_a, rf_rdata_a)) :
+                              (CheriCapIT8 ? {reg2memcap_it8_fmt0(rf_rcap_a), 1'b0, rf_rdata_a[31:0]} :
+                                             {reg2memcap_fmt0(rf_rcap_a), 1'b0, rf_rdata_a[31:0]});
+          result_data_o       = tmp66[64:33];
+          result_cap_o        = NULL_REG_CAP;
+          cheri_rf_we_raw     = 1'b1;
+          cheri_ex_valid_raw  = 1'b1;
+        end
+      (cheri_operator_i[CSEAL] | cheri_operator_i[CUNSEAL]):
+        begin                   // cd <-- cs1; cd.otyp <-- cs2.otype; cd.sealed <-- val
+          result_data_o        = rf_rdata_a;
+
+          if (cheri_operator_i[CSEAL])
+            result_cap_o       = full2regcap(seal_cap(rf_fullcap_a, rf_rdata_b[OTYPE_W-1:0]));
+          else begin
+            tfcap                = unseal_cap(rf_fullcap_a);
+            tfcap.perms[PERM_GL] = rf_fullcap_a.perms[PERM_GL] & rf_fullcap_b.perms[PERM_GL];
+            tfcap.cperms         = compress_perms(tfcap.perms, tfcap.cperms[5:4]);
+            result_cap_o         = full2regcap(tfcap);
+          end
+
+          result_cap_o.valid   = result_cap_o.valid & (~addr_bound_vio) & (~perm_vio);
+          cheri_rf_we_raw      = 1'b1;
+          cheri_ex_valid_raw   = 1'b1;
+        end
+      cheri_operator_i[CAND_PERM]:         // cd <-- cs1; cd.perm <-- cd.perm & rs2
+        begin
+          logic [PERMS_W-1:0] pmask;
+          result_data_o      = rf_rdata_a;
+          tfcap              = rf_fullcap_a;
+          tfcap.perms        = tfcap.perms & rf_rdata_b[PERMS_W-1:0];
+          tfcap.cperms       = compress_perms(tfcap.perms, tfcap.cperms[5:4]);
+          // for sealed caps, clear tag unless perm mask (excluding GL) == all '1'
+          pmask              = rf_rdata_b[PERMS_W-1:0];
+          pmask[PERM_GL]     = 1'b1;
+          tfcap.valid        = tfcap.valid & (~is_cap_sealed(rf_fullcap_a) | (&pmask));
+          result_cap_o       = full2regcap(tfcap);
+          cheri_rf_we_raw    = 1'b1;
+          cheri_ex_valid_raw = 1'b1;
+        end
+      cheri_operator_i[CSET_HIGH]:         // cd <-- cs1; cd.high <-- convert(rs2)
+        begin
+          // this only works for memcap_fmt0 for now QQQ
+          result_data_o      = rf_rdata_a;
+          result_cap_o       = CheriCapIT8 ? mem2regcap_it8_fmt0({1'b0, rf_rdata_b}, {1'b0, rf_rdata_a}, 4'h0) :
+                                             mem2regcap_fmt0({1'b0, rf_rdata_b}, {1'b0, rf_rdata_a}, 4'h0);
+          cheri_rf_we_raw    = 1'b1;
+          cheri_ex_valid_raw = 1'b1;
+        end
+
+      // setaddr/incoffset: cd <-- cs1; cd.offset <-- rs2, or cs1.addr + rs2, or cs1.addr + imm12
+      // auipcc: cd <-- pcc, cd.address <-- pcc.address + (imm20 << 12)
+      (cheri_operator_i[CSET_ADDR] | cheri_operator_i[CINC_ADDR] |
+       cheri_operator_i[CINC_ADDR_IMM] | cheri_operator_i[CAUIPCC] | cheri_operator_i[CAUICGP]):
+        begin
+          logic clr_sealed;
+          logic instr_fault;
+
+          result_data_o        = addr_result;
+
+          // for pointer operations, follow C convention and allow newptr == top
+          clr_sealed           = cheri_operator_i[CAUIPCC] ? 1'b0 : is_cap_sealed(rf_fullcap_a);
+          tfcap                = setaddr1_outcap;
+          tfcap.valid          = tfcap.valid & ~clr_sealed;
+          result_cap_o         = full2regcap(tfcap);
+          instr_fault          = csr_dbg_tclr_fault_i & (rf_fullcap_a.valid | cheri_operator_i[CAUIPCC]) &
+                                 ~result_cap_o.valid;
+          cheri_wb_err_raw     = instr_fault;
+          cheri_rf_we_raw      = ~instr_fault;
+          cheri_ex_valid_raw   = 1'b1;
+        end
+      (cheri_operator_i[CSET_BOUNDS] | cheri_operator_i[CSET_BOUNDS_IMM] | cheri_operator_i[CSET_BOUNDS_EX] |
+       cheri_operator_i[CRRL] | cheri_operator_i[CRAM] | cheri_operator_i[CSET_BOUNDS_RNDN]):
+        begin                  // cd <-- cs1; cd.base <-- cs1.address, cd.len <-- rs2 or imm12
+          logic instr_fault;
+
+          tfcap            = cheri_operator_i[CSET_BOUNDS_RNDN] ? setbounds_rndn_outcap : setbounds_outcap;
+          tfcap.valid      = tfcap.valid & ~is_cap_sealed(rf_fullcap_a);
+
+          if (cheri_operator_i[CRRL]) begin
+            result_data_o = tfcap.rlen;
+            result_cap_o  = NULL_REG_CAP;
+          end else if (cheri_operator_i[CRAM]) begin
+            result_data_o = tfcap.maska;
+            result_cap_o  = NULL_REG_CAP;
+          end else begin
+            result_data_o = rf_rdata_a;
+            result_cap_o  = full2regcap(tfcap);
+          end
+
+          cheri_ex_valid_raw = set_bounds_done;
+          instr_fault        = csr_dbg_tclr_fault_i & rf_fullcap_a.valid & ~result_cap_o.valid &
+                             (cheri_operator_i[CSET_BOUNDS] | cheri_operator_i[CSET_BOUNDS_IMM] |
+                              cheri_operator_i[CSET_BOUNDS_EX] | cheri_operator_i[CSET_BOUNDS_RNDN]);
+          cheri_rf_we_raw    = ~instr_fault;
+          cheri_wb_err_raw   = instr_fault;
+        end
+      cheri_operator_i[CCLEAR_TAG]:         // cd <-- cs1; cd.tag <-- '0'
+        begin
+          result_data_o        = rf_rdata_a;
+          result_cap_o         = rf_rcap_a;
+          result_cap_o.valid   = 1'b0;
+          cheri_rf_we_raw      = 1'b1;
+          cheri_ex_valid_raw   = 1'b1;
+        end
+      cheri_operator_i[CIS_SUBSET]:      // rd <-- (cs1.tag == cs2.tag) && (cs2 is_subset_of cs1)
+        begin
+          result_data_o       = 32'((rf_fullcap_a.valid  == rf_fullcap_b.valid) &&
+                                 ~addr_bound_vio && (&(rf_fullcap_a.perms | ~rf_fullcap_b.perms)));
+          result_cap_o        = NULL_REG_CAP;
+          cheri_rf_we_raw     = 1'b1;
+          cheri_ex_valid_raw  = 1'b1;
+        end
+      cheri_operator_i[CIS_EQUAL]:       // rd <-- (cs1 == cs2)
+        begin
+          result_data_o       = 32'(is_equal(rf_fullcap_a, rf_fullcap_b, rf_rdata_a, rf_rdata_b));
+          result_cap_o        = NULL_REG_CAP;
+          cheri_rf_we_raw     = 1'b1;
+          cheri_ex_valid_raw  = 1'b1;
+        end
+      cheri_operator_i[CSUB_CAP]:          // rd <-- cs1.addr - cs2.addr
+        begin
+          result_data_o       = rf_rdata_a - rf_rdata_b;
+          result_cap_o        = NULL_REG_CAP;
+          cheri_rf_we_raw     = 1'b1;
+          cheri_ex_valid_raw  = 1'b1;
+        end
+      cheri_operator_i[CMOVE_CAP]:         // cd <-- cs1
+        begin
+          result_data_o       = rf_rdata_a;
+          result_cap_o        = rf_rcap_a;
+          cheri_rf_we_raw     = 1'b1;
+          cheri_ex_valid_raw  = 1'b1;
+        end
+      cheri_operator_i[CLOAD_CAP]:
+        begin
+          lc_cglg              = ~rf_fullcap_a.perms[PERM_LG];
+          lc_csdlm             = ~rf_fullcap_a.perms[PERM_LM];
+          lc_ctag              = ~rf_fullcap_a.perms[PERM_MC];
+
+          result_data_o        = 32'h0;
+          result_cap_o         = NULL_REG_CAP;
+          cheri_rf_we_raw      = 1'b0;
+          cheri_ex_valid_raw   = 1'b1;             // lsu_req_done is factored in by id_stage
+          cheri_ex_err_raw     = 1'b0;             // acc err passed to LSU and processed later in WB
+          rf_trsv_en_o         = CheriPPLBC & cheri_tsafe_en_i & lsu_req_done_i;
+        end
+      cheri_operator_i[CSTORE_CAP]:
+        begin
+          result_data_o        = 32'h0;
+          result_cap_o         = NULL_REG_CAP;
+          cheri_rf_we_raw      = 1'b0;
+          cheri_ex_valid_raw   = 1'b1;
+          cheri_ex_err_raw     = 1'b0;       // acc err passed to LSU and processed later in WB
+          csc_wcap             = rf_rcap_b;
+          csc_wcap.valid       = rf_rcap_b.valid & ~perm_vio_slc;
+        end
+      cheri_operator_i[CCSR_RW]:           // cd <-- scr; scr <-- cs1 if cs1 != C0
+        begin
+          logic [31:0] tmp32;
+          logic        is_ztop, is_write;
+          reg_cap_t    trcap;
+          logic        instr_fault;
+          
+          is_ztop            = (cheri_cs2_dec_i==CHERI_SCR_ZTOPC);
+          is_write           = (rf_raddr_a_i != 0);
+          instr_fault        = perm_vio | illegal_scr_addr;
+                            
+          csr_access_o       = ~instr_fault;
+          csr_op_o           = CHERI_CSR_RW;
+          csr_op_en_raw      = ~instr_fault && is_write && ~is_ztop;
+          ztop_wr_raw        = ~instr_fault && is_write && is_ztop;
+          csr_addr_o         = cheri_cs2_dec_i;
+
+          if (cheri_cs2_dec_i == CHERI_SCR_MTCC) begin
+            // MTVEC/MTCC legalization (clear tag if checking fails)
+            // note we don't reall need set_address checks here - it's only used to update temp fields
+            //   so that RTL behavior would match sail
+            scr_legalization = 1'b1;
+            csr_wdata_o      = {rf_rdata_a[31:2], 2'b00};          
+            trcap            = full2regcap(setaddr1_outcap);
+            if ((rf_rdata_a[1:0] != 2'b00) || ~rf_fullcap_a.perms[PERM_EX] || (rf_fullcap_a.otype != 0))
+              trcap.valid = 1'b0; 
+            else
+              trcap.valid = rf_fullcap_a.valid; 
+            csr_wcap_o       = trcap;
+          end else if (cheri_cs2_dec_i == CHERI_SCR_MEPCC) begin
+            // MEPCC legalization (clear tag if checking fails)
+            scr_legalization = 1'b1;
+            csr_wdata_o      = {rf_rdata_a[31:1], 1'b0};          
+            trcap            = full2regcap(setaddr1_outcap);
+            if ((rf_rdata_a[0] != 1'b0) || ~rf_fullcap_a.perms[PERM_EX] || (rf_fullcap_a.otype != 0))
+              trcap.valid = 1'b0; 
+            else
+              trcap.valid = rf_fullcap_a.valid; 
+            csr_wcap_o       = trcap;
+          end else begin
+            scr_legalization = 1'b0;
+            csr_wdata_o      = rf_rdata_a;          
+            csr_wcap_o       = rf_rcap_a;
+          end 
+
+          if (is_ztop) begin
+            result_data_o    = ztop_rdata_i;
+            result_cap_o     = ztop_rcap_i;
+            ztop_wfcap_o     = rf_fullcap_a;
+            ztop_wdata_o     = rf_rdata_a;
+          end else begin
+            result_data_o    = csr_rdata_i;
+            result_cap_o     = csr_rcap_i;
+            ztop_wfcap_o     = NULL_FULL_CAP;
+            ztop_wdata_o     = 32'h0; 
+          end
+          cheri_rf_we_raw    = ~instr_fault;
+          cheri_ex_valid_raw = 1'b1;
+          cheri_wb_err_raw   = instr_fault; 
+        end
+      (cheri_operator_i[CJALR] | cheri_operator_i[CJAL]):
+        begin                  // cd <-- pcc; pcc <-- cs1/pc+offset; pcc.address[0] <--'0'; pcc.sealed <--'0'
+          logic [2:0] seal_type;
+          logic       instr_fault;
+
+          // note this is the RV32 definition of JALR arithmetic (add first then mask of lsb)
+          branch_target_o      = {addr_result[31:1], 1'b0};
+          pcc_cap_o            = full2pcap(unseal_cap(rf_fullcap_a));
+          // Note we can't directly use pc_if here
+          // (link address == pc_id + delta, but pc_if should be the next executed PC (the jump target)
+          //  if branch prediction works)
+          result_data_o        = pc_id_nxt;
+          seal_type            = csr_mstatus_mie_i ? OTYPE_SENTRY_IE_BKWD : OTYPE_SENTRY_ID_BKWD;
+          //tfcap                = seal_cap(setaddr1_outcap, seal_type);
+          tfcap                = (rf_waddr_i == 5'h1) ? seal_cap(setaddr1_outcap, seal_type) : 
+                                                        setaddr1_outcap;
+          result_cap_o         = full2regcap(tfcap);
+
+          // problem with instr_fault: the pcc_cap.valid check causing timing issue on instr_addr_o
+          // -- use the speculative version for instruction fetch
+          // -- the ID exception (cheri_ex_err) flushes the pipeline and re-set PC so
+          //    the speculatively fetched instruction will be flushed
+          // -- this is now mitigated since we no longer do address bound checking here 
+          //    but let's keep the solution for now
+
+          instr_fault           = perm_vio;
+
+          cheri_rf_we_raw      = ~instr_fault;    // err -> wb exception
+          branch_req_raw       = ~instr_fault & cheri_operator_i[CJALR];    // update PCC in CSR
+          // branch_req_spec_raw  = 1'b1;
+          branch_req_spec_raw  = ~instr_fault;    // set fetch PC
+
+          cheri_wb_err_raw     = instr_fault;
+          cheri_ex_err_raw     = 1'b0;
+          csr_set_mie_raw      = ~instr_fault && cheri_operator_i[CJALR] && 
+                                 ((rf_fullcap_a.otype == OTYPE_SENTRY_IE_FWD) ||
+                                  (rf_fullcap_a.otype == OTYPE_SENTRY_IE_BKWD)) ;
+          csr_clr_mie_raw      = ~instr_fault && cheri_operator_i[CJALR] && 
+                                 ((rf_fullcap_a.otype == OTYPE_SENTRY_ID_FWD) || 
+                                  (rf_fullcap_a.otype == OTYPE_SENTRY_ID_BKWD)) ;
+          cheri_ex_valid_raw   = 1'b1;
+        end
+      default:;
+    endcase
+  end   // always_combi
+
+  assign is_load_cap  = cheri_operator_i[CLOAD_CAP];
+  assign is_store_cap = cheri_operator_i[CSTORE_CAP];
+
+  assign is_cap   = cheri_operator_i[CLOAD_CAP] | cheri_operator_i[CSTORE_CAP];
+
+  // muxing between "normal cheri LSU requests (clc/csc) and CLBC
+
+  if (WritebackStage) begin
+    // assert LSU req until instruction is retired (req_done from LSU)
+    // note if the previous instr is also a load/store, cheri_exec_id won't be asserted 
+    // till WB is ready (lsu_resp for the previous isntr)
+    assign cheri_lsu_req      = is_cap & cheri_exec_id_i;
+  end else begin
+    // no WB stage, only assert req in the first_cycle phase of the instruction
+    // (consistent with the RV32 load/store instructions)
+    // Here instruction won't complete till lsu_resp_valid in this case, 
+    // keeping lsu_req asserted causes problem as LSU sees it as a new request
+    assign cheri_lsu_req      = is_cap & cheri_exec_id_i & instr_first_cycle_i;
+  end
+
+  assign cheri_lsu_we       = is_store_cap;
+  assign cheri_lsu_addr     = cs1_addr_plusimm + {29'h0, addr_incr_req_i, 2'b00};
+  assign cheri_lsu_is_cap   = is_cap;
+
+  assign cheri_lsu_wdata    = is_store_cap ? {csc_wcap.valid, rf_rdata_b} : 33'h0;
+  assign cheri_lsu_wcap     = is_store_cap  ? csc_wcap : NULL_REG_CAP;
+
+  // RS1/CS1+offset is
+  //  keep this separate to help timing on the memory interface
+  //   - the starting address for cheri L*/S*.CAP instructions
+  assign cs1_imm = (is_cap|cheri_operator_i[CJALR]) ? {{20{cheri_imm12_i[11]}}, cheri_imm12_i} : 0;
+
+  assign cs1_addr_plusimm   = rf_rdata_a + cs1_imm;
+
+  assign pc_id_nxt = pc_id_i + (instr_is_compressed_i ? 2 : 4);
+
+  //
+  // shared adder for address calculation
+  //
+  always_comb begin : shared_adder
+    logic        [31:0] tmp32a, tmp32b;
+
+    if      (cheri_operator_i[CJALR])           tmp32a = {{20{cheri_imm12_i[11]}}, cheri_imm12_i};
+    else if (cheri_operator_i[CJAL])            tmp32a = {{11{cheri_imm21_i[20]}}, cheri_imm21_i};
+    else if (cheri_operator_i[CAUIPCC])         tmp32a = {cheri_imm20_i[19], cheri_imm20_i, 11'h0};
+    else if (cheri_operator_i[CAUICGP])         tmp32a = {cheri_imm20_i[19], cheri_imm20_i, 11'h0};
+    else if (cheri_operator_i[CSET_ADDR])       tmp32a = rf_rdata_b;
+    else if (cheri_operator_i[CINC_ADDR])       tmp32a = rf_rdata_b;
+    else if (cheri_operator_i[CINC_ADDR_IMM])   tmp32a = {{20{cheri_imm12_i[11]}}, cheri_imm12_i};
+    else                                        tmp32a = 0;
+
+    if      (cheri_operator_i[CJALR])           tmp32b = rf_rdata_a;
+    else if (cheri_operator_i[CJAL])            tmp32b = pc_id_i;
+    else if (cheri_operator_i[CAUIPCC])         tmp32b = pc_id_i;
+    else if (cheri_operator_i[CAUICGP])         tmp32b = rf_rdata_a;
+    else if (cheri_operator_i[CSET_ADDR])       tmp32b = 32'h0;
+    else if (cheri_operator_i[CINC_ADDR])       tmp32b = rf_rdata_a;
+    else if (cheri_operator_i[CINC_ADDR_IMM])   tmp32b = rf_rdata_a;
+    else                                        tmp32b = 0;
+
+    addr_result  = tmp32a + tmp32b;
+  end
+
+  //
+  // Big combinational functions
+  //  - break out to make sure we can properly gate off operands to save power
+  //
+  always_comb begin: set_address_comb
+    full_cap_t   tfcap1;
+    logic [31:0] taddr1;
+
+    // set_addr operation 1
+    if (cheri_operator_i[CJAL] | cheri_operator_i[CJALR]) begin
+      // we don't really need the representability check here, but update_temp_fields is necessary
+      tfcap1  = pcc2fullcap(pcc_cap_i);        // pcc to link register
+      taddr1  = pc_id_nxt;
+    end else if (cheri_operator_i[CAUIPCC]) begin
+      tfcap1  = pcc2fullcap(pcc_cap_i);
+      taddr1  = addr_result;
+    end else if (cheri_operator_i[CSET_ADDR] | cheri_operator_i[CINC_ADDR] |
+                 cheri_operator_i[CINC_ADDR_IMM] | cheri_operator_i[CAUICGP]) begin
+      tfcap1  = rf_fullcap_a;
+      taddr1  = addr_result;
+    end else if (scr_legalization) begin
+      tfcap1  = rf_fullcap_a;
+      taddr1  = csr_wdata_o;
+    end else begin
+      tfcap1  = NULL_FULL_CAP;
+      taddr1  = 32'h0;
+    end
+
+    // representability check only
+    setaddr1_outcap = set_address(tfcap1, taddr1, 0, 0);
+  end
+
+  bound_req_t bound_req1, bound_req2;
+
+  always_comb begin: set_bounds_comb
+    logic [31:0] newlen;
+    logic        req_exact;
+    logic [31:0] tmp_addr;
+    full_cap_t   tfcap3;
+
+    // set_bounds
+    if (cheri_operator_i[CSET_BOUNDS] | cheri_operator_i[CSET_BOUNDS_RNDN]) begin
+      newlen    = rf_rdata_b;
+      req_exact = 1'b0;
+      tfcap3 = rf_fullcap_a;
+      tmp_addr  = rf_rdata_a;
+    end else if (cheri_operator_i[CSET_BOUNDS_EX]) begin
+      newlen    = rf_rdata_b;
+      req_exact = 1'b1;
+      tfcap3 = rf_fullcap_a;
+      tmp_addr  = rf_rdata_a;
+    end else if (cheri_operator_i[CSET_BOUNDS_IMM]) begin
+      newlen    = 32'(cheri_imm12_i);  // unsigned imm
+      req_exact = 1'b0;
+      tfcap3 = rf_fullcap_a;
+      tmp_addr  = rf_rdata_a;
+    end else if (cheri_operator_i[CRRL] | cheri_operator_i[CRAM]) begin
+      newlen    = rf_rdata_a;
+      req_exact = 1'b0;
+      tfcap3 = NULL_FULL_CAP;
+      tmp_addr  = 0;
+    end else begin
+      newlen    = 32'h0;
+      req_exact = 1'b0;
+      tfcap3 = NULL_FULL_CAP;
+      tmp_addr  = 0;
+    end
+
+    bound_req1 = CheriCapIT8 ? prep_bound_req_it8 (tfcap3, tmp_addr, newlen) :
+                               prep_bound_req (tfcap3, tmp_addr, newlen);
+
+    setbounds_outcap = set_bounds(tfcap3, tmp_addr, bound_req2, req_exact);
+
+    setbounds_rndn_outcap = CheriCapIT8 ? set_bounds_rndn_it8(tfcap3, tmp_addr, bound_req2) :
+                                          set_bounds_rndn(tfcap3, tmp_addr, bound_req2);
+  end
+
+  if (CheriSBND2) begin
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        bound_req2      <= '{0, 0, 0, 0, 0, 0};
+        set_bounds_done <= 1'b0;
+      end else begin
+        bound_req2      <= bound_req1;
+        // set_bounds_done is asserted in the 2nd cycle of execution when SBD2 == 1
+        // note in ibex it actaully is ok to hold set_bounds_done high for both cycles
+        // since the multicycle control logic won't look at ex_valid till the 2nd cycle
+        // however this is the cleaner solution.
+        set_bounds_done <= (cheri_operator_i[CSET_BOUNDS] | cheri_operator_i[CSET_BOUNDS_IMM] |
+                            cheri_operator_i[CSET_BOUNDS_EX] | cheri_operator_i[CRRL] | 
+                            cheri_operator_i[CRAM]) & cheri_exec_id_i & ~set_bounds_done ;
+      end
+    end
+  end else begin
+    assign bound_req2      = bound_req1;
+    assign set_bounds_done = 1'b1;
+  end
+
+
+
+  // address bound and permission checks for
+  //    - cheri no-LSU instructions
+  //    - cheri LSU (cap) instructions (including internal instr like LBC)
+  //    - RV32  LSU (data) instructions
+  // this is a architectural access check (apply to the whole duration of an instruction)
+  //    - based on architectural capability registers and addresses
+
+  // - orginally we combine checking for CHERI and RV32 but it caused a combi loop
+  //   that goes from instr_executing -> rv32_lsu_req -> lsu_error -> cheri_ex_err -> instr_executing
+  //   it's not a real runtime issue but it does confuses timing tools so let's split for now.
+  //   Besides - note checking/lsu_cheri_err_o is one timing critical path
+  logic [31:0] rv32_ls_chkaddr;
+  assign rv32_ls_chkaddr = rv32_lsu_addr_i;
+
+  always_comb begin : check_rv32
+    logic [31:0] top_offset;
+    logic [32:0] top_bound;
+    logic [31:0] base_bound, base_chkaddr;
+    logic        top_vio, base_vio;
+    logic [32:0] top_chkaddr;
+    logic        top_size_ok;
+
+    // generate the address used to check top bound violation
+    base_chkaddr = rv32_ls_chkaddr;
+
+    if (rv32_lsu_type_i == 2'b00) begin
+      top_offset  = 32'h4;
+      top_size_ok = |rf_fullcap_a.top33[32:2];     // at least 4 bytes
+    end else if (rv32_lsu_type_i == 2'b01) begin
+      top_offset  = 32'h2;
+      top_size_ok = |rf_fullcap_a.top33[32:1];
+    end else begin
+      top_offset = 32'h1;
+      top_size_ok = |rf_fullcap_a.top33[32:0];
+    end
+
+    //top_chkaddr = base_chkaddr + top_offset;
+    top_chkaddr = {1'b0, base_chkaddr};
+
+    // top_bound  = rf_fullcap_a.top33;
+    top_bound  = rf_fullcap_a.top33 - top_offset;
+    base_bound = rf_fullcap_a.base32;
+
+    top_vio  = (top_chkaddr  > top_bound) || ~top_size_ok;
+    base_vio = (base_chkaddr < base_bound);
+
+    // timing critical (data_req_o) path - don't add any unnecssary terms.
+    // we will chose with is_cheri on the LSU interface later.
+    //   for unaligned access, only check the starting (1st) address
+    //   (if there is an error, addr_incr_req won't be thre anyway
+    addr_bound_vio_rv32 =  (top_vio | base_vio) & ~addr_incr_req_i ;
+
+    // main permission logic
+    perm_vio_vec_rv32 = 0;
+
+    perm_vio_vec_rv32[PVIO_TAG]  = ~rf_fullcap_a.valid;
+    perm_vio_vec_rv32[PVIO_SEAL] = is_cap_sealed(rf_fullcap_a);
+    perm_vio_vec_rv32[PVIO_LD]   = ((~rv32_lsu_we_i) && (~rf_fullcap_a.perms[PERM_LD]));
+    perm_vio_vec_rv32[PVIO_SD]   = (rv32_lsu_we_i && (~rf_fullcap_a.perms[PERM_SD]));
+    
+    perm_vio_rv32 =  |perm_vio_vec_rv32;
+  end
+
+  assign rv32_lsu_err = cheri_pmode_i & ~debug_mode_i & (addr_bound_vio_rv32 | perm_vio_rv32);
+
+  // Cheri instr address bound checking
+  //   -- we choose to centralize the address bound checking here
+  //      so that we can mux the inputs and save some area
+
+
+  logic [31:0] cheri_ls_chkaddr;
+  assign cheri_ls_chkaddr = cs1_addr_plusimm;
+
+  always_comb begin : check_cheri
+    logic [31:0] top_offset;
+    logic [32:0] top_bound;
+    logic [31:0] base_bound, base_chkaddr;
+    logic [32:0] top_chkaddr;
+    logic        top_vio, base_vio, top_equal;
+    logic        cs2_bad_type;
+    logic        cs1_otype_0, cs1_otype_1, cs1_otype_45, cs1_otype_23;
+    logic        cs2_otype_45;
+
+    // generate the address used to check top bound violation
+    if (cheri_operator_i[CSEAL])
+      base_chkaddr = rf_rdata_b;           // cs2.address
+    else if (cheri_operator_i[CUNSEAL])
+      // inCapBounds(cs2_val, zero_extend(cs1_val.otype), 1)
+      base_chkaddr =  {28'h0, decode_otype(rf_fullcap_a.otype, rf_fullcap_a.perms[PERM_EX])};  
+    else if (cheri_operator_i[CIS_SUBSET])
+      base_chkaddr = rf_fullcap_b.base32;  // cs2.base32
+    else   // CLC/CSC
+      base_chkaddr = cheri_ls_chkaddr;     // cs1.address + offset
+
+    if (cheri_operator_i[CIS_SUBSET])
+      top_chkaddr = rf_fullcap_b.top33;
+    else if (is_cap)  // CLC/CSC
+      top_chkaddr = {1'b0, base_chkaddr[31:3], 3'b000};
+    else 
+      top_chkaddr = {1'b0, base_chkaddr};
+
+    if (cheri_operator_i[CSEAL] | cheri_operator_i[CUNSEAL]) begin
+      top_bound  = rf_fullcap_b.top33;
+      base_bound = rf_fullcap_b.base32;
+    end else if (is_cap) begin // CLC/CSC
+      top_bound  = {rf_fullcap_a.top33[32:3], 3'b000};       // 8-byte aligned access only
+      base_bound = rf_fullcap_a.base32;
+    end else begin
+      top_bound  = rf_fullcap_a.top33;
+      base_bound = rf_fullcap_a.base32;
+    end
+
+    top_vio   = (top_chkaddr  > top_bound);
+    base_vio  = (base_chkaddr < base_bound);
+    top_equal = (top_chkaddr == top_bound);
+
+    if (debug_mode_i)
+      addr_bound_vio = 1'b0;
+    else if (is_cap) 
+      addr_bound_vio = top_vio | base_vio | top_equal;
+    else if (cheri_operator_i[CIS_SUBSET]) 
+      addr_bound_vio = top_vio | base_vio;
+    else if (cheri_operator_i[CSEAL] | cheri_operator_i[CUNSEAL])
+      addr_bound_vio = top_vio | base_vio | top_equal;
+    else
+      addr_bound_vio = 1'b0;
+
+    // main permission logic
+    perm_vio_vec = 0;
+    perm_vio     = 0;
+    perm_vio_slc = 0;
+    cs2_bad_type = 1'b0;
+    illegal_scr_addr = 1'b0;
+
+    // otype_1: forward sentry; otype_23: forward inherit sentry; otype_45: backward sentry; 
+    cs1_otype_0  = (rf_fullcap_a.otype == 3'h0);
+    cs1_otype_1  = rf_fullcap_a.perms[PERM_EX] & (rf_fullcap_a.otype == 3'h1);  // fwd sentry
+    cs1_otype_45 = rf_fullcap_a.perms[PERM_EX] & ((rf_fullcap_a.otype == 3'h4) || (rf_fullcap_a.otype == 3'h5)); 
+    cs1_otype_23 = rf_fullcap_a.perms[PERM_EX] & ((rf_fullcap_a.otype == 3'h2) || (rf_fullcap_a.otype == 3'h3));
+ 
+    cs2_otype_45 = rf_fullcap_b.perms[PERM_EX] & ((rf_fullcap_b.otype == 3'h4) || (rf_fullcap_b.otype == 3'h5)); 
+
+    // note cseal/unseal/cis_subject doesn't generate exceptions, 
+    // so for all exceptions, violations can always be attributed to cs1, thus no need to further split
+    // exceptions based on source operands.
+    if (is_load_cap) begin
+      perm_vio_vec[PVIO_TAG]   = ~rf_fullcap_a.valid;
+      perm_vio_vec[PVIO_SEAL]  = is_cap_sealed(rf_fullcap_a);
+      perm_vio_vec[PVIO_LD]    = ~(rf_fullcap_a.perms[PERM_LD]);
+      perm_vio_vec[PVIO_ALIGN] = (cheri_ls_chkaddr[2:0] != 0);
+    end else if (is_store_cap) begin
+      perm_vio_vec[PVIO_TAG]   = (~rf_fullcap_a.valid); 
+      perm_vio_vec[PVIO_SEAL]  = is_cap_sealed(rf_fullcap_a);
+      perm_vio_vec[PVIO_SD]    = ~rf_fullcap_a.perms[PERM_SD];
+      perm_vio_vec[PVIO_SC]    = (~rf_fullcap_a.perms[PERM_MC] && rf_fullcap_b.valid);
+      perm_vio_vec[PVIO_ALIGN] = (cheri_ls_chkaddr[2:0] != 0);
+      perm_vio_slc             = ~rf_fullcap_a.perms[PERM_SL] && rf_fullcap_b.valid && 
+                                (~rf_fullcap_b.perms[PERM_GL]) ;
+    end else if (cheri_operator_i[CSEAL]) begin
+      cs2_bad_type = rf_fullcap_a.perms[PERM_EX] ? 
+                     ((rf_rdata_b[31:3]!=0)||(rf_rdata_b[2:0]==0)) : 
+                     ((|rf_rdata_b[31:4]) || (rf_rdata_b[3:0] <= 8));
+      // cs2.addr check : ex: 0-7, non-ex: 9-15
+      perm_vio_vec[PVIO_TAG]   = ~rf_fullcap_b.valid;
+      perm_vio_vec[PVIO_SEAL]  = is_cap_sealed(rf_fullcap_a) || is_cap_sealed(rf_fullcap_b) || 
+                                  (~rf_fullcap_b.perms[PERM_SE]) || cs2_bad_type;
+    end else if (cheri_operator_i[CUNSEAL]) begin
+      perm_vio_vec[PVIO_TAG]   = ~rf_fullcap_b.valid; 
+      perm_vio_vec[PVIO_SEAL]  = (~is_cap_sealed(rf_fullcap_a)) || is_cap_sealed(rf_fullcap_b) ||
+                                 (~rf_fullcap_b.perms[PERM_US]);
+    end else if (cheri_operator_i[CJALR]) begin
+      perm_vio_vec[PVIO_TAG]   = ~rf_fullcap_a.valid;
+      perm_vio_vec[PVIO_SEAL]  = (is_cap_sealed(rf_fullcap_a) && (cheri_imm12_i != 0)) ||
+                                 ~(((rf_waddr_i == 0) && (rf_raddr_a_i == 5'h1) && cs1_otype_45) || 
+                                   ((rf_waddr_i == 0) && (rf_raddr_a_i != 5'h1) && (cs1_otype_0 || cs1_otype_1)) ||
+                                   ((rf_waddr_i == 5'h1) && (cs1_otype_0 | cs1_otype_23)) ||
+                                   ((rf_waddr_i != 0) && (cs1_otype_0 | cs1_otype_1)));
+                                 
+      perm_vio_vec[PVIO_EX]    = ~rf_fullcap_a.perms[PERM_EX]; 
+    end else if (cheri_operator_i[CCSR_RW]) begin
+      perm_vio_vec[PVIO_ASR]   = ~pcc_cap_i.perms[PERM_SR];
+      illegal_scr_addr         = ~debug_mode_i & (csr_addr_o < 27);
+    end else begin
+      perm_vio_vec = 0;
+    end
+
+    perm_vio = | perm_vio_vec;
+
+  end
+
+  // qualified by lsu_req later
+  // store_local error only causes tag clearing unless escalated to fault for debugging
+  assign cheri_lsu_err = cheri_pmode_i & ~debug_mode_i & 
+                         (addr_bound_vio | perm_vio | (csr_dbg_tclr_fault_i & perm_vio_slc));
+
+  //
+  // fault case mtval generation
+  // report to csr as mtval
+  logic ls_addr_misaligned_only;
+
+  assign cheri_ex_err_info_o = 12'h0;           // no ex stage cheri error currently
+  assign cheri_wb_err_info_o = cheri_wb_err_info_q;
+
+  assign cheri_wb_err_d      = cheri_wb_err_raw & cheri_exec_id_i & cheri_ex_valid_raw & ~debug_mode_i;
+
+  // addr_bound_vio is the timing optimized version (gating data_req) 
+  // However we need to generate full version of addr_bound_vio to match the sail exception 
+  // priority definition (bound_vio has higher priority over alignment_error).
+  // this has less timing impact since it goes to a flop stage
+  logic addr_bound_vio_ext;
+  logic [32:0] cheri_top_chkaddr_ext;
+
+  assign cheri_top_chkaddr_ext = cheri_ls_chkaddr + 8;   // extend to 33 bit for compare
+  assign addr_bound_vio_ext = is_cap ?  addr_bound_vio | (cheri_top_chkaddr_ext > rf_fullcap_a.top33) :
+                              addr_bound_vio;
+
+  always_comb begin : err_cause_comb 
+    cheri_err_cause  = vio_cause_enc(addr_bound_vio_ext, perm_vio_vec);
+    rv32_err_cause   = vio_cause_enc(addr_bound_vio_rv32, perm_vio_vec_rv32);
+
+    
+    ls_addr_misaligned_only = perm_vio_vec[PVIO_ALIGN] && (perm_vio_vec[PVIO_ALIGN-1:0] == 0) && ~addr_bound_vio_ext;
+    
+    // cheri_wb_err_raw is already qualified by instr
+    // bit 15:13: reserved
+    // bit 12: illegal_scr_addr
+    // bit 11: alignment error (load/store)
+    // bit 10:0 mtval as defined by CHERIoT arch spec
+    if (cheri_operator_i[CCSR_RW] & cheri_wb_err_raw & illegal_scr_addr & cheri_exec_id_i)
+      // cspecialrw trap, illegal addr, treated as illegal_insn
+      cheri_wb_err_info_d = {3'h0, 1'b1, 12'h0};
+    else if (cheri_operator_i[CCSR_RW] & cheri_wb_err_raw & cheri_exec_id_i)
+      // cspecialrw traps, PERM_SR
+      cheri_wb_err_info_d = {5'h0, 1'b1, cheri_cs2_dec_i, cheri_err_cause};
+    else if (cheri_wb_err_raw  & cheri_exec_id_i)
+      cheri_wb_err_info_d = {5'h0, 1'b0, rf_raddr_a_i, cheri_err_cause};
+    else if ((is_load_cap | is_store_cap) & cheri_lsu_err & cheri_exec_id_i)
+      cheri_wb_err_info_d = {4'h0, ls_addr_misaligned_only, 1'b0, rf_raddr_a_i, cheri_err_cause};
+    else if (rv32_lsu_req_i & rv32_lsu_err)
+      cheri_wb_err_info_d = {5'h0, 1'b0, rf_raddr_a_i, rv32_err_cause};
+    else 
+      cheri_wb_err_info_d = cheri_wb_err_info_q;
+  end 
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      cheri_wb_err_q      <= 1'b0;
+      cheri_wb_err_info_q <= 'h0;
+    end else begin
+      // Simple flop here works since
+      //  -- cheri_wb_err is gated by cheri_exec_id/ex_valid
+      //  --  all non-load/store cheriot instructions that can generate exceptions 
+      //      only takes 1 cycle in ID/EX stage
+      //  -- faulted non-load/store instruction can only stay 1 cycle in wb_stage
+      cheri_wb_err_q      <= cheri_wb_err_d; 
+      cheri_wb_err_info_q <= cheri_wb_err_info_d;
+    end
+  end
+
+  //
+  // muxing in cheri LSU signals with the rv32 signals
+  //
+  assign lsu_req_o         = (instr_is_cheri_i ? cheri_lsu_req : rv32_lsu_req_i);
+  assign cpu_lsu_dec_o     = ((instr_is_cheri_i && is_cap) | instr_is_rv32lsu_i);  
+
+
+  assign cpu_lsu_cheri_err = instr_is_cheri_i ? cheri_lsu_err : rv32_lsu_err; 
+  assign cpu_lsu_addr      = instr_is_cheri_i ? cheri_lsu_addr : rv32_lsu_addr_i;
+  assign cpu_lsu_we        = instr_is_cheri_i ? cheri_lsu_we : rv32_lsu_we_i;
+  assign cpu_lsu_wdata     = instr_is_cheri_i ? cheri_lsu_wdata : {1'b0, rv32_lsu_wdata_i};
+  assign cpu_lsu_is_cap    = instr_is_cheri_i & cheri_lsu_is_cap;
+
+  // muxing tbre ctrl inputs and CPU ctrl inputs
+
+  assign lsu_cheri_err_o   = ~lsu_tbre_sel_i ? cpu_lsu_cheri_err : 1'b0;
+  assign lsu_we_o          = ~lsu_tbre_sel_i ? cpu_lsu_we   : tbre_lsu_we_i;
+  assign lsu_addr_o        = ~lsu_tbre_sel_i ? cpu_lsu_addr : tbre_lsu_addr_i;
+  assign lsu_wdata_o       = ~lsu_tbre_sel_i ? cpu_lsu_wdata : tbre_lsu_wdata_i;
+  assign lsu_is_cap_o      = ~lsu_tbre_sel_i ? cpu_lsu_is_cap : tbre_lsu_is_cap_i;
+
+  assign lsu_lc_clrperm_o  = (~lsu_tbre_sel_i & instr_is_cheri_i) ? cheri_lsu_lc_clrperm : 0;
+  assign lsu_type_o        = (~lsu_tbre_sel_i & ~instr_is_cheri_i) ? rv32_lsu_type_i : 2'b00;
+  assign lsu_wcap_o        = (~lsu_tbre_sel_i & instr_is_cheri_i) ? cheri_lsu_wcap    : NULL_REG_CAP;
+  assign lsu_sign_ext_o    = (~lsu_tbre_sel_i & ~instr_is_cheri_i) ? rv32_lsu_sign_ext_i : 1'b0;
+
+
+  // rv32 core side signals
+  // request phase: be nice and mux using the current EX instruction to select
+
+  // addr_incr:
+  //  -- must qualify addr_incr otherwise it goes to ALU and mess up non-LSU instructions
+  //  -- however for LEC to gate this with cheri_pmode, otherwise illegal_insn will feed into addr logic
+  //     since illegal_insn goes into instr_is_rv32lsu
+  // assign rv32_addr_incr_req_o   = instr_is_rv32lsu_i  ?  addr_incr_req_i : 1'b0;   // original
+  assign rv32_addr_incr_req_o   = (~cheri_pmode_i | instr_is_rv32lsu_i)  ?  addr_incr_req_i : 1'b0;
+
+  assign rv32_addr_last_o       = addr_last_i;
+
+  // req_done, resp_valid, load/store_err will be directly from LSU
+
+  //
+  // Stack high watermark CSR update
+  //
+  
+  // Notes,
+  //  - this should also take care of unaligned access (which increases addr only)
+  //    (although stack access should not have any)
+  //  - it's also ok if the prev instr gets faulted in WB, since stall_mem/data_req_allowed logic  ensures 
+  //    that lsu_req won't be issued till memory response/error comes back
+  //  - what if the instruction gets faulted later in WB stage? Also fine since worst case even if HM is 
+  //    too aggressive we will just have to spend more time zeroing out more stack area.
+  
+  assign csr_mshwm_set_o = lsu_req_o & ~lsu_cheri_err_o & lsu_we_o & 
+                           (lsu_addr_o[31:4] >= csr_mshwmb_i[31:4]) & (lsu_addr_o[31:4] < csr_mshwm_i[31:4]);
+  assign csr_mshwm_new_o = {lsu_addr_o[31:4], 4'h0};
+
+
+  //
+  // Stack fast clearing support
+  //
+
+  if (CheriStkZ) begin
+    logic lsu_addr_in_stkz_range, stkz_stall_q;
+
+    assign lsu_addr_in_stkz_range = cpu_lsu_dec_o && (cpu_lsu_addr[31:4] >= stkz_base_i[31:4]) && 
+                                    (cpu_lsu_addr[31:2] < stkz_ptr_i[31:2]);
+
+    // cpu_lsu_dec_o is meant to be an early hint to help LSU to generate mux selects for 
+    // address/ctrl/wdata (eventually to help timing on those output ports)
+    // - we always suppress lsu_req if stkclr active and address-in-range (to be cleared)
+    // - however in the first cycle we speculatively still assert cpu_lsu_dec_o to let LSU choose 
+    //   the address from cpu core (and hold back stkz/tbre_req). In the next cycle we can deassert
+    //   cpu_lsu_dec_o to let stkz/tbre_req go through
+    // - we also require that lsu_req (after gated by cpu_stkz_stall0) can only go from 0 to 1
+    //   once in an instruction cycle. It's satisfied b/c,
+    //   -- Note stkz_active_i is asserted synchronously by writing to the new stkz_ptr CSR. 
+    //      As such it is not possible for active to go from '0' to '1' in the middle of an 
+    //      load/store instruction when we want to keep lsu_req high while waiting for lsu_req_done
+    //   -- Also, since the cpu_lsu_addr only increments (clc/csc/unaligned) and stkz address
+    //      only decrements, if lsu_addr_in_range = 0 for the 1st word, it will stay 0 for 2nd 
+    //   -- Need to ensure stkz design meet those requirements
+    assign cpu_stall_by_stkz_o = stkz_active_i & lsu_addr_in_stkz_range; 
+    assign cpu_grant_to_stkz_o  = ~instr_first_cycle_i & stkz_stall_q;
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        stkz_stall_q <= 1'b0;
+      end else begin
+        stkz_stall_q <= stkz_active_i & lsu_addr_in_stkz_range;
+      end
+    end
+
+  end else begin
+    assign cpu_stall_by_stkz_o = 1'b0;
+    assign cpu_grant_to_stkz_o = 1'b0;
+  end
+
+  //
+  // debug signal for FPGA only
+  //
+  logic [15:0] dbg_status;
+  logic [66:0] dbg_cs1_vec, dbg_cs2_vec, dbg_cd_vec;
+
+  assign dbg_status = {4'h0,
+                       instr_is_rv32lsu_i, rv32_lsu_req_i, rv32_lsu_we_i,  rv32_lsu_err,
+                       cheri_exec_id_i, cheri_lsu_err, rf_fullcap_a.valid, result_cap_o.valid,
+                       addr_bound_vio, perm_vio, addr_bound_vio_rv32, perm_vio_rv32};
+
+  assign dbg_cs1_vec = {rf_fullcap_a.top_cor, rf_fullcap_a.base_cor, // 66:64
+                        rf_fullcap_a.exp,                            // 63:59
+                        rf_fullcap_a.top, rf_fullcap_a.base,         // 58:41
+                        rf_fullcap_a.otype, rf_fullcap_a.cperms,     // 40:32
+                        rf_rdata_a};                                 // 31:0
+
+  assign dbg_cs2_vec = {rf_fullcap_b.top_cor, rf_fullcap_b.base_cor, // 66:64
+                        rf_fullcap_b.exp,                            // 63:59
+                        rf_fullcap_b.top, rf_fullcap_b.base,         // 58:41
+                        rf_fullcap_b.otype, rf_fullcap_b.cperms,     // 40:32
+                        rf_rdata_b};                                 // 31:0
+
+  assign dbg_cd_vec = {result_cap_o.top_cor, result_cap_o.base_cor,  // 66:64
+                        result_cap_o.exp,                            // 63:59
+                        result_cap_o.top, result_cap_o.base,         // 58:41
+                        result_cap_o.otype, result_cap_o.cperms,     // 40:32
+                        result_data_o};                              // 31:0
+
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheri_pkg.sv b/hw/ip/cheriot-ibex/rtl/cheri_pkg.sv
new file mode 100644
index 0000000..186ce55
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheri_pkg.sv
@@ -0,0 +1,1247 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+package cheri_pkg;
+
+  // bit field widths
+  parameter int unsigned ADDR_W    = 32;
+  parameter int unsigned TOP_W     = 9;    
+  parameter int unsigned TOP8_W    = 8;   // IT8 encoding only
+  parameter int unsigned BOT_W     = 9;
+  parameter int unsigned CEXP_W    = 4;
+  parameter int unsigned CEXP5_W   = 5;   // IT8 encoding only
+  parameter int unsigned EXP_W     = 5;
+  parameter int unsigned OTYPE_W   = 3;
+  parameter int unsigned CPERMS_W  = 6;
+  parameter int unsigned PERMS_W   = 12;
+
+  parameter int unsigned REGCAP_W  = 37;
+
+  parameter bit    [4:0] RESETEXP  = 24;
+  parameter int unsigned UPPER_W   = 24;
+  parameter bit    [4:0] RESETCEXP = 15;   // only used in non-IT8 encoding
+
+  // bit index of PERMS field
+  // U0 SE US EX SR MC LD SL LM SD LG GL
+  parameter int unsigned PERM_GL =  0;     // global flag
+  parameter int unsigned PERM_LG =  1;     // load global
+  parameter int unsigned PERM_SD =  2;     // store
+  parameter int unsigned PERM_LM =  3;     // load mutable
+  parameter int unsigned PERM_SL =  4;     // store local
+  parameter int unsigned PERM_LD =  5;     // load
+  parameter int unsigned PERM_MC =  6;     // capability load/store
+  parameter int unsigned PERM_SR =  7;     // access system registes
+  parameter int unsigned PERM_EX =  8;     // execution
+  parameter int unsigned PERM_US =  9;     // unseal
+  parameter int unsigned PERM_SE = 10;     // seal
+  parameter int unsigned PERM_U0 = 11;     //
+
+  parameter logic [2:0] OTYPE_SENTRY_IE_BKWD = 3'd5;
+  parameter logic [2:0] OTYPE_SENTRY_ID_BKWD = 3'd4;
+  parameter logic [2:0] OTYPE_SENTRY_IE_FWD  = 3'd3;
+  parameter logic [2:0] OTYPE_SENTRY_ID_FWD  = 3'd2;
+  parameter logic [2:0] OTYPE_SENTRY     = 3'd1;
+  parameter logic [2:0] OTYPE_UNSEALED   = 3'd0;
+
+  // Compressed (regFile) capability type
+  typedef struct packed {
+    logic                valid;
+    logic [1:0]          top_cor;
+    logic                base_cor;
+    logic [EXP_W-1   :0] exp;    // expanded
+    logic [TOP_W-1   :0] top;
+    logic [BOT_W-1   :0] base;
+    logic [OTYPE_W-1 :0] otype;
+    logic [CPERMS_W-1:0] cperms;
+    logic                rsvd;
+  } reg_cap_t;
+
+  typedef struct packed {
+    logic                valid;
+    logic [EXP_W-1   :0] exp;    // expanded
+    logic [ADDR_W    :0] top33;
+    logic [ADDR_W-1  :0] base32;
+    logic [OTYPE_W-1 :0] otype;
+    logic [PERMS_W-1: 0] perms;
+    logic [1:0]          top_cor;
+    logic                base_cor;
+    logic [TOP_W-1   :0] top;
+    logic [BOT_W-1   :0] base;
+    logic [CPERMS_W-1:0] cperms;
+    logic [31:0]         maska;
+    logic                rsvd;
+    logic [31:0]         rlen;
+  } full_cap_t;
+
+  typedef struct packed {
+    logic                valid;
+    logic [EXP_W-1   :0] exp;    // expanded
+    logic [ADDR_W    :0] top33;
+    logic [ADDR_W-1  :0] base32;
+    logic [OTYPE_W-1 :0] otype;
+    logic [PERMS_W-1: 0] perms;
+    logic [CPERMS_W-1:0] cperms;
+    logic                rsvd;
+  } pcc_cap_t;
+
+  typedef struct packed {
+    logic [32:0]      top33req;
+    logic [EXP_W-1:0] exp1;
+    logic [EXP_W-1:0] exp2;
+    logic [EXP_W:0]   explen;
+    logic [EXP_W:0]   expb;   // this can be 32 so must be 6-bit
+    logic             in_bound;
+  } bound_req_t;
+
+  parameter reg_cap_t  NULL_REG_CAP  = '{0, 0, 0, 0, 0, 0, 0, 0, 0};
+  parameter full_cap_t NULL_FULL_CAP = '{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  parameter pcc_cap_t  NULL_PCC_CAP  = '{0, 0, 0, 0, 0, 0, 0, 0};
+
+  parameter logic [5:0] CPERMS_TX = 6'b101111;  // Tx (execution root)
+  parameter logic [5:0] CPERMS_TM = 6'b111111;  // Tm (memory data root)
+  parameter logic [5:0] CPERMS_TS = 6'b100111;  // Tx (seal root)
+
+  parameter pcc_cap_t PCC_RESET_CAP   = '{1'b1, RESETEXP, 33'h10000_0000, 0, OTYPE_UNSEALED, 13'h1eb, CPERMS_TX, 1'b0};   // Tx (execution root)
+
+  parameter reg_cap_t  MTVEC_RESET_CAP     = '{1'b1, 0, 0, RESETEXP, 9'h100, 0, OTYPE_UNSEALED, CPERMS_TX, 1'b0};   // Tx (execution root)
+  parameter reg_cap_t  MTDC_RESET_CAP      = '{1'b1, 0, 0, RESETEXP, 9'h100, 0, OTYPE_UNSEALED, CPERMS_TM, 1'b0};   // Tm
+  parameter reg_cap_t MEPC_RESET_CAP       = '{1'b1, 0, 0, RESETEXP, 9'h100, 0, OTYPE_UNSEALED, CPERMS_TX, 1'b0};   // Tx
+  parameter reg_cap_t  MSCRATCHC_RESET_CAP = '{1'b1, 0, 0, RESETEXP, 9'h100, 0, OTYPE_UNSEALED, CPERMS_TS, 1'b0};   // Ts
+
+
+  parameter logic [PERMS_W-1: 0] PERM_MC_IMSK = (1<<PERM_LD) | (1<<PERM_MC) | (1<<PERM_SD);
+  parameter logic [PERMS_W-1: 0] PERM_LC_IMSK = (1<<PERM_LD) | (1<<PERM_MC);
+  parameter logic [PERMS_W-1: 0] PERM_SC_IMSK = (1<<PERM_SD) | (1<<PERM_MC);
+  parameter logic [PERMS_W-1: 0] PERM_DD_IMSK = 0;
+  parameter logic [PERMS_W-1: 0] PERM_EX_IMSK = (1<<PERM_EX) | (1<<PERM_MC) | (1<<PERM_LD);
+  parameter logic [PERMS_W-1: 0] PERM_SE_IMSK = 0;
+
+  // expand the perms from memory representation
+  function automatic logic [PERMS_W-1:0] expand_perms(logic [CPERMS_W-1:0] cperms);
+    logic [PERMS_W-1:0] perms;
+
+    perms = 0;
+    perms[PERM_GL] = cperms[5];
+
+    if (cperms[4:3] == 2'b11) begin
+      perms[PERM_LG] = cperms[0];
+      perms[PERM_LM] = cperms[1];
+      perms[PERM_SL] = cperms[2];
+      perms          = perms | PERM_MC_IMSK;
+    end else if (cperms[4:2] == 3'b101) begin
+      perms[PERM_LG] = cperms[0];
+      perms[PERM_LM] = cperms[1];
+      perms          = perms | PERM_LC_IMSK;
+    end else if (cperms[4:0] == 5'b10000) begin
+      perms = perms | PERM_SC_IMSK;
+    end else if (cperms[4:2] == 3'b100) begin
+      perms[PERM_SD] = cperms[0];
+      perms[PERM_LD] = cperms[1];
+      perms          = perms | PERM_DD_IMSK;
+    end else if (cperms[4:3] == 2'b01) begin
+      perms[PERM_LG] = cperms[0];
+      perms[PERM_LM] = cperms[1];
+      perms[PERM_SR] = cperms[2];
+      perms          = perms | PERM_EX_IMSK;
+    end else if (cperms[4:3] == 2'b00) begin
+      perms[PERM_US] = cperms[0];
+      perms[PERM_SE] = cperms[1];
+      perms[PERM_U0] = cperms[2];
+      perms          = perms | PERM_SE_IMSK;
+    end
+
+    return perms;
+  endfunction
+
+  // test the implict permission mask (any bits not 1?)
+  `define TEST_IMSK(P, M) (&((P) | ~(M)))
+
+  // compress perms field to memory representation
+  function automatic logic [CPERMS_W-1:0] compress_perms (logic [PERMS_W-1:0] perms, logic [1:0] unused_qqq);   // unused_qqq is a place holder, just to compatible with the old encoding for now.
+    logic [CPERMS_W-1:0] cperms;
+
+    // test all types encoding and determine encoding (Robert's priority order)
+    // Encoding explicit bits based on type
+    cperms    = 0;
+    cperms[5] = perms[PERM_GL];
+
+    if (`TEST_IMSK(perms, PERM_EX_IMSK)) begin
+      cperms[0]   = perms[PERM_LG];
+      cperms[1]   = perms[PERM_LM];
+      cperms[2]   = perms[PERM_SR];
+      cperms[4:3] = 2'b01;
+    end else if (`TEST_IMSK(perms, PERM_MC_IMSK)) begin
+      cperms[0]   = perms[PERM_LG];
+      cperms[1]   = perms[PERM_LM];
+      cperms[2]   = perms[PERM_SL];
+      cperms[4:3] = 2'b11;
+    end else if (`TEST_IMSK(perms, PERM_LC_IMSK)) begin
+      cperms[0]   = perms[PERM_LG];
+      cperms[1]   = perms[PERM_LM];
+      cperms[4:2] = 3'b101;
+    end else if (`TEST_IMSK(perms, PERM_SC_IMSK)) begin
+      cperms[4:0] = 5'b10000;
+    end else if (perms[PERM_SD] | perms[PERM_LD]) begin
+      cperms[0]   = perms[PERM_SD];
+      cperms[1]   = perms[PERM_LD];
+      cperms[4:2] = 3'b100;
+    end else begin
+      cperms[0]   = perms[PERM_US];
+      cperms[1]   = perms[PERM_SE];
+      cperms[2]   = perms[PERM_U0];
+      cperms[4:3] = 2'b00;
+    end
+
+    //$display("-------compress_perms:%t: %x - %x", $time, perms, cperms);
+    return cperms;
+  endfunction
+
+  // handling cperms in loaded cap based on the loading cap requirment
+  function automatic logic [CPERMS_W-1:0] mask_clcperms (logic [CPERMS_W-1:0] cperms_in, logic [3:0] clrperm,
+                                                   logic valid_in, logic sealed);
+    logic [CPERMS_W-1:0] cperms_out;
+    logic                clr_gl, clr_lg, clr_sdlm;
+
+    clr_gl    = clrperm[0] & valid_in;
+    clr_lg    = clrperm[0] & valid_in & ~sealed;
+    clr_sdlm  = clrperm[1] & valid_in & ~sealed;  // only clear SD/LM if not sealed
+
+    cperms_out    = cperms_in;
+    cperms_out[5] = cperms_in[5] & ~clr_gl;         // GL
+
+    if (cperms_in[4:3] == 2'b11) begin
+      cperms_out[0] = cperms_in[0] & ~clr_lg;       // LG
+      cperms_out[1] = cperms_in[1] & ~clr_sdlm;      // LM
+      cperms_out[4:2] = clr_sdlm ? 3'b101 : cperms_in[4:2];
+    end else if (cperms_in[4:2] == 3'b101) begin
+      cperms_out[0] = cperms_in[0] & ~clr_lg;       // LG
+      cperms_out[1] = cperms_in[1] & ~clr_sdlm;      // LM
+    end else if (cperms_in[4:0] == 5'b10000) begin
+      cperms_out[4:0] = clr_sdlm? 5'h0 : cperms_in[4:0];   // clear SD will results in NULL permission
+    end else if (cperms_in[4:2] == 3'b100) begin
+      cperms_out[4] = ~(clr_sdlm & ~cperms_in[1]);    // must decode to 5'h0 if both ld/sd are 0.
+      cperms_out[0] = cperms_in[0] & ~clr_sdlm;
+    end else if (cperms_in[4:3] == 2'b01) begin
+      cperms_out[0] = cperms_in[0] & ~clr_lg;       // LG
+      cperms_out[1] = cperms_in[1] & ~clr_sdlm;      // LM
+    end
+
+    return cperms_out;
+  endfunction
+
+  // caculate length (mem size) in bytes of a capability
+  function automatic logic[31:0] get_cap_len (full_cap_t full_cap);
+    logic [32:0] tmp33;
+    logic [31:0] result;
+
+    tmp33  = full_cap.top33 - full_cap.base32;
+    result = tmp33[32]? 32'hffff_ffff: tmp33[31:0];
+
+    return result;
+  endfunction
+
+  // obtain 32-bit representation of top
+  function automatic logic[32:0] get_bound33(logic [TOP_W-1:0] top, logic [1:0]  cor,
+                                           logic [EXP_W-1:0] exp, logic [31:0] addr);
+    logic [32:0] t1, t2, mask, cor_val;
+
+    if (cor[1])
+      cor_val = {33{cor[1]}};         // negative sign extension
+    else
+      cor_val = {32'h0, (~cor[1]) & cor[0]};
+
+    cor_val = (cor_val << exp) << TOP_W;
+    mask    = (33'h1_ffff_ffff << exp) << TOP_W;
+
+    t1 = ({1'b0, addr} & mask) + cor_val;     // apply correction and truncate
+//$display("gb33: corval=%09x, mask=%09x, t1=%09x", cor_val, mask, t1);
+    t2 = {24'h0, top};                         // extend to 32 bit
+    t1 = t1 | (t2 << exp);
+
+    return t1;
+
+  endfunction
+
+  // this implementation give slightly better timing/area results
+  function automatic logic[32:0] get_bound33_trial(logic [TOP_W-1:0] top, logic [1:0]  cor,
+                                             logic [EXP_W-1:0] exp, logic [31:0] addr);
+    logic [32:0] t33a, t33b, result;
+    logic [23:0] t24a, t24b, mask24, cor24;
+
+    if (cor[1])
+      cor24 = {24{cor[1]}};         // negative sign extension
+    else
+      cor24 = {23'h0, (~cor[1]) & cor[0]};
+
+    cor24  = (cor24 << exp);
+    mask24 = {24{1'b1}} << exp;
+
+    t24a = ({1'b0, addr[31:9]} & mask24) + cor24;     // apply correction and truncate
+//$display("gb33: corval=%09x, mask=%09x, t1=%09x", cor_val, mask, t1);
+    t33a = {24'h0, top};
+    result = {t24a, 9'h0} | (t33a << exp);
+
+    return result;
+
+  endfunction
+
+  // update the top/base correction for a cap
+  function automatic logic [2:0] update_temp_fields(logic [TOP_W-1:0] top, logic [BOT_W-1:0] base,
+                                                    logic [BOT_W-1:0] addrmi);
+    logic top_hi, addr_hi;
+    logic [2:0] res3;
+
+    top_hi   = (top < base);
+    addr_hi  = (addrmi < base);
+
+    // top_cor
+    res3[2:1] = (top_hi == addr_hi)? 2'b00 : ((top_hi && (!addr_hi))? 2'b01 : 2'b11);
+
+    // base_cor
+    res3[0] = (addr_hi) ? 1 : 0;
+
+    return res3;
+  endfunction
+
+  // set address of a capability
+  //   by default we check for representability only. 
+  //   use checktop/checkbase to check explicitly against top33/base32 bounds (pcc updates)
+  //   * note, representability check in most cases (other than exp=24) covers the base32 check 
+
+  function automatic full_cap_t set_address (full_cap_t in_cap, logic [31:0] newptr, logic chktop, logic chkbase);
+    full_cap_t        out_cap;
+    logic [32:0]      tmp33;
+    logic [32-TOP_W:0] tmp24, mask24;
+    logic  [2:0]      tmp3;
+    logic [BOT_W-1:0] ptrmi9;
+    logic             top_lt;
+
+    out_cap = in_cap;
+    mask24  = {(33-TOP_W){1'b1}} << in_cap.exp;          // mask24 = 0 if exp == 24
+
+    tmp33   = {1'b0, newptr} - {1'b0, in_cap.base32};  // extend to make sure we can see carry from MSB
+    tmp24   = tmp33[32:TOP_W] & mask24;
+    top_lt  =  ({1'b0, newptr} < {in_cap.top33[32:1], 1'b0});
+
+    if ((tmp24 != 0) || (chktop & ~top_lt) || (chkbase & tmp33[32]))
+      out_cap.valid = 1'b0;
+
+    ptrmi9           = BOT_W'(newptr >> in_cap.exp);
+    tmp3             = update_temp_fields(out_cap.top, out_cap.base, ptrmi9);
+    out_cap.top_cor  = tmp3[2:1];
+    out_cap.base_cor = tmp3[0];
+
+    return out_cap;
+  endfunction
+
+  //
+  // utility functions
+  //
+
+  // return the size (bit length) of input number without leading zeros
+  function automatic logic [5:0] get_size(logic [31:0] din);
+    logic  [5:0] count;
+    logic [31:0] a32;
+    int i;
+
+    a32 = {din[31], 31'h0};
+    for (i = 30; i >=  0; i--) a32[i] = a32[i+1] | din[i];
+    count = thermo_dec32(a32);
+
+    return count;
+  endfunction
+
+  // return the exp of a 32-bit input (by count trailing zeros)
+  function automatic logic [5:0] count_tz (logic [31:0] din);
+    logic  [5:0] count;
+    logic [31:0] a32, b32;
+    int i;
+
+    a32 = {31'h0, din[0]};
+    for (i = 1; i < 32; i++) a32[i] = a32[i-1] | din[i];
+    // count = a32[31] ? thermo_dec32(~a32) : 0;       // if input all zero, return 0
+    count = thermo_dec32(~a32);       // if input all zero, return 32
+
+    return count;
+  endfunction
+
+  // this simply count the number of 1's in a thermoeter-encoded input vector
+  //    (32-N zeros followed by N ones)
+  // 
+  function automatic logic [5:0] thermo_dec32(logic [31:0] a32);
+    logic  [5:0] count;
+    logic [31:0] b32;
+
+    if (a32[31]) count = 32;
+    else begin
+      count[5] = 1'b0;
+      count[4] = a32[15];
+      b32[15:0] = count[4] ? a32[31:16] : a32[15:0];
+      count[3] = b32[7];
+      b32[ 7:0] = count[3] ? b32[15:8] : b32[7:0];
+      count[2] = b32[3];
+      b32[ 3:0] = count[2] ?  b32[7:4] : b32[3:0];
+      count[1] = b32[1];
+      b32[ 1:0] = count[1] ?  b32[3:2] : b32[1:0];
+      count[0] = b32[0];
+    end
+
+    return count;
+  endfunction
+
+  // set bounds (top/base/exp/addr) of a capability
+
+  // break up into 2 parts to enable 2-cycle option
+  function automatic bound_req_t prep_bound_req (full_cap_t in_cap, logic [31:0] addr, logic [31:0] length);
+    bound_req_t result;
+    logic [5:0] size_result;
+
+    result.top33req = {1'b0, addr} + {1'b0, length};    // "requested" 33-bit top
+    result.expb     = count_tz(addr);
+    result.explen   = get_size({9'h0, length[31:9]});        // length exp without saturation
+
+    size_result     = result.explen;
+    result.exp1     = (size_result >= 6'(RESETCEXP)) ? EXP_W'(RESETEXP) : EXP_W'(size_result);
+
+    size_result     += 1;
+    result.exp2     = (size_result >= 6'(RESETCEXP)) ? EXP_W'(RESETEXP) : EXP_W'(size_result);
+
+    // move this to prep_bound_req to share with set_bounds_rndown 
+    //   should be ok to fit this in cycle 1 since it is a straight compare
+    result.in_bound = ~((result.top33req > in_cap.top33) || (addr < in_cap.base32)); 
+
+    return result;
+  endfunction
+  
+  function automatic bound_req_t prep_bound_req_it8 (full_cap_t in_cap, logic [31:0] addr, logic [31:0] length);   // IT8 encoding 
+    bound_req_t result;
+    logic [4:0] size_result;
+    logic       gt24;
+    logic [4:0] limit24_mask;
+
+    result.top33req = {1'b0, addr} + {1'b0, length};    // "requested" 33-bit top
+    result.expb     = count_tz(addr);
+    result.explen   = get_size({9'h0, length[31:9]});   // length exp without saturation, max 23
+
+    // since explen <= 23, exp1 and exp must be <= 24. No need for saturation logic
+    result.exp1     = result.explen;   
+    result.exp2     = result.explen + 1;
+
+    // move this to prep_bound_req to share with set_bounds_rndown 
+    //   should be ok to fit this in cycle 1 since it is a straight compare
+    result.in_bound = ~((result.top33req > in_cap.top33) || (addr < in_cap.base32)); 
+
+    return result;
+  endfunction
+
+  function automatic full_cap_t set_bounds (full_cap_t in_cap, logic[31:0] addr,
+                                            bound_req_t bound_req, logic req_exact);
+    full_cap_t       out_cap;
+
+    logic [EXP_W-1:0] exp1, exp2;
+    logic [32:0]      top33req;
+    logic [BOT_W:0]   base1, base2, top1, top2, len1, len2;
+    logic [32:0]      mask1, mask2;
+    logic             ovrflw, topoff1, topoff2, topoff;
+    logic             baseoff1, baseoff2, baseoff;
+    logic             tophi1, tophi2, tophi;
+    logic             in_bound;
+
+    out_cap  = in_cap;
+
+    top33req = bound_req.top33req;
+    exp1     = bound_req.exp1;
+    exp2     = bound_req.exp2;
+    in_bound = bound_req.in_bound;
+
+    // 1st path
+    mask1    = {33{1'b1}} << exp1;
+    base1    = (BOT_W+1)'(addr >> exp1);
+    topoff1  = |(top33req & ~mask1);
+    baseoff1 = |({1'b0, addr} & ~mask1);
+    top1     = (BOT_W+1)'(top33req >> exp1) + (BOT_W+1)'(topoff1);
+    len1     = top1 - base1;
+    tophi1   = (top1[8:0] >= base1[8:0]);
+
+    // overflow detection based on 1st path
+    ovrflw = len1[9];
+
+    // 2nd path in parallel
+    mask2    = {33{1'b1}} << exp2;
+    base2    = (BOT_W+1)'(addr >> exp2);
+    topoff2  = |(top33req & ~mask2);
+    baseoff2 = |({1'b0, addr} & ~mask2);
+    top2     = (BOT_W+1)'(top33req >> exp2) + (BOT_W+1)'(topoff2);
+    len2     = top2 - base2;
+    tophi2   = (top2[8:0] >= base2[8:0]);
+
+    // select results
+    if (~ovrflw) begin
+      out_cap.exp   = exp1;
+      out_cap.top   = top1[TOP_W-1:0];
+      out_cap.base  = base1[BOT_W-1:0];
+      out_cap.maska = mask1[31:0];
+      out_cap.rlen  = {22'h0, len1} << exp1;
+      topoff        = topoff1;
+      baseoff       = baseoff1;
+      tophi         = tophi1;
+    end else begin
+      out_cap.exp   = exp2;
+      out_cap.top   = top2[TOP_W-1:0];
+      out_cap.base  = base2[BOT_W-1:0];
+      out_cap.maska = mask2[31:0];
+      out_cap.rlen  = {22'h0, len2} << exp2;
+      topoff        = topoff2;
+      baseoff       = baseoff2;
+      tophi         = tophi2;
+    end
+
+`ifdef CHERI_PKG_DEBUG
+
+$display("--- set_bounds: exact = %x, ovrflw = %x, exp1 = %x, exp2 = %x, exp = %x, len = %x", ~(topoff|baseoff), ovrflw, exp1, exp2, out_cap.exp, out_cap.rlen);
+$display("--- set_bounds:  b1 = %x, t1 = %x, b2 = %x, t2 = %x", base1, top1, base2, top2);
+`endif
+
+    // top/base correction values
+    //   Note the new base == addr >> exp, so addr_hi == FALSE, thus base_cor == 0
+    //   as such, top_cor can only be either either 0 or +1;
+    out_cap.top_cor  = tophi ? 2'b00 : 2'b01;
+    out_cap.base_cor = 1'b0;
+
+    if (req_exact & (topoff | baseoff)) out_cap.valid = 1'b0;
+
+    // we used the "requested top" to verify the results against original bounds
+    // also compare address >= old base 32 to handle exp=24 case
+    //   exp = 24 case: can have addr < base (not covered by representibility checking);
+    //   other exp cases: always addr >= base when out_cap.tag == 1
+    if (~in_bound) 
+      out_cap.valid = 1'b0;
+
+    return out_cap;
+  endfunction
+
+  function automatic full_cap_t set_bounds_rndn (full_cap_t in_cap, logic[31:0] addr,
+                                                 bound_req_t bound_req);
+    full_cap_t       out_cap;
+
+    logic [EXP_W:0]  explen, expb, exp_final;
+    logic [32:0]     top33req;
+    logic            in_bound;
+    logic            el_gt_eb, el_gt_14, eb_gt_14; 
+    logic            tophi;
+
+    out_cap  = in_cap;
+
+    top33req = bound_req.top33req;
+    explen   = bound_req.explen;
+    expb     = bound_req.expb;
+    in_bound = bound_req.in_bound;
+
+    el_gt_eb = (explen > expb);
+    el_gt_14 = (explen > 14);
+    eb_gt_14 = (expb   > 14);
+    
+    // final exp =  min(14, e_l, e_b)
+    exp_final = (el_gt_eb & !eb_gt_14) ? expb : (el_gt_14 ? 14 : explen);
+
+    // if (el_gt_eb & eb_gt_14) exp_final = 14;       //  min(14, min(e_l, e_b)), el > eb, eb > 14
+    // else if (el_gt_eb)       exp_final = expb;     //  min(14, min(e_l, e_b)), el > eb, eb <= 14
+    // else if (el_gt_14)       exp_final = 14;       //  min(14, min(e_l, e_b)), el <= eb, el > 14
+    // else                     exp_final = explen;   //  e_l,                    el <= eb, el <= 14
+
+    out_cap.exp  = exp_final;
+    out_cap.base = (BOT_W)'(addr >> exp_final);
+
+    out_cap.top = (el_gt_eb | el_gt_14) ? ((BOT_W)'(out_cap.base-1)) : 
+                                          ((BOT_W)'(top33req >> exp_final));
+
+    if (~in_bound) out_cap.valid = 1'b0;
+
+    // top/base correction values
+    //   Note the new base == addr >> exp, so addr_hi == FALSE, thus base_cor == 0
+    //   as such, top_cor can only be either either 0 or +1;
+    tophi = (out_cap.top >= out_cap.base);
+    out_cap.top_cor  = tophi ? 2'b00 : 2'b01;
+    out_cap.base_cor = 2'b00;  
+
+    return out_cap;
+  endfunction
+
+
+  function automatic full_cap_t set_bounds_rndn_it8 (full_cap_t in_cap, logic[31:0] addr,         // IT8 encoding 
+                                                     bound_req_t bound_req);
+    full_cap_t       out_cap;
+
+    logic [EXP_W:0]  explen, expb, exp_final;
+    logic [32:0]     top33req;
+    logic            in_bound;
+    logic            el_gt_eb; 
+    logic            tophi;
+
+    out_cap  = in_cap;
+
+    top33req = bound_req.top33req;
+    explen   = bound_req.explen;
+    expb     = bound_req.expb;
+    in_bound = bound_req.in_bound;
+
+    el_gt_eb = (explen > expb);
+    
+    exp_final = (el_gt_eb) ? expb :  explen;
+
+    out_cap.exp  = exp_final;
+    out_cap.base = (BOT_W)'(addr >> exp_final);
+
+    out_cap.top = (el_gt_eb) ? ((BOT_W)'(out_cap.base-1)) : ((BOT_W)'(top33req >> exp_final));
+
+    if (~in_bound) out_cap.valid = 1'b0;
+
+    // top/base correction values
+    //   Note the new base == addr >> exp, so addr_hi == FALSE, thus base_cor == 0
+    //   as such, top_cor can only be either either 0 or +1;
+    tophi = (out_cap.top >= out_cap.base);
+    out_cap.top_cor  = tophi ? 2'b00 : 2'b01;
+    out_cap.base_cor = 2'b00;  
+
+    return out_cap;
+  endfunction
+
+
+
+  // seal/unseal related functions
+  function automatic full_cap_t seal_cap (full_cap_t in_cap, logic [OTYPE_W-1:0] new_otype);
+    full_cap_t out_cap;
+
+    out_cap = in_cap;
+    out_cap.otype = new_otype;
+    return out_cap;
+  endfunction
+
+  function automatic full_cap_t unseal_cap (full_cap_t in_cap);
+    full_cap_t out_cap;
+    out_cap = in_cap;
+    out_cap.otype = OTYPE_UNSEALED;
+    return out_cap;
+  endfunction
+
+  function automatic logic is_cap_sealed (full_cap_t in_cap);
+    logic result;
+
+    result = (in_cap.otype != OTYPE_UNSEALED);
+    return result;
+  endfunction
+
+  //function automatic logic is_cap_sentry (full_cap_t in_cap);
+  //  logic result;
+
+  //  result = (in_cap.perms[PERM_EX]) && ((in_cap.otype == OTYPE_SENTRY) || (in_cap.otype == OTYPE_SENTRY_ID) ||
+  //            (in_cap.otype == OTYPE_SENTRY_IE));
+  //  return result;
+  //endfunction
+
+
+  function automatic logic [3:0] decode_otype (logic [2:0] otype3, logic perm_ex);
+    logic [3:0] otype4;
+
+    otype4 = {~perm_ex & (otype3 != 0), otype3};
+    return otype4;
+  endfunction
+
+  // reg_cap decompression (to full_cap)
+  function automatic full_cap_t reg2fullcap (reg_cap_t reg_cap, logic [31:0] addr);
+    full_cap_t full_cap;
+
+    full_cap.perms    = expand_perms(reg_cap.cperms);
+    full_cap.valid    = reg_cap.valid;
+    full_cap.exp      = reg_cap.exp;
+    full_cap.otype    = reg_cap.otype;
+    full_cap.top_cor  = reg_cap.top_cor;
+    full_cap.base_cor = reg_cap.base_cor;
+    full_cap.top      = reg_cap.top;
+    full_cap.base     = reg_cap.base;
+    full_cap.cperms   = reg_cap.cperms;
+    full_cap.rsvd     = reg_cap.rsvd;
+
+    full_cap.top33  = get_bound33(reg_cap.top, reg_cap.top_cor, reg_cap.exp, addr);
+    full_cap.base32 = get_bound33(reg_cap.base, {2{reg_cap.base_cor}}, reg_cap.exp, addr);
+    // full_cap  = update_bounds(full_cap, addr);   // for some reason this increases area 
+
+    full_cap.maska    = 0;
+    full_cap.rlen     = 0;
+
+    return full_cap;
+  endfunction
+
+  // full_cap compression (to reg_cap).
+  //   note we don't recalculate top/base_cor here since the address/bounds of a capability
+  //   won't change without an explicit instruction (only exception is PCC)
+  function automatic reg_cap_t full2regcap (full_cap_t full_cap);
+    reg_cap_t reg_cap;
+
+    reg_cap          = NULL_REG_CAP;
+    reg_cap.valid    = full_cap.valid;
+    reg_cap.top_cor  = full_cap.top_cor;
+    reg_cap.base_cor = full_cap.base_cor;
+    reg_cap.exp      = full_cap.exp;
+    reg_cap.top      = full_cap.top;
+    reg_cap.base     = full_cap.base;
+    reg_cap.cperms   = full_cap.cperms;
+    reg_cap.rsvd     = full_cap.rsvd;
+    reg_cap.otype    = full_cap.otype;
+
+    return reg_cap;
+  endfunction
+
+  // pcc_cap expansion (to full_cap).
+  //  -- pcc is a special case since the address (PC) moves around..
+  //     so have to adjust correction factors and validate bounds here
+  // function automatic full_cap_t pcc2fullcap (pcc_cap_t pcc_cap, logic [31:0] pc_addr);
+  function automatic full_cap_t pcc2fullcap (pcc_cap_t in_pcap);
+    full_cap_t pcc_fullcap;
+
+    pcc_fullcap.valid    = in_pcap.valid;   
+    pcc_fullcap.exp      = in_pcap.exp; 
+    pcc_fullcap.top33    = in_pcap.top33;
+    pcc_fullcap.base32   = in_pcap.base32;
+    pcc_fullcap.otype    = in_pcap.otype;
+    pcc_fullcap.perms    = in_pcap.perms;
+    pcc_fullcap.top_cor  = 2'b0;          // will be updated by set_address()
+    pcc_fullcap.base_cor = 1'b0;
+    pcc_fullcap.top      = TOP_W'(in_pcap.top33  >> (in_pcap.exp));
+    pcc_fullcap.base     = BOT_W'(in_pcap.base32 >> (in_pcap.exp));
+    pcc_fullcap.cperms   = in_pcap.cperms;
+    pcc_fullcap.maska    = 0;             // not used in pcc_cap
+    pcc_fullcap.rsvd     = in_pcap.rsvd;
+    pcc_fullcap.rlen     = 0;             // not used in pcc_cap
+ 
+    return pcc_fullcap;
+  endfunction
+
+  // compress full_cap to pcc_cap
+  function automatic pcc_cap_t full2pcap (full_cap_t full_cap);
+    pcc_cap_t pcc_cap;
+
+    pcc_cap.valid    = full_cap.valid;
+    pcc_cap.exp      = full_cap.exp;
+    pcc_cap.top33    = full_cap.top33;
+    pcc_cap.base32   = full_cap.base32;
+    pcc_cap.otype    = full_cap.otype;
+    pcc_cap.perms    = full_cap.perms;
+    pcc_cap.cperms   = full_cap.cperms;
+    pcc_cap.rsvd     = full_cap.rsvd;
+
+    return pcc_cap;
+  endfunction
+
+  function automatic reg_cap_t pcc2mepcc (pcc_cap_t pcc_cap, logic [31:0] address, logic clrtag);
+    reg_cap_t  reg_cap;
+    full_cap_t tfcap0, tfcap1;
+
+    tfcap0  = pcc2fullcap(pcc_cap);
+    // Still need representability check to cover save_pc_if and save_pc_wb cases
+    tfcap1  = set_address(tfcap0, address, 0, 0);
+    
+    reg_cap = full2regcap(tfcap1);
+    if (clrtag) reg_cap.valid = 1'b0;
+
+    return reg_cap;
+  endfunction
+
+  //
+  // pack/unpack the cap+addr between reg and memory
+  // format 0: lsw32 = addr, msw33 = cap fields
+  //
+  // p’7 otype’3 E’4 B’9 T’9
+  localparam integer RSVD_LO   = 31;
+  localparam integer CPERMS_LO = 25;
+  localparam integer OTYPE_LO  = 22;
+  localparam integer CEXP_LO   = 18;   
+  localparam integer CEXP5_LO  = 17;   // IT8 encoding only
+  localparam integer TOP_LO    = 9;
+  localparam integer BASE_LO   = 0;
+ 
+  // mem2reg, cap meta data, original cap bound encoding, memfmt0 
+  function automatic reg_cap_t mem2regcap_fmt0 (logic [32:0] msw, logic [32:0] addr33, logic [3:0] clrperm);
+    reg_cap_t regcap;
+    logic [EXP_W-1:0] tmp5;
+    logic [2:0]  tmp3;
+    logic [CPERMS_W-1:0] cperms_mem;
+    logic [BOT_W-1:0]    addrmi9;
+    logic                sealed;
+    logic                valid_in;
+
+    valid_in      = msw[32] & addr33[32];
+    regcap.valid  = valid_in & ~clrperm[3];   
+
+    tmp5 = {1'b0, msw[CEXP_LO+:CEXP_W]};
+    if (tmp5 == EXP_W'(RESETCEXP)) tmp5 = RESETEXP;
+    regcap.exp = tmp5;
+
+    regcap.top    = msw[TOP_LO+:TOP_W];
+    regcap.base   = msw[BASE_LO+:BOT_W];
+    regcap.otype  = msw[OTYPE_LO+:OTYPE_W];
+
+    sealed = (regcap.otype != OTYPE_UNSEALED);
+    cperms_mem      = msw[CPERMS_LO+:CPERMS_W];
+    regcap.cperms   = mask_clcperms(cperms_mem, clrperm, regcap.valid, sealed);
+    addrmi9         = BOT_W'({1'b0, addr33[31:0]} >> regcap.exp); // ignore the tag valid bit
+    tmp3            = update_temp_fields(regcap.top, regcap.base, addrmi9);
+    regcap.top_cor  = tmp3[2:1];
+    regcap.base_cor = tmp3[0];
+
+    regcap.rsvd     = msw[RSVD_LO];
+
+    return regcap;
+
+  endfunction
+
+  // mem2reg, cap meta data, IT8 encoding, memfmt0 
+  function automatic reg_cap_t mem2regcap_it8_fmt0 (logic [32:0] msw, logic [32:0] addr33, logic [3:0] clrperm);   // IT8
+    reg_cap_t regcap;
+    logic [EXP_W-1:0] cexp;
+    logic [TOP_W-2:0] top8, base8;
+    logic [TOP_W-1:0] top9, base9;
+    logic             denorm, ltop, btop, ttop, tcin; 
+    logic             top_hi, addr_hi;
+    logic [2:0]       res3;
+
+    logic [CPERMS_W-1:0] cperms_mem;
+    logic [BOT_W-1:0]    addrmi9;
+    logic                sealed;
+    logic                valid_in;
+
+    valid_in      = msw[32] & addr33[32];
+    regcap.valid  = valid_in & ~clrperm[3];   
+
+    cexp          = msw[CEXP5_LO+:CEXP5_W];
+    denorm        = (cexp == 0);
+
+    btop          = msw[BASE_LO+BOT_W-1];
+    base8         = msw[BASE_LO+:(BOT_W-1)];
+    top8          = msw[TOP_LO+:(TOP_W-1)];
+
+    tcin          = (top8 < base8);           // can actually merge it with t_hi in update_temp_fields QQQ
+    ltop          = ~denorm;
+    ttop          = ltop ^ tcin ^ btop;
+
+    regcap.exp    = cexp ^ {5{~denorm}};      // this is the ^0b11111 part
+    top9          = {ttop, top8};
+    base9         = {btop, base8};
+    regcap.top    = top9;
+    regcap.base   = base9;
+
+    regcap.otype  = msw[OTYPE_LO+:OTYPE_W];
+
+    sealed = (regcap.otype != OTYPE_UNSEALED);
+    cperms_mem      = msw[CPERMS_LO+:CPERMS_W];
+    regcap.cperms   = mask_clcperms(cperms_mem, clrperm, regcap.valid, sealed);
+    addrmi9         = BOT_W'({1'b0, addr33[31:0]} >> regcap.exp); // ignore the tag valid bit
+
+    // update temp fields
+    // tmp3            = update_temp_fields(regcap.top, regcap.base, addrmi9);
+    // top_hi   = (top < base);
+    top_hi   = (btop ^ ttop) ? ~ttop : tcin; 
+    addr_hi  = (addrmi9 < base9);
+
+    regcap.top_cor  = (top_hi == addr_hi)? 2'b00 : ((top_hi && (!addr_hi))? 2'b01 : 2'b11);
+    regcap.base_cor = (addr_hi) ? 1'b1 : 1'b0;
+
+    regcap.rsvd     = msw[RSVD_LO];
+
+    return regcap;
+
+  endfunction
+
+  // reg to mem, meta data, original cap bound encoding, memfmt0
+  function automatic logic[32:0] reg2memcap_fmt0 (reg_cap_t regcap);
+
+    logic [32:0] msw;
+
+    msw[32] = regcap.valid ;
+
+    msw[CEXP_LO+:CEXP_W]     = (regcap.exp == RESETEXP) ? RESETCEXP : regcap.exp[CEXP_W-1:0];
+    msw[TOP_LO+:TOP_W]       = regcap.top   ;
+    msw[BASE_LO+:BOT_W]      = regcap.base  ;
+    msw[OTYPE_LO+:OTYPE_W]   = regcap.otype ;
+    msw[CPERMS_LO+:CPERMS_W] = regcap.cperms;
+    msw[RSVD_LO]             = regcap.rsvd;
+
+    return msw;
+
+  endfunction
+
+  // reg to mem, meta data, IT8 encoding, memfmt0
+  function automatic logic[32:0] reg2memcap_it8_fmt0 (reg_cap_t regcap);         // IT8
+
+    logic [32:0] msw;
+    logic        denorm, ltop, cor;
+    logic  [9:0] top10, base10, len10;
+
+    cor    = (regcap.top_cor == {2{regcap.base_cor}}); 
+    top10  = {~cor, regcap.top};
+    base10 = {1'b0, regcap.base};
+    len10  = top10 - base10; 
+    ltop   = len10[9] | len10[8];
+
+    denorm = (regcap.exp == 0) && ~ltop;
+
+    msw[32] = regcap.valid;
+
+    msw[CEXP5_LO+:CEXP5_W]   = regcap.exp ^ {5{~denorm}};
+    msw[TOP_LO+:(TOP_W-1)]   = regcap.top[7:0];
+    msw[BASE_LO+:BOT_W]      = regcap.base  ;
+    msw[OTYPE_LO+:OTYPE_W]   = regcap.otype ;
+    msw[CPERMS_LO+:CPERMS_W] = regcap.cperms;
+    msw[RSVD_LO]             = regcap.rsvd;
+
+    return msw;
+
+  endfunction
+
+  //
+  // pack/unpack the cap+addr between reg and memory
+  // format 1: lsw32 = RSVD+EXP+T+B+A9, msw32 = CPERMS+OTYPE+A23
+  //
+
+  // mem to reg, meta data, original cap bound encoding, memfmt1
+  function automatic reg_cap_t mem2regcap_fmt1 (logic [32:0] msw, logic [32:0] lsw, logic [3:0] clrperm);
+    reg_cap_t regcap;
+    logic [2:0]  tmp3;
+    logic        sealed;
+    logic [8:0]  addrmi9;
+    logic [CPERMS_W-1:0] cperms_mem;
+    logic        valid_in;
+
+    // lsw is now EXP+B+T+A
+    valid_in      = msw[32] & lsw[32];
+    regcap.valid  = valid_in & ~clrperm[3];   
+    regcap.exp    = (lsw[30:27] == RESETCEXP) ?  RESETEXP : {1'b0, lsw[30:27]};
+    regcap.base   = lsw[26:18];
+    regcap.top    = lsw[17:9];
+    addrmi9       = (lsw[30:27] == RESETCEXP) ? {1'b0, lsw[8:1]} : lsw[8:0];
+
+    regcap.otype  = msw[25:23];
+    sealed        = (regcap.otype != OTYPE_UNSEALED);
+
+    // cperms_mem = {lsw[31], msw[31:26]};
+    cperms_mem    = msw[31:26];
+    regcap.cperms = mask_clcperms(cperms_mem, clrperm, regcap.valid, sealed);
+    regcap.rsvd   = lsw[31];
+
+    tmp3 = update_temp_fields(regcap.top, regcap.base, addrmi9);
+    regcap.top_cor  = tmp3[2:1];
+    regcap.base_cor = tmp3[0];
+
+    return regcap;
+
+  endfunction
+
+ 
+  // mem to reg, meta data, IT8 encoding, memfmt1
+  function automatic reg_cap_t mem2regcap_it8_fmt1 (logic [32:0] msw, logic [32:0] lsw, logic [3:0] clrperm);   // xyz
+    reg_cap_t regcap;
+    logic [EXP_W-1:0] cexp;
+    logic [TOP_W-2:0] top8, base8;
+    logic [TOP_W-1:0] top9, base9;
+    logic             denorm, ltop, btop, ttop, tcin; 
+    logic             top_hi, addr_hi;
+    logic [2:0]       res3;
+
+    logic        sealed;
+    logic [8:0]  addrmi9;
+    logic [CPERMS_W-1:0] cperms_mem;
+    logic        valid_in;
+
+
+    // lsw is now EXP+T+B+A
+    valid_in      = msw[32] & lsw[32];
+    regcap.valid  = valid_in & ~clrperm[3];  
+
+    cexp          = lsw[30:26];
+    denorm        = (cexp == 0);
+
+    btop          = lsw[17];
+    base8         = lsw[16:9];
+    top8          = lsw[25:18];
+
+    tcin          = (top8 < base8);           // can actually merge it with t_hi in update_temp_fields QQQ
+    ltop          = ~denorm;
+    ttop          = ltop ^ tcin ^ btop;
+
+    regcap.exp    = cexp ^ {5{~denorm}};      // this is the ^0b11111 part
+    top9          = {ttop, top8};
+    base9         = {btop, base8};
+    regcap.top    = top9;
+    regcap.base   = base9;
+
+    // (regcap.exp >= RESETEXP);
+    addrmi9       = (regcap.exp[4] & regcap.exp[3]) ? {1'b0, lsw[8:1]} : lsw[8:0];
+
+    regcap.otype  = msw[25:23];
+    sealed        = (regcap.otype != OTYPE_UNSEALED);
+
+    // cperms_mem = {lsw[31], msw[31:26]};
+    cperms_mem    = msw[31:26];
+    regcap.cperms = mask_clcperms(cperms_mem, clrperm, regcap.valid, sealed);
+    regcap.rsvd   = lsw[31];
+
+    // tmp3 = update_temp_fields(regcap.top, regcap.base, addrmi9);
+    top_hi   = (btop ^ ttop) ? ~ttop : tcin; 
+    addr_hi  = (addrmi9 < base9);
+
+    regcap.top_cor  = (top_hi == addr_hi)? 2'b00 : ((top_hi && (!addr_hi))? 2'b01 : 2'b11);
+    regcap.base_cor = (addr_hi) ? 1'b1 : 1'b0;
+
+    return regcap;
+
+  endfunction
+
+  // mem to reg, addr32, both original and IT8 encoding, memfmt1
+  function automatic logic[32:0] mem2regaddr_fmt1 (logic [32:0] msw, logic [32:0] lsw, reg_cap_t regcap);    // xyz
+    logic [32:0] addr33;
+    logic [31:0] addrmi, addrhi, addrlo;
+    logic [31:0] mask1, mask2;
+
+    // (regcap.exp >= RESETEXP)
+    if (regcap.exp[4] & regcap.exp[3]) begin
+      addrhi   = 32'h0;
+      addrmi   = {lsw[8:0], 23'h0};
+      addrlo   = {9'h0, msw[22:0]};
+    end else begin
+      addrmi   = {23'h0, lsw[8:0]} << regcap.exp;
+      mask1    = {32{1'b1}} << regcap.exp;
+      mask2    = mask1 << 9;
+      addrhi   = ({9'h0, msw[22:0]} << 9) & mask2;
+      addrlo   = {9'h0, msw[22:0]} & (~mask1);
+    end
+
+    addr33 = {lsw[32], addrhi | addrmi | addrlo};
+
+    return addr33;
+  endfunction
+
+  // reg to mem, original cap bound encoding, memfmt1
+  function automatic logic[65:0] reg2mem_fmt1 (reg_cap_t reg_cap, logic[31:0] addr);
+
+    logic [32:0] msw, lsw;
+    logic [31:0] mask1, mask2;
+
+    msw[32]    = reg_cap.valid;
+    msw[31:26] = reg_cap.cperms[5:0];
+    msw[25:23] = reg_cap.otype;
+    lsw[32]    = reg_cap.valid ;
+    lsw[31]    = reg_cap.rsvd;
+    lsw[26:18] = reg_cap.base;
+    lsw[17:9]  = reg_cap.top;
+
+    if (reg_cap.exp == RESETEXP) begin
+      msw[22:0]  = addr[22:0];
+      lsw[30:27] = RESETCEXP;
+      lsw[8:0]   = addr[31:23];
+    end else begin
+      mask1    = {32{1'b1}} << reg_cap.exp;
+      mask2    = mask1 << 9;
+
+      msw[22:0]  = 23'((addr & ~mask1) | ((addr & mask2) >> 9));
+      lsw[30:27] = reg_cap.exp[CEXP_W-1:0];
+      lsw[8:0]   = 9'(addr >> reg_cap.exp);
+    end
+
+    return {msw, lsw};
+
+  endfunction
+
+  // reg to mem, IT8 encoding, memfmt1
+  function automatic logic[65:0] reg2mem_it8_fmt1 (reg_cap_t regcap, logic[31:0] addr);        // xyz
+
+    logic [32:0] msw, lsw;
+    logic [31:0] mask1, mask2;
+    logic        denorm, ltop, cor;
+    logic  [9:0] top10, base10, len10;
+
+    cor    = (regcap.top_cor == {2{regcap.base_cor}}); 
+    top10  = {~cor, regcap.top};
+    base10 = {1'b0, regcap.base};
+    len10  = top10-base10; 
+    ltop   = len10[9] | len10[8];
+
+    denorm = (regcap.exp == 0) && ~ltop;
+
+    msw[32]    = regcap.valid;
+    msw[31:26] = regcap.cperms[5:0];
+    msw[25:23] = regcap.otype;
+    lsw[32]    = regcap.valid ;
+    lsw[31]    = regcap.rsvd;
+    lsw[30:26] = regcap.exp ^ {5{~denorm}} ;
+    lsw[25:18] = regcap.top[7:0];
+    lsw[17:9]  = regcap.base;
+
+    // (regcap.exp >= RESETEXP) 
+    if (regcap.exp[4] & regcap.exp[3]) begin
+      msw[22:0]  = addr[22:0];
+      lsw[8:0]   = addr[31:23];
+    end else begin
+      mask1    = {32{1'b1}} << regcap.exp;
+      mask2    = mask1 << 9;
+      msw[22:0]  = 23'((addr & ~mask1) | ((addr & mask2) >> 9));
+      lsw[8:0]   = 9'(addr >> regcap.exp);
+    end
+
+    return {msw, lsw};
+
+  endfunction
+
+  // simply cast regcap to a 38-bit vector. 
+  // we can do this with systemverilog casting but let's be explicit here
+  function automatic logic [REGCAP_W-1:0] reg2vec (reg_cap_t regcap);
+
+    logic [REGCAP_W-1:0] vec_out;
+
+    vec_out[REGCAP_W-1]  = regcap.valid ;
+    vec_out[34+:2]       = regcap.top_cor;
+    vec_out[33+:1]       = regcap.base_cor;
+    vec_out[28+:EXP_W]   = regcap.exp;
+    vec_out[19+:TOP_W]   = regcap.top   ;
+    vec_out[10+:BOT_W]   = regcap.base  ;
+    vec_out[7+:OTYPE_W]  = regcap.otype ;
+    vec_out[6+:1]        = regcap.rsvd;
+    vec_out[0+:CPERMS_W] = regcap.cperms;
+
+    return vec_out;
+  endfunction
+
+  function automatic reg_cap_t vec2reg (logic [REGCAP_W-1:0] vec_in);
+
+    reg_cap_t regcap;
+
+    regcap.valid    = vec_in[REGCAP_W-1];
+    regcap.top_cor  = vec_in[34+:2];
+    regcap.base_cor = vec_in[33+:1];
+    regcap.exp      = vec_in[28+:EXP_W];
+    regcap.top      = vec_in[19+:TOP_W];
+    regcap.base     = vec_in[10+:BOT_W];
+    regcap.otype    = vec_in[7+:OTYPE_W];
+    regcap.rsvd     = vec_in[6+:1];
+    regcap.cperms   = vec_in[0+:CPERMS_W];
+
+    return regcap;
+  endfunction
+
+  // test whether 2 caps are equal
+  function automatic logic is_equal (full_cap_t cap_a, full_cap_t cap_b, 
+                                     logic [31:0] addra, logic[31:0] addrb);
+
+    is_equal =  (cap_a.valid  == cap_b.valid) &&
+                (cap_a.top  == cap_b.top) && (cap_a.base == cap_b.base) &&
+                (cap_a.cperms  == cap_b.cperms) && (cap_a.rsvd == cap_b.rsvd) && 
+                (cap_a.exp    == cap_b.exp) && (cap_a.otype  == cap_b.otype) && 
+                (addra == addrb);
+    return is_equal;
+
+  endfunction
+ 
+  // clear tag of a regcap if needed, otherwise simply pass through
+  function automatic reg_cap_t and_regcap_tag (reg_cap_t in_cap, logic tag_mask);
+    reg_cap_t out_cap;
+
+    out_cap = in_cap;
+    out_cap.valid = in_cap.valid & tag_mask;
+    return out_cap;
+
+  endfunction
+
+  // parameters and constants
+
+  parameter logic[6:0] CHERI_INSTR_OPCODE = 7'h5b;
+  parameter int OPDW = 36;      // Must >= number of cheri operator/instructions we support
+
+  typedef enum logic [5:0] {
+    CGET_PERM        = 6'h00,
+    CGET_TYPE        = 6'h01,
+    CGET_BASE        = 6'h02,
+    CGET_LEN         = 6'h03,
+    CGET_TAG         = 6'h04,
+    CGET_TOP         = 6'h05,
+    CGET_HIGH        = 6'h06,
+    CGET_ADDR        = 6'h07,
+    CSEAL            = 6'h08,
+    CUNSEAL          = 6'h09,
+    CAND_PERM        = 6'h0a,
+    CSET_ADDR        = 6'h0b,
+    CINC_ADDR        = 6'h0c,
+    CINC_ADDR_IMM    = 6'h0d,
+    CSET_BOUNDS      = 6'h0e,
+    CSET_BOUNDS_EX   = 6'h0f,
+    CSET_BOUNDS_IMM  = 6'h10,
+    CIS_SUBSET       = 6'h11,
+    CIS_EQUAL        = 6'h12,
+    CMOVE_CAP        = 6'h13,
+    CSUB_CAP         = 6'h14,
+    CCLEAR_TAG       = 6'h15,
+    CLOAD_CAP        = 6'h16,
+    CSET_HIGH        = 6'h17,
+    CSTORE_CAP       = 6'h18,
+    CCSR_RW          = 6'h19,
+    CJALR            = 6'h1a,
+    CJAL             = 6'h1b,
+    CAUIPCC          = 6'h1c,
+    CAUICGP          = 6'h1d,
+    CRRL             = 6'h1e,
+    CRAM             = 6'h1f,
+    CSET_BOUNDS_RNDN = 6'h20
+  } cheri_op_e;
+
+  typedef enum logic [4:0] {
+    CHERI_CSR_NULL,
+    CHERI_CSR_RW
+  } cheri_csr_op_e;
+
+  parameter logic [4:0] CHERI_SCR_MEPCC      = 5'd31;
+  parameter logic [4:0] CHERI_SCR_MSCRATCHC  = 5'd30;
+  parameter logic [4:0] CHERI_SCR_MTDC       = 5'd29;
+  parameter logic [4:0] CHERI_SCR_MTCC       = 5'd28;
+  parameter logic [4:0] CHERI_SCR_ZTOPC      = 5'd27;
+  parameter logic [4:0] CHERI_SCR_DSCRATCHC1 = 5'd26;
+  parameter logic [4:0] CHERI_SCR_DSCRATCHC0 = 5'd25;
+  parameter logic [4:0] CHERI_SCR_DEPCC      = 5'd24;
+
+  // permission violations
+  parameter int unsigned W_PVIO = 8;
+
+  parameter logic [2:0] PVIO_TAG   = 3'h0;
+  parameter logic [2:0] PVIO_SEAL  = 3'h1;
+  parameter logic [2:0] PVIO_EX    = 3'h2;
+  parameter logic [2:0] PVIO_LD    = 3'h3;
+  parameter logic [2:0] PVIO_SD    = 3'h4;
+  parameter logic [2:0] PVIO_SC    = 3'h5;
+  parameter logic [2:0] PVIO_ASR   = 3'h6;
+  parameter logic [2:0] PVIO_ALIGN = 3'h7;
+  
+
+  function automatic logic [4:0] vio_cause_enc (logic bound_vio, logic[W_PVIO-1:0] perm_vio_vec);
+    logic [4:0] vio_cause;
+    
+    if (perm_vio_vec[PVIO_TAG])
+      vio_cause = 5'h2;
+    else if (perm_vio_vec[PVIO_SEAL])
+      vio_cause = 5'h3;
+    else if (perm_vio_vec[PVIO_EX])
+      vio_cause = 5'h11;
+    else if (perm_vio_vec[PVIO_LD])
+      vio_cause = 5'h12;
+    else if (perm_vio_vec[PVIO_SD])
+      vio_cause = 5'h13;
+    else if (perm_vio_vec[PVIO_SC])
+      vio_cause = 5'h15;
+    else if (perm_vio_vec[PVIO_ASR])
+      vio_cause = 5'h18;
+    else if (bound_vio)
+      vio_cause = 5'h1;
+    else
+      vio_cause = 5'h0;
+
+    return vio_cause;
+  endfunction
+ 
+endpackage
diff --git a/hw/ip/cheriot-ibex/rtl/cheri_regfile.sv b/hw/ip/cheriot-ibex/rtl/cheri_regfile.sv
new file mode 100644
index 0000000..27c636a
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheri_regfile.sv
@@ -0,0 +1,384 @@
+
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+module cheri_regfile import cheri_pkg::*; #(
+  parameter int unsigned NREGS      = 32,
+  parameter int unsigned NCAPS      = 32,
+  parameter bit          RegFileECC = 1'b0,
+  parameter int unsigned DataWidth  = 32,
+  parameter bit          CheriPPLBC = 1'b0,
+  parameter bit          TRVKBypass = 1'b1
+) (
+   // Clock and Reset
+  input  logic                  clk_i,
+  input  logic                  rst_ni,
+  input  logic                  par_rst_ni,
+
+  //Read port R1
+  input  logic           [4:0]  raddr_a_i,
+  output logic [DataWidth-1:0]  rdata_a_o,
+  output reg_cap_t              rcap_a_o,
+
+  //Read port R2
+  input  logic           [4:0]  raddr_b_i,
+  output logic [DataWidth-1:0]  rdata_b_o,
+  output reg_cap_t              rcap_b_o,
+
+  // Write port W1
+  input  logic          [4:0]   waddr_a_i,
+  input  logic [DataWidth-1:0]  wdata_a_i,
+  input  reg_cap_t              wcap_a_i,
+  input  logic                  we_a_i,         // we always write both cap & data in parallel
+
+  // Tag reservation and revocation port
+  output logic          [31:0]  reg_rdy_o,
+  input  logic          [4:0]   trvk_addr_i,
+  input  logic                  trvk_en_i,
+  input  logic                  trvk_clrtag_i,
+  input  logic          [6:0]   trvk_par_i,     // make sure this is included in lockstep compare      
+  input  logic          [4:0]   trsv_addr_i,
+  input  logic                  trsv_en_i,
+  input  logic          [6:0]   trsv_par_i,     
+  
+  output logic                  alert_o
+);
+
+  localparam logic [6:0] DefParBits[0:31] = '{7'h27,7'h0d,7'h6b,7'h41,7'h62,7'h48,7'h2e,7'h04,
+                                              7'h1f,7'h35,7'h53,7'h79,7'h5a,7'h70,7'h16,7'h3c,
+                                              7'h6e,7'h44,7'h22,7'h08,7'h2b,7'h01,7'h67,7'h4d,
+                                              7'h56,7'h7c,7'h1a,7'h30,7'h13,7'h39,7'h5f,7'h75};
+
+  localparam logic [6:0] TrvkParIncr = 7'h15;
+  localparam logic [6:0] NullParBits = 7'h2a;           // 7-bit parity for 32'h0
+
+  logic [31:0] rf_reg   [31:0];
+  logic [31:0] rf_reg_q [NREGS-1:1];
+  
+  logic [6:0]  rf_reg_par   [31:0];
+  logic [6:0]  rf_reg_par_q [NREGS-1:0];
+  
+  reg_cap_t         rf_cap   [31:0];
+  reg_cap_t         rf_cap_q [NCAPS-1:1];
+
+  reg_cap_t         rcap_a, rcap_b;
+
+  logic [NREGS-1:1] we_a_dec;
+  logic [NREGS-1:1] trvk_dec, trsv_dec;
+  logic [31:0]      reg_rdy_vec;
+  
+  logic             pplbc_alert;
+  
+  always_comb begin : we_a_decoder
+    for (int unsigned i = 1; i < NREGS; i++) begin
+      we_a_dec[i] = (waddr_a_i == 5'(i)) ? we_a_i : 1'b0;
+      trvk_dec[i] = CheriPPLBC ? (trvk_addr_i == 5'(i)) : 1'b0;
+      trsv_dec[i] = CheriPPLBC ? (trsv_addr_i == 5'(i)) : 1'b0;
+    end
+  end
+
+  // No flops for R0 as it's hard-wired to 0
+  for (genvar i = 1; i < NREGS; i++) begin : g_rf_flops
+    
+    
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        rf_reg_q[i] <= 32'h0;
+      end else if (we_a_dec[i]) begin
+        rf_reg_q[i] <= wdata_a_i[31:0];
+      end 
+    end
+    
+    if (RegFileECC) begin : g_reg_par
+      logic [6:0] wdata_par;
+      logic       trvk_clr_we;
+      
+      assign trvk_clr_we = CheriPPLBC & trvk_dec[i] & trvk_en_i & trvk_clrtag_i;      
+      assign wdata_par   = wdata_a_i[DataWidth-1:DataWidth-7];
+      
+      // split reset of data and parity to detect spurious reset (fault protection)
+      always_ff @(posedge clk_i or negedge par_rst_ni) begin
+        if (!par_rst_ni) begin
+          rf_reg_par_q[i] <= DefParBits[i];
+        end else if (trvk_clr_we && we_a_dec[i]) begin
+          rf_reg_par_q[i] <= wdata_par ^ TrvkParIncr;
+        end else if (trvk_clr_we) begin
+          // update parity bits
+          rf_reg_par_q[i] <= rf_reg_par_q[i] ^ TrvkParIncr;
+        end else if (we_a_dec[i]) begin
+          rf_reg_par_q[i] <= wdata_par;
+        end 
+      end
+    end else begin : g_no_reg_par
+      assign rf_reg_par_q[i] = 7'h0;
+    end  // gen reg_par
+
+  end // g_rf_flops
+
+
+  assign rf_reg[0]     = 32'h0;
+  assign rf_reg_par[0] = DefParBits[0];
+  for (genvar i=1; i<32 ; i++) begin
+    if (i < NREGS) begin
+      assign rf_reg[i]     = rf_reg_q[i];         
+      assign rf_reg_par[i] = rf_reg_par_q[i];     
+    end else begin
+      assign rf_reg[i]     = 0;
+      assign rf_reg_par[i] = DefParBits[i];
+    end
+  end
+
+  assign rdata_a_o = DataWidth'({rf_reg_par[raddr_a_i], rf_reg[raddr_a_i]});
+  assign rdata_b_o = DataWidth'({rf_reg_par[raddr_b_i], rf_reg[raddr_b_i]});
+
+  // capability meta data (MSW)
+  for (genvar i = 1; i < NCAPS; i++) begin : g_cap_flops
+    logic trvk_clr_we;
+      
+    assign trvk_clr_we = CheriPPLBC & trvk_dec[i] & trvk_en_i & trvk_clrtag_i;      
+ 
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        rf_cap_q[i] <= NULL_REG_CAP;
+      end else if (trvk_clr_we && we_a_dec[i]) begin
+        rf_cap_q[i] <= and_regcap_tag(wcap_a_i, 1'b0);
+      end else if (trvk_clr_we) begin
+        // prioritize revocation (later in pipeline)
+        rf_cap_q[i] <= and_regcap_tag(rf_cap_q[i], 1'b0);
+      end else if (we_a_dec[i]) begin
+        rf_cap_q[i] <= wcap_a_i;
+      end
+    end
+  end
+
+  assign rf_cap[0] = NULL_REG_CAP;
+  for (genvar i=1; i<32 ; i++) begin
+    if (i < NCAPS) begin 
+      assign rf_cap[i] = rf_cap_q[i];
+    end else begin
+      assign rf_cap[i] = NULL_REG_CAP;
+    end
+  end
+
+  assign rcap_a = rf_cap[raddr_a_i];
+  assign rcap_b = rf_cap[raddr_b_i];
+
+  if (CheriPPLBC) begin : g_regrdy
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni)
+        reg_rdy_vec[0] <= 1'b1;
+    end
+
+    for (genvar i=1; i<NCAPS; i++) begin
+      always_ff @(posedge clk_i or negedge rst_ni) begin
+        if (!rst_ni)
+          reg_rdy_vec[i] <= 1'b1;
+        else if (trsv_dec[i] & trsv_en_i)   // prioritize trsv t
+          reg_rdy_vec[i] <= 1'b0;
+        else if (trvk_dec[i] & trvk_en_i)
+          reg_rdy_vec[i] <= 1'b1;
+      end  // always_ff
+    end
+
+    // unused bits
+    for (genvar i=NCAPS; i<32; i++) begin
+      always_ff @(posedge clk_i or negedge rst_ni) begin
+        if (!rst_ni)
+          reg_rdy_vec[i] <= 1'b1;
+      end
+    end
+    
+    // build the shadow copy of reg_rdy_vec for fault protection
+    if (RegFileECC) begin : gen_shdw
+      logic  [4:0] trvk_addr_q;
+      logic        trvk_en_q;
+      logic        trvk_clrtag_q;
+      logic  [6:0] trvk_par_q;
+      logic  [4:0] trsv_addr_q;
+      logic        trsv_en_q;
+      logic  [6:0] trsv_par_q;
+
+      logic      [31:0] reg_rdy_vec_shdw, reg_rdy_vec_q;  
+      logic [NREGS-1:1] trvk_dec_shdw, trsv_dec_shdw;
+      logic             shdw_mismatch_err, cap_rvk_err;            
+
+
+      always_comb begin  
+        for (int unsigned i = 1; i < NREGS; i++) begin
+          trvk_dec_shdw[i] = (trvk_addr_q == 5'(i));
+          trsv_dec_shdw[i] = (trsv_addr_q == 5'(i));
+        end
+      end
+      
+      always_ff @(posedge clk_i or negedge par_rst_ni) begin
+        if (!par_rst_ni) begin
+           trvk_addr_q   <= 5'h0;
+           trvk_en_q     <= 1'b0;        
+           trvk_clrtag_q <= 1'b0;
+           trvk_par_q    <= NullParBits;   
+           trsv_addr_q   <= 5'h0;
+           trsv_en_q     <= 1'b0;
+           trsv_par_q    <= NullParBits;
+           reg_rdy_vec_q <= {32{1'b1}};
+        end else begin
+           trvk_addr_q   <= trvk_addr_i;
+           trvk_en_q     <= trvk_en_i;
+           trvk_clrtag_q <= trvk_clrtag_i;
+           trvk_par_q    <= trvk_par_i;   
+           trsv_addr_q   <= trsv_addr_i;
+           trsv_en_q     <= trsv_en_i;
+           trsv_par_q    <= trsv_par_i;
+           reg_rdy_vec_q <= reg_rdy_vec;
+        end
+      end
+      
+      for (genvar i = 0; i < 32; i++) begin
+        if ((i == 0) || (i >= NCAPS)) begin
+          assign reg_rdy_vec_shdw[i] = 1'b1;
+        end else begin
+          always_ff @(posedge clk_i or negedge par_rst_ni) begin
+            if (!par_rst_ni)
+              reg_rdy_vec_shdw[i] <= 1'b1;
+            else if (trsv_dec_shdw[i] & trsv_en_q)
+            reg_rdy_vec_shdw[i] <= 1'b0;
+            else if (trvk_dec_shdw[i] & trvk_en_q)
+              reg_rdy_vec_shdw[i] <= 1'b1;
+          end  // always_ff
+        end
+      end
+
+      // generate alert 
+      assign shdw_mismatch_err = (reg_rdy_vec_shdw != reg_rdy_vec_q);
+
+      // readback revoked cap to make sure the valid bit is actually cleared
+      always_comb begin
+        cap_rvk_err = 0;        
+        for (int unsigned i = 1; i < NCAPS; i++) begin
+          cap_rvk_err = cap_rvk_err | (trvk_en_q & trvk_clrtag_q & trvk_dec_shdw[i] & rf_cap_q[i].valid);
+        end
+      end
+ 
+       
+      // check parity of trsv and trvk requests
+      logic [1:0] trsv_ecc_err, trvk_ecc_err;
+
+      prim_secded_inv_39_32_dec trsv_ecc_i (
+        .data_i    ({trsv_par_q, 26'h0, trsv_en_q, trsv_addr_q}),
+        .data_o    (),
+        .syndrome_o(),
+        .err_o     (trsv_ecc_err)
+      );
+
+      prim_secded_inv_39_32_dec trsk_ecc_i (
+        .data_i    ({trvk_par_q, 25'h0, trvk_en_q, trvk_clrtag_q, trvk_addr_q}),
+        .data_o    (),
+        .syndrome_o(),
+        .err_o     (trvk_ecc_err)
+      );
+
+      assign pplbc_alert = shdw_mismatch_err | cap_rvk_err | (|trsv_ecc_err) | (|trvk_ecc_err);
+      
+    end else begin : gen_no_shdw // no ECC or shdw checking
+      assign pplbc_alert = 1'b0;      
+    end
+    
+  end else begin : g_no_regrdy
+    assign reg_rdy_vec = {32{1'b1}};
+    assign pplbc_alert = 1'b0;
+  end  // not pplbc
+  
+  //
+  //  read back last-writen register for fault protection
+  //
+  logic reg_rdbk_err;
+  
+  if (RegFileECC) begin : gen_fault_rdbk
+    logic [NREGS-1:1] we_a_dec_shdw;
+    logic       [4:0] waddr_a_q;
+    logic      [31:0] wdata_a_q;
+    logic       [6:0] wpar_a_q;
+    logic      [37:0] wcap_vec_q;
+    logic             we_a_q;
+    logic      [31:0] wdata_tmp;
+    logic       [6:0] rpar_tmp;
+    logic       [1:0] wreq_ecc_err;
+    logic             rdbk_cmp_err;
+    
+    // flop the write request and check parity 
+    //   need all fields to compute parity bits
+    always_ff @(posedge clk_i or negedge par_rst_ni) begin
+      if (!par_rst_ni) begin
+        waddr_a_q   <= 5'h0;
+        wdata_a_q   <= 32'h0;
+        wpar_a_q    <= NullParBits;
+        wcap_vec_q  <= 38'h0;
+        we_a_q      <= 1'b0;
+      end else begin
+        waddr_a_q   <= waddr_a_i;
+        wdata_a_q   <= wdata_a_i[31:0];
+        wpar_a_q    <= wdata_a_i[DataWidth-1:DataWidth-7];
+        wcap_vec_q  <= reg2vec(wcap_a_i);
+        we_a_q      <= we_a_i;
+      end
+    end      
+
+    assign wdata_tmp    = wdata_a_q ^ wcap_vec_q[31:0] ^ {20'h0, we_a_q, waddr_a_q, wcap_vec_q[37:32]};
+
+    prim_secded_inv_39_32_dec wdata_ecc_i (
+      .data_i    ({wpar_a_q, wdata_tmp}),
+      .data_o    (),
+      .syndrome_o(),
+      .err_o     (wreq_ecc_err)
+    );
+   
+    // decode and read back to verify (only parity bits)
+    always_comb begin 
+      for (int unsigned i = 1; i < NREGS; i++) begin
+        we_a_dec_shdw[i] = (waddr_a_q == 5'(i)) ? we_a_q : 1'b0;
+      end
+    end
+
+    assign rpar_tmp     = rf_reg_par[waddr_a_q]; 
+   
+    assign rdbk_cmp_err = (rpar_tmp != wpar_a_q) && (waddr_a_q != 0) && we_a_q;
+
+    assign reg_rdbk_err = (|wreq_ecc_err) | rdbk_cmp_err;
+
+  end else begin : gen_no_fault_rdbk
+    assign reg_rdbk_err = 1'b0;
+  end 
+  
+  assign alert_o   = pplbc_alert | reg_rdbk_err;
+
+  reg_cap_t rcap_a_rvkd, rcap_b_rvkd;
+
+  if (TRVKBypass) begin
+    // Bypass the registier update cycle and directly update the read ports
+    always_comb begin
+      reg_rdy_o = reg_rdy_vec | ({NREGS{trvk_en_i}} & {trvk_dec, 1'b0});
+      
+      rcap_a_rvkd = rcap_a;
+      if (trvk_en_i && trvk_clrtag_i && (trvk_addr_i == raddr_a_i))
+        rcap_a_rvkd.valid = 1'b0;
+      rcap_a_o = rcap_a_rvkd;
+
+      rcap_b_rvkd = rcap_b;
+      if (trvk_en_i && trvk_clrtag_i && (trvk_addr_i == raddr_b_i))
+        rcap_b_rvkd.valid = 1'b0;
+      rcap_b_o = rcap_b_rvkd;
+    
+    end
+  end else begin
+    assign reg_rdy_o = reg_rdy_vec;
+
+    assign rcap_a_rvkd = rcap_a;
+    assign rcap_a_o    = rcap_a_rvkd;
+    assign rcap_b_rvkd = rcap_b;
+    assign rcap_b_o  = rcap_b_rvkd;
+  end
+   
+
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheri_stkz.sv b/hw/ip/cheriot-ibex/rtl/cheri_stkz.sv
new file mode 100644
index 0000000..ba6ce15
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheri_stkz.sv
@@ -0,0 +1,161 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+
+module cheri_stkz import cheri_pkg::*; (
+   // Clock and Reset
+  input  logic          clk_i,
+  input  logic          rst_ni,
+
+  // CSR register interface
+  input  logic          ztop_wr_i,
+  input  logic [31:0]   ztop_wdata_i,
+  input  full_cap_t     ztop_wfcap_i,
+  output logic [31:0]   ztop_rdata_o,
+  output reg_cap_t      ztop_rcap_o,
+ 
+  input  logic          unmasked_intr_i,
+
+  output logic          stkz_active_o,
+  output logic          stkz_abort_o,
+  output logic [31:0]   stkz_ptr_o,
+  output logic [31:0]   stkz_base_o,
+  output logic          stkz_err_o,
+
+  // LSU req/resp interface (to be multiplixed/qualified)
+  input  logic          lsu_stkz_resp_valid_i,
+  input  logic          lsu_stkz_resp_err_i,
+  input  logic          lsu_stkz_req_done_i,   
+  output logic          stkz_lsu_req_o,
+  output logic          stkz_lsu_we_o,
+  output logic          stkz_lsu_is_cap_o,
+  output logic [31:0]   stkz_lsu_addr_o,
+  output logic [32:0]   stkz_lsu_wdata_o
+);
+
+  typedef enum logic [1:0] {STKZ_IDLE, STKZ_ACTIVE, STKZ_ABORT} stkz_fsm_t;
+
+  stkz_fsm_t  stkz_fsm_d, stkz_fsm_q;
+
+  logic  [29:0] stkz_ptrw, stkz_ptrw_nxt;
+  logic  [29:0] stkz_basew;
+  logic         stkz_start, stkz_done, stkz_stop, stkz_active;
+  reg_cap_t     ztop_rcap, ztop_rcap_nxt;
+  logic  [32:0] ztop_wtop33;
+  logic  [31:0] ztop_wbase32;
+  logic         waddr_eq_base;
+  logic         cmd_cap_good;
+  reg_cap_t     cmd_wcap; 
+  logic         cmd_new_cap, cmd_new_null;
+  logic         cmd_is_n2z;
+
+  assign stkz_lsu_wdata_o  = 33'h0;
+  assign stkz_lsu_is_cap_o = 1'b0;        // this means we are really writing 33'h0 to memory
+  assign stkz_lsu_we_o     = 1'b1;
+  assign stkz_lsu_req_o    = stkz_active;
+  assign stkz_lsu_addr_o   = {stkz_ptrw_nxt, 2'h0};
+
+  assign stkz_active_o     = stkz_active;      
+  assign stkz_active       = (stkz_fsm_q != STKZ_IDLE);
+  assign stkz_abort_o      = (stkz_fsm_q == STKZ_ABORT);
+
+  assign stkz_ptr_o        = {stkz_ptrw, 2'h0};
+  assign stkz_base_o       = {stkz_basew, 2'h0};
+
+  assign ztop_rdata_o = {stkz_ptrw, 2'h0};
+  assign ztop_rcap_o  = ztop_rcap;
+
+  assign ztop_wbase32 = ztop_wfcap_i.base32;
+  assign ztop_wtop33  = ztop_wfcap_i.top33;
+
+  assign cmd_cap_good  = ztop_wfcap_i.valid && (ztop_wtop33[32:2] >= ztop_wdata_i[31:2]) &&
+                         ztop_wfcap_i.perms[PERM_SD];
+  assign cmd_is_n2z    = cmd_cap_good && (ztop_wdata_i[31:2] == ztop_wbase32[31:2]);
+
+  assign cmd_new_null  = ztop_wr_i && (ztop_wfcap_i == NULL_FULL_CAP) && (ztop_wdata_i == 32'h0);
+  assign cmd_new_cap   = ztop_wr_i && ~cmd_new_null;
+
+  assign stkz_start    = cmd_new_cap && cmd_cap_good && (ztop_wdata_i[31:2] > ztop_wbase32[31:2]);
+  assign stkz_done     = (stkz_ptrw_nxt <= stkz_basew);
+  assign stkz_stop     = unmasked_intr_i | cmd_new_null;
+
+
+  always_comb begin
+    logic [2:0] tmp3;
+    logic [8:0] addrmi9;
+ 
+    if ((stkz_fsm_q == STKZ_IDLE) && stkz_start)
+      stkz_fsm_d = STKZ_ACTIVE;
+    else if ((stkz_fsm_q == STKZ_ACTIVE) & stkz_done & lsu_stkz_req_done_i)  // "normal" completion
+      stkz_fsm_d = STKZ_IDLE;
+    else if ((stkz_fsm_q == STKZ_ACTIVE) & stkz_stop & lsu_stkz_req_done_i)  // abort 
+      stkz_fsm_d = STKZ_IDLE;
+    else if ((stkz_fsm_q == STKZ_ACTIVE) & stkz_stop)  // pending abort, wait till lsu_req_done
+      stkz_fsm_d = STKZ_ABORT;
+    else if ((stkz_fsm_q == STKZ_ABORT) & lsu_stkz_req_done_i)
+      stkz_fsm_d = STKZ_IDLE;    // self clear by any furtherload/store activity
+    else
+      stkz_fsm_d = stkz_fsm_q;
+
+    // clear tag if writing an ztop value with address == base
+    cmd_wcap = full2regcap(ztop_wfcap_i);
+    if (cmd_is_n2z) cmd_wcap.valid = 1'b0;
+
+    // we are doing this in lieu of a full set_address.
+    //   note we only start an zeroization if addr > base32 so no need for representability check
+    ztop_rcap_nxt = ztop_rcap;
+    addrmi9 = {stkz_ptrw_nxt, 2'b00} >> ztop_rcap.exp;
+    tmp3    = update_temp_fields(ztop_rcap.top, ztop_rcap.base, addrmi9);
+    ztop_rcap_nxt.top_cor  = tmp3[2:1];
+    ztop_rcap_nxt.base_cor = tmp3[0];
+    ztop_rcap_nxt.valid    = ztop_rcap.valid & ~stkz_done;
+  end
+  
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      stkz_fsm_q    <= STKZ_IDLE;
+      stkz_ptrw     <= 30'h0;
+      stkz_ptrw_nxt <= 30'h0;
+      stkz_basew    <= 30'h0;
+      stkz_err_o    <= 1'b0;
+      ztop_rcap     <= NULL_REG_CAP;
+    end else begin
+
+      stkz_fsm_q <= stkz_fsm_d;
+
+      // zcap is an WARL SCR
+      //   - if active
+      //     - Readback return current progress 
+      //     - allow writing NULL to stop (readback NULL in this case)
+      //       
+      //   - if not active, i
+      //     - only allow writing tagged cap (legalized), which starts zeroization, however
+      //     - speical case: write a tagged cap with addr == base will NOT start zeroization but 
+      //       will clear tag on read
+      //
+      if (ztop_wr_i) begin
+        stkz_ptrw <= ztop_wdata_i[31:2];
+        ztop_rcap <= cmd_wcap;
+      end else if (stkz_active && lsu_stkz_req_done_i) begin
+        stkz_ptrw <= stkz_ptrw_nxt; 
+        ztop_rcap <= ztop_rcap_nxt;
+      end
+
+      // this is the captured hardware zeroization context, only updated for valid zerioation runs
+      if (stkz_start) begin
+        stkz_ptrw_nxt <= ztop_wdata_i[31:2] - 1;
+        stkz_basew    <= ztop_wbase32[31:2];
+      end else if (stkz_active && lsu_stkz_req_done_i && ~(stkz_done | stkz_stop)) begin
+        stkz_ptrw_nxt <= stkz_ptrw_nxt - 1;
+      end
+       
+      if (~stkz_active && stkz_start)
+        stkz_err_o <= 1'b0;
+      else if (lsu_stkz_resp_valid_i && lsu_stkz_resp_err_i)
+        stkz_err_o <= 1'b1;
+
+    end
+  end
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheri_tbre.sv b/hw/ip/cheriot-ibex/rtl/cheri_tbre.sv
new file mode 100644
index 0000000..eb5df17
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheri_tbre.sv
@@ -0,0 +1,269 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+
+module cheri_tbre #(
+  parameter int unsigned FifoSize = 4,        // must be power-of-2
+  parameter int unsigned AddrHi = 31 
+) (
+   // Clock and Reset
+  input  logic          clk_i,
+  input  logic          rst_ni,
+
+  // MMIO register interface 
+  input  logic [65:0]   tbre_ctrl_vec_i,
+  output logic          tbre_stat_o,
+  output logic          tbre_err_o,
+
+  // LSU req/resp interface (to be multiplixed/qualified)
+  input  logic          lsu_tbre_resp_valid_i,
+  input  logic          lsu_tbre_resp_err_i,
+  input  logic          lsu_tbre_resp_is_wr_i,
+  input  logic [32:0]   lsu_tbre_raw_lsw_i,   
+  input  logic          lsu_tbre_req_done_i,   
+  input  logic          lsu_tbre_addr_incr_i,
+  output logic          tbre_lsu_req_o,
+  output logic          tbre_lsu_is_cap_o,
+  output logic          tbre_lsu_we_o,
+  output logic [31:0]   tbre_lsu_addr_o,
+  output logic [32:0]   tbre_lsu_wdata_o,
+
+  // LSU snoop interface
+  input  logic          snoop_lsu_req_done_i,   
+  input  logic          snoop_lsu_req_i,
+  input  logic          snoop_lsu_is_cap_i,
+  input  logic          snoop_lsu_we_i,
+  input  logic          snoop_lsu_cheri_err_i,
+  input  logic [31:0]   snoop_lsu_addr_i,
+
+  // trvk interface
+  input  logic          trvk_en_i,
+  input  logic          trvk_clrtag_i
+);
+
+  localparam FifoPtrW  = $clog2(FifoSize);
+  localparam CapFifoDW = 33+1;
+  localparam ReqFifoDW = AddrHi-1;
+
+
+  logic        tbre_go;
+  logic        tbre_add1wait;
+  logic        load_stop_cond, load_gnt;
+  logic        store_gnt;
+  logic        store_req_valid;
+  logic [31:0] load_addr, store_addr;
+  logic        wait_resp_q;
+
+  logic        req_fifo_wr_en, cap_fifo_wr_en, shdw_fifo_wr_en, fifo_rd_en;
+
+  logic [AddrHi-3:0]     cur_load_addr8, load_addr8_p1;
+  logic [FifoPtrW:0]     req_fifo_ext_wr_ptr, cap_fifo_ext_wr_ptr, shdw_fifo_ext_wr_ptr;
+  logic [FifoPtrW:0]     os_req_cnt;
+  logic [FifoPtrW:0]     fifo_ext_rd_ptr;    
+  logic [FifoPtrW-1:0]   req_fifo_wr_ptr, cap_fifo_wr_ptr, shdw_fifo_wr_ptr;
+  logic [FifoPtrW-1:0]   fifo_rd_ptr;    
+  logic                  shdw_fifo_wr_data;
+  logic [CapFifoDW-1:0]  cap_fifo_wr_data;
+  logic [ReqFifoDW-1:0]  req_fifo_wr_data;
+  logic                  fifo_rd_shdw, fifo_rd_tag, fifo_rd_valid, fifo_rd_err;
+  logic [31:0]           fifo_rd_data;
+  logic [AddrHi-3:0]     fifo_rd_addr8;
+  logic                  fifo_not_empty;
+  
+
+  typedef enum logic [1:0] {TBRE_IDLE, TBRE_LOAD, TBRE_WAIT} tbre_fsm_t;
+  tbre_fsm_t tbre_fsm_q, tbre_fsm_d;
+
+  typedef enum logic [1:0] {SCH_NONE, SCH_LOAD, SCH_STORE} tbre_sch_t;
+  tbre_sch_t tbre_sch_q, tbre_sch_d;
+
+  typedef struct packed {
+    logic         go;
+    logic         add1wait;
+    logic [31:0]  start_addr;
+    logic [31:0]  end_addr;
+  } tbre_ctrl_t;
+
+  tbre_ctrl_t tbre_ctrl;
+
+  // register interface
+  assign tbre_ctrl.go         = tbre_ctrl_vec_i[64];
+  assign tbre_ctrl.add1wait   = tbre_ctrl_vec_i[65];
+  assign tbre_ctrl.start_addr = tbre_ctrl_vec_i[31:0];
+  assign tbre_ctrl.end_addr   = tbre_ctrl_vec_i[63:32];
+  assign tbre_stat_o          = (tbre_fsm_q != TBRE_IDLE);
+
+  //  note having resp_valid here improves performance but making timing a bit worse 
+  //     (data_rvalid --> tbre_lsu_req --> core/tbre mux select --> data_wdata_o
+  assign tbre_lsu_req_o    = ((tbre_sch_q == SCH_LOAD) | ((tbre_sch_q == SCH_STORE) && store_req_valid)) & (~wait_resp_q |  (lsu_tbre_resp_valid_i & ~tbre_ctrl.add1wait));
+  assign tbre_lsu_is_cap_o = (tbre_sch_q == SCH_LOAD);
+  assign tbre_lsu_we_o     = (tbre_sch_q == SCH_STORE);
+  assign tbre_lsu_addr_o   = (tbre_sch_q == SCH_LOAD) ? load_addr + {lsu_tbre_addr_incr_i, 2'b00} : store_addr;
+  assign tbre_lsu_wdata_o  = {1'b0, fifo_rd_data};
+         
+  assign load_addr8_p1     = cur_load_addr8 + 1;
+
+  assign load_stop_cond  = (load_addr8_p1 > tbre_ctrl.end_addr[AddrHi:3]);
+  assign load_gnt        = (tbre_sch_q == SCH_LOAD) & lsu_tbre_req_done_i;
+  assign store_gnt       = (tbre_sch_q == SCH_STORE) & lsu_tbre_req_done_i;
+
+  // expand load/store address by concatnating the MSB from start_address (save some area)
+  assign load_addr       = (AddrHi >= 31) ? {cur_load_addr8, 3'b000} : 
+                           {tbre_ctrl.start_addr[31:AddrHi+1], cur_load_addr8, 3'b000}; 
+  assign store_addr      = (AddrHi >= 31) ? {fifo_rd_addr8, 3'b000} : 
+                           {tbre_ctrl.start_addr[31:AddrHi+1], fifo_rd_addr8, 3'b000}; 
+
+  always_comb begin
+    logic load_stall, req_fifo_full;
+
+    // state machine tracking the progress of memory walk
+    if ((tbre_fsm_q == TBRE_IDLE) && tbre_ctrl.go)
+      tbre_fsm_d = TBRE_LOAD;
+    else if ((tbre_fsm_q == TBRE_LOAD) && load_gnt & load_stop_cond)
+      tbre_fsm_d = TBRE_WAIT;
+    else if ((tbre_fsm_q == TBRE_WAIT) &&  (os_req_cnt == 0))
+      tbre_fsm_d = TBRE_IDLE;
+    else
+      tbre_fsm_d = tbre_fsm_q;
+
+    // arbitration between load/store requests, throttle if too many outstanding load requests
+    //   TBRE assumes a non-buffered memory model (new req won't be gnt'd if the prev response
+    //   still outstanding). If not, we have to change this to throttle on resp as well since
+    //   the load_store_unit can't handle multiple outstanding requests.
+ 
+    load_stall    = (os_req_cnt >= FifoSize-1);
+    req_fifo_full = (os_req_cnt >= FifoSize);
+
+    tbre_sch_d = tbre_sch_q;   // default
+    case (tbre_sch_q) 
+      SCH_NONE: 
+        if ((tbre_fsm_q == TBRE_LOAD) && !req_fifo_full)
+          tbre_sch_d = SCH_LOAD;
+        else if (store_req_valid)
+          tbre_sch_d = SCH_STORE;
+      SCH_LOAD:
+        if (load_gnt & (load_stall || (tbre_fsm_d == TBRE_WAIT)) & store_req_valid)
+          tbre_sch_d = SCH_STORE;
+        else if (load_gnt & (load_stall || (tbre_fsm_d == TBRE_WAIT)))
+          tbre_sch_d = SCH_NONE;
+      SCH_STORE:
+        if ((store_gnt | ~store_req_valid) & (tbre_fsm_q == TBRE_LOAD))
+          tbre_sch_d = SCH_LOAD;     // no need to check req_fifo_full, since we are dequeing from it
+        else if (store_gnt|~store_req_valid)    // go back to NONE to allow reading fifo further
+          tbre_sch_d = SCH_NONE;     // no bandwidth loss here since the load req will move ahead anyway
+      default:;
+    endcase
+  end
+  
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      tbre_fsm_q      <= TBRE_IDLE;
+      tbre_sch_q      <= SCH_NONE;
+      cur_load_addr8  <= 'h0;
+      wait_resp_q     <= 1'b0;
+      tbre_err_o      <= 1'b0;
+    end else begin
+
+      tbre_fsm_q <= tbre_fsm_d;
+      tbre_sch_q <= tbre_sch_d;
+
+      if (tbre_ctrl.go & (tbre_fsm_q == TBRE_IDLE))
+        cur_load_addr8 <= tbre_ctrl.start_addr[AddrHi:3];
+      else if (load_gnt)
+        cur_load_addr8 <= load_addr8_p1;
+
+      if (load_gnt | store_gnt)
+        wait_resp_q <= 1'b1;
+      else if (lsu_tbre_resp_valid_i)
+        wait_resp_q <= 1'b0;
+
+      // for now just capture/latch errors and flag it to firmware
+      if ((tbre_fsm_q == TBRE_IDLE) && tbre_ctrl.go)
+        tbre_err_o <= 1'b0;
+      else if (lsu_tbre_resp_valid_i && lsu_tbre_resp_err_i)
+        tbre_err_o <= 1'b1;
+    end
+  end
+
+  // FIFOs to buffer caps read from the data memory and shadow bits from the shadow map
+
+  // count of outstand load requests in the pipeline
+  assign os_req_cnt = req_fifo_ext_wr_ptr - fifo_ext_rd_ptr;
+
+  assign req_fifo_wr_ptr  = req_fifo_ext_wr_ptr[FifoPtrW-1:0];
+  assign cap_fifo_wr_ptr  = cap_fifo_ext_wr_ptr[FifoPtrW-1:0];
+  assign shdw_fifo_wr_ptr = shdw_fifo_ext_wr_ptr[FifoPtrW-1:0];
+  assign fifo_rd_ptr      = fifo_ext_rd_ptr[FifoPtrW-1:0];
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      fifo_ext_rd_ptr      <= 'h0;
+      req_fifo_ext_wr_ptr  <= 'h0;
+      cap_fifo_ext_wr_ptr  <= 'h0;
+      shdw_fifo_ext_wr_ptr <= 'h0;
+    end else begin
+      // FIFO size is power-of-2
+      if (fifo_rd_en) fifo_ext_rd_ptr <= fifo_ext_rd_ptr + 1;
+
+      if (req_fifo_wr_en)  req_fifo_ext_wr_ptr  <= req_fifo_ext_wr_ptr + 1;
+      if (cap_fifo_wr_en)  cap_fifo_ext_wr_ptr  <= cap_fifo_ext_wr_ptr + 1;      
+      if (shdw_fifo_wr_en) shdw_fifo_ext_wr_ptr <= shdw_fifo_ext_wr_ptr + 1;
+    end
+  end
+
+  logic [FifoSize-1:0][ReqFifoDW-1:0] req_fifo_mem;          // packed entry: addr, valid, 32-bit data
+  logic [FifoSize-1:0][CapFifoDW-1:0] cap_fifo_mem;          // packed entry: addr, valid, 32-bit data
+  logic [FifoSize-1:0]                shdw_fifo_mem;         // single shadow bit per entry
+  
+  for (genvar i= 0; i < FifoSize; i++) begin : gen_fifo_mem
+    logic [28:0] req_fifo_item_addr8;
+    assign req_fifo_item_addr8 = (AddrHi >= 31) ? req_fifo_mem[i][AddrHi-3:0] : 
+                           {tbre_ctrl.start_addr[31:AddrHi+1], req_fifo_mem[i][AddrHi-3:0]}; 
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        req_fifo_mem[i]  <= 0;
+        cap_fifo_mem[i]  <= 0;
+        shdw_fifo_mem[i] <= 1'b0;
+      end else begin
+        // monitoring the ongoing writes to LSU to detect collisiona
+        // also  what about a collision between write request and head of the FIFO? 
+        if (req_fifo_wr_en && (i == req_fifo_wr_ptr)) 
+          req_fifo_mem[i] <= req_fifo_wr_data;
+        else if ((req_fifo_item_addr8 == snoop_lsu_addr_i[31:3]) && snoop_lsu_req_done_i && snoop_lsu_we_i)
+          req_fifo_mem[i] <= req_fifo_mem[i] & {1'b0, {(AddrHi-2){1'b1}}};
+
+        if (cap_fifo_wr_en && (i == cap_fifo_wr_ptr)) cap_fifo_mem[i] <= cap_fifo_wr_data;
+        if (shdw_fifo_wr_en && (i == shdw_fifo_wr_ptr)) shdw_fifo_mem[i] <= shdw_fifo_wr_data;
+      end
+    end  // always
+  end  // generate
+
+  // peek into the current FIFO head
+  assign fifo_rd_addr8  = req_fifo_mem[fifo_rd_ptr][AddrHi-3:0];
+  assign fifo_rd_valid  = req_fifo_mem[fifo_rd_ptr][AddrHi-2];
+  assign fifo_rd_data   = cap_fifo_mem[fifo_rd_ptr][31:0];
+  assign fifo_rd_tag    = cap_fifo_mem[fifo_rd_ptr][32];
+  assign fifo_rd_err    = cap_fifo_mem[fifo_rd_ptr][33];
+  assign fifo_rd_shdw   = shdw_fifo_mem[fifo_rd_ptr];
+
+  // only issue invalidation store requests if
+  //   valid cap returned && no write collision on the address && shadow_bit == 1 
+  assign store_req_valid = fifo_not_empty & fifo_rd_tag & fifo_rd_shdw & fifo_rd_valid & ~fifo_rd_err;
+
+  assign fifo_not_empty = (req_fifo_ext_wr_ptr  != fifo_ext_rd_ptr) && 
+                          (cap_fifo_ext_wr_ptr  != fifo_ext_rd_ptr) &&
+                          (shdw_fifo_ext_wr_ptr != fifo_ext_rd_ptr);
+
+  assign fifo_rd_en = fifo_not_empty & (((tbre_sch_q == SCH_STORE) & store_gnt) | ~store_req_valid);
+
+  assign req_fifo_wr_en   = (tbre_sch_q == SCH_LOAD) & load_gnt;
+  assign req_fifo_wr_data = {1'b1, cur_load_addr8};
+
+  assign cap_fifo_wr_en   = lsu_tbre_resp_valid_i & ~lsu_tbre_resp_is_wr_i;
+  assign cap_fifo_wr_data = {lsu_tbre_resp_err_i, lsu_tbre_raw_lsw_i};
+
+  assign shdw_fifo_wr_en   = trvk_en_i;
+  assign shdw_fifo_wr_data = trvk_clrtag_i;
+  
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheri_tbre_wrapper.sv b/hw/ip/cheriot-ibex/rtl/cheri_tbre_wrapper.sv
new file mode 100644
index 0000000..de1693f
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheri_tbre_wrapper.sv
@@ -0,0 +1,248 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+
+module cheri_tbre_wrapper import cheri_pkg::*; #(
+  parameter bit          CHERIoTEn   = 1'b1,
+  parameter bit          CheriTBRE   = 1'b1,
+  parameter bit          CheriStkZ   = 1'b1,
+  parameter  bit         StkZIntrOK  = 1'b0,
+  parameter int unsigned MMRegDinW   = 128,
+  parameter int unsigned MMRegDoutW  = 64
+  
+) (
+   // Clock and Reset
+  input  logic          clk_i,
+  input  logic          rst_ni,
+
+  // MMIO register interface 
+  input  logic [MMRegDinW-1:0]  mmreg_corein_i,
+  output logic [MMRegDoutW-1:0] mmreg_coreout_o,
+
+  // LSU req/resp interface (to be multiplixed/qualified)
+  input  logic          lsu_tbre_resp_valid_i,
+  input  logic          lsu_tbre_resp_err_i,
+  input  logic          lsu_tbre_resp_is_wr_i,
+  input  logic [32:0]   lsu_tbre_raw_lsw_i,   
+  input  logic          lsu_tbre_req_done_i,   
+  input  logic          lsu_tbre_addr_incr_i,
+  input  logic          lsu_tbre_sel_i,
+  output logic          tbre_lsu_req_o,
+  output logic          tbre_lsu_is_cap_o,
+  output logic          tbre_lsu_we_o,
+  output logic [31:0]   tbre_lsu_addr_o,
+  output logic [32:0]   tbre_lsu_wdata_o,
+
+  // LSU snoop interface
+  input  logic          snoop_lsu_req_done_i,   
+  input  logic          snoop_lsu_req_i,
+  input  logic          snoop_lsu_is_cap_i,
+  input  logic          snoop_lsu_we_i,
+  input  logic          snoop_lsu_cheri_err_i,
+  input  logic [31:0]   snoop_lsu_addr_i,
+
+  // trvk interface
+  input  logic          trvk_en_i,
+  input  logic          trvk_clrtag_i,
+
+  // Stack fast-clearing signals
+  input  logic          ztop_wr_i,
+  input  logic [31:0]   ztop_wdata_i,
+  input  full_cap_t     ztop_wfcap_i,
+  output logic [31:0]   ztop_rdata_o,
+  output reg_cap_t      ztop_rcap_o,
+ 
+  input  logic          unmasked_intr_i,
+
+  output logic          stkz_active_o,
+  output logic          stkz_abort_o,
+  output logic [31:0]   stkz_ptr_o,
+  output logic [31:0]   stkz_base_o
+);
+
+  localparam nMSTR = 2;
+
+  logic          lsu_blk1_resp_valid;    
+  logic          lsu_blk1_req_done;   
+  logic          blk1_lsu_req;
+  logic          blk1_lsu_is_cap;
+  logic          blk1_lsu_we;
+  logic [31:0]   blk1_lsu_addr;
+  logic [32:0]   blk1_lsu_wdata;
+
+  logic          lsu_blk0_resp_valid;    
+  logic          lsu_blk0_req_done;   
+  logic          blk0_lsu_req;
+  logic          blk0_lsu_is_cap;
+  logic          blk0_lsu_we;
+  logic [31:0]   blk0_lsu_addr;
+  logic [32:0]   blk0_lsu_wdata;
+
+
+  logic          tbre_stat, tbre_err, stkz_err;
+
+  assign mmreg_coreout_o = {{(MMRegDoutW-10){1'b0}}, 2'b00, 2'b00, stkz_err, stkz_active_o,
+                                                    2'b00,  tbre_err, tbre_stat};
+
+  if (CHERIoTEn & CheriTBRE) begin : g_tbre
+    logic [65:0] tbre_ctrl_vec;
+
+    assign tbre_ctrl_vec      = mmreg_corein_i[65:0];
+
+    cheri_tbre #(
+      .FifoSize (4), 
+      .AddrHi   (23)
+    ) cheri_tbre_i (
+     // Clock and Reset
+      .clk_i                   (clk_i),                 
+      .rst_ni                  (rst_ni),
+      .tbre_ctrl_vec_i         (tbre_ctrl_vec),
+      .tbre_stat_o             (tbre_stat),
+      .tbre_err_o              (tbre_err),
+      .lsu_tbre_resp_valid_i   (lsu_blk1_resp_valid),
+      .lsu_tbre_resp_err_i     (lsu_tbre_resp_err_i),
+      .lsu_tbre_resp_is_wr_i   (lsu_tbre_resp_is_wr_i),
+      .lsu_tbre_raw_lsw_i      (lsu_tbre_raw_lsw_i),   
+      .lsu_tbre_req_done_i     (lsu_blk1_req_done),   
+      .lsu_tbre_addr_incr_i    (lsu_tbre_addr_incr_i),
+      .tbre_lsu_req_o          (blk1_lsu_req),
+      .tbre_lsu_is_cap_o       (blk1_lsu_is_cap),
+      .tbre_lsu_we_o           (blk1_lsu_we),
+      .tbre_lsu_addr_o         (blk1_lsu_addr),
+      .tbre_lsu_wdata_o        (blk1_lsu_wdata),
+      .snoop_lsu_req_done_i    (snoop_lsu_req_done_i),  
+      .snoop_lsu_req_i         (snoop_lsu_req_i),
+      .snoop_lsu_is_cap_i      (snoop_lsu_is_cap_i),
+      .snoop_lsu_we_i          (snoop_lsu_we_i),
+      .snoop_lsu_cheri_err_i   (snoop_lsu_cheri_err_i),
+      .snoop_lsu_addr_i        (snoop_lsu_addr_i),
+      .trvk_en_i               (trvk_en_i),
+      .trvk_clrtag_i           (trvk_clrtag_i)          
+      );
+  end else begin
+    assign tbre_stat       = 1'b0;
+    assign tbre_err        = 1'b0;
+    assign blk1_lsu_req    = 1'b0;
+    assign blk1_lsu_is_cap = 1'b0;
+    assign blk1_lsu_we     = 1'b0;
+    assign blk1_lsu_addr   = 32'h0;
+    assign blk1_lsu_wdata  = 33'h0;
+  end
+
+  if (CHERIoTEn & CheriStkZ) begin : g_stkz
+    logic unmasked_intr;
+    assign unmasked_intr = StkZIntrOK & unmasked_intr_i;
+
+    cheri_stkz cheri_stkz_i (
+      .clk_i                  (clk_i             ),
+      .rst_ni                 (rst_ni            ),
+      .ztop_wr_i              (ztop_wr_i),  
+      .ztop_wdata_i           (ztop_wdata_i),
+      .ztop_wfcap_i           (ztop_wfcap_i),
+      .ztop_rdata_o           (ztop_rdata_o),
+      .ztop_rcap_o            (ztop_rcap_o),
+      .unmasked_intr_i        (unmasked_intr    ),
+      .stkz_active_o          (stkz_active_o        ),
+      .stkz_abort_o           (stkz_abort_o         ),
+      .stkz_ptr_o             (stkz_ptr_o           ),
+      .stkz_base_o            (stkz_base_o          ),
+      .stkz_err_o             (stkz_err             ),
+      .lsu_stkz_resp_valid_i  (lsu_blk0_resp_valid    ),
+      .lsu_stkz_resp_err_i    (lsu_tbre_resp_err_i    ),
+      .lsu_stkz_req_done_i    (lsu_blk0_req_done      ),
+      .stkz_lsu_req_o         (blk0_lsu_req           ),
+      .stkz_lsu_we_o          (blk0_lsu_we            ),
+      .stkz_lsu_is_cap_o      (blk0_lsu_is_cap        ),
+      .stkz_lsu_addr_o        (blk0_lsu_addr          ),
+      .stkz_lsu_wdata_o       (blk0_lsu_wdata         )
+      );
+
+  end else begin
+    assign stkz_active_o = 1'b0;
+    assign stkz_abort_o  = 1'b0;
+    assign stkz_ptr_o    = 32'h3;     // use this to flag stkz feature doesn't exist
+    assign stkz_base_o   = 32'h0;
+    assign stkz_err      = 1'b0;
+
+    assign ztop_rcap_o  = NULL_REG_CAP;
+    assign ztop_rdata_o = 32'h0000_aa55;
+
+    assign blk0_lsu_req    = 1'b0;
+    assign blk0_lsu_is_cap = 1'b0;
+    assign blk0_lsu_we     = 1'b0;
+    assign blk0_lsu_addr   = 32'h0;
+    assign blk0_lsu_wdata  = 33'h0;
+  end
+
+  //
+  // Arbitration for LSU interface between tbre and stkz engines
+  //  reuse the obimux logic
+  //
+  logic [nMSTR-1:0] mstr_arbit, mstr_arbit_q, mstr_arbit_comb;
+  logic [nMSTR-1:0] mstr_req;
+  logic             req_pending, req_pending_q;
+  logic             slv_req, slv_gnt;
+
+  assign slv_req = |mstr_req;
+
+  // arbitration by strict priority assignment - mst_req[0] == highest priority
+  for (genvar i = 0; i < nMSTR; i++) begin
+    logic [7:0] pri_mask;
+    assign pri_mask = 8'hff >> (8-i);      // max 8 masters, should be enough 
+    assign mstr_arbit[i] = mstr_req[i] & ~(|(mstr_req & pri_mask[nMSTR-1:0]));
+  end
+
+  // Handling delayed-gnt case. 
+  // make the next arbiration decision immediately if any master_req active
+  // If slv_gnt doesn't happen in the same cycle, register the  decision till 
+  // slv_gant so that the address/wdata/ctrl can be hold steady when presenting 
+  // the next request to the slave. 
+  // Corner case:
+  // -- adding the lsu_tbre_sel term to req_pending (allow the arbitration to
+  //    change when LSU is handling CPU requests.
+  //    this is needed since TBRE could cancel write requests in the case of
+  //    a pipeline hazard (cpu write to the same location TBRE is working on)
+  
+  assign mstr_arbit_comb = req_pending_q ? mstr_arbit_q : mstr_arbit;
+  assign req_pending = |mstr_req & ~slv_gnt & ~req_pending_q & lsu_tbre_sel_i;
+
+  always @(posedge clk_i or negedge rst_ni) begin
+    if(~rst_ni) begin
+      req_pending_q  <= 1'b0;
+      mstr_arbit_q   <= 0;
+    end else begin
+      if (slv_gnt) req_pending_q <= 1'b0;
+      else if (req_pending) req_pending_q <= 1'b1;
+      if (req_pending) mstr_arbit_q <= mstr_arbit;
+    end
+  end
+
+  // muxing the outgoing control signals
+  assign slv_gnt  = lsu_tbre_req_done_i;
+  assign mstr_req = {blk1_lsu_req,  blk0_lsu_req};
+
+  assign tbre_lsu_req_o     = slv_req;
+  assign tbre_lsu_is_cap_o  = mstr_arbit_comb[1] ? blk1_lsu_is_cap : blk0_lsu_is_cap;
+  assign tbre_lsu_we_o      = mstr_arbit_comb[1] ? blk1_lsu_we : blk0_lsu_we;
+  assign tbre_lsu_addr_o    = mstr_arbit_comb[1] ? blk1_lsu_addr : blk0_lsu_addr;
+  assign tbre_lsu_wdata_o   = mstr_arbit_comb[1] ? blk1_lsu_wdata : blk0_lsu_wdata;
+
+  assign lsu_blk1_req_done  = mstr_arbit_comb[1] & lsu_tbre_req_done_i; 
+  assign lsu_blk0_req_done  = mstr_arbit_comb[0] & lsu_tbre_req_done_i; 
+
+  // 
+  logic resp_sel_q;
+  always @(posedge clk_i or negedge rst_ni) begin
+    if(~rst_ni) begin
+      resp_sel_q <= 1'b0;
+    end else if (lsu_tbre_req_done_i) begin
+      resp_sel_q <= (mstr_arbit_comb[1]);
+    end
+  end
+
+  assign lsu_blk0_resp_valid = ~resp_sel_q & lsu_tbre_resp_valid_i;
+  assign lsu_blk1_resp_valid =  resp_sel_q & lsu_tbre_resp_valid_i;
+
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheri_trvk_stage.sv b/hw/ip/cheriot-ibex/rtl/cheri_trvk_stage.sv
new file mode 100644
index 0000000..b22ce70
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheri_trvk_stage.sv
@@ -0,0 +1,131 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+module cheri_trvk_stage #(
+  parameter int unsigned HeapBase  = 32'h2001_0000,
+  parameter int unsigned TSMapSize = 1024
+) (
+   // Clock and Reset
+  input  logic                clk_i,
+  input  logic                rst_ni,
+
+  input  logic                rf_trsv_en_i,
+  input  logic [4:0]          rf_trsv_addr_i,
+
+  // from LSU
+  input  logic                lsu_resp_valid_i,
+  input  logic                lsu_load_err_i,
+  input  logic [31:0]         rf_wdata_lsu_i,
+  input  cheri_pkg::reg_cap_t rf_wcap_lsu_i,
+
+  input  logic                lsu_resp_is_wr_i,
+
+  input  logic                lsu_tbre_resp_valid_i,
+  input  logic                lsu_tbre_resp_err_i,
+
+  output logic [4:0]          rf_trvk_addr_o,
+  output logic                rf_trvk_en_o,
+  output logic                rf_trvk_clrtag_o,
+
+  output logic                tbre_trvk_en_o,
+  output logic                tbre_trvk_clrtag_o,
+
+  output logic                tsmap_cs_o,
+  output logic [15:0]         tsmap_addr_o,
+  input  logic [31:0]         tsmap_rdata_i
+);
+
+  import cheri_pkg::*;
+
+  reg_cap_t    in_cap_q;
+  logic [31:0] in_data_q;
+
+  logic        cpu_op_active;
+  logic [2:0]  cpu_op_valid_q, tbre_op_valid_q, cap_good_q;
+  logic        cpu_op_valid, tbre_op_valid, cap_good;
+  logic [4:0]  trsv_addr;
+  logic [4:0]  trsv_addr_q[2:0];
+  logic        trvk_status;
+
+  logic [31:0] base32;
+  logic [31:0] tsmap_ptr;
+  logic  [4:0] bitpos_q; // bit index in a 32-bit word
+  logic        range_ok;
+  logic  [2:1] range_ok_q;
+
+
+  assign base32    = get_bound33(in_cap_q.base, {2{in_cap_q.base_cor}}, in_cap_q.exp, in_data_q);
+  assign tsmap_ptr = (base32 - HeapBase) >> 3;
+
+  assign tsmap_addr_o  = tsmap_ptr[15:5];
+
+  // not a sealling cap and pointing to valid TSMAP range
+  assign range_ok      = (tsmap_ptr[31:5] <= TSMapSize) && 
+                         ~((in_cap_q.cperms[4:3]==2'b00) && (|in_cap_q.cperms[2:0]));
+  assign tsmap_cs_o    = (cpu_op_valid_q[0] | tbre_op_valid_q[0]) & cap_good_q[0];
+
+  assign rf_trvk_en_o     =  cpu_op_valid_q[2];
+  assign rf_trvk_clrtag_o =  trvk_status & cap_good_q[2] & range_ok_q[2];
+  assign rf_trvk_addr_o   =  trsv_addr_q[2];
+
+  assign tbre_trvk_en_o      = tbre_op_valid_q[2];
+  assign tbre_trvk_clrtag_o  = trvk_status & cap_good_q[2] & range_ok_q[2];
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      cpu_op_active <= 1'b0;
+      trsv_addr  <= 5'h0;
+    end else begin
+      if (rf_trsv_en_i) cpu_op_active <= 1'b1;
+      else if (lsu_resp_valid_i) cpu_op_active <= 1'b0;
+
+      if (rf_trsv_en_i)  trsv_addr <= rf_trsv_addr_i;
+    end
+  end
+
+  
+  assign cpu_op_valid  =  cpu_op_active & lsu_resp_valid_i;    // CPU op only active when Load cap 
+  assign tbre_op_valid =  lsu_tbre_resp_valid_i & ~lsu_resp_is_wr_i;    // TBRE Load
+  assign cap_good      =  (cpu_op_active & lsu_resp_valid_i & ~lsu_load_err_i & rf_wcap_lsu_i.valid) |
+                          (lsu_tbre_resp_valid_i & ~lsu_tbre_resp_err_i &  rf_wcap_lsu_i.valid);
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      cpu_op_valid_q  <= 0;
+      tbre_op_valid_q <= 0;
+      cap_good_q      <= 0;
+      in_cap_q        <= NULL_REG_CAP;
+      in_data_q       <= 32'h0;
+      bitpos_q        <= 0;
+      trvk_status     <= 1'b0;
+      range_ok_q      <= 0;
+      trsv_addr_q[0]  <= 5'b0;
+      trsv_addr_q[1]  <= 5'b0;
+      trsv_addr_q[2]  <= 5'b0;
+    end else begin
+      // control signal per stage
+      cpu_op_valid_q  <= {cpu_op_valid_q[1:0], cpu_op_valid};
+      tbre_op_valid_q <= {tbre_op_valid_q[1:0], tbre_op_valid};
+      cap_good_q      <= {cap_good_q[1:0], cap_good};
+      trsv_addr_q[0]  <= trsv_addr;
+      trsv_addr_q[1]  <= trsv_addr_q[0];
+      trsv_addr_q[2]  <= trsv_addr_q[1];
+
+      // stage 0 status: register loaded cap
+      if ((cpu_op_valid & ~lsu_load_err_i) | (tbre_op_valid & ~lsu_tbre_resp_err_i)) begin
+        in_cap_q    <= rf_wcap_lsu_i;
+        in_data_q   <= rf_wdata_lsu_i;
+      end
+
+      // stage 1 status:
+      bitpos_q      <= tsmap_ptr[4:0];
+      range_ok_q[1] <= range_ok;
+
+      // stage 2: index map data
+      range_ok_q[2] <= range_ok_q[1];
+      trvk_status   <= tsmap_rdata_i[bitpos_q];
+    end
+  end
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_alu.sv b/hw/ip/cheriot-ibex/rtl/cheriot_alu.sv
new file mode 100644
index 0000000..32d2fe7
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_alu.sv
@@ -0,0 +1,1400 @@
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Arithmetic logic unit
+ */
+module cheriot_alu #(
+  parameter cheriot_pkg::rv32b_e RV32B = cheriot_pkg::RV32BNone
+) (
+  input  cheriot_pkg::alu_op_e operator_i,
+  input  logic [31:0]       operand_a_i,
+  input  logic [31:0]       operand_b_i,
+
+  input  logic              instr_first_cycle_i,
+
+  input  logic [32:0]       multdiv_operand_a_i,
+  input  logic [32:0]       multdiv_operand_b_i,
+
+  input  logic              multdiv_sel_i,
+
+  input  logic [31:0]       imd_val_q_i[2],
+  output logic [31:0]       imd_val_d_o[2],
+  output logic [1:0]        imd_val_we_o,
+
+  output logic [31:0]       adder_result_o,
+  output logic [33:0]       adder_result_ext_o,
+
+  output logic [31:0]       result_o,
+  output logic              comparison_result_o,
+  output logic              is_equal_result_o
+);
+  import cheriot_pkg::*;
+
+  logic [31:0] operand_a_rev;
+  logic [32:0] operand_b_neg;
+
+  // bit reverse operand_a for left shifts and bit counting
+  for (genvar k = 0; k < 32; k++) begin : gen_rev_operand_a
+    assign operand_a_rev[k] = operand_a_i[31-k];
+  end
+
+  ///////////
+  // Adder //
+  ///////////
+
+  logic        adder_op_a_shift1;
+  logic        adder_op_a_shift2;
+  logic        adder_op_a_shift3;
+  logic        adder_op_b_negate;
+  logic [32:0] adder_in_a, adder_in_b;
+  logic [31:0] adder_result;
+
+  always_comb begin
+    adder_op_a_shift1 = 1'b0;
+    adder_op_a_shift2 = 1'b0;
+    adder_op_a_shift3 = 1'b0;
+    adder_op_b_negate = 1'b0;
+    unique case (operator_i)
+      // Adder OPs
+      ALU_SUB,
+
+      // Comparator OPs
+      ALU_EQ,   ALU_NE,
+      ALU_GE,   ALU_GEU,
+      ALU_LT,   ALU_LTU,
+      ALU_SLT,  ALU_SLTU,
+
+      // MinMax OPs (RV32B Ops)
+      ALU_MIN,  ALU_MINU,
+      ALU_MAX,  ALU_MAXU: adder_op_b_negate = 1'b1;
+
+      // Address Calculation OPs (RV32B Ops)
+      ALU_SH1ADD: if (RV32B != RV32BNone) adder_op_a_shift1 = 1'b1;
+      ALU_SH2ADD: if (RV32B != RV32BNone) adder_op_a_shift2 = 1'b1;
+      ALU_SH3ADD: if (RV32B != RV32BNone) adder_op_a_shift3 = 1'b1;
+
+      default:;
+    endcase
+  end
+
+  // prepare operand a
+  always_comb begin
+    unique case(1'b1)
+      multdiv_sel_i:     adder_in_a = multdiv_operand_a_i;
+      adder_op_a_shift1: adder_in_a = {operand_a_i[30:0],2'b01};
+      adder_op_a_shift2: adder_in_a = {operand_a_i[29:0],3'b001};
+      adder_op_a_shift3: adder_in_a = {operand_a_i[28:0],4'b0001};
+      default:           adder_in_a = {operand_a_i,1'b1};
+    endcase
+  end
+
+  // prepare operand b
+  assign operand_b_neg = {operand_b_i,1'b0} ^ {33{1'b1}};
+  always_comb begin
+    unique case (1'b1)
+      multdiv_sel_i:     adder_in_b = multdiv_operand_b_i;
+      adder_op_b_negate: adder_in_b = operand_b_neg;
+      default:           adder_in_b = {operand_b_i, 1'b0};
+    endcase
+  end
+
+  // actual adder
+  assign adder_result_ext_o = $unsigned(adder_in_a) + $unsigned(adder_in_b);
+
+  assign adder_result       = adder_result_ext_o[32:1];
+
+  assign adder_result_o     = adder_result;
+
+  ////////////////
+  // Comparison //
+  ////////////////
+
+  logic is_equal;
+  logic is_greater_equal;  // handles both signed and unsigned forms
+  logic cmp_signed;
+
+  always_comb begin
+    unique case (operator_i)
+      ALU_GE,
+      ALU_LT,
+      ALU_SLT,
+      // RV32B only
+      ALU_MIN,
+      ALU_MAX: cmp_signed = 1'b1;
+
+      default: cmp_signed = 1'b0;
+    endcase
+  end
+
+  assign is_equal = (adder_result == 32'b0);
+  assign is_equal_result_o = is_equal;
+
+  // Is greater equal
+  always_comb begin
+    if ((operand_a_i[31] ^ operand_b_i[31]) == 1'b0) begin
+      is_greater_equal = (adder_result[31] == 1'b0);
+    end else begin
+      is_greater_equal = operand_a_i[31] ^ (cmp_signed);
+    end
+  end
+
+  // GTE unsigned:
+  // (a[31] == 1 && b[31] == 1) => adder_result[31] == 0
+  // (a[31] == 0 && b[31] == 0) => adder_result[31] == 0
+  // (a[31] == 1 && b[31] == 0) => 1
+  // (a[31] == 0 && b[31] == 1) => 0
+
+  // GTE signed:
+  // (a[31] == 1 && b[31] == 1) => adder_result[31] == 0
+  // (a[31] == 0 && b[31] == 0) => adder_result[31] == 0
+  // (a[31] == 1 && b[31] == 0) => 0
+  // (a[31] == 0 && b[31] == 1) => 1
+
+  // generate comparison result
+  logic cmp_result;
+
+  always_comb begin
+    unique case (operator_i)
+      ALU_EQ:             cmp_result =  is_equal;
+      ALU_NE:             cmp_result = ~is_equal;
+      ALU_GE,   ALU_GEU,
+      ALU_MAX,  ALU_MAXU: cmp_result = is_greater_equal; // RV32B only
+      ALU_LT,   ALU_LTU,
+      ALU_MIN,  ALU_MINU, //RV32B only
+      ALU_SLT,  ALU_SLTU: cmp_result = ~is_greater_equal;
+
+      default: cmp_result = is_equal;
+    endcase
+  end
+
+  assign comparison_result_o = cmp_result;
+
+  ///////////
+  // Shift //
+  ///////////
+
+  // The shifter structure consists of a 33-bit shifter: 32-bit operand + 1 bit extension for
+  // arithmetic shifts and one-shift support.
+  // Rotations and funnel shifts are implemented as multi-cycle instructions.
+  // The shifter is also used for single-bit instructions and bit-field place as detailed below.
+  //
+  // Standard Shifts
+  // ===============
+  // For standard shift instructions, the direction of the shift is to the right by default. For
+  // left shifts, the signal shift_left signal is set. If so, the operand is initially reversed,
+  // shifted to the right by the specified amount and shifted back again. For arithmetic- and
+  // one-shifts the 33rd bit of the shifter operand can is set accordingly.
+  //
+  // Multicycle Shifts
+  // =================
+  //
+  // Rotation
+  // --------
+  // For rotations, the operand signals operand_a_i and operand_b_i are kept constant to rs1 and
+  // rs2 respectively.
+  //
+  // Rotation pseudocode:
+  //   shift_amt = rs2 & 31;
+  //   multicycle_result = (rs1 >> shift_amt) | (rs1 << (32 - shift_amt));
+  //                       ^-- cycle 0 -----^ ^-- cycle 1 --------------^
+  //
+  // Funnel Shifts
+  // -------------
+  // For funnel shifs, operand_a_i is tied to rs1 in the first cycle and rs3 in the
+  // second cycle. operand_b_i is always tied to rs2. The order of applying the shift amount or
+  // its complement is determined by bit [5] of shift_amt.
+  //
+  // Funnel shift Pseudocode: (fsl)
+  //  shift_amt = rs2 & 63;
+  //  shift_amt_compl = 32 - shift_amt[4:0]
+  //  if (shift_amt >=33):
+  //     multicycle_result = (rs1 >> shift_amt_compl[4:0]) | (rs3 << shift_amt[4:0]);
+  //                         ^-- cycle 0 ----------------^ ^-- cycle 1 ------------^
+  //  else if (shift_amt <= 31 && shift_amt > 0):
+  //     multicycle_result = (rs1 << shift_amt[4:0]) | (rs3 >> shift_amt_compl[4:0]);
+  //                         ^-- cycle 0 ----------^ ^-- cycle 1 -------------------^
+  //  For shift_amt == 0, 32, both shift_amt[4:0] and shift_amt_compl[4:0] == '0.
+  //  these cases need to be handled separately outside the shifting structure:
+  //  else if (shift_amt == 32):
+  //     multicycle_result = rs3
+  //  else if (shift_amt == 0):
+  //     multicycle_result = rs1.
+  //
+  // Single-Bit Instructions
+  // =======================
+  // Single bit instructions operate on bit operand_b_i[4:0] of operand_a_i.
+
+  // The operations bset, bclr and binv are implemented by generation of a bit-mask using the
+  // shifter structure. This is done by left-shifting the operand 32'h1 by the required amount.
+  // The signal shift_sbmode multiplexes the shifter input and sets the signal shift_left.
+  // Further processing is taken care of by a separate structure.
+  //
+  // For bext, the bit defined by operand_b_i[4:0] is to be returned. This is done by simply
+  // shifting operand_a_i to the right by the required amount and returning bit [0] of the result.
+  //
+  // Bit-Field Place
+  // ===============
+  // The shifter structure is shared to compute bfp_mask << bfp_off.
+
+  logic       shift_left;
+  logic       shift_ones;
+  logic       shift_arith;
+  logic       shift_funnel;
+  logic       shift_sbmode;
+  logic [5:0] shift_amt;
+  logic [5:0] shift_amt_compl; // complementary shift amount (32 - shift_amt)
+
+  logic        [31:0] shift_operand;
+  logic signed [32:0] shift_result_ext_signed;
+  logic        [32:0] shift_result_ext;
+  logic               unused_shift_result_ext;
+  logic        [31:0] shift_result;
+  logic        [31:0] shift_result_rev;
+
+  // zbf
+  logic bfp_op;
+  logic [4:0]  bfp_len;
+  logic [4:0]  bfp_off;
+  logic [31:0] bfp_mask;
+  logic [31:0] bfp_mask_rev;
+  logic [31:0] bfp_result;
+
+  // bfp: shares the shifter structure to compute bfp_mask << bfp_off
+  assign bfp_op = (RV32B != RV32BNone) ? (operator_i == ALU_BFP) : 1'b0;
+  assign bfp_len = {~(|operand_b_i[27:24]), operand_b_i[27:24]}; // len = 0 encodes for len = 16
+  assign bfp_off = operand_b_i[20:16];
+  assign bfp_mask = (RV32B != RV32BNone) ? ~(32'hffff_ffff << bfp_len) : '0;
+  for (genvar i = 0; i < 32; i++) begin : gen_rev_bfp_mask
+    assign bfp_mask_rev[i] = bfp_mask[31-i];
+  end
+
+  assign bfp_result =(RV32B != RV32BNone) ?
+      (~shift_result & operand_a_i) | ((operand_b_i & bfp_mask) << bfp_off) : '0;
+
+  // bit shift_amt[5]: word swap bit: only considered for FSL/FSR.
+  // if set, reverse operations in first and second cycle.
+  assign shift_amt[5] = operand_b_i[5] & shift_funnel;
+  assign shift_amt_compl = 32 - operand_b_i[4:0];
+
+  always_comb begin
+    if (bfp_op) begin
+      shift_amt[4:0] = bfp_off;  // length field of bfp control word
+    end else begin
+      shift_amt[4:0] = instr_first_cycle_i ?
+          (operand_b_i[5] && shift_funnel ? shift_amt_compl[4:0] : operand_b_i[4:0]) :
+          (operand_b_i[5] && shift_funnel ? operand_b_i[4:0] : shift_amt_compl[4:0]);
+    end
+  end
+
+  // single-bit mode: shift
+  assign shift_sbmode = (RV32B != RV32BNone) ?
+      (operator_i == ALU_BSET) | (operator_i == ALU_BCLR) | (operator_i == ALU_BINV) : 1'b0;
+
+  // left shift if this is:
+  // * a standard left shift (slo, sll)
+  // * a rol in the first cycle
+  // * a ror in the second cycle
+  // * fsl: without word-swap bit: first cycle, else: second cycle
+  // * fsr: without word-swap bit: second cycle, else: first cycle
+  // * a single-bit instruction: bclr, bset, binv (excluding bext)
+  // * bfp: bfp_mask << bfp_off
+  always_comb begin
+    unique case (operator_i)
+      ALU_SLL: shift_left = 1'b1;
+      ALU_SLO: shift_left = (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) ? 1'b1 : 1'b0;
+      ALU_BFP: shift_left = (RV32B != RV32BNone) ? 1'b1 : 1'b0;
+      ALU_ROL: shift_left = (RV32B != RV32BNone) ? instr_first_cycle_i : 0;
+      ALU_ROR: shift_left = (RV32B != RV32BNone) ? ~instr_first_cycle_i : 0;
+      ALU_FSL: shift_left = (RV32B != RV32BNone) ?
+        (shift_amt[5] ? ~instr_first_cycle_i : instr_first_cycle_i) : 1'b0;
+      ALU_FSR: shift_left = (RV32B != RV32BNone) ?
+          (shift_amt[5] ? instr_first_cycle_i : ~instr_first_cycle_i) : 1'b0;
+      default: shift_left = 1'b0;
+    endcase
+    if (shift_sbmode) begin
+      shift_left = 1'b1;
+    end
+  end
+
+  assign shift_arith  = (operator_i == ALU_SRA);
+  assign shift_ones   = (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) ?
+      (operator_i == ALU_SLO) | (operator_i == ALU_SRO) : 1'b0;
+  assign shift_funnel = (RV32B != RV32BNone) ?
+      (operator_i == ALU_FSL) | (operator_i == ALU_FSR) : 1'b0;
+
+  // shifter structure.
+  always_comb begin
+    // select shifter input
+    // for bfp, sbmode and shift_left the corresponding bit-reversed input is chosen.
+    if (RV32B == RV32BNone) begin
+      shift_operand = shift_left ? operand_a_rev : operand_a_i;
+    end else begin
+      unique case (1'b1)
+        bfp_op:       shift_operand = bfp_mask_rev;
+        shift_sbmode: shift_operand = 32'h8000_0000;
+        default:      shift_operand = shift_left ? operand_a_rev : operand_a_i;
+      endcase
+    end
+
+    shift_result_ext_signed =
+        $signed({shift_ones | (shift_arith & shift_operand[31]), shift_operand}) >>> shift_amt[4:0];
+    shift_result_ext = $unsigned(shift_result_ext_signed);
+
+    shift_result            = shift_result_ext[31:0];
+    unused_shift_result_ext = shift_result_ext[32];
+
+    for (int unsigned i = 0; i < 32; i++) begin
+      shift_result_rev[i] = shift_result[31-i];
+    end
+
+    shift_result = shift_left ? shift_result_rev : shift_result;
+
+  end
+
+  ///////////////////
+  // Bitwise Logic //
+  ///////////////////
+
+  logic bwlogic_or;
+  logic bwlogic_and;
+  logic [31:0] bwlogic_operand_b;
+  logic [31:0] bwlogic_or_result;
+  logic [31:0] bwlogic_and_result;
+  logic [31:0] bwlogic_xor_result;
+  logic [31:0] bwlogic_result;
+
+  logic bwlogic_op_b_negate;
+
+  always_comb begin
+    unique case (operator_i)
+      // Logic-with-negate OPs (RV32B Ops)
+      ALU_XNOR,
+      ALU_ORN,
+      ALU_ANDN: bwlogic_op_b_negate = (RV32B != RV32BNone) ? 1'b1 : 1'b0;
+      ALU_CMIX: bwlogic_op_b_negate = (RV32B != RV32BNone) ? ~instr_first_cycle_i : 1'b0;
+      default:  bwlogic_op_b_negate = 1'b0;
+    endcase
+  end
+
+  assign bwlogic_operand_b = bwlogic_op_b_negate ? operand_b_neg[32:1] : operand_b_i;
+
+  assign bwlogic_or_result  = operand_a_i | bwlogic_operand_b;
+  assign bwlogic_and_result = operand_a_i & bwlogic_operand_b;
+  assign bwlogic_xor_result = operand_a_i ^ bwlogic_operand_b;
+
+  assign bwlogic_or  = (operator_i == ALU_OR)  | (operator_i == ALU_ORN);
+  assign bwlogic_and = (operator_i == ALU_AND) | (operator_i == ALU_ANDN);
+
+  always_comb begin
+    unique case (1'b1)
+      bwlogic_or:  bwlogic_result = bwlogic_or_result;
+      bwlogic_and: bwlogic_result = bwlogic_and_result;
+      default:     bwlogic_result = bwlogic_xor_result;
+    endcase
+  end
+
+  logic [5:0]  bitcnt_result;
+  logic [31:0] minmax_result;
+  logic [31:0] pack_result;
+  logic [31:0] sext_result;
+  logic [31:0] singlebit_result;
+  logic [31:0] rev_result;
+  logic [31:0] shuffle_result;
+  logic [31:0] xperm_result;
+  logic [31:0] butterfly_result;
+  logic [31:0] invbutterfly_result;
+  logic [31:0] clmul_result;
+  logic [31:0] multicycle_result;
+
+  if (RV32B != RV32BNone) begin : g_alu_rvb
+
+    /////////////////
+    // Bitcounting //
+    /////////////////
+
+    // The bit-counter structure computes the number of set bits in its operand. Partial results
+    // (from left to right) are needed to compute the control masks for computation of
+    // bcompress/bdecompress by the butterfly network, if implemented.
+    // For cpop, clz and ctz, only the end result is used.
+
+    logic        zbe_op;
+    logic        bitcnt_ctz;
+    logic        bitcnt_clz;
+    logic        bitcnt_cz;
+    logic [31:0] bitcnt_bits;
+    logic [31:0] bitcnt_mask_op;
+    logic [31:0] bitcnt_bit_mask;
+    logic [ 5:0] bitcnt_partial [32];
+    logic [31:0] bitcnt_partial_lsb_d;
+    logic [31:0] bitcnt_partial_msb_d;
+
+
+    assign bitcnt_ctz    = operator_i == ALU_CTZ;
+    assign bitcnt_clz    = operator_i == ALU_CLZ;
+    assign bitcnt_cz     = bitcnt_ctz | bitcnt_clz;
+    assign bitcnt_result = bitcnt_partial[31];
+
+    // Bit-mask generation for clz and ctz:
+    // The bit mask is generated by spreading the lowest-order set bit in the operand to all
+    // higher order bits. The resulting mask is inverted to cover the lowest order zeros. In order
+    // to create the bit mask for leading zeros, the input operand needs to be reversed.
+    assign bitcnt_mask_op = bitcnt_clz ? operand_a_rev : operand_a_i;
+
+    always_comb begin
+      bitcnt_bit_mask = bitcnt_mask_op;
+      bitcnt_bit_mask |= bitcnt_bit_mask << 1;
+      bitcnt_bit_mask |= bitcnt_bit_mask << 2;
+      bitcnt_bit_mask |= bitcnt_bit_mask << 4;
+      bitcnt_bit_mask |= bitcnt_bit_mask << 8;
+      bitcnt_bit_mask |= bitcnt_bit_mask << 16;
+      bitcnt_bit_mask = ~bitcnt_bit_mask;
+    end
+
+    assign zbe_op = (operator_i == ALU_BCOMPRESS) | (operator_i == ALU_BDECOMPRESS);
+
+    always_comb begin
+      case (1'b1)
+        zbe_op:      bitcnt_bits = operand_b_i;
+        bitcnt_cz:   bitcnt_bits = bitcnt_bit_mask & ~bitcnt_mask_op; // clz / ctz
+        default:     bitcnt_bits = operand_a_i; // cpop
+      endcase
+    end
+
+    // The parallel prefix counter is of the structure of a Brent-Kung Adder. In the first
+    // log2(width) stages, the sum of the n preceding bit lines is computed for the bit lines at
+    // positions 2**n-1 (power-of-two positions) where n denotes the current stage.
+    // In stage n=log2(width), the count for position width-1 (the MSB) is finished.
+    // For the intermediate values, an inverse adder tree then computes the bit counts for the bit
+    // lines at positions
+    // m = 2**(n-1) + i*2**(n-2), where i = [1 ... width / 2**(n-1)-1] and n = [log2(width) ... 2].
+    // Thus, at every subsequent stage the result of two previously unconnected sub-trees is
+    // summed, starting at the node summing bits [width/2-1 : 0] and [3*width/4-1: width/2]
+    // and moving to iteratively sum up all the sub-trees.
+    // The inverse adder tree thus features log2(width) - 1 stages the first of these stages is a
+    // single addition at position 3*width/4 - 1. It does not interfere with the last
+    // stage of the primary adder tree. These stages can thus be folded together, resulting in a
+    // total of 2*log2(width)-2 stages.
+    // For more details refer to R. Brent, H. T. Kung, "A Regular Layout for Parallel Adders",
+    // (1982).
+    // For a bitline at position p, only bits
+    // bitcnt_partial[max(i, such that p % log2(i) == 0)-1 : 0] are needed for generation of the
+    // butterfly network control signals. The adders in the intermediate value adder tree thus need
+    // not be full 5-bit adders. We leave the optimization to the synthesis tools.
+    //
+    // Consider the following 8-bit example for illustraton.
+    //
+    // let bitcnt_bits = 8'babcdefgh.
+    //
+    //                   a  b  c  d  e  f  g  h
+    //                   | /:  | /:  | /:  | /:
+    //                   |/ :  |/ :  |/ :  |/ :
+    // stage 1:          +  :  +  :  +  :  +  :
+    //                   |  : /:  :  |  : /:  :
+    //                   |,--+ :  :  |,--+ :  :
+    // stage 2:          +  :  :  :  +  :  :  :
+    //                   |  :  |  : /:  :  :  :
+    //                   |,-----,--+ :  :  :  : ^-primary adder tree
+    // stage 3:          +  :  +  :  :  :  :  : -------------------------
+    //                   :  | /| /| /| /| /|  : ,-intermediate adder tree
+    //                   :  |/ |/ |/ |/ |/ :  :
+    // stage 4           :  +  +  +  +  +  :  :
+    //                   :  :  :  :  :  :  :  :
+    // bitcnt_partial[i] 7  6  5  4  3  2  1  0
+
+    always_comb begin
+      bitcnt_partial = '{default: '0};
+      // stage 1
+      for (int unsigned i = 1; i < 32; i += 2) begin
+        bitcnt_partial[i] = {5'h0, bitcnt_bits[i]} + {5'h0, bitcnt_bits[i-1]};
+      end
+      // stage 2
+      for (int unsigned i = 3; i < 32; i += 4) begin
+        bitcnt_partial[i] = bitcnt_partial[i-2] + bitcnt_partial[i];
+      end
+      // stage 3
+      for (int unsigned i = 7; i < 32; i += 8) begin
+        bitcnt_partial[i] = bitcnt_partial[i-4] + bitcnt_partial[i];
+      end
+      // stage 4
+      for (int unsigned i = 15; i < 32; i += 16) begin
+        bitcnt_partial[i] = bitcnt_partial[i-8] + bitcnt_partial[i];
+      end
+      // stage 5
+      bitcnt_partial[31] = bitcnt_partial[15] + bitcnt_partial[31];
+      // ^- primary adder tree
+      // -------------------------------
+      // ,-intermediate value adder tree
+      bitcnt_partial[23] = bitcnt_partial[15] + bitcnt_partial[23];
+
+      // stage 6
+      for (int unsigned i = 11; i < 32; i += 8) begin
+        bitcnt_partial[i] = bitcnt_partial[i-4] + bitcnt_partial[i];
+      end
+
+      // stage 7
+      for (int unsigned i = 5; i < 32; i += 4) begin
+        bitcnt_partial[i] = bitcnt_partial[i-2] + bitcnt_partial[i];
+      end
+      // stage 8
+      bitcnt_partial[0] = {5'h0, bitcnt_bits[0]};
+      for (int unsigned i = 2; i < 32; i += 2) begin
+        bitcnt_partial[i] = bitcnt_partial[i-1] + {5'h0, bitcnt_bits[i]};
+      end
+    end
+
+    ///////////////
+    // Min / Max //
+    ///////////////
+
+    assign minmax_result = cmp_result ? operand_a_i : operand_b_i;
+
+    //////////
+    // Pack //
+    //////////
+
+    logic packu;
+    logic packh;
+    assign packu = operator_i == ALU_PACKU;
+    assign packh = operator_i == ALU_PACKH;
+
+    always_comb begin
+      unique case (1'b1)
+        packu:   pack_result = {operand_b_i[31:16], operand_a_i[31:16]};
+        packh:   pack_result = {16'h0, operand_b_i[7:0], operand_a_i[7:0]};
+        default: pack_result = {operand_b_i[15:0], operand_a_i[15:0]};
+      endcase
+    end
+
+    //////////
+    // Sext //
+    //////////
+
+    assign sext_result = (operator_i == ALU_SEXTB) ?
+        { {24{operand_a_i[7]}}, operand_a_i[7:0]} : { {16{operand_a_i[15]}}, operand_a_i[15:0]};
+
+    /////////////////////////////
+    // Single-bit Instructions //
+    /////////////////////////////
+
+    always_comb begin
+      unique case (operator_i)
+        ALU_BSET: singlebit_result = operand_a_i | shift_result;
+        ALU_BCLR: singlebit_result = operand_a_i & ~shift_result;
+        ALU_BINV: singlebit_result = operand_a_i ^ shift_result;
+        default:  singlebit_result = {31'h0, shift_result[0]}; // ALU_BEXT
+      endcase
+    end
+
+    ////////////////////////////////////
+    // General Reverse and Or-combine //
+    ////////////////////////////////////
+
+    // Only a subset of the general reverse and or-combine instructions are implemented in the
+    // balanced version of the B extension. Currently rev8 (shift_amt = 5'b11000) and orc.b
+    // (shift_amt = 5'b00111) are supported in the base extension.
+
+    logic [4:0] zbp_shift_amt;
+    logic gorc_op;
+
+    assign gorc_op = (operator_i == ALU_GORC);
+    assign zbp_shift_amt[2:0] =
+        (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) ? shift_amt[2:0] : {3{shift_amt[0]}};
+    assign zbp_shift_amt[4:3] =
+        (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) ? shift_amt[4:3] : {2{shift_amt[3]}};
+
+    always_comb begin
+      rev_result = operand_a_i;
+
+      if (zbp_shift_amt[0]) begin
+        rev_result = (gorc_op ? rev_result : 32'h0)       |
+                     ((rev_result & 32'h5555_5555) <<  1) |
+                     ((rev_result & 32'haaaa_aaaa) >>  1);
+      end
+
+      if (zbp_shift_amt[1]) begin
+        rev_result = (gorc_op ? rev_result : 32'h0)       |
+                     ((rev_result & 32'h3333_3333) <<  2) |
+                     ((rev_result & 32'hcccc_cccc) >>  2);
+      end
+
+      if (zbp_shift_amt[2]) begin
+        rev_result = (gorc_op ? rev_result : 32'h0)       |
+                     ((rev_result & 32'h0f0f_0f0f) <<  4) |
+                     ((rev_result & 32'hf0f0_f0f0) >>  4);
+      end
+
+      if (zbp_shift_amt[3]) begin
+        rev_result = ((RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) &&
+                      gorc_op ? rev_result : 32'h0) |
+                     ((rev_result & 32'h00ff_00ff) <<  8) |
+                     ((rev_result & 32'hff00_ff00) >>  8);
+      end
+
+      if (zbp_shift_amt[4]) begin
+        rev_result = ((RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) &&
+                      gorc_op ? rev_result : 32'h0) |
+                     ((rev_result & 32'h0000_ffff) << 16) |
+                     ((rev_result & 32'hffff_0000) >> 16);
+      end
+    end
+
+    logic crc_hmode;
+    logic crc_bmode;
+    logic [31:0] clmul_result_rev;
+
+    if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) begin : gen_alu_rvb_otearlgrey_full
+
+      /////////////////////////
+      // Shuffle / Unshuffle //
+      /////////////////////////
+
+      localparam logic [31:0] SHUFFLE_MASK_L [4] =
+          '{32'h00ff_0000, 32'h0f00_0f00, 32'h3030_3030, 32'h4444_4444};
+      localparam logic [31:0] SHUFFLE_MASK_R [4] =
+          '{32'h0000_ff00, 32'h00f0_00f0, 32'h0c0c_0c0c, 32'h2222_2222};
+
+      localparam logic [31:0] FLIP_MASK_L [4] =
+          '{32'h2200_1100, 32'h0044_0000, 32'h4411_0000, 32'h1100_0000};
+      localparam logic [31:0] FLIP_MASK_R [4] =
+          '{32'h0088_0044, 32'h0000_2200, 32'h0000_8822, 32'h0000_0088};
+
+      logic [31:0] SHUFFLE_MASK_NOT [4];
+      for(genvar i = 0; i < 4; i++) begin : gen_shuffle_mask_not
+        assign SHUFFLE_MASK_NOT[i] = ~(SHUFFLE_MASK_L[i] | SHUFFLE_MASK_R[i]);
+      end
+
+      logic shuffle_flip;
+      assign shuffle_flip = operator_i == ALU_UNSHFL;
+
+      logic [3:0] shuffle_mode;
+
+      always_comb begin
+        shuffle_result = operand_a_i;
+
+        if (shuffle_flip) begin
+          shuffle_mode[3] = shift_amt[0];
+          shuffle_mode[2] = shift_amt[1];
+          shuffle_mode[1] = shift_amt[2];
+          shuffle_mode[0] = shift_amt[3];
+        end else begin
+          shuffle_mode = shift_amt[3:0];
+        end
+
+        if (shuffle_flip) begin
+          shuffle_result = (shuffle_result & 32'h8822_4411) |
+              ((shuffle_result << 6)  & FLIP_MASK_L[0]) |
+              ((shuffle_result >> 6)  & FLIP_MASK_R[0]) |
+              ((shuffle_result << 9)  & FLIP_MASK_L[1]) |
+              ((shuffle_result >> 9)  & FLIP_MASK_R[1]) |
+              ((shuffle_result << 15) & FLIP_MASK_L[2]) |
+              ((shuffle_result >> 15) & FLIP_MASK_R[2]) |
+              ((shuffle_result << 21) & FLIP_MASK_L[3]) |
+              ((shuffle_result >> 21) & FLIP_MASK_R[3]);
+        end
+
+        if (shuffle_mode[3]) begin
+          shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[0]) |
+              (((shuffle_result << 8) & SHUFFLE_MASK_L[0]) |
+              ((shuffle_result >> 8) & SHUFFLE_MASK_R[0]));
+        end
+        if (shuffle_mode[2]) begin
+          shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[1]) |
+              (((shuffle_result << 4) & SHUFFLE_MASK_L[1]) |
+              ((shuffle_result >> 4) & SHUFFLE_MASK_R[1]));
+        end
+        if (shuffle_mode[1]) begin
+          shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[2]) |
+              (((shuffle_result << 2) & SHUFFLE_MASK_L[2]) |
+              ((shuffle_result >> 2) & SHUFFLE_MASK_R[2]));
+        end
+        if (shuffle_mode[0]) begin
+          shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[3]) |
+              (((shuffle_result << 1) & SHUFFLE_MASK_L[3]) |
+              ((shuffle_result >> 1) & SHUFFLE_MASK_R[3]));
+        end
+
+        if (shuffle_flip) begin
+          shuffle_result = (shuffle_result & 32'h8822_4411) |
+              ((shuffle_result << 6)  & FLIP_MASK_L[0]) |
+              ((shuffle_result >> 6)  & FLIP_MASK_R[0]) |
+              ((shuffle_result << 9)  & FLIP_MASK_L[1]) |
+              ((shuffle_result >> 9)  & FLIP_MASK_R[1]) |
+              ((shuffle_result << 15) & FLIP_MASK_L[2]) |
+              ((shuffle_result >> 15) & FLIP_MASK_R[2]) |
+              ((shuffle_result << 21) & FLIP_MASK_L[3]) |
+              ((shuffle_result >> 21) & FLIP_MASK_R[3]);
+        end
+      end
+
+      //////////////
+      // Crossbar //
+      //////////////
+      // The crossbar permutation instructions xperm.[nbh] (Zbp) can be implemented using 8
+      // parallel 4-bit-wide, 8-input crossbars. Basically, we permute the 8 nibbles of operand_a_i
+      // based on operand_b_i.
+
+      // Generate selector indices and valid signals.
+      // - sel_n[x] indicates which nibble of operand_a_i is selected for output nibble x.
+      // - vld_n[x] indicates if the selection is valid.
+      logic  [7:0][2:0] sel_n; // nibbles
+      logic  [7:0]      vld_n; // nibbles
+      logic  [3:0][1:0] sel_b; // bytes
+      logic  [3:0]      vld_b; // bytes
+      logic  [1:0][0:0] sel_h; // half words
+      logic  [1:0]      vld_h; // half words
+
+      // Per nibble, 3 bits are needed for the selection. Other bits must be zero.
+      // sel_n bit mask: 32'b0111_0111_0111_0111_0111_0111_0111_0111
+      // vld_n bit mask: 32'b1000_1000_1000_1000_1000_1000_1000_1000
+      for (genvar i = 0; i < 8; i++) begin : gen_sel_vld_n
+        assign sel_n[i] =   operand_b_i[i*4     +: 3];
+        assign vld_n[i] = ~|operand_b_i[i*4 + 3 +: 1];
+      end
+
+      // Per byte, 2 bits are needed for the selection. Other bits must be zero.
+      // sel_b bit mask: 32'b0000_0011_0000_0011_0000_0011_0000_0011
+      // vld_b bit mask: 32'b1111_1100_1111_1100_1111_1100_1111_1100
+      for (genvar i = 0; i < 4; i++) begin : gen_sel_vld_b
+        assign sel_b[i] =   operand_b_i[i*8     +: 2];
+        assign vld_b[i] = ~|operand_b_i[i*8 + 2 +: 6];
+      end
+
+      // Per half word, 1 bit is needed for the selection only. All other bits must be zero.
+      // sel_h bit mask: 32'b0000_0000_0000_0001_0000_0000_0000_0001
+      // vld_h bit mask: 32'b1111_1111_1111_1110_1111_1111_1111_1110
+      for (genvar i = 0; i < 2; i++) begin : gen_sel_vld_h
+        assign sel_h[i] =   operand_b_i[i*16     +: 1];
+        assign vld_h[i] = ~|operand_b_i[i*16 + 1 +: 15];
+      end
+
+      // Convert selector indices and valid signals to control the nibble-based
+      // crossbar logic.
+      logic [7:0][2:0] sel;
+      logic [7:0]      vld;
+      always_comb begin
+        unique case (operator_i)
+          ALU_XPERM_N: begin
+            // No conversion needed.
+            sel = sel_n;
+            vld = vld_n;
+          end
+
+          ALU_XPERM_B: begin
+            // Convert byte to nibble indicies.
+            for (int b = 0; b < 4; b++) begin
+              sel[b*2 +  0] =   {sel_b[b], 1'b0};
+              sel[b*2 +  1] =   {sel_b[b], 1'b1};
+              vld[b*2 +: 2] = {2{vld_b[b]}};
+            end
+          end
+
+          ALU_XPERM_H: begin
+            // Convert half-word to nibble indices.
+            for (int h = 0; h < 2; h++) begin
+              sel[h*4 +  0] =   {sel_h[h], 2'b00};
+              sel[h*4 +  1] =   {sel_h[h], 2'b01};
+              sel[h*4 +  2] =   {sel_h[h], 2'b10};
+              sel[h*4 +  3] =   {sel_h[h], 2'b11};
+              vld[h*4 +: 4] = {4{vld_h[h]}};
+            end
+          end
+
+          default: begin
+            // Tie valid to zero to disable the crossbar unless we need it.
+            sel = sel_n;
+            vld = '0;
+          end
+        endcase
+      end
+
+      // The actual nibble-based crossbar logic.
+      logic [7:0][3:0] val_n;
+      logic [7:0][3:0] xperm_n;
+      assign val_n = operand_a_i;
+      for (genvar i = 0; i < 8; i++) begin : gen_xperm_n
+        assign xperm_n[i] = vld[i] ? val_n[sel[i]] : '0;
+      end
+      assign xperm_result = xperm_n;
+
+      ///////////////////////////////////////////////////
+      // Carry-less Multiply + Cyclic Redundancy Check //
+      ///////////////////////////////////////////////////
+
+      // Carry-less multiplication can be understood as multiplication based on
+      // the addition interpreted as the bit-wise xor operation.
+      //
+      // Example: 1101 X 1011 = 1111111:
+      //
+      //       1011 X 1101
+      //       -----------
+      //              1101
+      //         xor 1101
+      //         ---------
+      //             10111
+      //        xor 0000
+      //        ----------
+      //            010111
+      //       xor 1101
+      //       -----------
+      //           1111111
+      //
+      // Architectural details:
+      //         A 32 x 32-bit array
+      //         [ operand_b[i] ? (operand_a << i) : '0 for i in 0 ... 31 ]
+      //         is generated. The entries of the array are pairwise 'xor-ed'
+      //         together in a 5-stage binary tree.
+      //
+      //
+      // Cyclic Redundancy Check:
+      //
+      // CRC-32 (CRC-32/ISO-HDLC) and CRC-32C (CRC-32/ISCSI) are directly implemented. For
+      // documentation of the crc configuration (crc-polynomials, initialization, reflection, etc.)
+      // see http://reveng.sourceforge.net/crc-catalogue/all.htm
+      // A useful guide to crc arithmetic and algorithms is given here:
+      // http://www.piclist.com/techref/method/math/crcguide.html.
+      //
+      // The CRC operation solves the following equation using binary polynomial arithmetic:
+      //
+      // rev(rd)(x) = rev(rs1)(x) * x**n mod {1, P}(x)
+      //
+      // where P denotes lower 32 bits of the corresponding CRC polynomial, rev(a) the bit reversal
+      // of a, n = 8,16, or 32 for .b, .h, .w -variants. {a, b} denotes bit concatenation.
+      //
+      // Using barret reduction, one can show that
+      //
+      // M(x) mod P(x) = R(x) =
+      //          (M(x) * x**n) & {deg(P(x)'{1'b1}}) ^ (M(x) x**-(deg(P(x) - n)) cx mu(x) cx P(x),
+      //
+      // Where mu(x) = polydiv(x**64, {1,P}) & 0xffffffff. Here, 'cx' refers to carry-less
+      // multiplication. Substituting rev(rd)(x) for R(x) and rev(rs1)(x) for M(x) and solving for
+      // rd(x) with P(x) a crc32 polynomial (deg(P(x)) = 32), we get
+      //
+      // rd = rev( (rev(rs1) << n)  ^ ((rev(rs1) >> (32-n)) cx mu cx P)
+      //    = (rs1 >> n) ^ rev(rev( (rs1 << (32-n)) cx rev(mu)) cx P)
+      //                       ^-- cycle 0--------------------^
+      //      ^- cycle 1 -------------------------------------------^
+      //
+      // In the last step we used the fact that carry-less multiplication is bit-order agnostic:
+      // rev(a cx b) = rev(a) cx rev(b).
+
+      logic clmul_rmode;
+      logic clmul_hmode;
+      logic [31:0] clmul_op_a;
+      logic [31:0] clmul_op_b;
+      logic [31:0] operand_b_rev;
+      logic [31:0] clmul_and_stage[32];
+      logic [31:0] clmul_xor_stage1[16];
+      logic [31:0] clmul_xor_stage2[8];
+      logic [31:0] clmul_xor_stage3[4];
+      logic [31:0] clmul_xor_stage4[2];
+
+      logic [31:0] clmul_result_raw;
+
+      for (genvar i = 0; i < 32; i++) begin : gen_rev_operand_b
+        assign operand_b_rev[i] = operand_b_i[31-i];
+      end
+
+      assign clmul_rmode = operator_i == ALU_CLMULR;
+      assign clmul_hmode = operator_i == ALU_CLMULH;
+
+      // CRC
+      localparam logic [31:0] CRC32_POLYNOMIAL = 32'h04c1_1db7;
+      localparam logic [31:0] CRC32_MU_REV = 32'hf701_1641;
+
+      localparam logic [31:0] CRC32C_POLYNOMIAL = 32'h1edc_6f41;
+      localparam logic [31:0] CRC32C_MU_REV = 32'hdea7_13f1;
+
+      logic crc_op;
+
+      logic crc_cpoly;
+
+      logic [31:0] crc_operand;
+      logic [31:0] crc_poly;
+      logic [31:0] crc_mu_rev;
+
+      assign crc_op = (operator_i == ALU_CRC32C_W) | (operator_i == ALU_CRC32_W) |
+                      (operator_i == ALU_CRC32C_H) | (operator_i == ALU_CRC32_H) |
+                      (operator_i == ALU_CRC32C_B) | (operator_i == ALU_CRC32_B);
+
+      assign crc_cpoly = (operator_i == ALU_CRC32C_W) |
+                         (operator_i == ALU_CRC32C_H) |
+                         (operator_i == ALU_CRC32C_B);
+
+      assign crc_hmode = (operator_i == ALU_CRC32_H) | (operator_i == ALU_CRC32C_H);
+      assign crc_bmode = (operator_i == ALU_CRC32_B) | (operator_i == ALU_CRC32C_B);
+
+      assign crc_poly   = crc_cpoly ? CRC32C_POLYNOMIAL : CRC32_POLYNOMIAL;
+      assign crc_mu_rev = crc_cpoly ? CRC32C_MU_REV : CRC32_MU_REV;
+
+      always_comb begin
+        unique case (1'b1)
+          crc_bmode: crc_operand = {operand_a_i[7:0], 24'h0};
+          crc_hmode: crc_operand = {operand_a_i[15:0], 16'h0};
+          default:   crc_operand = operand_a_i;
+        endcase
+      end
+
+      // Select clmul input
+      always_comb begin
+        if (crc_op) begin
+          clmul_op_a = instr_first_cycle_i ? crc_operand : imd_val_q_i[0];
+          clmul_op_b = instr_first_cycle_i ? crc_mu_rev : crc_poly;
+        end else begin
+          clmul_op_a = clmul_rmode | clmul_hmode ? operand_a_rev : operand_a_i;
+          clmul_op_b = clmul_rmode | clmul_hmode ? operand_b_rev : operand_b_i;
+        end
+      end
+
+      for (genvar i = 0; i < 32; i++) begin : gen_clmul_and_op
+        assign clmul_and_stage[i] = clmul_op_b[i] ? clmul_op_a << i : '0;
+      end
+
+      for (genvar i = 0; i < 16; i++) begin : gen_clmul_xor_op_l1
+        assign clmul_xor_stage1[i] = clmul_and_stage[2*i] ^ clmul_and_stage[2*i+1];
+      end
+
+      for (genvar i = 0; i < 8; i++) begin : gen_clmul_xor_op_l2
+        assign clmul_xor_stage2[i] = clmul_xor_stage1[2*i] ^ clmul_xor_stage1[2*i+1];
+      end
+
+      for (genvar i = 0; i < 4; i++) begin : gen_clmul_xor_op_l3
+        assign clmul_xor_stage3[i] = clmul_xor_stage2[2*i] ^ clmul_xor_stage2[2*i+1];
+      end
+
+      for (genvar i = 0; i < 2; i++) begin : gen_clmul_xor_op_l4
+        assign clmul_xor_stage4[i] = clmul_xor_stage3[2*i] ^ clmul_xor_stage3[2*i+1];
+      end
+
+      assign clmul_result_raw = clmul_xor_stage4[0] ^ clmul_xor_stage4[1];
+
+      for (genvar i = 0; i < 32; i++) begin : gen_rev_clmul_result
+        assign clmul_result_rev[i] = clmul_result_raw[31-i];
+      end
+
+      // clmulr_result = rev(clmul(rev(a), rev(b)))
+      // clmulh_result = clmulr_result >> 1
+      always_comb begin
+        case (1'b1)
+          clmul_rmode: clmul_result = clmul_result_rev;
+          clmul_hmode: clmul_result = {1'b0, clmul_result_rev[31:1]};
+          default:     clmul_result = clmul_result_raw;
+        endcase
+      end
+    end else begin : gen_alu_rvb_not_otearlgrey_full
+      assign shuffle_result       = '0;
+      assign xperm_result         = '0;
+      assign clmul_result         = '0;
+      // support signals
+      assign clmul_result_rev     = '0;
+      assign crc_bmode            = '0;
+      assign crc_hmode            = '0;
+    end
+
+    if (RV32B == RV32BFull) begin : gen_alu_rvb_full
+
+      ///////////////
+      // Butterfly //
+      ///////////////
+
+      // The butterfly / inverse butterfly network executing bcompress/bdecompress (zbe)
+      // instructions. For bdecompress, the control bits mask of a local left region is generated
+      // by the inverse of a n-bit left rotate and complement upon wrap (LROTC) operation by the
+      // number of ones in the deposit bitmask to the right of the segment. n hereby denotes the
+      // width of the according segment. The bitmask for a pertaining local right region is equal
+      // to the corresponding local left region. Bcompress uses an analogue inverse process.
+      // Consider the following 8-bit example.  For details, see Hilewitz et al. "Fast Bit Gather,
+      // Bit Scatter and Bit Permuation Instructions for Commodity Microprocessors", (2008).
+      //
+      // The bcompress/bdecompress instructions are completed in 2 cycles. In the first cycle, the
+      // control bitmask is prepared by executing the parallel prefix bit count. In the second
+      // cycle, the bit swapping is executed according to the control masks.
+
+      // 8-bit example:  (Hilewitz et al.)
+      // Consider the instruction bdecompress operand_a_i deposit_mask
+      // Let operand_a_i = 8'babcd_efgh
+      //    deposit_mask = 8'b1010_1101
+      //
+      // control bitmask for stage 1:
+      //  - number of ones in the right half of the deposit bitmask: 3
+      //  - width of the segment: 4
+      //  - control bitmask = ~LROTC(4'b0, 3)[3:0] = 4'b1000
+      //
+      // control bitmask:   c3 c2  c1 c0  c3 c2  c1 c0
+      //                    1  0   0  0   1  0   0  0
+      //                    <- L ----->   <- R ----->
+      // operand_a_i        a  b   c  d   e  f   g  h
+      //                    :\ |   |  |  /:  |   |  |
+      //                    : +|---|--|-+ :  |   |  |
+      //                    :/ |   |  |  \:  |   |  |
+      // stage 1            e  b   c  d   a  f   g  h
+      //                    <L->   <R->   <L->   <R->
+      // control bitmask:   c3 c2  c3 c2  c1 c0  c1 c0
+      //                    1  1   1  1   1  0   1  0
+      //                    :\ :\ /: /:   :\ |  /:  |
+      //                    : +:-+-:+ :   : +|-+ :  |
+      //                    :/ :/ \: \:   :/ |  \:  |
+      // stage 2            c  d   e  b   g  f   a  h
+      //                    L  R   L  R   L  R   L  R
+      // control bitmask:   c3 c3  c2 c2  c1 c1  c0 c0
+      //                    1  1   0  0   1  1   0  0
+      //                    :\/:   |  |   :\/:   |  |
+      //                    :  :   |  |   :  :   |  |
+      //                    :/\:   |  |   :/\:   |  |
+      // stage 3            d  c   e  b   f  g   a  h
+      // & deposit bitmask: 1  0   1  0   1  1   0  1
+      // result:            d  0   e  0   f  g   0  h
+
+      logic [ 5:0] bitcnt_partial_q [32];
+
+      // first cycle
+      // Store partial bitcnts
+      for (genvar i = 0; i < 32; i++) begin : gen_bitcnt_reg_in_lsb
+        assign bitcnt_partial_lsb_d[i] = bitcnt_partial[i][0];
+      end
+
+      for (genvar i = 0; i < 16; i++) begin : gen_bitcnt_reg_in_b1
+        assign bitcnt_partial_msb_d[i] = bitcnt_partial[2*i+1][1];
+      end
+
+      for (genvar i = 0; i < 8; i++) begin : gen_bitcnt_reg_in_b2
+        assign bitcnt_partial_msb_d[16+i] = bitcnt_partial[4*i+3][2];
+      end
+
+      for (genvar i = 0; i < 4; i++) begin : gen_bitcnt_reg_in_b3
+        assign bitcnt_partial_msb_d[24+i] = bitcnt_partial[8*i+7][3];
+      end
+
+      for (genvar i = 0; i < 2; i++) begin : gen_bitcnt_reg_in_b4
+        assign bitcnt_partial_msb_d[28+i] = bitcnt_partial[16*i+15][4];
+      end
+
+      assign bitcnt_partial_msb_d[30] = bitcnt_partial[31][5];
+      assign bitcnt_partial_msb_d[31] = 1'b0; // unused
+
+      // Second cycle
+      // Load partial bitcnts
+      always_comb begin
+        bitcnt_partial_q = '{default: '0};
+
+        for (int unsigned i = 0; i < 32; i++) begin : gen_bitcnt_reg_out_lsb
+          bitcnt_partial_q[i][0] = imd_val_q_i[0][i];
+        end
+
+        for (int unsigned i = 0; i < 16; i++) begin : gen_bitcnt_reg_out_b1
+          bitcnt_partial_q[2*i+1][1] = imd_val_q_i[1][i];
+        end
+
+        for (int unsigned i = 0; i < 8; i++) begin : gen_bitcnt_reg_out_b2
+          bitcnt_partial_q[4*i+3][2] = imd_val_q_i[1][16+i];
+        end
+
+        for (int unsigned i = 0; i < 4; i++) begin : gen_bitcnt_reg_out_b3
+          bitcnt_partial_q[8*i+7][3] = imd_val_q_i[1][24+i];
+        end
+
+        for (int unsigned i = 0; i < 2; i++) begin : gen_bitcnt_reg_out_b4
+          bitcnt_partial_q[16*i+15][4] = imd_val_q_i[1][28+i];
+        end
+
+        bitcnt_partial_q[31][5] = imd_val_q_i[1][30];
+      end
+
+      logic [31:0] butterfly_mask_l[5];
+      logic [31:0] butterfly_mask_r[5];
+      logic [31:0] butterfly_mask_not[5];
+      logic [31:0] lrotc_stage [5]; // left rotate and complement upon wrap
+
+      // number of bits in local r = 32 / 2**(stage + 1) = 16/2**stage
+      `define _N(stg) (16 >> stg)
+
+      // bcompress / bdecompress control bit generation
+      for (genvar stg = 0; stg < 5; stg++) begin : gen_butterfly_ctrl_stage
+        // number of segs: 2** stg
+        for (genvar seg=0; seg<2**stg; seg++) begin : gen_butterfly_ctrl
+
+          assign lrotc_stage[stg][2*`_N(stg)*(seg+1)-1 : 2*`_N(stg)*seg] =
+              {{`_N(stg){1'b0}},{`_N(stg){1'b1}}} <<
+                bitcnt_partial_q[`_N(stg)*(2*seg+1)-1][$clog2(`_N(stg)):0];
+
+          assign butterfly_mask_l[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)]
+                   = ~lrotc_stage[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)];
+
+          assign butterfly_mask_r[stg][`_N(stg)*(2*seg+1)-1 : `_N(stg)*(2*seg)]
+                   = ~lrotc_stage[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)];
+
+          assign butterfly_mask_l[stg][`_N(stg)*(2*seg+1)-1 : `_N(stg)*(2*seg)]   = '0;
+          assign butterfly_mask_r[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)] = '0;
+        end
+      end
+      `undef _N
+
+      for (genvar stg = 0; stg < 5; stg++) begin : gen_butterfly_not
+        assign butterfly_mask_not[stg] =
+            ~(butterfly_mask_l[stg] | butterfly_mask_r[stg]);
+      end
+
+      always_comb begin
+        butterfly_result = operand_a_i;
+
+        butterfly_result = butterfly_result & butterfly_mask_not[0] |
+            ((butterfly_result & butterfly_mask_l[0]) >> 16)|
+            ((butterfly_result & butterfly_mask_r[0]) << 16);
+
+        butterfly_result = butterfly_result & butterfly_mask_not[1] |
+            ((butterfly_result & butterfly_mask_l[1]) >> 8)|
+            ((butterfly_result & butterfly_mask_r[1]) << 8);
+
+        butterfly_result = butterfly_result & butterfly_mask_not[2] |
+            ((butterfly_result & butterfly_mask_l[2]) >> 4)|
+            ((butterfly_result & butterfly_mask_r[2]) << 4);
+
+        butterfly_result = butterfly_result & butterfly_mask_not[3] |
+            ((butterfly_result & butterfly_mask_l[3]) >> 2)|
+            ((butterfly_result & butterfly_mask_r[3]) << 2);
+
+        butterfly_result = butterfly_result & butterfly_mask_not[4] |
+            ((butterfly_result & butterfly_mask_l[4]) >> 1)|
+            ((butterfly_result & butterfly_mask_r[4]) << 1);
+
+        butterfly_result = butterfly_result & operand_b_i;
+      end
+
+      always_comb begin
+        invbutterfly_result = operand_a_i & operand_b_i;
+
+        invbutterfly_result = invbutterfly_result & butterfly_mask_not[4] |
+            ((invbutterfly_result & butterfly_mask_l[4]) >> 1)|
+            ((invbutterfly_result & butterfly_mask_r[4]) << 1);
+
+        invbutterfly_result = invbutterfly_result & butterfly_mask_not[3] |
+            ((invbutterfly_result & butterfly_mask_l[3]) >> 2)|
+            ((invbutterfly_result & butterfly_mask_r[3]) << 2);
+
+        invbutterfly_result = invbutterfly_result & butterfly_mask_not[2] |
+            ((invbutterfly_result & butterfly_mask_l[2]) >> 4)|
+            ((invbutterfly_result & butterfly_mask_r[2]) << 4);
+
+        invbutterfly_result = invbutterfly_result & butterfly_mask_not[1] |
+            ((invbutterfly_result & butterfly_mask_l[1]) >> 8)|
+            ((invbutterfly_result & butterfly_mask_r[1]) << 8);
+
+        invbutterfly_result = invbutterfly_result & butterfly_mask_not[0] |
+            ((invbutterfly_result & butterfly_mask_l[0]) >> 16)|
+            ((invbutterfly_result & butterfly_mask_r[0]) << 16);
+      end
+    end else begin : gen_alu_rvb_not_full
+      logic [31:0] unused_imd_val_q_1;
+      assign unused_imd_val_q_1   = imd_val_q_i[1];
+      assign butterfly_result     = '0;
+      assign invbutterfly_result  = '0;
+      // support signals
+      assign bitcnt_partial_lsb_d = '0;
+      assign bitcnt_partial_msb_d = '0;
+    end
+
+    //////////////////////////////////////
+    // Multicycle Bitmanip Instructions //
+    //////////////////////////////////////
+    // Ternary instructions + Shift Rotations + Bit Compress/Decompress + CRC
+    // For ternary instructions (zbt), operand_a_i is tied to rs1 in the first cycle and rs3 in the
+    // second cycle. operand_b_i is always tied to rs2.
+
+    always_comb begin
+      unique case (operator_i)
+        ALU_CMOV: begin
+          multicycle_result = (operand_b_i == 32'h0) ? operand_a_i : imd_val_q_i[0];
+          imd_val_d_o = '{operand_a_i, 32'h0};
+          if (instr_first_cycle_i) begin
+            imd_val_we_o = 2'b01;
+          end else begin
+            imd_val_we_o = 2'b00;
+          end
+        end
+
+        ALU_CMIX: begin
+          multicycle_result = imd_val_q_i[0] | bwlogic_and_result;
+          imd_val_d_o = '{bwlogic_and_result, 32'h0};
+          if (instr_first_cycle_i) begin
+            imd_val_we_o = 2'b01;
+          end else begin
+            imd_val_we_o = 2'b00;
+          end
+        end
+
+        ALU_FSR, ALU_FSL,
+        ALU_ROL, ALU_ROR: begin
+          if (shift_amt[4:0] == 5'h0) begin
+            multicycle_result = shift_amt[5] ? operand_a_i : imd_val_q_i[0];
+          end else begin
+            multicycle_result = imd_val_q_i[0] | shift_result;
+          end
+          imd_val_d_o = '{shift_result, 32'h0};
+          if (instr_first_cycle_i) begin
+            imd_val_we_o = 2'b01;
+          end else begin
+            imd_val_we_o = 2'b00;
+          end
+        end
+
+        ALU_CRC32_W, ALU_CRC32C_W,
+        ALU_CRC32_H, ALU_CRC32C_H,
+        ALU_CRC32_B, ALU_CRC32C_B: begin
+          if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) begin
+            unique case (1'b1)
+              crc_bmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 8);
+              crc_hmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 16);
+              default:   multicycle_result = clmul_result_rev;
+            endcase
+            imd_val_d_o = '{clmul_result_rev, 32'h0};
+            if (instr_first_cycle_i) begin
+              imd_val_we_o = 2'b01;
+            end else begin
+              imd_val_we_o = 2'b00;
+            end
+          end else begin
+            imd_val_d_o = '{operand_a_i, 32'h0};
+            imd_val_we_o = 2'b00;
+            multicycle_result = '0;
+          end
+        end
+
+        ALU_BCOMPRESS, ALU_BDECOMPRESS: begin
+          if (RV32B == RV32BFull) begin
+            multicycle_result = (operator_i == ALU_BDECOMPRESS) ? butterfly_result :
+                                                                  invbutterfly_result;
+            imd_val_d_o = '{bitcnt_partial_lsb_d, bitcnt_partial_msb_d};
+            if (instr_first_cycle_i) begin
+              imd_val_we_o = 2'b11;
+            end else begin
+              imd_val_we_o = 2'b00;
+            end
+          end else begin
+            imd_val_d_o = '{operand_a_i, 32'h0};
+            imd_val_we_o = 2'b00;
+            multicycle_result = '0;
+          end
+        end
+
+        default: begin
+          imd_val_d_o = '{operand_a_i, 32'h0};
+          imd_val_we_o = 2'b00;
+          multicycle_result = '0;
+        end
+      endcase
+    end
+
+
+  end else begin : g_no_alu_rvb
+    logic [31:0] unused_imd_val_q[2];
+    assign unused_imd_val_q           = imd_val_q_i;
+    logic [31:0] unused_butterfly_result;
+    assign unused_butterfly_result    = butterfly_result;
+    logic [31:0] unused_invbutterfly_result;
+    assign unused_invbutterfly_result = invbutterfly_result;
+    // RV32B result signals
+    assign bitcnt_result       = '0;
+    assign minmax_result       = '0;
+    assign pack_result         = '0;
+    assign sext_result         = '0;
+    assign singlebit_result    = '0;
+    assign rev_result          = '0;
+    assign shuffle_result      = '0;
+    assign xperm_result        = '0;
+    assign butterfly_result    = '0;
+    assign invbutterfly_result = '0;
+    assign clmul_result        = '0;
+    assign multicycle_result   = '0;
+    // RV32B support signals
+    assign imd_val_d_o         = '{default: '0};
+    assign imd_val_we_o        = '{default: '0};
+  end
+
+  ////////////////
+  // Result mux //
+  ////////////////
+
+  always_comb begin
+    result_o   = '0;
+
+    unique case (operator_i)
+      // Bitwise Logic Operations (negate: RV32B)
+      ALU_XOR,  ALU_XNOR,
+      ALU_OR,   ALU_ORN,
+      ALU_AND,  ALU_ANDN: result_o = bwlogic_result;
+
+      // Adder Operations
+      ALU_ADD,  ALU_SUB,
+      // RV32B
+      ALU_SH1ADD, ALU_SH2ADD,
+      ALU_SH3ADD: result_o = adder_result;
+
+      // Shift Operations
+      ALU_SLL,  ALU_SRL,
+      ALU_SRA,
+      // RV32B
+      ALU_SLO,  ALU_SRO: result_o = shift_result;
+
+      // Shuffle Operations (RV32B)
+      ALU_SHFL, ALU_UNSHFL: result_o = shuffle_result;
+
+      // Crossbar Permutation Operations (RV32B)
+      ALU_XPERM_N, ALU_XPERM_B, ALU_XPERM_H: result_o = xperm_result;
+
+      // Comparison Operations
+      ALU_EQ,   ALU_NE,
+      ALU_GE,   ALU_GEU,
+      ALU_LT,   ALU_LTU,
+      ALU_SLT,  ALU_SLTU: result_o = {31'h0,cmp_result};
+
+      // MinMax Operations (RV32B)
+      ALU_MIN,  ALU_MAX,
+      ALU_MINU, ALU_MAXU: result_o = minmax_result;
+
+      // Bitcount Operations (RV32B)
+      ALU_CLZ, ALU_CTZ,
+      ALU_CPOP: result_o = {26'h0, bitcnt_result};
+
+      // Pack Operations (RV32B)
+      ALU_PACK, ALU_PACKH,
+      ALU_PACKU: result_o = pack_result;
+
+      // Sign-Extend (RV32B)
+      ALU_SEXTB, ALU_SEXTH: result_o = sext_result;
+
+      // Ternary Bitmanip Operations (RV32B)
+      ALU_CMIX, ALU_CMOV,
+      ALU_FSL,  ALU_FSR,
+      // Rotate Shift (RV32B)
+      ALU_ROL, ALU_ROR,
+      // Cyclic Redundancy Checks (RV32B)
+      ALU_CRC32_W, ALU_CRC32C_W,
+      ALU_CRC32_H, ALU_CRC32C_H,
+      ALU_CRC32_B, ALU_CRC32C_B,
+      // Bit Compress / Decompress (RV32B)
+      ALU_BCOMPRESS, ALU_BDECOMPRESS: result_o = multicycle_result;
+
+      // Single-Bit Bitmanip Operations (RV32B)
+      ALU_BSET, ALU_BCLR,
+      ALU_BINV, ALU_BEXT: result_o = singlebit_result;
+
+      // General Reverse / Or-combine (RV32B)
+      ALU_GREV, ALU_GORC: result_o = rev_result;
+
+      // Bit Field Place (RV32B)
+      ALU_BFP: result_o = bfp_result;
+
+      // Carry-less Multiply Operations (RV32B)
+      ALU_CLMUL, ALU_CLMULR,
+      ALU_CLMULH: result_o = clmul_result;
+
+      default: ;
+    endcase
+  end
+
+  logic unused_shift_amt_compl;
+  assign unused_shift_amt_compl = shift_amt_compl[5];
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_branch_predict.sv b/hw/ip/cheriot-ibex/rtl/cheriot_branch_predict.sv
new file mode 100644
index 0000000..e99089b
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_branch_predict.sv
@@ -0,0 +1,100 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Branch Predictor
+ *
+ * This implements static branch prediction. It takes an instruction and its PC and determines if
+ * it's a branch or a jump and calculates its target. For jumps it will always predict taken. For
+ * branches it will predict taken if the PC offset is negative.
+ *
+ * This handles both compressed and uncompressed instructions. Compressed instructions must be in
+ * the lower 16-bits of instr.
+ *
+ * The predictor is entirely combinational but takes clk/rst_n signals for use by assertions.
+ */
+
+`include "prim_assert.sv"
+
+module cheriot_branch_predict (
+  input  logic clk_i,
+  input  logic rst_ni,
+
+  // Instruction from fetch stage
+  input  logic [31:0] fetch_rdata_i,
+  input  logic [31:0] fetch_pc_i,
+  input  logic        fetch_valid_i,
+
+  // Prediction for supplied instruction
+  output logic        predict_branch_taken_o,
+  output logic [31:0] predict_branch_pc_o
+);
+  import cheriot_pkg::*;
+
+  logic [31:0] imm_j_type;
+  logic [31:0] imm_b_type;
+  logic [31:0] imm_cj_type;
+  logic [31:0] imm_cb_type;
+
+  logic [31:0] branch_imm;
+
+  logic [31:0] instr;
+
+  logic instr_j;
+  logic instr_b;
+  logic instr_cj;
+  logic instr_cb;
+
+  logic instr_b_taken;
+
+  // Provide short internal name for fetch_rdata_i due to reduce line wrapping
+  assign instr = fetch_rdata_i;
+
+  // Extract and sign-extend to 32-bit the various immediates that may be used to calculate the
+  // target
+
+  // Uncompressed immediates
+  assign imm_j_type = { {12{instr[31]}}, instr[19:12], instr[20], instr[30:21], 1'b0 };
+  assign imm_b_type = { {19{instr[31]}}, instr[31], instr[7], instr[30:25], instr[11:8], 1'b0 };
+
+  // Compressed immediates
+  assign imm_cj_type = { {20{instr[12]}}, instr[12], instr[8], instr[10:9], instr[6], instr[7],
+    instr[2], instr[11], instr[5:3], 1'b0 };
+
+  assign imm_cb_type = { {23{instr[12]}}, instr[12], instr[6:5], instr[2], instr[11:10],
+    instr[4:3], 1'b0};
+
+  // Determine if the instruction is a branch or a jump
+
+  // Uncompressed branch/jump
+  assign instr_b = opcode_e'(instr[6:0]) == OPCODE_BRANCH;
+  assign instr_j = opcode_e'(instr[6:0]) == OPCODE_JAL;
+
+  // Compressed branch/jump
+  assign instr_cb = (instr[1:0] == 2'b01) & ((instr[15:13] == 3'b110) | (instr[15:13] == 3'b111));
+  assign instr_cj = (instr[1:0] == 2'b01) & ((instr[15:13] == 3'b101) | (instr[15:13] == 3'b001));
+
+  // Select out the branch offset for target calculation based upon the instruction type
+  always_comb begin
+    branch_imm = imm_b_type;
+
+    unique case (1'b1)
+      instr_j  : branch_imm = imm_j_type;
+      instr_b  : branch_imm = imm_b_type;
+      instr_cj : branch_imm = imm_cj_type;
+      instr_cb : branch_imm = imm_cb_type;
+      default : ;
+    endcase
+  end
+
+  `ASSERT_IF(BranchInsTypeOneHot, $onehot0({instr_j, instr_b, instr_cj, instr_cb}), fetch_valid_i)
+
+  // Determine branch prediction, taken if offset is negative
+  assign instr_b_taken = (instr_b & imm_b_type[31]) | (instr_cb & imm_cb_type[31]);
+
+  // Always predict jumps taken otherwise take prediction from `instr_b_taken`
+  assign predict_branch_taken_o = fetch_valid_i & (instr_j | instr_cj | instr_b_taken);
+  // Calculate target
+  assign predict_branch_pc_o    = fetch_pc_i + branch_imm;
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_compressed_decoder.sv b/hw/ip/cheriot-ibex/rtl/cheriot_compressed_decoder.sv
new file mode 100644
index 0000000..1ebcf3c
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_compressed_decoder.sv
@@ -0,0 +1,362 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Compressed instruction decoder
+ *
+ * Decodes RISC-V compressed instructions into their RV32 equivalent.
+ * This module is fully combinatorial, clock and reset are used for
+ * assertions only.
+ */
+
+`include "prim_assert.sv"
+
+module cheriot_compressed_decoder # (
+  parameter bit CHERIoTEn  = 1'b1
+) (
+  input  logic        clk_i,
+  input  logic        rst_ni,
+  input  logic        valid_i,
+  input  logic [31:0] instr_i,
+  input  logic        cheri_pmode_i,
+  output logic [31:0] instr_o,
+  output logic        is_compressed_o,
+  output logic        illegal_instr_o
+);
+  import cheriot_pkg::*;
+
+  // valid_i indicates if instr_i is valid and is used for assertions only.
+  // The following signal is used to avoid possible lint errors.
+  logic unused_valid;
+  assign unused_valid = valid_i;
+
+  ////////////////////////
+  // Compressed decoder //
+  ////////////////////////
+
+  always_comb begin
+    // By default, forward incoming instruction, mark it as legal.
+    instr_o         = instr_i;
+    illegal_instr_o = 1'b0;
+
+    // Check if incoming instruction is compressed.
+    unique case (instr_i[1:0])
+      // C0
+      2'b00: begin
+        unique case (instr_i[15:13])
+          3'b000: begin
+            if (CHERIoTEn & cheri_pmode_i)
+              // c.incaddr4cspn -> cincoffsetimm cd', csp, imm
+              instr_o = {2'b0, instr_i[10:7], instr_i[12:11], instr_i[5],
+                        instr_i[6], 2'b00, 5'h02, 3'b001, 2'b01, instr_i[4:2], {OPCODE_CHERI}};
+            else
+              // c.addi4spn -> addi rd', x2, imm
+              instr_o = {2'b0, instr_i[10:7], instr_i[12:11], instr_i[5],
+                         instr_i[6], 2'b00, 5'h02, 3'b000, 2'b01, instr_i[4:2], {OPCODE_OP_IMM}};
+            if (instr_i[12:5] == 8'b0)  illegal_instr_o = 1'b1;
+          end
+
+          3'b010: begin
+            // c.lw -> lw rd', imm(rs1')
+            instr_o = {5'b0, instr_i[5], instr_i[12:10], instr_i[6],
+                       2'b00, 2'b01, instr_i[9:7], 3'b010, 2'b01, instr_i[4:2], {OPCODE_LOAD}};
+          end
+
+          3'b011: begin
+            if (CHERIoTEn & cheri_pmode_i) begin
+              // CHERI: c.clc -> clc rd', imm(rs1'); reuse c.ld
+              instr_o = {4'b0, instr_i[6:5], instr_i[12:10],
+                         3'b000, 2'b01, instr_i[9:7], 3'b011, 2'b01, instr_i[4:2], {OPCODE_LOAD}};
+             end else begin
+              instr_o = instr_i;
+              illegal_instr_o = 1'b1;
+            end
+          end
+
+          3'b110: begin
+            // c.sw -> sw rs2', imm(rs1')
+            instr_o = {5'b0, instr_i[5], instr_i[12], 2'b01, instr_i[4:2],
+                       2'b01, instr_i[9:7], 3'b010, instr_i[11:10], instr_i[6],
+                       2'b00, {OPCODE_STORE}};
+          end
+
+          3'b001,
+          3'b100,
+          3'b101: begin
+            illegal_instr_o = 1'b1;
+          end
+
+          3'b111: begin
+            if (CHERIoTEn & cheri_pmode_i) begin
+              // CHERI: c.csc -> csc rs2', imm(rs1'); reuse c.sd
+              instr_o = {4'b0, instr_i[6:5], instr_i[12], 2'b01, instr_i[4:2],
+                         2'b01, instr_i[9:7], 3'b011, instr_i[11:10], 3'b000, {OPCODE_STORE}};
+            end else begin
+              instr_o = instr_i;
+              illegal_instr_o = 1'b1;
+            end
+
+          end
+
+          default: begin
+            illegal_instr_o = 1'b1;
+          end
+        endcase
+      end
+
+      // C1
+      //
+      // Register address checks for RV32E are performed in the regular instruction decoder.
+      // If this check fails, an illegal instruction exception is triggered and the controller
+      // writes the actual faulting instruction to mtval.
+      2'b01: begin
+        unique case (instr_i[15:13])
+          3'b000: begin
+            // c.addi -> addi rd, rd, nzimm
+            // c.nop
+            instr_o = {{6 {instr_i[12]}}, instr_i[12], instr_i[6:2],
+                       instr_i[11:7], 3'b0, instr_i[11:7], {OPCODE_OP_IMM}};
+          end
+
+          3'b001, 3'b101: begin
+            // 001: c.jal -> jal x1, imm
+            // 101: c.j   -> jal x0, imm
+            instr_o = {instr_i[12], instr_i[8], instr_i[10:9], instr_i[6],
+                       instr_i[7], instr_i[2], instr_i[11], instr_i[5:3],
+                       {9 {instr_i[12]}}, 4'b0, ~instr_i[15], {OPCODE_JAL}};
+          end
+
+          3'b010: begin
+            // c.li -> addi rd, x0, nzimm
+            // (c.li hints are translated into an addi hint)
+            instr_o = {{6 {instr_i[12]}}, instr_i[12], instr_i[6:2], 5'b0,
+                       3'b0, instr_i[11:7], {OPCODE_OP_IMM}};
+          end
+
+          3'b011: begin
+            // c.lui -> lui rd, imm
+            // (c.lui hints are translated into a lui hint)
+            instr_o = {{15 {instr_i[12]}}, instr_i[6:2], instr_i[11:7], {OPCODE_LUI}};
+
+            // c.incaddr16csp -> cincoffsetimm csp, csp, nzimm
+            if (CHERIoTEn & cheri_pmode_i &&  (instr_i[11:7] == 5'h02))  begin
+              instr_o = {{3 {instr_i[12]}}, instr_i[4:3], instr_i[5], instr_i[2],
+                         instr_i[6], 4'b0, 5'h02, 3'b001,  5'h02, {OPCODE_CHERI}};
+            end else if (instr_i[11:7] == 5'h02)  begin
+              // c.addi16sp -> addi x2, x2, nzimm
+              instr_o = {{3 {instr_i[12]}}, instr_i[4:3], instr_i[5], instr_i[2],
+                         instr_i[6], 4'b0, 5'h02, 3'b000, 5'h02, {OPCODE_OP_IMM}};
+            end
+
+            if ({instr_i[12], instr_i[6:2]} == 6'b0) illegal_instr_o = 1'b1;
+          end
+
+          3'b100: begin
+            unique case (instr_i[11:10])
+              2'b00,
+              2'b01: begin
+                // 00: c.srli -> srli rd, rd, shamt
+                // 01: c.srai -> srai rd, rd, shamt
+                // (c.srli/c.srai hints are translated into a srli/srai hint)
+                instr_o = {1'b0, instr_i[10], 5'b0, instr_i[6:2], 2'b01, instr_i[9:7],
+                           3'b101, 2'b01, instr_i[9:7], {OPCODE_OP_IMM}};
+                if (instr_i[12] == 1'b1)  illegal_instr_o = 1'b1;
+              end
+
+              2'b10: begin
+                // c.andi -> andi rd, rd, imm
+                instr_o = {{6 {instr_i[12]}}, instr_i[12], instr_i[6:2], 2'b01, instr_i[9:7],
+                           3'b111, 2'b01, instr_i[9:7], {OPCODE_OP_IMM}};
+              end
+
+              2'b11: begin
+                unique case ({instr_i[12], instr_i[6:5]})
+                  3'b000: begin
+                    // c.sub -> sub rd', rd', rs2'
+                    instr_o = {2'b01, 5'b0, 2'b01, instr_i[4:2], 2'b01, instr_i[9:7],
+                               3'b000, 2'b01, instr_i[9:7], {OPCODE_OP}};
+                  end
+
+                  3'b001: begin
+                    // c.xor -> xor rd', rd', rs2'
+                    instr_o = {7'b0, 2'b01, instr_i[4:2], 2'b01, instr_i[9:7], 3'b100,
+                               2'b01, instr_i[9:7], {OPCODE_OP}};
+                  end
+
+                  3'b010: begin
+                    // c.or  -> or  rd', rd', rs2'
+                    instr_o = {7'b0, 2'b01, instr_i[4:2], 2'b01, instr_i[9:7], 3'b110,
+                               2'b01, instr_i[9:7], {OPCODE_OP}};
+                  end
+
+                  3'b011: begin
+                    // c.and -> and rd', rd', rs2'
+                    instr_o = {7'b0, 2'b01, instr_i[4:2], 2'b01, instr_i[9:7], 3'b111,
+                               2'b01, instr_i[9:7], {OPCODE_OP}};
+                  end
+
+                  3'b100,
+                  3'b101,
+                  3'b110,
+                  3'b111: begin
+                    // 100: c.subw
+                    // 101: c.addw
+                    illegal_instr_o = 1'b1;
+                  end
+
+                  default: begin
+                    illegal_instr_o = 1'b1;
+                  end
+                endcase
+              end
+
+              default: begin
+                illegal_instr_o = 1'b1;
+              end
+            endcase
+          end
+
+          3'b110, 3'b111: begin
+            // 0: c.beqz -> beq rs1', x0, imm
+            // 1: c.bnez -> bne rs1', x0, imm
+            instr_o = {{4 {instr_i[12]}}, instr_i[6:5], instr_i[2], 5'b0, 2'b01,
+                       instr_i[9:7], 2'b00, instr_i[13], instr_i[11:10], instr_i[4:3],
+                       instr_i[12], {OPCODE_BRANCH}};
+          end
+
+          default: begin
+            illegal_instr_o = 1'b1;
+          end
+        endcase
+      end
+
+      // C2
+      //
+      // Register address checks for RV32E are performed in the regular instruction decoder.
+      // If this check fails, an illegal instruction exception is triggered and the controller
+      // writes the actual faulting instruction to mtval.
+      2'b10: begin
+        unique case (instr_i[15:13])
+          3'b000: begin
+            // c.slli -> slli rd, rd, shamt
+            // (c.ssli hints are translated into a slli hint)
+            instr_o = {7'b0, instr_i[6:2], instr_i[11:7], 3'b001, instr_i[11:7], {OPCODE_OP_IMM}};
+            if (instr_i[12] == 1'b1)  illegal_instr_o = 1'b1; // reserved for custom extensions
+          end
+
+          3'b010: begin
+            // c.lwsp -> lw rd, imm(x2)
+            instr_o = {4'b0, instr_i[3:2], instr_i[12], instr_i[6:4], 2'b00, 5'h02,
+                       3'b010, instr_i[11:7], OPCODE_LOAD};
+            if (instr_i[11:7] == 5'b0)  illegal_instr_o = 1'b1;
+          end
+
+          3'b011: begin
+            if (CHERIoTEn & cheri_pmode_i) begin
+              // c.clcsp -> clc cd, imm(c2),  reused c.ldsp
+              instr_o = {3'b0, instr_i[4:2], instr_i[12], instr_i[6:5], 3'b000, 5'h02,
+                         3'b011, instr_i[11:7], OPCODE_LOAD};
+              if (instr_i[11:7] == 5'b0)  illegal_instr_o = 1'b1;
+            end else begin
+              instr_o = instr_i;
+              illegal_instr_o = 1'b1;
+            end 
+          end
+
+          3'b100: begin
+            if (instr_i[12] == 1'b0) begin
+              if (instr_i[6:2] != 5'b0) begin
+                // c.mv -> add rd/rs1, x0, rs2
+                // (c.mv hints are translated into an add hint)
+                instr_o = {7'b0, instr_i[6:2], 5'b0, 3'b0, instr_i[11:7], {OPCODE_OP}};
+              end else begin
+                // c.jr -> jalr x0, rd/rs1, 0
+                instr_o = {12'b0, instr_i[11:7], 3'b0, 5'b0, {OPCODE_JALR}};
+                if (instr_i[11:7] == 5'b0) illegal_instr_o = 1'b1;
+              end
+            end else begin
+              if (instr_i[6:2] != 5'b0) begin
+                // c.add -> add rd, rd, rs2
+                // (c.add hints are translated into an add hint)
+                instr_o = {7'b0, instr_i[6:2], instr_i[11:7], 3'b0, instr_i[11:7], {OPCODE_OP}};
+              end else begin
+                if (instr_i[11:7] == 5'b0) begin
+                  // c.ebreak -> ebreak
+                  instr_o = {32'h00_10_00_73};
+                end else begin
+                  // c.jalr -> jalr x1, rs1, 0
+                  instr_o = {12'b0, instr_i[11:7], 3'b000, 5'b00001, {OPCODE_JALR}};
+                end
+              end
+            end
+          end
+
+          3'b110: begin
+            // c.swsp -> sw rs2, imm(x2)
+            instr_o = {4'b0, instr_i[8:7], instr_i[12], instr_i[6:2], 5'h02, 3'b010,
+                       instr_i[11:9], 2'b00, {OPCODE_STORE}};
+          end
+
+          3'b111: begin
+            if (CHERIoTEn & cheri_pmode_i) begin
+              // c.cscsp -> csc cs2, imm(c2),  reuse c.sdsp
+              instr_o = {3'b0, instr_i[9:7], instr_i[12], instr_i[6:2], 5'h02, 3'b011,
+                         instr_i[11:10], 3'b000, {OPCODE_STORE}};
+            end else begin 
+              instr_o = instr_i;
+              illegal_instr_o = 1'b1;
+            end
+          end
+
+
+          3'b001,
+          3'b101: begin
+            illegal_instr_o = 1'b1;
+          end
+
+          default: begin
+            illegal_instr_o = 1'b1;
+          end
+        endcase
+      end
+
+      // Incoming instruction is not compressed.
+      2'b11:;
+
+      default: begin
+        illegal_instr_o = 1'b1;
+      end
+    endcase
+  end
+
+  assign is_compressed_o = (instr_i[1:0] != 2'b11);
+
+  ////////////////
+  // Assertions //
+  ////////////////
+
+  // The valid_i signal used to gate below assertions must be known.
+  `ASSERT_KNOWN(IbexInstrValidKnown, valid_i)
+
+  // Selectors must be known/valid.
+  `ASSERT(IbexInstrLSBsKnown, valid_i |->
+      !$isunknown(instr_i[1:0]))
+  `ASSERT(IbexC0Known1, (valid_i && (instr_i[1:0] == 2'b00)) |->
+      !$isunknown(instr_i[15:13]))
+  `ASSERT(IbexC1Known1, (valid_i && (instr_i[1:0] == 2'b01)) |->
+      !$isunknown(instr_i[15:13]))
+  `ASSERT(IbexC1Known2, (valid_i && (instr_i[1:0] == 2'b01) && (instr_i[15:13] == 3'b100)) |->
+      !$isunknown(instr_i[11:10]))
+  `ASSERT(IbexC1Known3, (valid_i &&
+      (instr_i[1:0] == 2'b01) && (instr_i[15:13] == 3'b100) && (instr_i[11:10] == 2'b11)) |->
+      !$isunknown({instr_i[12], instr_i[6:5]}))
+  `ASSERT(IbexC2Known1, (valid_i && (instr_i[1:0] == 2'b10)) |->
+      !$isunknown(instr_i[15:13]))
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_controller.sv b/hw/ip/cheriot-ibex/rtl/cheriot_controller.sv
new file mode 100644
index 0000000..6e2109e
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_controller.sv
@@ -0,0 +1,962 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Main controller of the processor
+ */
+
+`include "prim_assert.sv"
+`include "dv_fcov_macros.svh"
+
+module cheriot_controller #(
+  parameter bit CHERIoTEn       = 1'b1,
+  parameter bit WritebackStage  = 0,
+  parameter bit BranchPredictor = 0
+ ) (
+  input  logic                  clk_i,
+  input  logic                  rst_ni,
+  input  logic                  cheri_pmode_i,
+
+  output logic                  ctrl_busy_o,             // core is busy processing instrs
+
+  // decoder related signals
+  input  logic                  illegal_insn_i,          // decoder has an invalid instr
+  input  logic                  ecall_insn_i,            // decoder has ECALL instr
+  input  logic                  mret_insn_i,             // decoder has MRET instr
+  input  logic                  dret_insn_i,             // decoder has DRET instr
+  input  logic                  wfi_insn_i,              // decoder has WFI instr
+  input  logic                  ebrk_insn_i,             // decoder has EBREAK instr
+  input  logic                  csr_pipe_flush_i,        // do CSR-related pipeline flush
+  input  logic                  csr_access_i,            // decoder has CSR access instr
+  input  logic                  csr_cheri_always_ok_i,   // cheri safe-listed CSR registers
+
+  // instr from IF-ID pipeline stage
+  input  logic                  instr_valid_i,           // instr is valid
+  input  logic [31:0]           instr_i,                 // uncompressed instr data for mtval
+  input  logic [15:0]           instr_compressed_i,      // instr compressed data for mtval
+  input  logic                  instr_is_compressed_i,   // instr is compressed
+  input  logic                  instr_bp_taken_i,        // instr was predicted taken branch
+  input  logic                  instr_fetch_err_i,       // instr has error
+  input  logic                  instr_fetch_err_plus2_i, // instr error is x32
+  input  logic                  instr_fetch_cheri_acc_vio_i,         
+  input  logic                  instr_fetch_cheri_bound_vio_i,         
+
+  input  logic [31:0]           pc_id_i,                 // instr address
+
+  // to IF-ID pipeline stage
+  output logic                  instr_valid_clear_o,     // kill instr in IF-ID reg
+  output logic                  id_in_ready_o,           // ID stage is ready for new instr
+  output logic                  controller_run_o,        // Controller is in standard instruction
+                                                         // run mode
+
+  // to prefetcher
+  output logic                  instr_req_o,             // start fetching instructions
+  output logic                  pc_set_o,                // jump to address set by pc_mux
+  output cheriot_pkg::pc_sel_e     pc_mux_o,                // IF stage fetch address selector
+                                                         // (boot, normal, exception...)
+  output logic                  nt_branch_mispredict_o,  // Not-taken branch in ID/EX was
+                                                         // mispredicted (predicted taken)
+  output cheriot_pkg::exc_pc_sel_e exc_pc_mux_o,            // IF stage selector for exception PC
+  output cheriot_pkg::exc_cause_e  exc_cause_o,             // for IF stage, CSRs
+
+  // LSU
+  input  logic [31:0]           lsu_addr_last_i,         // for mtval
+  input  logic                  load_err_i,
+  input  logic                  store_err_i,
+  input  logic                  lsu_err_is_cheri_i,
+  output logic                  wb_exception_o,          // Instruction in WB taking an exception
+  output logic                  id_exception_o,          // Instruction in ID taking an exception
+  output logic                  id_exception_nc_o,       // no-cheri
+
+  // jump/branch signals
+  input  logic                  branch_set_i,            // branch set signal (branch definitely
+                                                         // taken)
+  input  logic                  branch_not_set_i,        // branch is definitely not taken
+  input  logic                  jump_set_i,              // jump taken set signal
+
+  // interrupt signals
+  input  logic                  csr_mstatus_mie_i,       // M-mode interrupt enable bit
+  input  logic                  irq_pending_i,           // interrupt request pending
+  input  cheriot_pkg::irqs_t       irqs_i,                  // interrupt requests qualified with
+                                                         // mie CSR
+  input  logic                  irq_nm_i,                // non-maskeable interrupt
+  output logic                  nmi_mode_o,              // core executing NMI handler
+
+  // debug signals
+  input  logic                  debug_req_i,
+  output cheriot_pkg::dbg_cause_e  debug_cause_o,
+  output logic                  debug_csr_save_o,
+  output logic                  debug_mode_o,
+  input  logic                  debug_single_step_i,
+  input  logic                  debug_ebreakm_i,
+  input  logic                  debug_ebreaku_i,
+  input  logic                  trigger_match_i,
+
+  output logic                  csr_save_if_o,
+  output logic                  csr_save_id_o,
+  output logic                  csr_save_wb_o,
+  output logic                  csr_restore_mret_id_o,
+  output logic                  csr_restore_dret_id_o,
+  output logic                  csr_save_cause_o,
+  output logic                  csr_mepcc_clrtag_o,
+
+  output logic [31:0]           csr_mtval_o,
+  input  cheriot_pkg::priv_lvl_e   priv_mode_i,
+  input  logic                  csr_mstatus_tw_i,
+  input  logic                  csr_pcc_perm_sr_i,
+
+  // stall & flush signals
+  input  logic                  stall_id_i,
+  input  logic                  stall_wb_i,
+  output logic                  flush_id_o,
+  input  logic                  ready_wb_i,
+
+  // performance monitors
+  output logic                  perf_jump_o,             // we are executing a jump
+                                                         // instruction (j, jr, jal, jalr)
+  output logic                  perf_tbranch_o,          // we are executing a taken branch
+                                                         // instruction
+  input  logic                  instr_is_cheri_i,        // from decoder
+  input  logic                  cheri_ex_valid_i,        // from cheri EX
+  input  logic                  cheri_ex_err_i,
+  input  logic                  cheri_wb_err_i,
+  input  logic  [11:0]          cheri_ex_err_info_i,
+  input  logic  [15:0]          cheri_wb_err_info_i,
+  input  logic                  cheri_branch_req_i,
+  input  logic [31:0]           cheri_branch_target_i
+);
+  import cheriot_pkg::*;
+
+  // FSM state encoding
+  //typedef enum logic [3:0] {
+  //  RESET, BOOT_SET, WAIT_SLEEP, SLEEP, FIRST_FETCH, DECODE, FLUSH,
+  //  IRQ_TAKEN, DBG_TAKEN_IF, DBG_TAKEN_ID
+  //} ctrl_fsm_e;
+
+  ctrl_fsm_e ctrl_fsm_cs, ctrl_fsm_ns;
+
+  logic nmi_mode_q, nmi_mode_d;
+  logic debug_mode_q, debug_mode_d;
+  logic load_err_q, load_err_d;
+  logic store_err_q, store_err_d;
+  logic lsu_err_is_cheri_q;
+  logic exc_req_q, exc_req_d, exc_req_nc, exc_req_wb;
+  logic illegal_insn_q, illegal_insn_d;
+  logic cheri_ex_err_q, cheri_ex_err_d;
+  logic cheri_wb_err_q;
+  logic cheri_asr_err_q, cheri_asr_err_d;
+
+  // Of the various exception/fault signals, which one takes priority in FLUSH and hence controls
+  // what happens next (setting exc_cause, csr_mtval etc)
+  logic instr_fetch_err_prio;
+  logic illegal_insn_prio;
+  logic ecall_insn_prio;
+  logic ebrk_insn_prio;
+  logic store_err_prio;
+  logic load_err_prio;
+  logic cheri_ex_err_prio;
+  logic cheri_wb_err_prio;
+  logic cheri_asr_err_prio;
+
+  logic stall;
+  logic halt_if;
+  logic retain_id;
+  logic flush_id;
+  logic illegal_dret;
+  logic illegal_umode;
+  logic exc_req_lsu;
+  logic special_req;
+  logic special_req_pc_change;
+  logic special_req_flush_only;
+  logic do_single_step_d;
+  logic do_single_step_q;
+  logic enter_debug_mode_prio_d;
+  logic enter_debug_mode_prio_q;
+  logic enter_debug_mode;
+  logic ebreak_into_debug;
+  logic handle_irq;
+  logic id_wb_pending;
+
+  logic [3:0] mfip_id;
+  logic       unused_irq_timer;
+
+  logic ecall_insn;
+  logic mret_insn;
+  logic dret_insn;
+  logic wfi_insn;
+  logic ebrk_insn;
+  logic csr_pipe_flush;
+  logic instr_fetch_err;
+  logic cheri_ex_err;
+  logic mret_cheri_asr_err;
+  logic csr_cheri_asr_err;
+
+`ifndef SYNTHESIS
+`ifndef DII_SIM
+  // synopsys translate_off
+  // make sure we are called later so that we do not generate messages for
+  // glitches
+  always_ff @(negedge clk_i) begin
+    // print warning in case of decoding errors
+    if ((ctrl_fsm_cs == DECODE) && instr_valid_i && !instr_fetch_err_i && !wb_exception_o && illegal_insn_d) begin
+      $display("%t: Illegal instruction (hart %0x) at PC 0x%h: 0x%h", $time, cheriot_core.hart_id_i,
+               cheriot_id_stage.pc_id_i, 
+               (instr_is_compressed_i ? instr_compressed_i : instr_i));
+      // cheriot_id_stage.instr_rdata_i);
+    end
+  end
+  // synopsys translate_on
+`endif
+`endif
+
+  ////////////////
+  // Exceptions //
+  ////////////////
+
+  assign load_err_d  = load_err_i;
+  assign store_err_d = store_err_i;
+
+  // Decoder doesn't take instr_valid into account, factor it in here.
+  assign ecall_insn      = ecall_insn_i      & instr_valid_i;
+  assign mret_insn       = mret_insn_i       & instr_valid_i;
+  assign dret_insn       = dret_insn_i       & instr_valid_i;
+  assign wfi_insn        = wfi_insn_i        & instr_valid_i;
+  assign ebrk_insn       = ebrk_insn_i       & instr_valid_i;
+  assign csr_pipe_flush  = csr_pipe_flush_i  & instr_valid_i;
+  assign instr_fetch_err = instr_fetch_err_i & instr_valid_i;
+  assign cheri_ex_err    = cheri_ex_err_i & instr_is_cheri_i & instr_valid_i;
+
+  // "Executing DRET outside of Debug Mode causes an illegal instruction exception."
+  // [Debug Spec v0.13.2, p.41]
+  assign illegal_dret = dret_insn & ~debug_mode_q;
+
+  // Some instructions can only be executed in M-Mode
+  assign illegal_umode = (priv_mode_i != PRIV_LVL_M) &
+                         // MRET must be in M-Mode. TW means trap WFI to M-Mode.
+                         (mret_insn | (csr_mstatus_tw_i & wfi_insn));
+
+  assign mret_cheri_asr_err = CHERIoTEn & cheri_pmode_i & ~csr_pcc_perm_sr_i & mret_insn;
+  assign csr_cheri_asr_err  = CHERIoTEn & cheri_pmode_i & ~csr_pcc_perm_sr_i & instr_valid_i & 
+                              csr_access_i & ~illegal_insn_i & ~csr_cheri_always_ok_i;
+
+  // This is recorded in the illegal_insn_q flop to help timing.  Specifically
+  // it is needed to break the path from cheriot_cs_registers/illegal_csr_insn_o
+  // to pc_set_o.  Clear when controller is in FLUSH so it won't remain set
+  // once illegal instruction is handled.
+  // All terms in this expression are qualified by instr_valid_i
+  assign illegal_insn_d = illegal_insn_i | illegal_dret | illegal_umode;
+  assign cheri_ex_err_d = cheri_pmode_i & cheri_ex_err & (ctrl_fsm_cs != FLUSH);
+
+  assign cheri_asr_err_d = (~illegal_insn_i & csr_cheri_asr_err) | mret_cheri_asr_err;
+
+  // exception requests
+  // requests are flopped in exc_req_q.  This is cleared when controller is in
+  // the FLUSH state so the cycle following exc_req_q won't remain set for an
+  // exception request that has just been handled.
+  // All terms in this expression are qualified by instr_valid_i
+  assign exc_req_d = (ecall_insn | ebrk_insn | illegal_insn_d | instr_fetch_err | (cheri_pmode_i & cheri_ex_err) | 
+                      cheri_asr_err_d) & (ctrl_fsm_cs != FLUSH);
+  assign exc_req_nc = (ecall_insn | ebrk_insn | illegal_insn_d | instr_fetch_err | cheri_asr_err_d) &
+                      (ctrl_fsm_cs != FLUSH);
+
+  // LSU exception requests
+  assign exc_req_lsu = store_err_i | load_err_i;
+  assign exc_req_wb  = exc_req_lsu | (cheri_pmode_i & cheri_wb_err_i);
+
+  assign id_exception_o = exc_req_d;
+  assign id_exception_nc_o = exc_req_nc;
+
+  // special requests: special instructions, pipeline flushes, exceptions...
+  // All terms in these expressions are qualified by instr_valid_i except exc_req_lsu which can come
+  // from the Writeback stage with no instr_valid_i from the ID stage
+
+  // These special requests only cause a pipeline flush and in particular don't cause a PC change
+  // that is outside the normal execution flow
+  assign special_req_flush_only = wfi_insn | csr_pipe_flush;
+
+  // These special requests cause a change in PC
+  assign special_req_pc_change = mret_insn | dret_insn | exc_req_d | exc_req_wb;
+
+  // generic special request signal, applies to all instructions
+  assign special_req = special_req_pc_change | special_req_flush_only;
+
+  // Is there an instruction in ID or WB that has yet to complete?
+  assign id_wb_pending = instr_valid_i | ~ready_wb_i;
+
+  // Exception/fault prioritisation is taken from Table 3.7 of Priviledged Spec v1.11
+  if (WritebackStage) begin : g_wb_exceptions
+    always_comb begin
+      instr_fetch_err_prio = 0;
+      illegal_insn_prio    = 0;
+      ecall_insn_prio      = 0;
+      ebrk_insn_prio       = 0;
+      store_err_prio       = 0;
+      load_err_prio        = 0;
+      cheri_ex_err_prio    = 0;
+      cheri_wb_err_prio    = 0;
+      cheri_asr_err_prio   = 0;
+
+      // Note that with the writeback stage store/load errors occur on the instruction in writeback,
+      // all other exception/faults occur on the instruction in ID/EX. The faults from writeback
+      // must take priority as that instruction is architecurally ordered before the one in ID/EX.
+      if (store_err_q) begin
+        store_err_prio = 1'b1;
+      end else if (load_err_q) begin
+        load_err_prio  = 1'b1;
+      end else if (cheri_pmode_i & cheri_wb_err_q) begin
+        cheri_wb_err_prio  = 1'b1;
+      end else if (instr_fetch_err) begin
+        instr_fetch_err_prio = 1'b1;
+      end else if (illegal_insn_q) begin
+        illegal_insn_prio = 1'b1;
+      end else if (ecall_insn) begin
+        ecall_insn_prio = 1'b1;
+      end else if (ebrk_insn) begin
+        ebrk_insn_prio = 1'b1;
+      end else if (cheri_pmode_i & cheri_ex_err_q) begin
+        cheri_ex_err_prio = 1'b1;
+      end else if (cheri_asr_err_q) begin
+        cheri_asr_err_prio = 1'b1;
+      end
+    end
+
+    // Instruction in writeback is generating an exception so instruction in ID must not execute
+    assign wb_exception_o = load_err_q | store_err_q | load_err_i | store_err_i | (cheri_pmode_i & cheri_wb_err_i);
+  end else begin : g_no_wb_exceptions
+    always_comb begin
+      instr_fetch_err_prio = 0;
+      illegal_insn_prio    = 0;
+      ecall_insn_prio      = 0;
+      ebrk_insn_prio       = 0;
+      store_err_prio       = 0;
+      load_err_prio        = 0;
+      cheri_wb_err_prio    = 0;
+      cheri_ex_err_prio    = 0;
+      cheri_asr_err_prio   = 0;
+
+      if (instr_fetch_err) begin
+        instr_fetch_err_prio = 1'b1;
+      end else if (illegal_insn_q) begin
+        illegal_insn_prio = 1'b1;
+      end else if (ecall_insn) begin
+        ecall_insn_prio = 1'b1;
+      end else if (ebrk_insn) begin
+        ebrk_insn_prio = 1'b1;
+      end else if (cheri_ex_err_q) begin
+        cheri_ex_err_prio  = 1'b1;
+      end else if (store_err_q) begin
+        store_err_prio = 1'b1;
+      end else if (load_err_q) begin
+        load_err_prio  = 1'b1;
+      end else if (cheri_wb_err_q) begin
+        cheri_wb_err_prio  = 1'b1;
+      end else if (cheri_asr_err_q) begin
+        cheri_asr_err_prio = 1'b1;
+      end
+    end
+    assign wb_exception_o = 1'b0;
+  end
+
+  `ASSERT_IF(IbexExceptionPrioOnehot,
+             $onehot({instr_fetch_err_prio,
+                      illegal_insn_prio,
+                      ecall_insn_prio,
+                      ebrk_insn_prio,
+                      store_err_prio,
+                      load_err_prio,
+                      cheri_wb_err_prio,
+                      cheri_ex_err_prio,
+                      cheri_asr_err_prio}),
+             (ctrl_fsm_cs == FLUSH) & csr_save_cause_o)
+
+  ////////////////
+  // Interrupts //
+  ////////////////
+
+  // Enter debug mode due to an external debug_req_i or because the core is in
+  // single step mode (dcsr.step == 1). Single step must be qualified with
+  // instruction valid otherwise the core will immediately enter debug mode
+  // due to a recently flushed IF (or a delay in an instruction returning from
+  // memory) before it has had anything to single step.
+  // Also enter debug mode on a trigger match (hardware breakpoint)
+
+  // Set `do_single_step_q` when a valid instruction is seen outside of debug mode and core is in
+  // single step mode. The first valid instruction on debug mode entry will clear it. Hold its value
+  // when there is no valid instruction so `do_single_step_d` remains asserted until debug mode is
+  // entered.
+  assign do_single_step_d = instr_valid_i ? ~debug_mode_q & debug_single_step_i : do_single_step_q;
+  // Enter debug mode due to:
+  // * external `debug_req_i`
+  // * core in single step mode (dcsr.step == 1).
+  // * trigger match (hardware breakpoint)
+  //
+  // `debug_req_i` and `do_single_step_d` request debug mode with priority. This results in a debug
+  // mode entry even if the controller goes to `FLUSH` in preparation for handling an exception or
+  // interrupt. `trigger_match_i` is not a priority entry into debug mode as it must be ignored
+  // where control flow changes such that the instruction causing the trigger is no longer being
+  // executed.
+  assign enter_debug_mode_prio_d = (debug_req_i | do_single_step_d) & ~debug_mode_q;
+  assign enter_debug_mode = enter_debug_mode_prio_d | (trigger_match_i & ~debug_mode_q);
+
+  // Set when an ebreak should enter debug mode rather than jump to exception
+  // handler
+  assign ebreak_into_debug = priv_mode_i == PRIV_LVL_M ? debug_ebreakm_i :
+                             priv_mode_i == PRIV_LVL_U ? debug_ebreaku_i :
+                                                         1'b0;
+
+  // Interrupts including NMI are ignored,
+  // - while in debug mode [Debug Spec v0.13.2, p.39],
+  // - while in NMI mode (nested NMIs are not supported, NMI has highest priority and
+  //   cannot be interrupted by regular interrupts).
+  assign handle_irq = ~debug_mode_q & ~nmi_mode_q &
+      (irq_nm_i | (irq_pending_i & csr_mstatus_mie_i));
+
+  // generate ID of fast interrupts, highest priority to lowest ID
+  always_comb begin : gen_mfip_id
+    mfip_id = 4'd0;
+
+    for (int i = 14; i >= 0; i--) begin
+      if (irqs_i.irq_fast[i]) begin
+        mfip_id = i[3:0];
+      end
+    end
+  end
+
+  assign unused_irq_timer = irqs_i.irq_timer;
+
+  /////////////////////
+  // Core controller //
+  /////////////////////
+
+  always_comb begin
+    // Default values
+    instr_req_o           = 1'b1;
+
+    csr_save_if_o         = 1'b0;
+    csr_save_id_o         = 1'b0;
+    csr_save_wb_o         = 1'b0;
+    csr_restore_mret_id_o = 1'b0;
+    csr_restore_dret_id_o = 1'b0;
+    csr_save_cause_o      = 1'b0;
+    csr_mepcc_clrtag_o    = 1'b0;
+    csr_mtval_o           = '0;
+
+    // The values of pc_mux and exc_pc_mux are only relevant if pc_set is set. Some of the states
+    // below always set pc_mux and exc_pc_mux but only set pc_set if certain conditions are met.
+    // This avoid having to factor those conditions into the pc_mux and exc_pc_mux select signals
+    // helping timing.
+    pc_mux_o               = PC_BOOT;
+    pc_set_o               = 1'b0;
+    nt_branch_mispredict_o = 1'b0;
+
+    exc_pc_mux_o           = EXC_PC_IRQ;
+    exc_cause_o            = EXC_CAUSE_INSN_ADDR_MISA; // = 6'h00
+
+    ctrl_fsm_ns            = ctrl_fsm_cs;
+
+    ctrl_busy_o            = 1'b1;
+
+    halt_if                = 1'b0;
+    retain_id              = 1'b0;
+    flush_id               = 1'b0;
+
+    debug_csr_save_o       = 1'b0;
+    debug_cause_o          = DBG_CAUSE_EBREAK;
+    debug_mode_d           = debug_mode_q;
+    nmi_mode_d             = nmi_mode_q;
+
+    perf_tbranch_o         = 1'b0;
+    perf_jump_o            = 1'b0;
+
+    controller_run_o       = 1'b0;
+
+    unique case (ctrl_fsm_cs)
+      RESET: begin
+        instr_req_o   = 1'b0;
+        pc_mux_o      = PC_BOOT;
+        pc_set_o      = 1'b1;
+        ctrl_fsm_ns   = BOOT_SET;
+      end
+
+      BOOT_SET: begin
+        // copy boot address to instr fetch address
+        instr_req_o   = 1'b1;
+        pc_mux_o      = PC_BOOT;
+        pc_set_o      = 1'b1;
+
+        ctrl_fsm_ns = FIRST_FETCH;
+      end
+
+      WAIT_SLEEP: begin
+        ctrl_busy_o   = 1'b0;
+        instr_req_o   = 1'b0;
+        halt_if       = 1'b1;
+        flush_id      = 1'b1;
+        ctrl_fsm_ns   = SLEEP;
+      end
+
+      SLEEP: begin
+        // instruction in IF stage is already valid
+        // we begin execution when an interrupt has arrived
+        instr_req_o   = 1'b0;
+        halt_if       = 1'b1;
+        flush_id      = 1'b1;
+
+        // normal execution flow
+        // in debug mode or single step mode we leave immediately (wfi=nop)
+        if (irq_nm_i || irq_pending_i || debug_req_i || debug_mode_q || debug_single_step_i) begin
+          ctrl_fsm_ns = FIRST_FETCH;
+        end else begin
+          // Make sure clock remains disabled.
+          ctrl_busy_o = 1'b0;
+        end
+      end
+
+      FIRST_FETCH: begin
+        // Stall because of IF miss
+        if (id_in_ready_o) begin
+          ctrl_fsm_ns = DECODE;
+        end
+
+        // handle interrupts
+        if (handle_irq) begin
+          // We are handling an interrupt. Set halt_if to tell IF not to give
+          // us any more instructions before it redirects to the handler, but
+          // don't set flush_id: we must allow this instruction to complete
+          // (since it might have outstanding loads or stores).
+          ctrl_fsm_ns = IRQ_TAKEN;
+          halt_if     = 1'b1;
+        end
+
+        // enter debug mode
+        if (enter_debug_mode) begin
+          ctrl_fsm_ns = DBG_TAKEN_IF;
+          // Halt IF only for now, ID will be flushed in DBG_TAKEN_IF as the
+          // ID state is needed for correct debug mode entry
+          halt_if     = 1'b1;
+        end
+      end
+
+      DECODE: begin
+        // normal operating mode of the ID stage, in case of debug and interrupt requests,
+        // priorities are as follows (lower number == higher priority)
+        // 1. currently running (multicycle) instructions and exceptions caused by these
+        // 2. debug requests
+        // 3. interrupt requests
+
+        controller_run_o = 1'b1;
+
+        // Set PC mux for branch and jump here to ease timing. Value is only relevant if pc_set_o is
+        // also set. Setting the mux value here avoids factoring in special_req and instr_valid_i
+        // which helps timing.
+        pc_mux_o = PC_JUMP;
+
+
+        // Get ready for special instructions, exceptions, pipeline flushes
+        if (special_req) begin
+          // Halt IF but don't flush ID. This leaves a valid instruction in
+          // ID so controller can determine appropriate action in the
+          // FLUSH state.
+          retain_id = 1'b1;
+
+          // Wait for the writeback stage to either be ready for a new instruction or raise its own
+          // exception before going to FLUSH. If the instruction in writeback raises an exception it
+          // must take priority over any exception from an instruction in ID/EX. Only once the
+          // writeback stage is ready can we be certain that won't happen. Without a writeback
+          // stage ready_wb_i == 1 so the FSM will always go directly to FLUSH.
+
+          if (ready_wb_i | wb_exception_o) begin
+            ctrl_fsm_ns = FLUSH;
+          end
+        end
+
+        if (branch_set_i || jump_set_i || (cheri_pmode_i & cheri_branch_req_i)) begin
+          // Only set the PC if the branch predictor hasn't already done the branch for us
+          pc_set_o       = BranchPredictor ? ~instr_bp_taken_i : 1'b1;
+
+          perf_tbranch_o = branch_set_i;
+          perf_jump_o    = jump_set_i;
+        end
+
+        if (BranchPredictor) begin
+          if (instr_bp_taken_i & branch_not_set_i) begin
+            // If the instruction is a branch that was predicted to be taken but was not taken
+            // signal a mispredict.
+            nt_branch_mispredict_o = 1'b1;
+          end
+        end
+
+        // If entering debug mode or handling an IRQ the core needs to wait until any instruction in
+        // ID or WB has finished executing. Stall IF during that time.
+        if ((enter_debug_mode || handle_irq) && (stall || id_wb_pending)) begin
+          halt_if = 1'b1;
+        end
+
+        if (!stall && !special_req && !id_wb_pending) begin
+          if (enter_debug_mode) begin
+            // enter debug mode
+            ctrl_fsm_ns = DBG_TAKEN_IF;
+            // Halt IF only for now, ID will be flushed in DBG_TAKEN_IF as the
+            // ID state is needed for correct debug mode entry
+            halt_if     = 1'b1;
+          end else if (handle_irq) begin
+            // handle interrupt (not in debug mode)
+            ctrl_fsm_ns = IRQ_TAKEN;
+            // We are handling an interrupt (not in debug mode). Set halt_if to
+            // tell IF not to give us any more instructions before it redirects
+            // to the handler, but don't set flush_id: we must allow this
+            // instruction to complete (since it might have outstanding loads
+            // or stores).
+            halt_if     = 1'b1;
+          end
+        end
+
+      end // DECODE
+
+      IRQ_TAKEN: begin
+        pc_mux_o     = PC_EXC;
+        exc_pc_mux_o = EXC_PC_IRQ;
+
+        if (handle_irq) begin
+          pc_set_o         = 1'b1;
+
+          csr_save_if_o    = 1'b1;
+          csr_save_cause_o = 1'b1;
+
+          // interrupt priorities according to Privileged Spec v1.11 p.31
+          if (irq_nm_i && !nmi_mode_q) begin
+            exc_cause_o = EXC_CAUSE_IRQ_NM;
+            nmi_mode_d  = 1'b1; // enter NMI mode
+          end else if (irqs_i.irq_fast != 15'b0) begin
+            // generate exception cause ID from fast interrupt ID:
+            // - first bit distinguishes interrupts from exceptions,
+            // - second bit adds 16 to fast interrupt ID
+            // for example EXC_CAUSE_IRQ_FAST_0 = {1'b1, 5'd16}
+            exc_cause_o = exc_cause_e'({2'b11, mfip_id});
+          end else if (irqs_i.irq_external) begin
+            exc_cause_o = EXC_CAUSE_IRQ_EXTERNAL_M;
+          end else if (irqs_i.irq_software) begin
+            exc_cause_o = EXC_CAUSE_IRQ_SOFTWARE_M;
+          end else begin // irqs_i.irq_timer
+            exc_cause_o = EXC_CAUSE_IRQ_TIMER_M;
+          end
+        end
+
+        ctrl_fsm_ns = DECODE;
+      end
+
+      DBG_TAKEN_IF: begin
+        pc_mux_o     = PC_EXC;
+        exc_pc_mux_o = EXC_PC_DBD;
+
+        // enter debug mode and save PC in IF to dpc
+        // jump to debug exception handler in debug memory
+        flush_id         = 1'b1;
+        pc_set_o         = 1'b1;
+
+        csr_save_if_o    = 1'b1;
+        debug_csr_save_o = 1'b1;
+
+        csr_save_cause_o = 1'b1;
+        if (trigger_match_i) begin
+          debug_cause_o = DBG_CAUSE_TRIGGER;
+        end else if (debug_single_step_i) begin
+          debug_cause_o = DBG_CAUSE_STEP;
+        end else begin
+          debug_cause_o = DBG_CAUSE_HALTREQ;
+        end
+
+        // enter debug mode
+        debug_mode_d = 1'b1;
+
+        ctrl_fsm_ns  = DECODE;
+      end
+
+      DBG_TAKEN_ID: begin
+        // enter debug mode and save PC in ID to dpc, used when encountering
+        // 1. EBREAK during debug mode
+        // 2. EBREAK with forced entry into debug mode (ebreakm or ebreaku set).
+        // regular ebreak's go through FLUSH.
+        //
+        // for 1. do not update dcsr and dpc, for 2. do so [Debug Spec v0.13.2, p.39]
+        // jump to debug exception handler in debug memory
+        flush_id      = 1'b1;
+        pc_mux_o      = PC_EXC;
+        pc_set_o      = 1'b1;
+        exc_pc_mux_o  = EXC_PC_DBD;
+
+        // update dcsr and dpc
+        if (ebreak_into_debug && !debug_mode_q) begin // ebreak with forced entry
+
+          // dpc (set to the address of the EBREAK, i.e. set to PC in ID stage)
+          csr_save_cause_o = 1'b1;
+          csr_save_id_o    = 1'b1;
+
+          // dcsr
+          debug_csr_save_o = 1'b1;
+          debug_cause_o    = DBG_CAUSE_EBREAK;
+        end
+
+        // enter debug mode
+        debug_mode_d = 1'b1;
+
+        ctrl_fsm_ns  = DECODE;
+      end
+
+      FLUSH: begin
+        // flush the pipeline
+        halt_if     = 1'b1;
+        flush_id    = 1'b1;
+        ctrl_fsm_ns = DECODE;
+
+        // As pc_mux and exc_pc_mux can take various values in this state they aren't set early
+        // here.
+
+        // exceptions: set exception PC, save PC and exception cause
+        // exc_req_lsu is high for one clock cycle only (in DECODE)
+        if (exc_req_q || store_err_q || load_err_q || (cheri_pmode_i & cheri_wb_err_q)) begin
+          pc_set_o         = 1'b1;
+          pc_mux_o         = PC_EXC;
+          exc_pc_mux_o     = debug_mode_q ? EXC_PC_DBG_EXC : EXC_PC_EXC;
+
+          if (WritebackStage) begin : g_writeback_mepc_save
+            // With the writeback stage present whether an instruction accessing memory will cause
+            // an exception is only known when it is in writeback. So when taking such an exception
+            // epc must come from writeback.
+            csr_save_id_o  = ~(store_err_q | load_err_q | (cheri_pmode_i & cheri_wb_err_q));
+            csr_save_wb_o  = store_err_q | load_err_q | (cheri_pmode_i & cheri_wb_err_q);
+          end else begin : g_no_writeback_mepc_save
+            csr_save_id_o  = 1'b0;
+          end
+
+          csr_save_cause_o = 1'b1;
+
+          // Exception/fault prioritisation logic will have set exactly 1 X_prio signal
+          unique case (1'b1)
+            instr_fetch_err_prio: begin
+              if (instr_fetch_cheri_acc_vio_i) begin  // tag violation
+                exc_cause_o = EXC_CAUSE_CHERI_FAULT;
+                csr_mtval_o = {21'h0, 1'b1, 5'h0, 5'h2};   // s=1, cap_idx=0
+              end else if (instr_fetch_cheri_bound_vio_i) begin  // bound violation
+                exc_cause_o = EXC_CAUSE_CHERI_FAULT;
+                csr_mtval_o = {21'h0, 1'b1, 5'h0, 5'h1};   // s=1, cap_idx=0
+                csr_mepcc_clrtag_o = 1'b1;
+              end else begin                            // ext memory error
+                exc_cause_o = EXC_CAUSE_INSTR_ACCESS_FAULT;
+                csr_mtval_o = instr_fetch_err_plus2_i ? (pc_id_i + 32'd2) : pc_id_i;
+              end
+            end
+            illegal_insn_prio: begin
+              exc_cause_o = EXC_CAUSE_ILLEGAL_INSN;
+              csr_mtval_o = (CHERIoTEn & cheri_pmode_i) ? 32'h0 : 
+                            (instr_is_compressed_i ? {16'b0, instr_compressed_i} : instr_i);
+            end
+            ecall_insn_prio: begin
+              exc_cause_o = (priv_mode_i == PRIV_LVL_M) ? EXC_CAUSE_ECALL_MMODE :
+                                                          EXC_CAUSE_ECALL_UMODE;
+            end
+            ebrk_insn_prio: begin
+              if (debug_mode_q | ebreak_into_debug) begin
+                /*
+                 * EBREAK in debug mode re-enters debug mode
+                 *
+                 * "The only exception is EBREAK. When that is executed in Debug
+                 * Mode, it halts the hart again but without updating dpc or
+                 * dcsr." [Debug Spec v0.13.2, p.39]
+                 */
+
+                /*
+                 * dcsr.ebreakm == 1:
+                 * "EBREAK instructions in M-mode enter Debug Mode."
+                 * [Debug Spec v0.13.2, p.42]
+                 */
+                pc_set_o         = 1'b0;
+                csr_save_id_o    = 1'b0;
+                csr_save_cause_o = 1'b0;
+                ctrl_fsm_ns      = DBG_TAKEN_ID;
+                flush_id         = 1'b0;
+              end else begin
+                /*
+                 * "The EBREAK instruction is used by debuggers to cause control
+                 * to be transferred back to a debugging environment. It
+                 * generates a breakpoint exception and performs no other
+                 * operation. [...] ECALL and EBREAK cause the receiving
+                 * privilege mode's epc register to be set to the address of the
+                 * ECALL or EBREAK instruction itself, not the address of the
+                 * following instruction." [Privileged Spec v1.11, p.40]
+                 */
+                exc_cause_o      = EXC_CAUSE_BREAKPOINT;
+                if (CHERIoTEn && cheri_pmode_i) csr_mtval_o = pc_id_i;     // kliu added to match sail
+              end
+            end
+            store_err_prio: begin
+              if (cheri_pmode_i & lsu_err_is_cheri_q) begin
+                if (cheri_wb_err_info_i[11]) begin
+                  exc_cause_o = EXC_CAUSE_STORE_ADDR_MISALIGN;
+                  csr_mtval_o = lsu_addr_last_i;
+                end else begin
+                  exc_cause_o = EXC_CAUSE_CHERI_FAULT; 
+                  csr_mtval_o = {21'h0, cheri_wb_err_info_i[10:0]};
+                end
+              end else begin
+                exc_cause_o = EXC_CAUSE_STORE_ACCESS_FAULT;
+                csr_mtval_o = lsu_addr_last_i;
+              end
+            end
+            load_err_prio: begin
+              if (cheri_pmode_i & lsu_err_is_cheri_q) begin
+                if (cheri_wb_err_info_i[11]) begin
+                  exc_cause_o = EXC_CAUSE_LOAD_ADDR_MISALIGN;
+                  csr_mtval_o = lsu_addr_last_i;
+                end else begin
+                  exc_cause_o = EXC_CAUSE_CHERI_FAULT;
+                  csr_mtval_o = {21'h0, cheri_wb_err_info_i[10:0]};
+                end
+              end else begin
+                exc_cause_o = EXC_CAUSE_LOAD_ACCESS_FAULT;
+                csr_mtval_o = lsu_addr_last_i;
+              end
+            end
+            cheri_ex_err_prio: begin
+              if (cheri_pmode_i) begin
+                exc_cause_o = EXC_CAUSE_CHERI_FAULT;
+                csr_mtval_o = {21'h0, cheri_ex_err_info_i[10:0]};
+              end
+            end
+            cheri_wb_err_prio: begin
+              if (cheri_pmode_i) begin
+                if (cheri_wb_err_info_i[12]) begin  // illegal SCR addr 
+                  exc_cause_o = EXC_CAUSE_ILLEGAL_INSN;
+                  csr_mtval_o = {21'h0, cheri_wb_err_info_i[10:0]};
+                end else begin
+                  exc_cause_o = EXC_CAUSE_CHERI_FAULT;
+                  csr_mtval_o = {21'h0, cheri_wb_err_info_i[10:0]};
+                end
+              end
+            end
+            cheri_asr_err_prio: begin
+              exc_cause_o = EXC_CAUSE_CHERI_FAULT;
+              //csr_mtval_o = instr_is_compressed_i ? {16'b0, instr_compressed_i} : instr_i;
+              csr_mtval_o = {21'b0, 1'b1, 5'h0, 5'h18};  // S=1, cap_idx=0 (pcc), err=0x18
+            end
+
+            default: ;
+          endcase
+        end else begin
+          // special instructions and pipeline flushes
+          if (mret_insn) begin
+            pc_mux_o              = PC_ERET;
+            pc_set_o              = 1'b1;
+            csr_restore_mret_id_o = 1'b1;
+            if (nmi_mode_q) begin
+              nmi_mode_d          = 1'b0; // exit NMI mode
+            end
+          end else if (dret_insn) begin
+            pc_mux_o              = PC_DRET;
+            pc_set_o              = 1'b1;
+            debug_mode_d          = 1'b0;
+            csr_restore_dret_id_o = 1'b1;
+          end else if (wfi_insn) begin
+            ctrl_fsm_ns           = WAIT_SLEEP;
+          end else if (csr_pipe_flush && handle_irq) begin
+            // start handling IRQs when doing CSR-related pipeline flushes
+            ctrl_fsm_ns           = IRQ_TAKEN;
+          end
+        end // exc_req_q
+
+        // Entering debug mode due to either single step or debug_req. Ensure
+        // registers are set for exception but then enter debug handler rather
+        // than exception handler [Debug Spec v0.13.2, p.44]
+        // Leave all other signals as is to ensure CSRs and PC get set as if
+        // core was entering exception handler, entry to debug mode will then
+        // see the appropriate state and setup dpc correctly.
+        // If an EBREAK instruction is causing us to enter debug mode on the
+        // same cycle as a debug_req or single step, honor the EBREAK and
+        // proceed to DBG_TAKEN_ID.
+        if (enter_debug_mode_prio_q && !(ebrk_insn_prio && ebreak_into_debug)) begin
+          ctrl_fsm_ns = DBG_TAKEN_IF;
+        end
+      end // FLUSH
+
+      default: begin
+        instr_req_o = 1'b0;
+        ctrl_fsm_ns = RESET;
+      end
+    endcase
+  end
+
+  assign flush_id_o = flush_id;
+
+  // signal to CSR when in debug mode
+  assign debug_mode_o = debug_mode_q;
+
+  // signal to CSR when in an NMI handler (for nested exception handling)
+  assign nmi_mode_o = nmi_mode_q;
+
+  ///////////////////
+  // Stall control //
+  ///////////////////
+
+  // If high current instruction cannot complete this cycle. Either because it needs more cycles to
+  // finish (stall_id_i) or because the writeback stage cannot accept it yet (stall_wb_i). If there
+  // is no writeback stage stall_wb_i is a constant 0.
+  assign stall = stall_id_i | stall_wb_i;
+
+  // signal to IF stage that ID stage is ready for next instr
+  assign id_in_ready_o = ~stall & ~halt_if & ~retain_id;
+
+  // kill instr in IF-ID pipeline reg that are done, or if a
+  // multicycle instr causes an exception for example
+  // retain_id is another kind of stall, where the instr_valid bit must remain
+  // set (unless flush_id is set also). It cannot be factored directly into
+  // stall as this causes a combinational loop.
+  assign instr_valid_clear_o = ~(stall | retain_id) | flush_id;
+
+  // update registers
+  always_ff @(posedge clk_i or negedge rst_ni) begin : update_regs
+    if (!rst_ni) begin
+      ctrl_fsm_cs             <= RESET;
+      nmi_mode_q              <= 1'b0;
+      do_single_step_q        <= 1'b0;
+      debug_mode_q            <= 1'b0;
+      enter_debug_mode_prio_q <= 1'b0;
+      load_err_q              <= 1'b0;
+      store_err_q             <= 1'b0;
+      lsu_err_is_cheri_q      <= 1'b0;
+      exc_req_q               <= 1'b0;
+      illegal_insn_q          <= 1'b0;
+      cheri_ex_err_q          <= 1'b0;
+      cheri_wb_err_q          <= 1'b0;
+      cheri_asr_err_q         <= 1'b0;
+    end else begin
+      ctrl_fsm_cs             <= ctrl_fsm_ns;
+      nmi_mode_q              <= nmi_mode_d;
+      do_single_step_q        <= do_single_step_d;
+      debug_mode_q            <= debug_mode_d;
+      enter_debug_mode_prio_q <= enter_debug_mode_prio_d;
+      load_err_q              <= load_err_d;
+      store_err_q             <= store_err_d;
+      lsu_err_is_cheri_q      <= lsu_err_is_cheri_i;
+      exc_req_q               <= exc_req_d;
+      illegal_insn_q          <= illegal_insn_d;
+      cheri_ex_err_q          <= cheri_ex_err_d;
+      cheri_wb_err_q          <= cheri_wb_err_i;
+      cheri_asr_err_q         <= cheri_asr_err_d;
+    end
+  end
+
+  `ifdef RVFI
+    // Workaround for internal verilator error when using hierarchical refers to calcuate this
+    // directly in cheriot_core
+    logic rvfi_flush_next;
+
+    assign rvfi_flush_next = ctrl_fsm_ns == FLUSH;
+  `endif
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_core.sv b/hw/ip/cheriot-ibex/rtl/cheriot_core.sv
new file mode 100644
index 0000000..7069082
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_core.sv
@@ -0,0 +1,2255 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+`ifdef RISCV_FORMAL
+  `define RVFI
+`endif
+
+`include "prim_assert.sv"
+
+/**
+ * Top level module of the ibex RISC-V core
+ */
+
+//import cheri_pkg::*;
+
+module cheriot_core import cheriot_pkg::*; import cheri_pkg::*; #(
+  parameter bit          PMPEnable         = 1'b0,
+  parameter int unsigned PMPGranularity    = 0,
+  parameter int unsigned PMPNumRegions     = 4,
+  parameter int unsigned MHPMCounterNum    = 0,
+  parameter int unsigned MHPMCounterWidth  = 40,
+  parameter bit          RV32E             = 1'b0,
+  parameter rv32m_e      RV32M             = RV32MFast,
+  parameter rv32b_e      RV32B             = RV32BNone,
+  parameter bit          BranchTargetALU   = 1'b0,
+  parameter bit          WritebackStage    = 1'b0,
+  parameter bit          ICache            = 1'b0,
+  parameter bit          ICacheECC         = 1'b0,
+  parameter int unsigned BusSizeECC        = BUS_SIZE,
+  parameter int unsigned TagSizeECC        = IC_TAG_SIZE,
+  parameter int unsigned LineSizeECC       = IC_LINE_SIZE,
+  parameter bit          BranchPredictor   = 1'b0,
+  parameter bit          DbgTriggerEn      = 1'b0,
+  parameter int unsigned DbgHwBreakNum     = 1,
+  parameter bit          ResetAll          = 1'b0,
+  parameter lfsr_seed_t  RndCnstLfsrSeed   = RndCnstLfsrSeedDefault,
+  parameter lfsr_perm_t  RndCnstLfsrPerm   = RndCnstLfsrPermDefault,
+  parameter bit          SecureIbex        = 1'b0,
+  parameter bit          DummyInstructions = 1'b0,
+  parameter bit          RegFileECC        = 1'b0,
+  parameter int unsigned RegFileDataWidth  = 32,
+  parameter int unsigned DmHaltAddr        = 32'h1A110800,
+  parameter int unsigned DmExceptionAddr   = 32'h1A110808,
+  // CHERIoT paramters
+  parameter bit          CHERIoTEn         = 1'b1,
+  parameter int unsigned DataWidth         = 33,
+  parameter int unsigned HeapBase          = 32'h2001_0000,
+  parameter int unsigned TSMapBase         = 32'h2002_f000,
+  parameter int unsigned TSMapSize         = 1024,
+  parameter bit          MemCapFmt         = 1'b0,
+  parameter bit          CheriPPLBC        = 1'b1,
+  parameter bit          CheriSBND2        = 1'b0,
+  parameter bit          CheriTBRE         = 1'b1,
+  parameter bit          CheriStkZ         = 1'b1,
+  parameter int unsigned MMRegDinW         = 128,
+  parameter int unsigned MMRegDoutW        = 64,
+  parameter bit          CheriCapIT8       = 1'b0
+) (
+  // Clock and Reset
+  input  logic                         clk_i,
+  input  logic                         rst_ni,
+
+  input  logic [31:0]                  hart_id_i,
+  input  logic [31:0]                  boot_addr_i,
+  input  logic                         cheri_pmode_i,
+  input  logic                         cheri_tsafe_en_i,
+
+  // Instruction memory interface
+  output logic                         instr_req_o,
+  input  logic                         instr_gnt_i,
+  input  logic                         instr_rvalid_i,
+  output logic [31:0]                  instr_addr_o,
+  input  logic [31:0]                  instr_rdata_i,
+  input  logic                         instr_err_i,
+
+  // Data memory interface
+  output logic                         data_req_o,
+  output logic                         data_is_cap_o,
+  input  logic                         data_gnt_i,
+  input  logic                         data_rvalid_i,
+  output logic                         data_we_o,
+  output logic [3:0]                   data_be_o,
+  output logic [31:0]                  data_addr_o,
+  output logic [DataWidth-1:0]         data_wdata_o,
+  input  logic [DataWidth-1:0]         data_rdata_i,
+  input  logic                         data_err_i,
+
+  // Register file interface
+  output logic                         dummy_instr_id_o,
+  output logic [4:0]                   rf_raddr_a_o,
+  output logic [4:0]                   rf_raddr_b_o,
+  output logic [4:0]                   rf_waddr_wb_o,
+  output logic                         rf_we_wb_o,
+  output logic [RegFileDataWidth-1:0]  rf_wdata_wb_ecc_o,
+  output reg_cap_t                     rf_wcap_wb_o,
+  input  logic [RegFileDataWidth-1:0]  rf_rdata_a_ecc_i,
+  input  logic [RegFileDataWidth-1:0]  rf_rdata_b_ecc_i,
+  input  reg_cap_t                     rf_rcap_a_i,
+  input  reg_cap_t                     rf_rcap_b_i,
+  input  logic [31:0]                  rf_reg_rdy_i,
+
+  output logic                         rf_trsv_en_o,
+  output logic [4:0]                   rf_trsv_addr_o,
+  output logic [6:0]                   rf_trsv_par_o,
+  output logic [4:0]                   rf_trvk_addr_o,
+  output logic                         rf_trvk_en_o,
+  output logic                         rf_trvk_clrtag_o,
+  output logic [6:0]                   rf_trvk_par_o,
+  output logic                         tsmap_cs_o,
+  output logic [15:0]                  tsmap_addr_o,
+  input  logic [31:0]                  tsmap_rdata_i,
+  input  logic [MMRegDinW-1:0]         mmreg_corein_i,
+  output logic [MMRegDoutW-1:0]        mmreg_coreout_o,
+  output logic                         cheri_fatal_err_o,
+
+  // RAMs interface
+  output logic [IC_NUM_WAYS-1:0]       ic_tag_req_o,
+  output logic                         ic_tag_write_o,
+  output logic [IC_INDEX_W-1:0]        ic_tag_addr_o,
+  output logic [TagSizeECC-1:0]        ic_tag_wdata_o,
+  input  logic [TagSizeECC-1:0]        ic_tag_rdata_i [IC_NUM_WAYS],
+  output logic [IC_NUM_WAYS-1:0]       ic_data_req_o,
+  output logic                         ic_data_write_o,
+  output logic [IC_INDEX_W-1:0]        ic_data_addr_o,
+  output logic [LineSizeECC-1:0]       ic_data_wdata_o,
+  input  logic [LineSizeECC-1:0]       ic_data_rdata_i [IC_NUM_WAYS],
+  input  logic                         ic_scr_key_valid_i,
+
+  // Interrupt inputs
+  input  logic                         irq_software_i,
+  input  logic                         irq_timer_i,
+  input  logic                         irq_external_i,
+  input  logic [14:0]                  irq_fast_i,
+  input  logic                         irq_nm_i,       // non-maskeable interrupt
+  output logic                         irq_pending_o,
+
+  // Debug Interface
+  input  logic                         debug_req_i,
+  output crash_dump_t                  crash_dump_o,
+  output logic                         double_fault_seen_o,
+
+  // RISC-V Formal Interface
+  // Does not comply with the coding standards of _i/_o suffixes, but follows
+  // the convention of RISC-V Formal Interface Specification.
+`ifdef RVFI
+  output logic                         rvfi_valid,
+  output logic [63:0]                  rvfi_order,
+  output logic [31:0]                  rvfi_insn,
+  output logic                         rvfi_trap,
+  output logic                         rvfi_halt,
+  output logic                         rvfi_intr,
+  output logic [ 1:0]                  rvfi_mode,
+  output logic [ 1:0]                  rvfi_ixl,
+  output logic [ 4:0]                  rvfi_rs1_addr,
+  output logic [ 4:0]                  rvfi_rs2_addr,
+  output logic [ 4:0]                  rvfi_rs3_addr,
+  output logic [31:0]                  rvfi_rs1_rdata,
+  output reg_cap_t                     rvfi_rs1_rcap,
+  output logic [31:0]                  rvfi_rs2_rdata,
+  output reg_cap_t                     rvfi_rs2_rcap,
+  output logic [31:0]                  rvfi_rs3_rdata,
+  output logic [ 4:0]                  rvfi_rd_addr,
+  output logic [31:0]                  rvfi_rd_wdata,
+  output reg_cap_t                     rvfi_rd_wcap,
+  output logic [31:0]                  rvfi_pc_rdata,
+  output logic [31:0]                  rvfi_pc_wdata,
+  output logic                         rvfi_mem_is_cap,
+  output logic [31:0]                  rvfi_mem_addr,
+  output logic [ 3:0]                  rvfi_mem_rmask,
+  output logic [ 3:0]                  rvfi_mem_wmask,
+  output logic [DataWidth-1:0]         rvfi_mem_rdata,
+  output reg_cap_t                     rvfi_mem_rcap,
+  output logic [DataWidth-1:0]         rvfi_mem_wdata,
+  output reg_cap_t                     rvfi_mem_wcap,
+  output logic [31:0]                  rvfi_ext_mip,
+  output logic                         rvfi_ext_nmi,
+  output logic                         rvfi_ext_debug_req,
+  output logic [63:0]                  rvfi_ext_mcycle,
+`endif
+
+  // CPU Control Signals
+  input  fetch_enable_t                fetch_enable_i,
+  output logic                         alert_minor_o,
+  output logic                         alert_major_o,
+  output logic                         icache_inval_o,
+  output logic                         core_busy_o
+);
+
+  localparam int unsigned PMP_NUM_CHAN      = 3;
+  localparam bit          DataIndTiming     = SecureIbex;
+  localparam bit          PCIncrCheck       = SecureIbex;
+  localparam bit          ShadowCSR         = 1'b0;
+
+  // IF/ID signals
+  logic        dummy_instr_id;
+  logic        instr_valid_id;
+  logic        instr_executing_id;
+  logic        instr_new_id;
+  logic [31:0] instr_rdata_id;                 // Instruction sampled inside IF stage
+  logic [31:0] instr_rdata_alu_id;             // Instruction sampled inside IF stage (replicated to
+                                               // ease fan-out)
+  logic [15:0] instr_rdata_c_id;               // Compressed instruction sampled inside IF stage
+  logic        instr_is_compressed_id;
+  logic        instr_perf_count_id;
+  logic        instr_bp_taken_id;
+  logic        instr_fetch_err;                // Bus error on instr fetch
+  logic        instr_fetch_err_plus2;          // Instruction error is misaligned
+  logic        instr_fetch_cheri_acc_vio;         
+  logic        instr_fetch_cheri_bound_vio;         
+  logic        illegal_c_insn_id;              // Illegal compressed instruction sent to ID stage
+
+  logic [31:0] pc_if;                          // Program counter in IF stage
+  logic [31:0] pc_id;                          // Program counter in ID stage
+  logic [31:0] pc_wb;                          // Program counter in WB stage
+  logic [33:0] imd_val_d_ex[2];                // Intermediate register for multicycle Ops
+  logic [33:0] imd_val_q_ex[2];                // Intermediate register for multicycle Ops
+  logic [1:0]  imd_val_we_ex;
+
+  logic        data_ind_timing;
+  logic        dummy_instr_en;
+  logic [2:0]  dummy_instr_mask;
+  logic        dummy_instr_seed_en;
+  logic [31:0] dummy_instr_seed;
+  logic        icache_enable;
+  logic        icache_inval;
+  logic        pc_mismatch_alert;
+  logic        csr_shadow_err;
+
+  logic        instr_first_cycle_id;
+  logic        instr_valid_clear;
+  logic        pc_set;
+  logic        nt_branch_mispredict;
+  logic [31:0] nt_branch_addr;
+  pc_sel_e     pc_mux_id;                      // Mux selector for next PC
+  exc_pc_sel_e exc_pc_mux_id;                  // Mux selector for exception PC
+  exc_cause_e  exc_cause;                      // Exception cause
+
+  logic        lsu_load_err;
+  logic        lsu_store_err;
+  logic        lsu_err_is_cheri;
+
+  // LSU signals
+  logic        lsu_addr_incr_req;
+  logic [31:0] lsu_addr_last;
+  logic [31:0] lsu_addr;
+
+  // Jump and branch target and decision (EX->IF)
+  logic [31:0] branch_target_ex_rv32;
+  logic [31:0] branch_target_ex_cheri;
+  logic [31:0] branch_target_ex;
+  logic        branch_decision;
+
+  // Core busy signals
+  logic        ctrl_busy;
+  logic        if_busy;
+  logic        lsu_busy;
+
+  logic        lsu_busy_tbre;
+
+  // Register File
+  logic [4:0]  rf_raddr_a;
+  logic [31:0] rf_rdata_a;
+  logic [4:0]  rf_raddr_b;
+  logic [31:0] rf_rdata_b;
+  logic        rf_ren_a;
+  logic        rf_ren_b;
+  logic [4:0]  rf_waddr_wb;
+  logic [31:0] rf_wdata_wb;
+
+  reg_cap_t    rf_wcap_wb;
+
+  // Writeback register write data that can be used on the forwarding path (doesn't factor in memory
+  // read data as this is too late for the forwarding path)
+  logic [31:0] rf_wdata_fwd_wb;
+
+  reg_cap_t    rf_wcap_fwd_wb;
+
+  logic [32:0] rf_wdata_lsu;
+  reg_cap_t    rf_wcap_lsu;
+  logic        rf_we_wb;
+  logic        rf_we_lsu;
+  logic        rf_ecc_err_comb;
+
+  logic [4:0]  rf_waddr_id;
+  logic [31:0] rf_wdata_id;
+  logic        rf_we_id;
+  logic        rf_rd_a_wb_match;
+  logic        rf_rd_b_wb_match;
+
+  // ALU Control
+  alu_op_e     alu_operator_ex;
+  logic [31:0] alu_operand_a_ex;
+  logic [31:0] alu_operand_b_ex;
+
+  logic [31:0] bt_a_operand;
+  logic [31:0] bt_b_operand;
+
+  logic [31:0] alu_adder_result_ex;    // Used to forward computed address to LSU
+  logic [31:0] result_ex;
+
+  // Multiplier Control
+  logic        mult_en_ex;
+  logic        div_en_ex;
+  logic        mult_sel_ex;
+  logic        div_sel_ex;
+  md_op_e      multdiv_operator_ex;
+  logic [1:0]  multdiv_signed_mode_ex;
+  logic [31:0] multdiv_operand_a_ex;
+  logic [31:0] multdiv_operand_b_ex;
+  logic        multdiv_ready_id;
+
+  // CSR control
+  logic        csr_access;
+  csr_op_e     csr_op;
+  logic        csr_op_en;
+  csr_num_e    csr_addr;
+  logic [31:0] csr_rdata;
+  logic [31:0] csr_wdata;
+  logic        illegal_csr_insn_id;    // CSR access to non-existent register,
+                                       // with wrong priviledge level,
+                                       // or missing write permissions
+
+  // Data Memory Control
+  logic        lsu_we;
+  logic [1:0]  lsu_type;
+  logic        lsu_sign_ext;
+  logic        lsu_req;
+  logic [32:0] lsu_wdata;
+  reg_cap_t    lsu_wcap;
+  logic        lsu_req_done;
+
+  // stall control
+  logic        id_in_ready;
+  logic        ex_valid;
+
+  logic        lsu_resp_valid;
+  logic        lsu_resp_err;
+
+  // Signals between instruction core interface and pipe (if and id stages)
+  logic        instr_req_int;          // Id stage asserts a req to instruction core interface
+  logic        instr_req_gated;
+
+  // Writeback stage
+  logic           en_wb;
+  wb_instr_type_e instr_type_wb;
+  logic           ready_wb;
+  logic           rf_write_wb;
+  logic           outstanding_load_wb;
+  logic           outstanding_store_wb;
+
+  // Interrupts
+  logic        nmi_mode;
+  irqs_t       irqs;
+  logic        csr_mstatus_mie;
+  logic [31:0] csr_mepc, csr_depc;
+
+  // PMP signals
+  logic [33:0]  csr_pmp_addr [PMPNumRegions];
+  pmp_cfg_t     csr_pmp_cfg  [PMPNumRegions];
+  pmp_mseccfg_t csr_pmp_mseccfg;
+  logic         pmp_req_err  [PMP_NUM_CHAN];
+  logic         data_req_out;
+
+  logic        csr_save_if;
+  logic        csr_save_id;
+  logic        csr_save_wb;
+  logic        csr_restore_mret_id;
+  logic        csr_restore_dret_id;
+  logic        csr_save_cause;
+  logic        csr_mepcc_clrtag;
+  logic        csr_mtvec_init;
+  logic [31:0] csr_mtvec;
+  logic [31:0] csr_mtval;
+  logic        csr_mstatus_tw;
+  priv_lvl_e   priv_mode_id;
+  priv_lvl_e   priv_mode_lsu;
+
+  // debug mode and dcsr configuration
+  logic        debug_mode;
+  dbg_cause_e  debug_cause;
+  logic        debug_csr_save;
+  logic        debug_single_step;
+  logic        debug_ebreakm;
+  logic        debug_ebreaku;
+  logic        trigger_match;
+
+  // signals relating to instruction movements between pipeline stages
+  // used by performance counters and RVFI
+  logic        instr_id_done;
+  logic        instr_done_wb;
+
+  logic        perf_instr_ret_wb;
+  logic        perf_instr_ret_compressed_wb;
+  logic        perf_instr_ret_wb_spec;
+  logic        perf_instr_ret_compressed_wb_spec;
+  logic        perf_iside_wait;
+  logic        perf_dside_wait;
+  logic        perf_mul_wait;
+  logic        perf_div_wait;
+  logic        perf_jump;
+  logic        perf_branch;
+  logic        perf_tbranch;
+  logic        perf_load;
+  logic        perf_store;
+
+  // for RVFI
+  logic        illegal_insn_id, unused_illegal_insn_id; // ID stage sees an illegal instruction
+
+  pcc_cap_t     pcc_cap_r, pcc_cap_w;
+
+  logic          cheri_branch_req;
+  logic          cheri_branch_req_spec;
+  logic          instr_is_cheri_id;
+  logic          instr_is_rv32lsu_id;
+  logic          cheri_exec_id;
+  logic [11:0]   cheri_imm12;
+  logic [19:0]   cheri_imm20;
+  logic [20:0]   cheri_imm21;
+  logic  [4:0]   cheri_cs2_dec;
+  logic          cheri_load_id;
+  logic          cheri_store_id;
+  logic          cheri_rf_we;
+  logic [31:0]   cheri_result_data;
+  reg_cap_t      cheri_result_cap;
+  logic          cheri_ex_valid;
+  logic          cheri_ex_err;
+  logic [11:0]   cheri_ex_err_info;
+  logic          cheri_wb_err;
+  logic [15:0]   cheri_wb_err_info;
+  logic [OPDW-1:0] cheri_operator;
+
+  logic          rv32_lsu_req;
+  logic          rv32_lsu_we;
+  logic [1:0]    rv32_lsu_type;
+  logic [31:0]   rv32_lsu_wdata;
+  logic          rv32_lsu_sign_ext;
+  logic          rv32_lsu_addr_incr_req;
+  logic [31:0]   rv32_lsu_addr_last;
+  logic          rv32_lsu_resp_valid;
+
+  logic          cheri_csr_access;
+  logic [4:0]    cheri_csr_addr;
+  logic [31:0]   cheri_csr_wdata;
+  reg_cap_t      cheri_csr_wcap;
+  cheri_csr_op_e cheri_csr_op;
+  logic          cheri_csr_op_en;
+  logic [31:0]   cheri_csr_rdata;
+  reg_cap_t      cheri_csr_rcap;
+  logic          cheri_csr_set_mie;
+  logic          cheri_csr_clr_mie;
+
+  logic          lsu_is_cap, lsu_cheri_err;
+  logic [3:0]    lsu_lc_clrperm;
+
+  logic          csr_dbg_tclr_fault;
+
+  logic [31:0]   csr_mshwm;
+  logic [31:0]   csr_mshwmb;
+  logic          csr_mshwm_set;
+  logic [31:0]   csr_mshwm_new;
+  logic          ztop_wr;
+  logic [31:0]   ztop_wdata;
+  full_cap_t     ztop_wfcap;
+  logic [31:0]   ztop_rdata;
+  reg_cap_t      ztop_rcap;
+
+  logic          stkz_active;
+  logic          stkz_abort;
+  logic [31:0]   stkz_ptr;
+  logic [31:0]   stkz_base;
+
+  logic          lsu_tbre_resp_valid;
+  logic          lsu_tbre_resp_err;
+  logic          lsu_resp_is_wr;
+  logic [32:0]   lsu_tbre_raw_lsw;   
+  logic          lsu_tbre_req_done;   
+  logic          lsu_tbre_addr_incr;
+  logic          tbre_lsu_req;
+  logic          tbre_lsu_is_cap;
+  logic          tbre_lsu_we;
+  logic [31:0]   tbre_lsu_addr;
+  logic [32:0]   tbre_lsu_wdata;
+  logic          tbre_trvk_en;
+  logic          tbre_trvk_clrtag;
+
+  logic          lsu_tbre_sel, cpu_lsu_dec;
+  logic          rf_trsv_en;
+
+  logic          cpu_stall_by_stkz, cpu_grant_to_stkz;
+
+
+  //////////////////////
+  // Clock management //
+  //////////////////////
+
+  // Before going to sleep, wait for I- and D-side
+  // interfaces to finish ongoing operations.
+  assign core_busy_o = ctrl_busy | if_busy | lsu_busy;
+
+  //////////////
+  // IF stage //
+  //////////////
+
+  cheriot_if_stage #(
+    .DmHaltAddr       (DmHaltAddr),
+    .DmExceptionAddr  (DmExceptionAddr),
+    .DummyInstructions(DummyInstructions),
+    .ICache           (ICache),
+    .ICacheECC        (ICacheECC),
+    .BusSizeECC       (BusSizeECC),
+    .TagSizeECC       (TagSizeECC),
+    .LineSizeECC      (LineSizeECC),
+    .PCIncrCheck      (PCIncrCheck),
+    .ResetAll         (ResetAll          ),
+    .RndCnstLfsrSeed  (RndCnstLfsrSeed   ),
+    .RndCnstLfsrPerm  (RndCnstLfsrPerm   ),
+    .BranchPredictor  (BranchPredictor),
+    .CHERIoTEn        (CHERIoTEn)
+  ) if_stage_i (
+    .clk_i (clk_i),
+    .rst_ni(rst_ni),
+
+    .cheri_pmode_i  (cheri_pmode_i),
+    .boot_addr_i    (boot_addr_i),
+    .req_i          (instr_req_gated),  // instruction request control
+    .debug_mode_i   (debug_mode),
+
+    // instruction cache interface
+    .instr_req_o    (instr_req_o),
+    .instr_addr_o   (instr_addr_o),
+    .instr_gnt_i    (instr_gnt_i),
+    .instr_rvalid_i (instr_rvalid_i),
+    .instr_rdata_i  (instr_rdata_i),
+    .instr_err_i    (instr_err_i),
+
+    .ic_tag_req_o      (ic_tag_req_o),
+    .ic_tag_write_o    (ic_tag_write_o),
+    .ic_tag_addr_o     (ic_tag_addr_o),
+    .ic_tag_wdata_o    (ic_tag_wdata_o),
+    .ic_tag_rdata_i    (ic_tag_rdata_i),
+    .ic_data_req_o     (ic_data_req_o),
+    .ic_data_write_o   (ic_data_write_o),
+    .ic_data_addr_o    (ic_data_addr_o),
+    .ic_data_wdata_o   (ic_data_wdata_o),
+    .ic_data_rdata_i   (ic_data_rdata_i),
+    .ic_scr_key_valid_i(ic_scr_key_valid_i),
+
+    // outputs to ID stage
+    .instr_valid_id_o        (instr_valid_id),
+    .instr_new_id_o          (instr_new_id),
+    .instr_rdata_id_o        (instr_rdata_id),
+    .instr_rdata_alu_id_o    (instr_rdata_alu_id),
+    .instr_rdata_c_id_o      (instr_rdata_c_id),
+    .instr_is_compressed_id_o(instr_is_compressed_id),
+    .instr_bp_taken_o        (instr_bp_taken_id),
+    .instr_fetch_err_o       (instr_fetch_err),
+    .instr_fetch_err_plus2_o (instr_fetch_err_plus2),
+    .instr_fetch_cheri_acc_vio_o   (instr_fetch_cheri_acc_vio),       
+    .instr_fetch_cheri_bound_vio_o (instr_fetch_cheri_bound_vio),       
+
+    .illegal_c_insn_id_o     (illegal_c_insn_id),
+    .dummy_instr_id_o        (dummy_instr_id),
+    .pc_if_o                 (pc_if),
+    .pc_id_o                 (pc_id),
+    .pmp_err_if_i            (pmp_req_err[PMP_I]),
+    .pmp_err_if_plus2_i      (pmp_req_err[PMP_I2]),
+
+    // control signals
+    .instr_valid_clear_i   (instr_valid_clear),
+    .pc_set_i              (pc_set),
+    .pc_mux_i              (pc_mux_id),
+    .nt_branch_mispredict_i(nt_branch_mispredict),
+    .exc_pc_mux_i          (exc_pc_mux_id),
+    .exc_cause             (exc_cause),
+    .dummy_instr_en_i      (dummy_instr_en),
+    .dummy_instr_mask_i    (dummy_instr_mask),
+    .dummy_instr_seed_en_i (dummy_instr_seed_en),
+    .dummy_instr_seed_i    (dummy_instr_seed),
+    .icache_enable_i       (icache_enable),
+    .icache_inval_i        (icache_inval),
+
+    // branch targets
+    .branch_target_ex_i(branch_target_ex),
+    .nt_branch_addr_i  (nt_branch_addr),
+
+    // CSRs
+    .csr_mepc_i      (csr_mepc),  // exception return address
+    .csr_depc_i      (csr_depc),  // debug return address
+    .csr_mtvec_i     (csr_mtvec),  // trap-vector base address
+    .csr_mtvec_init_o(csr_mtvec_init),
+
+    // pipeline stalls
+    .id_in_ready_i(id_in_ready),
+
+    .pc_mismatch_alert_o(pc_mismatch_alert),
+    .if_busy_o          (if_busy),
+    .pcc_cap_i          (pcc_cap_r)
+  );
+
+  // Core is waiting for the ISide when ID/EX stage is ready for a new instruction but none are
+  // available
+  assign perf_iside_wait = id_in_ready & ~instr_valid_id;
+
+  // Multi-bit fetch enable used when SecureIbex == 1. When SecureIbex == 0 only use the bottom-bit
+  // of fetch_enable_i. Ensure the multi-bit encoding has the bottom bit set for on and unset for
+  // off so FetchEnableOn/FetchEnableOff can be used without needing to know the value of
+  // SecureIbex.
+  `ASSERT_INIT(FetchEnableSecureOnBottomBitSet,    FetchEnableOn[0] == 1'b1)
+  `ASSERT_INIT(FetchEnableSecureOffBottomBitClear, FetchEnableOff[0] == 1'b0)
+
+  // fetch_enable_i can be used to stop the core fetching new instructions
+  if (SecureIbex) begin : g_instr_req_gated_secure
+    // For secure Ibex fetch_enable_i must be a specific multi-bit pattern to enable instruction
+    // fetch
+    assign instr_req_gated = instr_req_int & (fetch_enable_i == FetchEnableOn);
+  end else begin : g_instr_req_gated_non_secure
+    // For non secure Ibex only the bottom bit of fetch enable is considered
+    logic unused_fetch_enable;
+    assign unused_fetch_enable = ^fetch_enable_i[$bits(fetch_enable_t)-1:1];
+
+    assign instr_req_gated = instr_req_int & fetch_enable_i[0];
+  end
+
+  //////////////
+  // ID stage //
+  //////////////
+
+  cheriot_id_stage #(
+    .RV32E          (RV32E),
+    .RV32M          (RV32M),
+    .RV32B          (RV32B),
+    .BranchTargetALU(BranchTargetALU),
+    .DataIndTiming  (DataIndTiming),
+    .WritebackStage (WritebackStage),
+    .BranchPredictor(BranchPredictor),
+    .CHERIoTEn      (CHERIoTEn),
+    .CheriPPLBC     (CheriPPLBC),
+    .CheriSBND2     (CheriSBND2)
+  ) id_stage_i (
+    .clk_i (clk_i),
+    .rst_ni(rst_ni),
+
+    .cheri_pmode_i        (cheri_pmode_i),
+    .cheri_tsafe_en_i     (cheri_tsafe_en_i),
+
+    // Processor Enable
+    .ctrl_busy_o   (ctrl_busy),
+    .illegal_insn_o(illegal_insn_id),
+
+    // from/to IF-ID pipeline register
+    .instr_valid_i        (instr_valid_id),
+    .instr_rdata_i        (instr_rdata_id),
+    .instr_rdata_alu_i    (instr_rdata_alu_id),
+    .instr_rdata_c_i      (instr_rdata_c_id),
+    .instr_is_compressed_i(instr_is_compressed_id),
+    .instr_bp_taken_i     (instr_bp_taken_id),
+
+    // Jumps and branches
+    .branch_decision_i(branch_decision),
+
+    // IF and ID control signals
+    .instr_first_cycle_id_o(instr_first_cycle_id),
+    .instr_valid_clear_o   (instr_valid_clear),
+    .id_in_ready_o         (id_in_ready),
+    .instr_req_o           (instr_req_int),
+    .pc_set_o              (pc_set),
+    .pc_mux_o              (pc_mux_id),
+    .nt_branch_mispredict_o(nt_branch_mispredict),
+    .nt_branch_addr_o      (nt_branch_addr),
+    .exc_pc_mux_o          (exc_pc_mux_id),
+    .exc_cause_o           (exc_cause),
+    .icache_inval_o        (icache_inval),
+
+    .instr_fetch_err_i      (instr_fetch_err),
+    .instr_fetch_err_plus2_i(instr_fetch_err_plus2),
+    .instr_fetch_cheri_acc_vio_i  (instr_fetch_cheri_acc_vio),       
+    .instr_fetch_cheri_bound_vio_i (instr_fetch_cheri_bound_vio),       
+
+    .illegal_c_insn_i       (illegal_c_insn_id),
+
+    .pc_id_i(pc_id),
+
+    // Stalls
+    .ex_valid_i      (ex_valid),
+    .lsu_resp_valid_i(lsu_resp_valid),
+
+    .alu_operator_ex_o (alu_operator_ex),
+    .alu_operand_a_ex_o(alu_operand_a_ex),
+    .alu_operand_b_ex_o(alu_operand_b_ex),
+
+    .imd_val_q_ex_o (imd_val_q_ex),
+    .imd_val_d_ex_i (imd_val_d_ex),
+    .imd_val_we_ex_i(imd_val_we_ex),
+
+    .bt_a_operand_o(bt_a_operand),
+    .bt_b_operand_o(bt_b_operand),
+
+    .mult_en_ex_o            (mult_en_ex),
+    .div_en_ex_o             (div_en_ex),
+    .mult_sel_ex_o           (mult_sel_ex),
+    .div_sel_ex_o            (div_sel_ex),
+    .multdiv_operator_ex_o   (multdiv_operator_ex),
+    .multdiv_signed_mode_ex_o(multdiv_signed_mode_ex),
+    .multdiv_operand_a_ex_o  (multdiv_operand_a_ex),
+    .multdiv_operand_b_ex_o  (multdiv_operand_b_ex),
+    .multdiv_ready_id_o      (multdiv_ready_id),
+
+    // CSR ID/EX
+    .csr_access_o         (csr_access),
+    .csr_op_o             (csr_op),
+    .csr_op_en_o          (csr_op_en),
+    .csr_save_if_o        (csr_save_if),  // control signal to save PC
+    .csr_save_id_o        (csr_save_id),  // control signal to save PC
+    .csr_save_wb_o        (csr_save_wb),  // control signal to save PC
+    .csr_restore_mret_id_o(csr_restore_mret_id),  // restore mstatus upon MRET
+    .csr_restore_dret_id_o(csr_restore_dret_id),  // restore mstatus upon MRET
+    .csr_save_cause_o     (csr_save_cause),
+    .csr_mepcc_clrtag_o   (csr_mepcc_clrtag),
+    .csr_mtval_o          (csr_mtval),
+    .priv_mode_i          (priv_mode_id),
+    .csr_mstatus_tw_i     (csr_mstatus_tw),
+    .illegal_csr_insn_i   (illegal_csr_insn_id),
+    .data_ind_timing_i    (data_ind_timing),
+    .csr_pcc_perm_sr_i    (pcc_cap_r.perms[PERM_SR]),
+
+    // LSU
+    .lsu_req_o        (rv32_lsu_req),  // to load store unit
+    .lsu_we_o         (rv32_lsu_we),  // to load store unit
+    .lsu_type_o       (rv32_lsu_type),  // to load store unit
+    .lsu_sign_ext_o   (rv32_lsu_sign_ext),  // to load store unit
+    .lsu_wdata_o      (rv32_lsu_wdata),  // to load store unit
+    .lsu_req_done_i   (lsu_req_done),  // from load store unit
+
+    .lsu_addr_incr_req_i(rv32_lsu_addr_incr_req),
+    .lsu_addr_last_i    (rv32_lsu_addr_last),
+
+    .lsu_load_err_i (lsu_load_err),
+    .lsu_store_err_i(lsu_store_err),
+    .lsu_err_is_cheri_i(lsu_err_is_cheri),
+
+    // Interrupt Signals
+    .csr_mstatus_mie_i(csr_mstatus_mie),
+    .irq_pending_i    (irq_pending_o),
+    .irqs_i           (irqs),
+    .irq_nm_i         (irq_nm_i),
+    .nmi_mode_o       (nmi_mode),
+
+    // Debug Signal
+    .debug_mode_o       (debug_mode),
+    .debug_cause_o      (debug_cause),
+    .debug_csr_save_o   (debug_csr_save),
+    .debug_req_i        (debug_req_i),
+    .debug_single_step_i(debug_single_step),
+    .debug_ebreakm_i    (debug_ebreakm),
+    .debug_ebreaku_i    (debug_ebreaku),
+    .trigger_match_i    (trigger_match),
+
+    // write data to commit in the register file
+    .result_ex_i(result_ex),
+    .csr_rdata_i(csr_rdata),
+
+    .rf_raddr_a_o      (rf_raddr_a),
+    .rf_rdata_a_i      (rf_rdata_a),
+    .rf_raddr_b_o      (rf_raddr_b),
+    .rf_rdata_b_i      (rf_rdata_b),
+    .rf_ren_a_o        (rf_ren_a),
+    .rf_ren_b_o        (rf_ren_b),
+    .rf_waddr_id_o     (rf_waddr_id),
+    .rf_wdata_id_o     (rf_wdata_id),
+    .rf_we_id_o        (rf_we_id),
+    .rf_rd_a_wb_match_o(rf_rd_a_wb_match),
+    .rf_rd_b_wb_match_o(rf_rd_b_wb_match),
+
+    .rf_waddr_wb_i    (rf_waddr_wb),
+    .rf_wdata_fwd_wb_i(rf_wdata_fwd_wb),
+    .rf_write_wb_i    (rf_write_wb),
+    .rf_reg_rdy_i     (rf_reg_rdy_i),
+
+    .en_wb_o               (en_wb),
+    .instr_type_wb_o       (instr_type_wb),
+    .instr_perf_count_id_o (instr_perf_count_id),
+    .ready_wb_i            (ready_wb),
+    .outstanding_load_wb_i (outstanding_load_wb),
+    .outstanding_store_wb_i(outstanding_store_wb),
+
+    // Performance Counters
+    .perf_jump_o      (perf_jump),
+    .perf_branch_o    (perf_branch),
+    .perf_tbranch_o   (perf_tbranch),
+    .perf_dside_wait_o(perf_dside_wait),
+    .perf_mul_wait_o  (perf_mul_wait),
+    .perf_div_wait_o  (perf_div_wait),
+    .instr_id_done_o  (instr_id_done),
+
+    .cheri_exec_id_o       (cheri_exec_id),
+    .instr_is_cheri_id_o   (instr_is_cheri_id),
+    .instr_is_rv32lsu_id_o (instr_is_rv32lsu_id),
+    .cheri_imm12_o         (cheri_imm12),
+    .cheri_imm20_o         (cheri_imm20),
+    .cheri_imm21_o         (cheri_imm21),
+    .cheri_operator_o      (cheri_operator),
+    .cheri_cs2_dec_o       (cheri_cs2_dec),
+    .cheri_load_o          (cheri_load_id),
+    .cheri_store_o         (cheri_store_id),
+    .cheri_ex_valid_i      (cheri_ex_valid),
+    .cheri_ex_err_i        (cheri_ex_err),
+    .cheri_ex_err_info_i   (cheri_ex_err_info),
+    .cheri_wb_err_i        (cheri_wb_err),
+    .cheri_wb_err_info_i   (cheri_wb_err_info),
+    .cheri_branch_req_i    (cheri_branch_req_spec),
+    .cheri_branch_target_i (branch_target_ex_cheri)
+  );
+
+
+  assign icache_inval_o = icache_inval;
+  // for RVFI only
+  assign unused_illegal_insn_id = illegal_insn_id;
+
+  cheriot_ex_block #(
+    .RV32M          (RV32M),
+    .RV32B          (RV32B),
+    .BranchTargetALU(BranchTargetALU)
+  ) ex_block_i (
+    .clk_i (clk_i),
+    .rst_ni(rst_ni),
+
+    // ALU signal from ID stage
+    .alu_operator_i         (alu_operator_ex),
+    .alu_operand_a_i        (alu_operand_a_ex),
+    .alu_operand_b_i        (alu_operand_b_ex),
+    .alu_instr_first_cycle_i(instr_first_cycle_id),
+
+    // Branch target ALU signal from ID stage
+    .bt_a_operand_i(bt_a_operand),
+    .bt_b_operand_i(bt_b_operand),
+
+    // Multipler/Divider signal from ID stage
+    .multdiv_operator_i   (multdiv_operator_ex),
+    .mult_en_i            (mult_en_ex),
+    .div_en_i             (div_en_ex),
+    .mult_sel_i           (mult_sel_ex),
+    .div_sel_i            (div_sel_ex),
+    .multdiv_signed_mode_i(multdiv_signed_mode_ex),
+    .multdiv_operand_a_i  (multdiv_operand_a_ex),
+    .multdiv_operand_b_i  (multdiv_operand_b_ex),
+    .multdiv_ready_id_i   (multdiv_ready_id),
+    .data_ind_timing_i    (data_ind_timing),
+
+    // Intermediate value register
+    .imd_val_we_o(imd_val_we_ex),
+    .imd_val_d_o (imd_val_d_ex),
+    .imd_val_q_i (imd_val_q_ex),
+
+    // Outputs
+    .alu_adder_result_ex_o(alu_adder_result_ex),  // to LSU
+    .result_ex_o          (result_ex),  // to ID
+
+    .branch_target_o  (branch_target_ex_rv32),  // to IF
+    .branch_decision_o(branch_decision),  // to ID
+
+    .ex_valid_o(ex_valid)
+  );
+
+  //////////////
+  // cheri EX //
+  //////////////
+  if (CHERIoTEn) begin : g_cheri_ex
+    cheri_ex #(
+      .WritebackStage       (WritebackStage),
+      .MemCapFmt            (MemCapFmt),
+      .HeapBase             (HeapBase),
+      .TSMapBase            (TSMapBase),
+      .TSMapSize            (TSMapSize),
+      .CheriPPLBC           (CheriPPLBC),
+      .CheriSBND2           (CheriSBND2),
+      .CheriStkZ            (CheriStkZ),
+      .CheriCapIT8          (CheriCapIT8)
+    ) u_cheri_ex (
+      .clk_i                (clk_i),
+      .rst_ni               (rst_ni),
+      .cheri_pmode_i        (cheri_pmode_i),
+      .cheri_tsafe_en_i     (cheri_tsafe_en_i),
+      .debug_mode_i         (debug_mode),
+      .fwd_we_i             (rf_write_wb),
+      .fwd_waddr_i          (rf_waddr_wb),
+      .fwd_wdata_i          (rf_wdata_fwd_wb),
+      .fwd_wcap_i           (rf_wcap_fwd_wb),
+      .rf_raddr_a_i         (rf_raddr_a),
+      .rf_rdata_a_i         (rf_rdata_a),
+      .rf_rcap_a_i          (rf_rcap_a_i),
+      .rf_raddr_b_i         (rf_raddr_b),
+      .rf_rdata_b_i         (rf_rdata_b),
+      .rf_rcap_b_i          (rf_rcap_b_i),
+      .rf_trsv_en_o         (rf_trsv_en),
+      .rf_waddr_i           (rf_waddr_id),
+      .pcc_cap_i            (pcc_cap_r),
+      .pcc_cap_o            (pcc_cap_w),
+      .pc_id_i              (pc_id),
+      .branch_req_o         (cheri_branch_req),
+      .branch_req_spec_o    (cheri_branch_req_spec),
+      .branch_target_o      (branch_target_ex_cheri),
+      .cheri_exec_id_i      (cheri_exec_id),
+      .instr_valid_i        (instr_valid_id),
+      .instr_first_cycle_i  (instr_first_cycle_id),
+      .instr_is_cheri_i     (instr_is_cheri_id),
+      .instr_is_rv32lsu_i   (instr_is_rv32lsu_id),
+      .instr_is_compressed_i(instr_is_compressed_id),
+      .cheri_imm12_i        (cheri_imm12),
+      .cheri_imm20_i        (cheri_imm20),
+      .cheri_imm21_i        (cheri_imm21),
+      .cheri_operator_i     (cheri_operator),
+      .cheri_cs2_dec_i      (cheri_cs2_dec),
+      .cheri_rf_we_o        (cheri_rf_we),
+      .result_data_o        (cheri_result_data),
+      .result_cap_o         (cheri_result_cap),
+      .cheri_ex_valid_o     (cheri_ex_valid),
+      .cheri_ex_err_o       (cheri_ex_err),
+      .cheri_ex_err_info_o  (cheri_ex_err_info),
+      .cheri_wb_err_o       (cheri_wb_err),
+      .cheri_wb_err_info_o  (cheri_wb_err_info),
+      .lsu_req_o            (lsu_req),
+      .lsu_is_cap_o         (lsu_is_cap),
+      .lsu_lc_clrperm_o     (lsu_lc_clrperm),
+      .lsu_cheri_err_o      (lsu_cheri_err),
+      .lsu_we_o             (lsu_we),
+      .lsu_addr_o           (lsu_addr),
+      .lsu_type_o           (lsu_type),
+      .lsu_wdata_o          (lsu_wdata),
+      .lsu_wcap_o           (lsu_wcap),
+      .cpu_stall_by_stkz_o  (cpu_stall_by_stkz),
+      .cpu_grant_to_stkz_o  (cpu_grant_to_stkz),
+      .lsu_sign_ext_o       (lsu_sign_ext),
+      .addr_incr_req_i      (lsu_addr_incr_req),
+      .addr_last_i          (lsu_addr_last),
+      .lsu_req_done_i       (lsu_req_done),
+      .lsu_rdata_i          (rf_wdata_lsu),
+      .lsu_rcap_i           (rf_wcap_lsu),
+      .rv32_lsu_req_i       (rv32_lsu_req),
+      .rv32_lsu_we_i        (rv32_lsu_we),
+      .rv32_lsu_type_i      (rv32_lsu_type),
+      .rv32_lsu_wdata_i     (rv32_lsu_wdata),
+      .rv32_lsu_sign_ext_i  (rv32_lsu_sign_ext),
+      .rv32_lsu_addr_i      (alu_adder_result_ex),
+      .rv32_addr_incr_req_o (rv32_lsu_addr_incr_req),
+      .rv32_addr_last_o     (rv32_lsu_addr_last),
+      .lsu_tbre_sel_i       (lsu_tbre_sel),
+      .tbre_lsu_req_i       (tbre_lsu_req),
+      .tbre_lsu_is_cap_i    (tbre_lsu_is_cap),
+      .tbre_lsu_we_i        (tbre_lsu_we),
+      .tbre_lsu_addr_i      (tbre_lsu_addr),
+      .tbre_lsu_wdata_i     (tbre_lsu_wdata),
+      .cpu_lsu_dec_o        (cpu_lsu_dec),  
+      .csr_rdata_i          (cheri_csr_rdata),
+      .csr_rcap_i           (cheri_csr_rcap),
+      .csr_mstatus_mie_i    (csr_mstatus_mie),
+      .csr_access_o         (cheri_csr_access),
+      .csr_addr_o           (cheri_csr_addr),
+      .csr_wdata_o          (cheri_csr_wdata),
+      .csr_wcap_o           (cheri_csr_wcap),
+      .csr_op_o             (cheri_csr_op),
+      .csr_op_en_o          (cheri_csr_op_en),
+      .csr_set_mie_o        (cheri_csr_set_mie),
+      .csr_clr_mie_o        (cheri_csr_clr_mie),
+      .csr_mshwm_i          (csr_mshwm),
+      .csr_mshwmb_i         (csr_mshwmb),
+      .csr_mshwm_set_o      (csr_mshwm_set),
+      .csr_mshwm_new_o      (csr_mshwm_new),
+      .stkz_active_i        (stkz_active),
+      .stkz_abort_i         (stkz_abort),
+      .stkz_ptr_i           (stkz_ptr),
+      .stkz_base_i          (stkz_base),
+      .ztop_wr_o            (ztop_wr),  
+      .ztop_wdata_o         (ztop_wdata),
+      .ztop_wfcap_o         (ztop_wfcap),
+      .ztop_rdata_i         (ztop_rdata),
+      .ztop_rcap_i          (ztop_rcap),
+      .csr_dbg_tclr_fault_i (csr_dbg_tclr_fault)
+    );
+
+    assign rf_trsv_en_o     = rf_trsv_en;
+    assign rf_trsv_addr_o   = rf_waddr_id;
+    assign branch_target_ex = (instr_valid_id & instr_is_cheri_id) ? branch_target_ex_cheri : branch_target_ex_rv32;
+  end else begin : gen_no_cheri_ex
+    assign rf_trsv_en_o           = 1'b0;
+    assign rf_trsv_addr_o         = 5'h0;
+                                  
+    assign cheri_branch_req       = 1'b0;
+    assign cheri_branch_req_spec  = 1'b0;
+    assign branch_target_ex       = branch_target_ex_rv32;
+    assign pcc_cap_w              = NULL_PCC_CAP;
+                                  
+    assign cheri_rf_we            = 1'b0;
+    assign cheri_result_data      = 32'h0;
+    assign cheri_result_cap       = NULL_REG_CAP;
+                                  
+    assign cheri_ex_valid         = 1'b0;
+    assign cheri_ex_err           = 1'b0;
+    assign cheri_ex_err_info      = 11'h0;
+    assign cheri_wb_err           = 1'b0;
+    assign cheri_wb_err_info      = 16'h0;
+
+    assign lsu_req                = rv32_lsu_req;
+    assign lsu_is_cap             = 1'b0;
+    assign lsu_lc_clrperm         = 4'h0;
+    assign lsu_cheri_err          = 1'b0;
+    assign lsu_we                 = rv32_lsu_we;
+    assign lsu_addr               = alu_adder_result_ex;
+    assign lsu_type               = rv32_lsu_type;
+    assign lsu_wdata              = rv32_lsu_wdata;
+    assign lsu_wcap               = NULL_REG_CAP;
+    assign lsu_sign_ext           = rv32_lsu_sign_ext;
+    assign rv32_lsu_addr_incr_req = lsu_addr_incr_req;
+    assign rv32_lsu_addr_last     = lsu_addr_last;
+
+    assign cpu_lsu_dec            = 1'b0;
+    assign cheri_csr_access       = 1'b0;
+    assign cheri_csr_addr         = 5'h0;
+    assign cheri_csr_wdata        = 32'h0;
+    assign cheri_csr_wcap         = NULL_REG_CAP;
+    assign cheri_csr_op           = CHERI_CSR_NULL;
+    assign cheri_csr_op_en        = 1'b0;
+    assign cheri_csr_set_mie      = 1'b0;
+    assign cheri_csr_clr_mie      = 1'b0;
+     
+    assign csr_mshwm_set          = 1'b0;
+    assign csr_mshwm_new          = 1'b0;
+ 
+  end
+
+  /////////////////////////////
+  // cheri TS pipeline stage //
+  /////////////////////////////
+
+  if (CHERIoTEn & CheriPPLBC) begin : g_trvk_stage
+    cheri_trvk_stage #(
+      .HeapBase  (HeapBase),
+      .TSMapSize (TSMapSize)
+    ) cheri_trvk_stage_i (
+     // Clock and Reset
+    .clk_i             (clk_i           ),
+    .rst_ni            (rst_ni          ),
+    .rf_trsv_en_i      (rf_trsv_en      ),
+    .rf_trsv_addr_i    (rf_trsv_addr_o  ),
+    .lsu_resp_valid_i  (lsu_resp_valid  ),
+    .lsu_load_err_i    (lsu_load_err    ),
+    .rf_wdata_lsu_i    (rf_wdata_lsu[31:0]),
+    .rf_wcap_lsu_i     (rf_wcap_lsu     ),
+    .lsu_resp_is_wr_i  (lsu_resp_is_wr),
+    .lsu_tbre_resp_valid_i (lsu_tbre_resp_valid),
+    .lsu_tbre_resp_err_i   (lsu_tbre_resp_err),
+    .rf_trvk_addr_o    (rf_trvk_addr_o  ),
+    .rf_trvk_en_o      (rf_trvk_en_o    ),
+    .rf_trvk_clrtag_o  (rf_trvk_clrtag_o),
+    .tbre_trvk_en_o    (tbre_trvk_en    ),
+    .tbre_trvk_clrtag_o(tbre_trvk_clrtag),          
+    .tsmap_cs_o        (tsmap_cs_o      ),
+    .tsmap_addr_o      (tsmap_addr_o    ),
+    .tsmap_rdata_i     (tsmap_rdata_i   )
+    );
+  end else begin
+    assign rf_trvk_addr_o   = 0;
+    assign rf_trvk_en_o     = 1'b0;
+    assign rf_trvk_clrtag_o = 1'b0;
+    assign tsmap_cs_o       = 1'b0;
+    assign tsmap_addr_o     = 0;
+  end
+
+  //////////////////////////////////////////
+  // cheri TS background revocation engine//
+  //////////////////////////////////////////
+
+  logic snoop_lsu_req_done;
+  logic unmasked_intr;
+
+  assign snoop_lsu_req_done = lsu_req_done;
+  assign unmasked_intr = irq_pending_o & csr_mstatus_mie;
+
+  cheri_tbre_wrapper #(
+    .CHERIoTEn   (CHERIoTEn),
+    .CheriTBRE   (CheriTBRE),
+    .CheriStkZ   (CheriStkZ),
+    .MMRegDinW   (MMRegDinW),
+    .MMRegDoutW  (MMRegDoutW)
+  ) cheri_tbre_wrapper_i (
+   // Clock and Reset
+    .clk_i                   (clk_i),                 
+    .rst_ni                  (rst_ni),
+    .mmreg_corein_i          (mmreg_corein_i),
+    .mmreg_coreout_o         (mmreg_coreout_o),
+    .lsu_tbre_resp_valid_i   (lsu_tbre_resp_valid),
+    .lsu_tbre_resp_err_i     (lsu_tbre_resp_err),
+    .lsu_tbre_resp_is_wr_i   (lsu_resp_is_wr),
+    .lsu_tbre_raw_lsw_i      (lsu_tbre_raw_lsw),   
+    .lsu_tbre_req_done_i     (lsu_tbre_req_done),   
+    .lsu_tbre_addr_incr_i    (lsu_tbre_addr_incr),
+    .lsu_tbre_sel_i          (lsu_tbre_sel),
+    .tbre_lsu_req_o          (tbre_lsu_req),
+    .tbre_lsu_is_cap_o       (tbre_lsu_is_cap),
+    .tbre_lsu_we_o           (tbre_lsu_we),
+    .tbre_lsu_addr_o         (tbre_lsu_addr),
+    .tbre_lsu_wdata_o        (tbre_lsu_wdata),
+    .snoop_lsu_req_done_i    (snoop_lsu_req_done),  
+    .snoop_lsu_req_i         (lsu_req),
+    .snoop_lsu_is_cap_i      (lsu_is_cap),
+    .snoop_lsu_we_i          (lsu_we),
+    .snoop_lsu_cheri_err_i   (lsu_cheri_err),
+    .snoop_lsu_addr_i        (lsu_addr),
+    .trvk_en_i               (tbre_trvk_en),
+    .trvk_clrtag_i           (tbre_trvk_clrtag),
+    .ztop_wr_i               (ztop_wr),  
+    .ztop_wdata_i            (ztop_wdata),
+    .ztop_wfcap_i            (ztop_wfcap),
+    .ztop_rdata_o            (ztop_rdata),
+    .ztop_rcap_o             (ztop_rcap),
+    .unmasked_intr_i         (unmasked_intr),
+    .stkz_active_o           (stkz_active),
+    .stkz_abort_o            (stkz_abort),
+    .stkz_ptr_o              (stkz_ptr),
+    .stkz_base_o             (stkz_base)          
+  ) ;
+   
+   
+  /////////////////////
+  // Load/store unit //
+  /////////////////////
+  logic [32:0] data_wdata33, data_rdata33;
+
+  assign data_req_o   = data_req_out & ~pmp_req_err[PMP_D];
+  assign lsu_resp_err = lsu_load_err | lsu_store_err;
+  assign data_wdata_o = data_wdata33[DataWidth-1:0];
+
+  if (DataWidth == 33) begin
+    assign data_rdata33 = data_rdata_i;
+  end else begin
+    assign data_rdata33 = {1'b0, data_rdata_i};
+  end
+
+  cheriot_load_store_unit #(
+    .CHERIoTEn(CHERIoTEn),
+    .MemCapFmt(MemCapFmt),
+    .CheriTBRE(CheriTBRE),
+    .CheriCapIT8(CheriCapIT8)
+    ) load_store_unit_i (
+    .clk_i (clk_i),
+    .rst_ni(rst_ni),
+
+    .cheri_pmode_i (cheri_pmode_i),
+    // data interface
+    .data_req_o    (data_req_out),
+    .data_is_cap_o (data_is_cap_o),
+    .data_gnt_i    (data_gnt_i),
+    .data_rvalid_i (data_rvalid_i),
+    .data_err_i    (data_err_i),
+    .data_pmp_err_i(pmp_req_err[PMP_D]),
+
+    .data_addr_o (data_addr_o),
+    .data_we_o   (data_we_o),
+    .data_be_o   (data_be_o),
+    .data_wdata_o(data_wdata33),
+    .data_rdata_i(data_rdata33),
+
+    // signals to/from ID/EX stage
+    .lsu_we_i      (lsu_we),
+    .lsu_type_i    (lsu_type),
+    .lsu_wdata_i   (lsu_wdata),
+    .lsu_wcap_i    (lsu_wcap),
+    .lsu_sign_ext_i(lsu_sign_ext),
+    .cpu_stall_by_stkz_i  (cpu_stall_by_stkz),
+    .cpu_grant_to_stkz_i  (cpu_grant_to_stkz),
+
+    .lsu_rdata_o      (rf_wdata_lsu),
+    .lsu_rcap_o       (rf_wcap_lsu),
+    .lsu_rdata_valid_o(rf_we_lsu),
+    .lsu_req_i        (lsu_req),
+    .lsu_is_cap_i     (lsu_is_cap),
+    .lsu_lc_clrperm_i (lsu_lc_clrperm),
+    .lsu_cheri_err_i  (lsu_cheri_err),
+    .lsu_addr_i       (lsu_addr),
+
+    .lsu_addr_incr_req_o(lsu_addr_incr_req),
+    .addr_last_o    (lsu_addr_last),
+
+    .lsu_req_done_o      (lsu_req_done),
+    .lsu_resp_valid_o (lsu_resp_valid),
+    .lsu_resp_is_wr_o      (lsu_resp_is_wr),    
+
+    .tbre_lsu_req_i        (tbre_lsu_req),
+    .cpu_lsu_dec_i         (cpu_lsu_dec),
+    .lsu_tbre_sel_o        (lsu_tbre_sel),     
+    .lsu_tbre_raw_lsw_o    (lsu_tbre_raw_lsw),
+    .lsu_tbre_req_done_o   (lsu_tbre_req_done),
+    .lsu_tbre_resp_valid_o (lsu_tbre_resp_valid),
+    .lsu_tbre_resp_err_o   (lsu_tbre_resp_err),
+    .lsu_tbre_addr_incr_req_o(lsu_tbre_addr_incr),
+
+    // exception signals
+    .load_err_o (lsu_load_err),
+    .store_err_o(lsu_store_err),
+    .lsu_err_is_cheri_o(lsu_err_is_cheri),
+
+    .busy_o(lsu_busy),
+
+    .busy_tbre_o(lsu_busy_tbre),
+
+    .perf_load_o (perf_load),
+    .perf_store_o(perf_store)
+  );
+
+  cheriot_wb_stage #(
+    .ResetAll       ( ResetAll       ),
+    .WritebackStage(WritebackStage)
+  ) wb_stage_i (
+    .clk_i                   (clk_i),
+    .rst_ni                  (rst_ni),
+    .en_wb_i                 (en_wb),
+    .instr_type_wb_i         (instr_type_wb),
+    .pc_id_i                 (pc_id),
+    .instr_is_compressed_id_i(instr_is_compressed_id),
+    .instr_perf_count_id_i   (instr_perf_count_id),
+    .instr_is_cheri_i        (instr_is_cheri_id),
+    .cheri_load_i            (cheri_load_id),
+    .cheri_store_i           (cheri_store_id),
+
+    .ready_wb_o                         (ready_wb),
+    .rf_write_wb_o                      (rf_write_wb),
+    .outstanding_load_wb_o              (outstanding_load_wb),
+    .outstanding_store_wb_o             (outstanding_store_wb),
+    .pc_wb_o                            (pc_wb),
+    .perf_instr_ret_wb_o                (perf_instr_ret_wb),
+    .perf_instr_ret_compressed_wb_o     (perf_instr_ret_compressed_wb),
+    .perf_instr_ret_wb_spec_o           (perf_instr_ret_wb_spec),
+    .perf_instr_ret_compressed_wb_spec_o(perf_instr_ret_compressed_wb_spec),
+
+    .rf_waddr_id_i(rf_waddr_id),
+    .rf_wdata_id_i(rf_wdata_id),
+    .rf_we_id_i   (rf_we_id),
+
+    .cheri_rf_we_i    (cheri_rf_we),
+    .cheri_rf_wdata_i (cheri_result_data),
+    .cheri_rf_wcap_i  (cheri_result_cap),
+
+    .rf_wdata_lsu_i(rf_wdata_lsu[31:0]),
+    .rf_wcap_lsu_i(rf_wcap_lsu),
+    .rf_we_lsu_i   (rf_we_lsu),
+
+    .rf_wdata_fwd_wb_o(rf_wdata_fwd_wb),
+    .rf_wcap_fwd_wb_o (rf_wcap_fwd_wb),
+
+    .rf_waddr_wb_o(rf_waddr_wb),
+    .rf_wdata_wb_o(rf_wdata_wb),
+    .rf_wcap_wb_o (rf_wcap_wb),
+    .rf_we_wb_o   (rf_we_wb),
+
+    .lsu_resp_valid_i(lsu_resp_valid),
+    .lsu_resp_err_i  (lsu_resp_err),
+
+    .instr_done_wb_o(instr_done_wb)
+  );
+
+  /////////////////////////////
+  // Register file interface //
+  /////////////////////////////
+
+  assign dummy_instr_id_o = dummy_instr_id;
+  assign rf_raddr_a_o     = rf_raddr_a;
+  assign rf_waddr_wb_o    = rf_waddr_wb;
+  assign rf_we_wb_o       = rf_we_wb;
+  assign rf_raddr_b_o     = rf_raddr_b;
+
+  assign rf_wcap_wb_o = rf_wcap_wb;
+
+  if (RegFileECC & CHERIoTEn) begin : gen_ecc_cheriot
+    logic [37:0] rf_wcap_vec, rf_rcap_a_vec, rf_rcap_b_vec;
+    logic  [1:0] rf_ecc_err_a, rf_ecc_err_b;
+    logic        rf_ecc_err_a_id, rf_ecc_err_b_id;
+    logic [31:0] wdata_tmp, rdata_a_tmp, rdata_b_tmp;
+    logic [31:0] unused_sig32_0, unused_sig32_1;
+    logic [38:0] wdata_ecc_tmp;
+
+    assign rf_wcap_vec  = reg2vec(rf_wcap_wb);
+
+    // ECC checkbit generation
+    // -- for simplicity just linearly add the parity bits together.
+    //    this is not as good as the full secded implementation (some double errors won't be detected)
+    //    but probably ok for protection against random fault injection
+
+    // include waddr and we in the ECC calculation
+    assign wdata_tmp    = rf_wdata_wb ^ rf_wcap_vec[31:0] ^ {20'h0, rf_we_wb, rf_waddr_wb, rf_wcap_vec[37:32]};
+    // assign wdata_tmp         = rf_wdata_wb ^ rf_wcap_vec[31:0] ^ {26'h0, rf_wcap_vec[37:32]};
+    assign rf_wdata_wb_ecc_o = {wdata_ecc_tmp[38:32], rf_wdata_wb};
+    prim_secded_inv_39_32_enc regfile_ecc_enc (
+      .data_i(wdata_tmp),
+      .data_o(wdata_ecc_tmp)
+    );
+
+    // generate parity bits for the TRSV/TRVK interface
+    prim_secded_inv_39_32_enc trsv_ecc_enc (
+      .data_i({26'h0, rf_trsv_en_o, rf_trsv_addr_o}),
+      .data_o({rf_trsv_par_o, unused_sig32_0})
+    );
+    
+    prim_secded_inv_39_32_enc trvk_ecc_enc (
+      .data_i({25'h0, rf_trvk_en_o, rf_trvk_clrtag_o, rf_trvk_addr_o}),
+      .data_o({rf_trvk_par_o, unused_sig32_1})
+    );
+
+    // ECC checking on register file rdata
+    assign rf_rcap_a_vec = reg2vec(rf_rcap_a_i);
+    assign rf_rcap_b_vec = reg2vec(rf_rcap_b_i);
+
+    assign rdata_a_tmp = rf_rdata_a_ecc_i[31:0] ^ rf_rcap_a_vec[31:0] ^ {20'h0, 1'b1, rf_raddr_a, rf_rcap_a_vec[37:32]};
+    assign rdata_b_tmp = rf_rdata_b_ecc_i[31:0] ^ rf_rcap_b_vec[31:0] ^ {20'h0, 1'b1, rf_raddr_b, rf_rcap_b_vec[37:32]};
+    
+    //assign rdata_a_tmp = rf_rdata_a_ecc_i[31:0] ^ rf_rcap_a_vec[31:0] ^ {26'h0, rf_rcap_a_vec[37:32]};
+    //assign rdata_b_tmp = rf_rdata_b_ecc_i[31:0] ^ rf_rcap_b_vec[31:0] ^ {26'h0, rf_rcap_b_vec[37:32]};
+    prim_secded_inv_39_32_dec regfile_ecc_dec_a (
+      .data_i    ({rf_rdata_a_ecc_i[38:32], rdata_a_tmp}),
+      .data_o    (),
+      .syndrome_o(),
+      .err_o     (rf_ecc_err_a)
+    );
+    prim_secded_inv_39_32_dec regfile_ecc_dec_b (
+      .data_i    ({rf_rdata_b_ecc_i[38:32], rdata_b_tmp}),
+      .data_o    (),
+      .syndrome_o(),
+      .err_o     (rf_ecc_err_b)
+    );
+
+    // Assign read outputs - no error correction, just trigger an alert
+    assign rf_rdata_a = rf_rdata_a_ecc_i[31:0];
+    assign rf_rdata_b = rf_rdata_b_ecc_i[31:0];
+
+    // Calculate errors - qualify with WB forwarding to avoid xprop into the alert signal
+    assign rf_ecc_err_a_id = |rf_ecc_err_a & rf_ren_a & ~rf_rd_a_wb_match;
+    assign rf_ecc_err_b_id = |rf_ecc_err_b & rf_ren_b & ~rf_rd_b_wb_match;
+
+    // Combined error
+    assign rf_ecc_err_comb = instr_valid_id & (rf_ecc_err_a_id | rf_ecc_err_b_id);
+
+  end else if (RegFileECC) begin : gen_regfile_ecc
+
+    logic [1:0] rf_ecc_err_a, rf_ecc_err_b;
+    logic       rf_ecc_err_a_id, rf_ecc_err_b_id;
+
+    // ECC checkbit generation for regiter file wdata
+    prim_secded_inv_39_32_enc regfile_ecc_enc (
+      .data_i(rf_wdata_wb),
+      .data_o(rf_wdata_wb_ecc_o)
+    );
+
+    // ECC checking on register file rdata
+    prim_secded_inv_39_32_dec regfile_ecc_dec_a (
+      .data_i    (rf_rdata_a_ecc_i),
+      .data_o    (),
+      .syndrome_o(),
+      .err_o     (rf_ecc_err_a)
+    );
+    prim_secded_inv_39_32_dec regfile_ecc_dec_b (
+      .data_i    (rf_rdata_b_ecc_i),
+      .data_o    (),
+      .syndrome_o(),
+      .err_o     (rf_ecc_err_b)
+    );
+
+    // Assign read outputs - no error correction, just trigger an alert
+    assign rf_rdata_a = rf_rdata_a_ecc_i[31:0];
+    assign rf_rdata_b = rf_rdata_b_ecc_i[31:0];
+
+    // Calculate errors - qualify with WB forwarding to avoid xprop into the alert signal
+    assign rf_ecc_err_a_id = |rf_ecc_err_a & rf_ren_a & ~rf_rd_a_wb_match;
+    assign rf_ecc_err_b_id = |rf_ecc_err_b & rf_ren_b & ~rf_rd_b_wb_match;
+
+    // Combined error
+    assign rf_ecc_err_comb = instr_valid_id & (rf_ecc_err_a_id | rf_ecc_err_b_id);
+
+    assign rf_trvk_par_o = 7'h0;
+    assign rf_trsv_par_o = 7'h0;
+
+  end else begin : gen_no_regfile_ecc
+
+    logic unused_rf_ren_a, unused_rf_ren_b;
+    logic unused_rf_rd_a_wb_match, unused_rf_rd_b_wb_match;
+
+    assign unused_rf_ren_a         = rf_ren_a;
+    assign unused_rf_ren_b         = rf_ren_b;
+    assign unused_rf_rd_a_wb_match = rf_rd_a_wb_match;
+    assign unused_rf_rd_b_wb_match = rf_rd_b_wb_match;
+    assign rf_wdata_wb_ecc_o       = rf_wdata_wb;
+    assign rf_rdata_a              = rf_rdata_a_ecc_i;
+    assign rf_rdata_b              = rf_rdata_b_ecc_i;
+    assign rf_ecc_err_comb         = 1'b0;
+
+    assign rf_trvk_par_o = 7'h0;
+    assign rf_trsv_par_o = 7'h0;
+end
+
+  ///////////////////////
+  // Crash dump output //
+  ///////////////////////
+
+  assign crash_dump_o.current_pc     = pc_id;
+  assign crash_dump_o.next_pc        = pc_if;
+  assign crash_dump_o.last_data_addr = lsu_addr_last;
+  assign crash_dump_o.exception_addr = csr_mepc;
+
+  ///////////////////
+  // Alert outputs //
+  ///////////////////
+
+  // Minor alert - core is in a recoverable state
+  // TODO add I$ ECC errors here
+  assign alert_minor_o = 1'b0;
+
+  // Major alert - core is unrecoverable
+  assign alert_major_o = rf_ecc_err_comb | pc_mismatch_alert | csr_shadow_err;
+
+  ////////////////////////
+  // RF (Register File) //
+  ////////////////////////
+`ifdef RVFI
+`endif
+
+
+  /////////////////////////////////////////
+  // CSRs (Control and Status Registers) //
+  /////////////////////////////////////////
+
+  assign csr_wdata  = alu_operand_a_ex;
+  assign csr_addr   = csr_num_e'(csr_access ? alu_operand_b_ex[11:0] : 12'b0);
+
+  cheriot_cs_registers #(
+    .DbgTriggerEn     (DbgTriggerEn),
+    .DbgHwBreakNum    (DbgHwBreakNum),
+    .DataIndTiming    (DataIndTiming),
+    .DummyInstructions(DummyInstructions),
+    .ShadowCSR        (ShadowCSR),
+    .ICache           (ICache),
+    .MHPMCounterNum   (MHPMCounterNum),
+    .MHPMCounterWidth (MHPMCounterWidth),
+    .PMPEnable        (PMPEnable),
+    .PMPGranularity   (PMPGranularity),
+    .PMPNumRegions    (PMPNumRegions),
+    .RV32E            (RV32E),
+    .RV32M            (RV32M),
+    .RV32B            (RV32B),
+    .CHERIoTEn        (CHERIoTEn)
+  ) cs_registers_i (
+    .clk_i (clk_i),
+    .rst_ni(rst_ni),
+
+    .cheri_pmode_i  (cheri_pmode_i),
+
+    // Hart ID from outside
+    .hart_id_i      (hart_id_i),
+    .priv_mode_id_o (priv_mode_id),
+    .priv_mode_lsu_o(priv_mode_lsu),
+
+    // mtvec
+    .csr_mtvec_o     (csr_mtvec),
+    .csr_mtvec_init_i(csr_mtvec_init),
+    .boot_addr_i     (boot_addr_i),
+
+    // Interface to CSRs     ( SRAM like                    )
+    .csr_access_i(csr_access),
+    .csr_addr_i  (csr_addr),
+    .csr_wdata_i (csr_wdata),
+    .csr_op_i    (csr_op),
+    .csr_op_en_i (csr_op_en),
+    .csr_rdata_o (csr_rdata),
+
+    .cheri_csr_access_i   (cheri_csr_access),
+    .cheri_csr_addr_i     (cheri_csr_addr),
+    .cheri_csr_wdata_i    (cheri_csr_wdata),
+    .cheri_csr_wcap_i     (cheri_csr_wcap),
+    .cheri_csr_op_i       (cheri_csr_op),
+    .cheri_csr_op_en_i    (cheri_csr_op_en),
+    .cheri_csr_set_mie_i  (cheri_csr_set_mie),
+    .cheri_csr_clr_mie_i  (cheri_csr_clr_mie),
+    .cheri_csr_rdata_o    (cheri_csr_rdata),
+    .cheri_csr_rcap_o     (cheri_csr_rcap),
+
+    .csr_mshwm_o          (csr_mshwm),
+    .csr_mshwmb_o         (csr_mshwmb),
+    .csr_mshwm_set_i      (csr_mshwm_set),
+    .csr_mshwm_new_i      (csr_mshwm_new),
+
+    // Interrupt related control signals
+    .irq_software_i   (irq_software_i),
+    .irq_timer_i      (irq_timer_i),
+    .irq_external_i   (irq_external_i),
+    .irq_fast_i       (irq_fast_i),
+    .nmi_mode_i       (nmi_mode),
+    .irq_pending_o    (irq_pending_o),
+    .irqs_o           (irqs),
+    .csr_mstatus_mie_o(csr_mstatus_mie),
+    .csr_mstatus_tw_o (csr_mstatus_tw),
+    .csr_mepc_o       (csr_mepc),
+
+    // PMP
+    .csr_pmp_cfg_o    (csr_pmp_cfg),
+    .csr_pmp_addr_o   (csr_pmp_addr),
+    .csr_pmp_mseccfg_o(csr_pmp_mseccfg),
+
+    // debug
+    .csr_depc_o         (csr_depc),
+    .debug_mode_i       (debug_mode),
+    .debug_cause_i      (debug_cause),
+    .debug_csr_save_i   (debug_csr_save),
+    .debug_single_step_o(debug_single_step),
+    .debug_ebreakm_o    (debug_ebreakm),
+    .debug_ebreaku_o    (debug_ebreaku),
+    .trigger_match_o    (trigger_match),
+
+    .pc_if_i(pc_if),
+    .pc_id_i(pc_id),
+    .pc_wb_i(pc_wb),
+
+    .data_ind_timing_o    (data_ind_timing),
+    .dummy_instr_en_o     (dummy_instr_en),
+    .dummy_instr_mask_o   (dummy_instr_mask),
+    .dummy_instr_seed_en_o(dummy_instr_seed_en),
+    .dummy_instr_seed_o   (dummy_instr_seed),
+    .icache_enable_o      (icache_enable),
+    .csr_shadow_err_o     (csr_shadow_err),
+
+    .csr_save_if_i     (csr_save_if),
+    .csr_save_id_i     (csr_save_id),
+    .csr_save_wb_i     (csr_save_wb),
+    .csr_restore_mret_i(csr_restore_mret_id),
+    .csr_restore_dret_i(csr_restore_dret_id),
+    .csr_save_cause_i  (csr_save_cause),
+    .csr_mepcc_clrtag_i   (csr_mepcc_clrtag),
+    .csr_mcause_i      (exc_cause),
+    .csr_mtval_i       (csr_mtval),
+    .illegal_csr_insn_o(illegal_csr_insn_id),
+
+    .double_fault_seen_o,
+
+    // performance counter related signals
+    .instr_ret_i                (perf_instr_ret_wb),
+    .instr_ret_compressed_i     (perf_instr_ret_compressed_wb),
+    .instr_ret_spec_i           (perf_instr_ret_wb_spec),
+    .instr_ret_compressed_spec_i(perf_instr_ret_compressed_wb_spec),
+    .iside_wait_i               (perf_iside_wait),
+    .jump_i                     (perf_jump),
+    .branch_i                   (perf_branch),
+    .branch_taken_i             (perf_tbranch),
+    .mem_load_i                 (perf_load),
+    .mem_store_i                (perf_store),
+    .dside_wait_i               (perf_dside_wait),
+    .mul_wait_i                 (perf_mul_wait),
+    .div_wait_i                 (perf_div_wait),
+
+    .cheri_branch_req_i     (cheri_branch_req),
+    .cheri_branch_target_i  (branch_target_ex_cheri),
+    .pcc_cap_i              (pcc_cap_w),
+    .pcc_cap_o              (pcc_cap_r),
+    .csr_dbg_tclr_fault_o   (csr_dbg_tclr_fault),
+    .cheri_fatal_err_o      (cheri_fatal_err_o)
+  );
+
+
+  if (PMPEnable) begin : g_pmp
+    logic [33:0] pmp_req_addr [PMP_NUM_CHAN];
+    pmp_req_e    pmp_req_type [PMP_NUM_CHAN];
+    priv_lvl_e   pmp_priv_lvl [PMP_NUM_CHAN];
+
+    assign pmp_req_addr[PMP_I]  = {2'b00, pc_if};
+    assign pmp_req_type[PMP_I]  = PMP_ACC_EXEC;
+    assign pmp_priv_lvl[PMP_I]  = priv_mode_id;
+    assign pmp_req_addr[PMP_I2] = {2'b00, (pc_if + 32'd2)};
+    assign pmp_req_type[PMP_I2] = PMP_ACC_EXEC;
+    assign pmp_priv_lvl[PMP_I2] = priv_mode_id;
+    assign pmp_req_addr[PMP_D]  = {2'b00, data_addr_o[31:0]};
+    assign pmp_req_type[PMP_D]  = data_we_o ? PMP_ACC_WRITE : PMP_ACC_READ;
+    assign pmp_priv_lvl[PMP_D]  = priv_mode_lsu;
+
+    cheriot_pmp #(
+      .PMPGranularity(PMPGranularity),
+      .PMPNumChan    (PMP_NUM_CHAN),
+      .PMPNumRegions (PMPNumRegions)
+    ) pmp_i (
+      .clk_i            (clk_i),
+      .rst_ni           (rst_ni),
+      // Interface to CSRs
+      .csr_pmp_cfg_i    (csr_pmp_cfg),
+      .csr_pmp_addr_i   (csr_pmp_addr),
+      .csr_pmp_mseccfg_i(csr_pmp_mseccfg),
+      .priv_mode_i      (pmp_priv_lvl),
+      // Access checking channels
+      .pmp_req_addr_i   (pmp_req_addr),
+      .pmp_req_type_i   (pmp_req_type),
+      .pmp_req_err_o    (pmp_req_err)
+    );
+  end else begin : g_no_pmp
+    // Unused signal tieoff
+    priv_lvl_e unused_priv_lvl_ls;
+    logic [33:0] unused_csr_pmp_addr [PMPNumRegions];
+    pmp_cfg_t    unused_csr_pmp_cfg  [PMPNumRegions];
+    pmp_mseccfg_t unused_csr_pmp_mseccfg;
+    assign unused_priv_lvl_ls = priv_mode_lsu;
+    assign unused_csr_pmp_addr = csr_pmp_addr;
+    assign unused_csr_pmp_cfg = csr_pmp_cfg;
+    assign unused_csr_pmp_mseccfg = csr_pmp_mseccfg;
+
+    // Output tieoff
+    assign pmp_req_err[PMP_I]  = 1'b0;
+    assign pmp_req_err[PMP_I2] = 1'b0;
+    assign pmp_req_err[PMP_D]  = 1'b0;
+  end
+
+`ifdef RVFI
+  // When writeback stage is present RVFI information is emitted when instruction is finished in
+  // third stage but some information must be captured whilst the instruction is in the second
+  // stage. Without writeback stage RVFI information is all emitted when instruction retires in
+  // second stage. RVFI outputs are all straight from flops. So 2 stage pipeline requires a single
+  // set of flops (instr_info => RVFI_out), 3 stage pipeline requires two sets (instr_info => wb
+  // => RVFI_out)
+  localparam int RVFI_STAGES = WritebackStage ? 2 : 1;
+
+  logic        rvfi_stage_valid     [RVFI_STAGES];
+  logic [63:0] rvfi_stage_order     [RVFI_STAGES];
+  logic [31:0] rvfi_stage_insn      [RVFI_STAGES];
+  logic        rvfi_stage_trap      [RVFI_STAGES];
+  logic        rvfi_stage_halt      [RVFI_STAGES];
+  logic        rvfi_stage_intr      [RVFI_STAGES];
+  logic [ 1:0] rvfi_stage_mode      [RVFI_STAGES];
+  logic [ 1:0] rvfi_stage_ixl       [RVFI_STAGES];
+  logic [ 4:0] rvfi_stage_rs1_addr  [RVFI_STAGES];
+  logic [ 4:0] rvfi_stage_rs2_addr  [RVFI_STAGES];
+  logic [ 4:0] rvfi_stage_rs3_addr  [RVFI_STAGES];
+  logic [31:0] rvfi_stage_rs1_rdata [RVFI_STAGES];
+  logic [31:0] rvfi_stage_rs2_rdata [RVFI_STAGES];
+  logic [31:0] rvfi_stage_rs3_rdata [RVFI_STAGES];
+  reg_cap_t    rvfi_stage_rs1_rcap [RVFI_STAGES];
+  reg_cap_t    rvfi_stage_rs2_rcap [RVFI_STAGES];
+  logic [ 4:0] rvfi_stage_rd_addr   [RVFI_STAGES];
+  logic [31:0] rvfi_stage_rd_wdata  [RVFI_STAGES];
+  reg_cap_t    rvfi_stage_rd_wcap [RVFI_STAGES];
+  logic [31:0] rvfi_stage_pc_rdata  [RVFI_STAGES];
+  logic [31:0] rvfi_stage_pc_wdata  [RVFI_STAGES];
+  logic [31:0] rvfi_stage_mem_addr  [RVFI_STAGES];
+  logic [ 3:0] rvfi_stage_mem_rmask [RVFI_STAGES];
+  logic [ 3:0] rvfi_stage_mem_wmask [RVFI_STAGES];
+  logic [DataWidth-1:0] rvfi_stage_mem_rdata [RVFI_STAGES];
+  reg_cap_t    rvfi_stage_mem_rcap  [RVFI_STAGES];
+  logic [DataWidth-1:0] rvfi_stage_mem_wdata [RVFI_STAGES];
+  reg_cap_t    rvfi_stage_mem_wcap  [RVFI_STAGES];
+  logic        rvfi_stage_mem_is_cap [RVFI_STAGES];
+
+  logic        rvfi_instr_new_wb;
+  logic        rvfi_intr_d;
+  logic        rvfi_intr_q;
+  logic        rvfi_set_trap_pc_d;
+  logic        rvfi_set_trap_pc_q;
+  logic [31:0] rvfi_insn_id;
+  logic [4:0]  rvfi_rs1_addr_d;
+  logic [4:0]  rvfi_rs1_addr_q;
+  logic [4:0]  rvfi_rs2_addr_d;
+  logic [4:0]  rvfi_rs2_addr_q;
+  logic [4:0]  rvfi_rs3_addr_d;
+  logic [31:0] rvfi_rs1_data_d;
+  logic [31:0] rvfi_rs1_data_q;
+  logic [31:0] rvfi_rs2_data_d;
+  logic [31:0] rvfi_rs2_data_q;
+  reg_cap_t    rvfi_rs1_cap_d;
+  reg_cap_t    rvfi_rs1_cap_q;
+  reg_cap_t    rvfi_rs2_cap_d;
+  reg_cap_t    rvfi_rs2_cap_q;
+  reg_cap_t    rvfi_rd_cap_d;
+  reg_cap_t    rvfi_rd_cap_q;
+  logic [31:0] rvfi_rs3_data_d;
+  logic [4:0]  rvfi_rd_addr_wb;
+  logic [4:0]  rvfi_rd_addr_q;
+  logic [4:0]  rvfi_rd_addr_d;
+  logic [31:0] rvfi_rd_wdata_wb;
+  logic [31:0] rvfi_rd_wdata_d;
+  logic [31:0] rvfi_rd_wdata_q;
+  logic        rvfi_rd_we_wb;
+  logic [3:0]  rvfi_mem_mask_int;
+  logic [DataWidth-1:0] rvfi_mem_rdata_d;
+  logic [DataWidth-1:0] rvfi_mem_rdata_q;
+  logic [DataWidth-1:0] rvfi_mem_wdata_d;
+  logic [DataWidth-1:0] rvfi_mem_wdata_q;
+  logic [31:0] rvfi_mem_addr_d;
+  logic [31:0] rvfi_mem_addr_q;
+  logic        rvfi_mem_is_cap_d;
+  logic        rvfi_mem_is_cap_q;
+  reg_cap_t    rvfi_mem_rcap_d;
+  reg_cap_t    rvfi_mem_rcap_q;
+  reg_cap_t    rvfi_mem_wcap_d;
+  reg_cap_t    rvfi_mem_wcap_q;
+  logic        rvfi_trap_id;
+  logic        rvfi_trap_wb;
+  logic [63:0] rvfi_stage_order_d;
+  logic        rvfi_id_done;
+  logic        rvfi_wb_done;
+
+  logic            new_debug_req;
+  logic            new_nmi;
+  logic            new_irq;
+  cheriot_pkg::irqs_t captured_mip;
+  logic            captured_nmi;
+  logic            captured_debug_req;
+  logic            captured_valid;
+
+  // RVFI extension for co-simulation support
+  // debug_req and MIP captured at IF -> ID transition so one extra stage
+  cheriot_pkg::irqs_t rvfi_ext_stage_mip          [RVFI_STAGES+1];
+  logic            rvfi_ext_stage_nmi          [RVFI_STAGES+1];
+  logic            rvfi_ext_stage_debug_req    [RVFI_STAGES+1];
+  logic [63:0]     rvfi_ext_stage_mcycle       [RVFI_STAGES];
+
+  logic        rvfi_stage_valid_d   [RVFI_STAGES];
+  
+  logic        insn_c_hint;
+
+  assign rvfi_valid     = rvfi_stage_valid    [RVFI_STAGES-1];
+  assign rvfi_order     = rvfi_stage_order    [RVFI_STAGES-1];
+  assign rvfi_insn      = rvfi_stage_insn     [RVFI_STAGES-1];
+  assign rvfi_trap      = rvfi_stage_trap     [RVFI_STAGES-1];
+  assign rvfi_halt      = rvfi_stage_halt     [RVFI_STAGES-1];
+  assign rvfi_intr      = rvfi_stage_intr     [RVFI_STAGES-1];
+  assign rvfi_mode      = rvfi_stage_mode     [RVFI_STAGES-1];
+  assign rvfi_ixl       = rvfi_stage_ixl      [RVFI_STAGES-1];
+  assign rvfi_rs1_addr  = rvfi_stage_rs1_addr [RVFI_STAGES-1];
+  assign rvfi_rs2_addr  = rvfi_stage_rs2_addr [RVFI_STAGES-1];
+  assign rvfi_rs3_addr  = rvfi_stage_rs3_addr [RVFI_STAGES-1];
+  assign rvfi_rs1_rdata = rvfi_stage_rs1_rdata[RVFI_STAGES-1];
+  assign rvfi_rs2_rdata = rvfi_stage_rs2_rdata[RVFI_STAGES-1];
+  assign rvfi_rs1_rcap  = rvfi_stage_rs1_rcap [RVFI_STAGES-1];
+  assign rvfi_rs2_rcap  = rvfi_stage_rs2_rcap [RVFI_STAGES-1];
+  assign rvfi_rs3_rdata = rvfi_stage_rs3_rdata[RVFI_STAGES-1];
+  assign rvfi_rd_wdata  = rvfi_stage_rd_wdata [RVFI_STAGES-1];
+  assign rvfi_rd_wcap   = rvfi_stage_rd_wcap  [RVFI_STAGES-1];
+  assign rvfi_pc_rdata  = rvfi_stage_pc_rdata [RVFI_STAGES-1];
+  assign rvfi_pc_wdata  = rvfi_stage_pc_wdata [RVFI_STAGES-1];
+  assign rvfi_mem_addr  = rvfi_stage_mem_addr [RVFI_STAGES-1];
+  assign rvfi_mem_rmask = rvfi_stage_mem_rmask[RVFI_STAGES-1];
+  assign rvfi_mem_wmask = rvfi_stage_mem_wmask[RVFI_STAGES-1];
+  assign rvfi_mem_rdata = rvfi_stage_mem_rdata[RVFI_STAGES-1];
+  assign rvfi_mem_wdata = rvfi_stage_mem_wdata[RVFI_STAGES-1];
+  assign rvfi_mem_is_cap = rvfi_stage_mem_is_cap[RVFI_STAGES-1];
+  assign rvfi_mem_rcap  = rvfi_stage_mem_rcap[RVFI_STAGES-1];
+  assign rvfi_mem_wcap  = rvfi_stage_mem_wcap[RVFI_STAGES-1];
+
+  // for HINT instructions like c.srai64/c.slli64, force rvfi_rd_addr output to 0 to match sail implementation
+  assign rvfi_rd_addr   = insn_c_hint ? 0 : rvfi_stage_rd_addr [RVFI_STAGES-1];
+
+  always_comb begin
+    if ((rvfi_insn[1:0] == 2'b01) && (rvfi_insn[15:13] == 3'b100) && (rvfi_insn[11:10] == 2'b00) &&      // c.srli64
+       ({rvfi_insn[12], rvfi_insn[6:2]} == 6'h0) &&
+        (rvfi_rs1_addr == rvfi_rd_addr) && (rvfi_rs1_rdata == rvfi_rd_wdata))
+      insn_c_hint = 1'b1;
+    else if ((rvfi_insn[1:0] == 2'b01) && (rvfi_insn[15:13] == 3'b100) && (rvfi_insn[11:10] == 2'b01) && // c.srai64
+       ({rvfi_insn[12], rvfi_insn[6:2]} == 6'h0) &&
+        (rvfi_rs1_addr == rvfi_rd_addr) && (rvfi_rs1_rdata == rvfi_rd_wdata))
+      insn_c_hint = 1'b1;
+    else 
+      insn_c_hint = 1'b0;
+   
+     
+  end 
+
+  assign rvfi_rd_addr_wb  = rf_waddr_wb;
+  assign rvfi_rd_wdata_wb = rf_we_wb ? rf_wdata_wb : rf_wdata_lsu; // this doesn't look right but ok
+  assign rvfi_rd_we_wb    = rf_we_wb | rf_we_lsu;
+
+  always_comb begin
+    // Use always_comb instead of continuous assign so first assign can set 0 as default everywhere
+    // that is overridden by more specific settings.
+    rvfi_ext_mip                                     = '0;
+    rvfi_ext_mip[CSR_MSIX_BIT]                       = rvfi_ext_stage_mip[RVFI_STAGES].irq_software;
+    rvfi_ext_mip[CSR_MTIX_BIT]                       = rvfi_ext_stage_mip[RVFI_STAGES].irq_timer;
+    rvfi_ext_mip[CSR_MEIX_BIT]                       = rvfi_ext_stage_mip[RVFI_STAGES].irq_external;
+    rvfi_ext_mip[CSR_MFIX_BIT_HIGH:CSR_MFIX_BIT_LOW] = rvfi_ext_stage_mip[RVFI_STAGES].irq_fast;
+  end
+
+  assign rvfi_ext_nmi       = rvfi_ext_stage_nmi[RVFI_STAGES];
+  assign rvfi_ext_debug_req = rvfi_ext_stage_debug_req[RVFI_STAGES];
+  assign rvfi_ext_mcycle    = rvfi_ext_stage_mcycle[RVFI_STAGES-1];
+
+  // When an instruction takes a trap the `rvfi_trap` signal will be set. Instructions that take
+  // traps flush the pipeline so ordinarily wouldn't be seen to be retire. The RVFI tracking
+  // pipeline is kept going for flushed instructions that trapped so they are still visible on the
+  // RVFI interface.
+
+  // Factor in exceptions taken in ID so RVFI tracking picks up flushed instructions that took
+  // a trap
+  // kliu 05082024: add the ~wb_exception_o iterm to handle the corner case where
+  // ID and WB both faulted, e.g., illegal_insn in ID and cheri_wb_err in WB
+  // The previous behavior is 2 rvfi items in the trace (both traps),
+  // even if the instruction in the ID is never executed.
+  // The new behavior only generate 1 rvfi item for wb stage fault
+  assign rvfi_id_done = instr_id_done | (id_stage_i.controller_i.rvfi_flush_next &
+                                         id_stage_i.controller_i.id_exception_o &
+                                         ~id_stage_i.controller_i.wb_exception_o);
+
+  if (WritebackStage) begin : gen_rvfi_wb_stage
+    logic unused_instr_new_id;
+
+    assign unused_instr_new_id = instr_new_id;
+
+    // With writeback stage first RVFI stage buffers instruction information captured in ID/EX
+    // awaiting instruction retirement and RF Write data/Mem read data whilst instruction is in WB
+    // So first stage becomes valid when instruction leaves ID/EX stage and remains valid until
+    // instruction leaves WB
+    assign rvfi_stage_valid_d[0] = (rvfi_id_done & ~dummy_instr_id) |
+                                   (rvfi_stage_valid[0] & ~rvfi_wb_done);
+    // Second stage is output stage so simple valid cycle after instruction leaves WB (and so has
+    // retired)
+    assign rvfi_stage_valid_d[1] = rvfi_wb_done;
+
+    // Signal new instruction in WB cycle after instruction leaves ID/EX (to enter WB)
+    logic rvfi_instr_new_wb_q;
+
+    // Signal new instruction in WB either when one has just entered or when a trap is progressing
+    // through the tracking pipeline
+    assign rvfi_instr_new_wb = rvfi_instr_new_wb_q | (rvfi_stage_valid[0] & rvfi_stage_trap[0]);
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        rvfi_instr_new_wb_q <= 0;
+      end else begin
+        rvfi_instr_new_wb_q <= rvfi_id_done;
+      end
+    end
+
+    assign rvfi_trap_id = id_stage_i.controller_i.id_exception_o;
+    assign rvfi_trap_wb = id_stage_i.controller_i.exc_req_wb ;
+    // WB is instantly done in the tracking pipeline when a trap is progress through the pipeline
+    assign rvfi_wb_done = instr_done_wb | (rvfi_stage_valid[0] & rvfi_stage_trap[0]);
+  end else begin : gen_rvfi_no_wb_stage
+    // Without writeback stage first RVFI stage is output stage so simply valid the cycle after
+    // instruction leaves ID/EX (and so has retired)
+    assign rvfi_stage_valid_d[0] = rvfi_id_done & ~dummy_instr_id;
+    // Without writeback stage signal new instr_new_wb when instruction enters ID/EX to correctly
+    // setup register write signals
+    assign rvfi_instr_new_wb = instr_new_id;
+    assign rvfi_trap_id = id_stage_i.controller_i.exc_req_d | id_stage_i.controller_i.exc_req_lsu;
+    assign rvfi_trap_wb = 1'b0;
+    assign rvfi_wb_done = instr_done_wb;
+  end
+
+  assign rvfi_stage_order_d = dummy_instr_id ? rvfi_stage_order[0] : rvfi_stage_order[0] + 64'd1;
+
+  // For interrupts and debug Ibex will take the relevant trap as soon as whatever instruction in ID
+  // finishes or immediately if the ID stage is empty. The rvfi_ext interface provides the DV
+  // environment with information about the irq/debug_req/nmi state that applies to a particular
+  // instruction.
+  //
+  // When a irq/debug_req/nmi appears the ID stage will finish whatever instruction it is currently
+  // executing (if any) then take the trap the cycle after that instruction leaves the ID stage. The
+  // trap taken depends upon the state of irq/debug_req/nmi on that cycle. In the cycles following
+  // that before the first instruction of the trap handler enters the ID stage the state of
+  // irq/debug_req/nmi could change but this has no effect on the trap handler (e.g. a higher
+  // priority interrupt might appear but this wouldn't stop the lower priority interrupt trap
+  // handler executing first as it's already being fetched). To provide the DV environment with the
+  // correct information for it to verify execution we need to capture the irq/debug_req/nmi state
+  // the cycle the trap decision is made. Which the captured_X signals below do.
+  //
+  // The new_X signals take the raw irq/debug_req/nmi inputs and factor in the enable terms required
+  // to determine if a trap will actually happen.
+  //
+  // These signals and the comment above are referred to in the documentation (cosim.rst). If
+  // altering the names or meanings of these signals or this comment please adjust the documentation
+  // appropriately.
+  assign new_debug_req = (debug_req_i & ~debug_mode);
+  assign new_nmi = irq_nm_i & ~nmi_mode & ~debug_mode;
+  assign new_irq = irq_pending_o & csr_mstatus_mie & ~nmi_mode & ~debug_mode;
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      captured_valid     <= 1'b0;
+      captured_mip       <= '0;
+      captured_nmi       <= 1'b0;
+      captured_debug_req <= 1'b0;
+    end else  begin
+      // Capture when ID stage has emptied out and something occurs that will cause a trap and we
+      // haven't yet captured
+      if (~instr_valid_id & (new_debug_req | new_irq | new_nmi) & ~captured_valid) begin
+        captured_valid     <= 1'b1;
+        captured_nmi       <= irq_nm_i;
+        captured_mip       <= cs_registers_i.mip;
+        captured_debug_req <= debug_req_i;
+      end
+
+      // Capture cleared out as soon as a new instruction appears in ID
+      if (if_stage_i.instr_valid_id_d) begin
+        captured_valid <= 1'b0;
+      end
+    end
+  end
+
+  // Pass the captured irq/debug_req/nmi state to the rvfi_ext interface tracking pipeline.
+  //
+  // To correctly capture we need to factor in various enable terms, should there be a fault in this
+  // logic we won't tell the DV environment about a trap that should have been taken. So if there's
+  // no valid capture we grab the raw values of the irq/debug_req/nmi inputs whatever they are and
+  // the DV environment will see if a trap should have been taken but wasn't.
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      rvfi_ext_stage_mip[0]       <= '0;
+      rvfi_ext_stage_nmi[0]       <= '0;
+      rvfi_ext_stage_debug_req[0] <= '0;
+    end else if (if_stage_i.instr_valid_id_d & if_stage_i.instr_new_id_d) begin
+      rvfi_ext_stage_mip[0]       <= instr_valid_id | ~captured_valid ? cs_registers_i.mip :
+                                                                        captured_mip;
+      rvfi_ext_stage_nmi[0]       <= instr_valid_id | ~captured_valid ? irq_nm_i :
+                                                                        captured_nmi;
+      rvfi_ext_stage_debug_req[0] <= instr_valid_id | ~captured_valid ? debug_req_i        :
+                                                                        captured_debug_req;
+    end
+  end
+
+  logic is_mem_rd, is_mem_wr;
+  assign is_mem_rd = lsu_req & ~lsu_we;
+  assign is_mem_wr = lsu_req & lsu_we;
+
+  for (genvar i = 0; i < RVFI_STAGES; i = i + 1) begin : g_rvfi_stages
+    int im1;
+
+    // this is just to get rid of the VCS elab warning (i-1 out of range when i==0)
+    if (i == 0)
+      assign im1 = 0;
+    else
+      assign im1 = i-1;
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        rvfi_stage_halt[i]            <= '0;
+        rvfi_stage_trap[i]            <= '0;
+        rvfi_stage_intr[i]            <= '0;
+        rvfi_stage_order[i]           <= '0;
+        rvfi_stage_insn[i]            <= '0;
+        rvfi_stage_mode[i]            <= {PRIV_LVL_M};
+        rvfi_stage_ixl[i]             <= CSR_MISA_MXL;
+        rvfi_stage_rs1_addr[i]        <= '0;
+        rvfi_stage_rs2_addr[i]        <= '0;
+        rvfi_stage_rs3_addr[i]        <= '0;
+        rvfi_stage_pc_rdata[i]        <= '0;
+        rvfi_stage_pc_wdata[i]        <= '0;
+        rvfi_stage_mem_rmask[i]       <= '0;
+        rvfi_stage_mem_wmask[i]       <= '0;
+        rvfi_stage_valid[i]           <= '0;
+        rvfi_stage_rs1_rdata[i]       <= '0;
+        rvfi_stage_rs2_rdata[i]       <= '0;
+        rvfi_stage_rs3_rdata[i]       <= '0;
+        rvfi_stage_rs1_rcap[i]        <= NULL_REG_CAP;
+        rvfi_stage_rs2_rcap[i]        <= NULL_REG_CAP;;
+        rvfi_stage_rd_wdata[i]        <= '0;
+        rvfi_stage_rd_wcap[i]         <= NULL_REG_CAP;;
+        rvfi_stage_rd_addr[i]         <= '0;
+        rvfi_stage_mem_rdata[i]       <= '0;
+        rvfi_stage_mem_wdata[i]       <= '0;
+        rvfi_stage_mem_addr[i]        <= '0;
+        rvfi_ext_stage_mip[i+1]       <= '0;
+        rvfi_ext_stage_nmi[i+1]       <= '0;
+        rvfi_ext_stage_debug_req[i+1] <= '0;
+        rvfi_ext_stage_mcycle[i]      <= '0;
+      end else begin
+        rvfi_stage_valid[i] <= rvfi_stage_valid_d[i];
+
+        if (i == 0) begin
+          if (rvfi_id_done) begin
+            rvfi_stage_halt[i]      <= '0;
+            // TODO: Sort this out for writeback stage
+            rvfi_stage_trap[i]            <= rvfi_trap_id;
+            rvfi_stage_intr[i]            <= rvfi_intr_d;
+            rvfi_stage_order[i]           <= rvfi_stage_order_d;
+            rvfi_stage_insn[i]            <= rvfi_insn_id;
+            rvfi_stage_mode[i]            <= {priv_mode_id};
+            rvfi_stage_ixl[i]             <= CSR_MISA_MXL;
+            rvfi_stage_rs1_addr[i]        <= rvfi_rs1_addr_d;
+            rvfi_stage_rs2_addr[i]        <= rvfi_rs2_addr_d;
+            rvfi_stage_rs3_addr[i]        <= rvfi_rs3_addr_d;
+            rvfi_stage_pc_rdata[i]        <= pc_id;
+            rvfi_stage_pc_wdata[i]        <= pc_set ? branch_target_ex : pc_if;
+            rvfi_stage_mem_rmask[i]       <= is_mem_rd ? rvfi_mem_mask_int : 4'b0000;  // kliu
+            rvfi_stage_mem_wmask[i]       <= is_mem_wr ? rvfi_mem_mask_int : 4'b0000;
+            rvfi_stage_rs1_rdata[i]       <= rvfi_rs1_data_d;
+            rvfi_stage_rs2_rdata[i]       <= rvfi_rs2_data_d;
+            rvfi_stage_rs3_rdata[i]       <= rvfi_rs3_data_d;
+            rvfi_stage_rs1_rcap[i]        <= rvfi_rs1_cap_d;
+            rvfi_stage_rs2_rcap[i]        <= rvfi_rs2_cap_d;
+            rvfi_stage_rd_addr[i]         <= rvfi_rd_addr_d;
+            rvfi_stage_rd_wdata[i]        <= rvfi_rd_wdata_d;
+            rvfi_stage_rd_wcap[i]         <= rvfi_rd_cap_d;
+            rvfi_stage_mem_rdata[i]       <= rvfi_mem_rdata_d;
+            rvfi_stage_mem_wdata[i]       <= rvfi_mem_wdata_d;
+            rvfi_stage_mem_rcap[i]        <= rvfi_mem_rcap_d;
+            rvfi_stage_mem_wcap[i]        <= rvfi_mem_wcap_d;
+            rvfi_stage_mem_is_cap[i]      <= rvfi_mem_is_cap_d;
+            rvfi_stage_mem_addr[i]        <= rvfi_mem_addr_d;
+            rvfi_ext_stage_mip[i+1]       <= rvfi_ext_stage_mip[i];
+            rvfi_ext_stage_nmi[i+1]       <= rvfi_ext_stage_nmi[i];
+            rvfi_ext_stage_debug_req[i+1] <= rvfi_ext_stage_debug_req[i];
+            rvfi_ext_stage_mcycle[i]      <= cs_registers_i.mcycle_counter_i.counter_val_o;
+          end
+        end else begin
+          if (rvfi_wb_done) begin
+            rvfi_stage_halt[i]      <= rvfi_stage_halt[im1];
+            rvfi_stage_trap[i]      <= rvfi_stage_trap[im1] | rvfi_trap_wb;
+            rvfi_stage_intr[i]      <= rvfi_stage_intr[im1];
+            rvfi_stage_order[i]     <= rvfi_stage_order[im1];
+            rvfi_stage_insn[i]      <= rvfi_stage_insn[im1];
+            rvfi_stage_mode[i]      <= rvfi_stage_mode[im1];
+            rvfi_stage_ixl[i]       <= rvfi_stage_ixl[im1];
+            rvfi_stage_rs1_addr[i]  <= rvfi_stage_rs1_addr[im1];
+            rvfi_stage_rs2_addr[i]  <= rvfi_stage_rs2_addr[im1];
+            rvfi_stage_rs3_addr[i]  <= rvfi_stage_rs3_addr[im1];
+            rvfi_stage_pc_rdata[i]  <= rvfi_stage_pc_rdata[im1];
+            rvfi_stage_pc_wdata[i]  <= rvfi_stage_pc_wdata[im1];
+            rvfi_stage_mem_rmask[i] <= rvfi_trap_wb ? 4'b0000 : rvfi_stage_mem_rmask[im1];
+            rvfi_stage_mem_wmask[i] <= rvfi_trap_wb ? 4'b0000 : rvfi_stage_mem_wmask[im1];
+            rvfi_stage_rs1_rdata[i] <= rvfi_stage_rs1_rdata[im1];
+            rvfi_stage_rs2_rdata[i] <= rvfi_stage_rs2_rdata[im1];
+            rvfi_stage_rs3_rdata[i] <= rvfi_stage_rs3_rdata[im1];
+            rvfi_stage_mem_wdata[i] <= rvfi_stage_mem_wdata[im1];
+            rvfi_stage_mem_is_cap[i] <= rvfi_stage_mem_is_cap[im1];
+            rvfi_stage_mem_wcap[i]   <= rvfi_stage_mem_wcap[im1];
+            rvfi_stage_mem_addr[i]  <= rvfi_stage_mem_addr[im1];
+            rvfi_stage_rs1_rcap[i]  <= rvfi_stage_rs1_rcap[im1];
+            rvfi_stage_rs2_rcap[i]  <= rvfi_stage_rs2_rcap[im1];
+
+            // For 2 RVFI_STAGES/Writeback Sor 2 Rtage ignore first stage flops for rd_addr, rd_wdata and
+            // mem_rdata. For RF write addr/data actual write happens in writeback so capture
+            // address/data there. For mem_rdata that is only available from the writeback stage.
+            // Previous stage flops still exist in RTL as they are used by the non writeback config
+            rvfi_stage_rd_addr[i]   <= rvfi_rd_addr_d;
+            rvfi_stage_rd_wdata[i]  <= rvfi_rd_wdata_d;
+            rvfi_stage_mem_rdata[i] <= rvfi_mem_rdata_d;
+            rvfi_stage_mem_rcap[i]  <= rvfi_mem_rcap_d;
+            rvfi_stage_rd_wcap[i]   <= rvfi_rd_cap_d;
+
+            rvfi_ext_stage_mip[i+1]       <= rvfi_ext_stage_mip[i];
+            rvfi_ext_stage_nmi[i+1]       <= rvfi_ext_stage_nmi[i];
+            rvfi_ext_stage_debug_req[i+1] <= rvfi_ext_stage_debug_req[i];
+            rvfi_ext_stage_mcycle[i]      <= rvfi_ext_stage_mcycle[im1];
+          end
+        end
+      end
+    end
+  end
+
+  // Memory adddress/write data available first cycle of ld/st instruction from register read
+  always_comb begin
+    if (~CheriTBRE & instr_first_cycle_id) begin
+      // rvfi_mem_addr_d    = alu_adder_result_ex;
+      rvfi_mem_addr_d    = lsu_addr;
+      rvfi_mem_wdata_d   = lsu_wdata;
+      rvfi_mem_wcap_d    = lsu_wcap;
+      rvfi_mem_is_cap_d  = lsu_is_cap;
+    end else if (CheriTBRE & lsu_req & cpu_lsu_dec & ~lsu_addr_incr_req) begin
+      rvfi_mem_addr_d    = lsu_addr;
+      rvfi_mem_wdata_d   = lsu_wdata;
+      rvfi_mem_wcap_d    = lsu_wcap;
+      rvfi_mem_is_cap_d  = lsu_is_cap;
+    end else begin
+      rvfi_mem_addr_d    = rvfi_mem_addr_q;
+      rvfi_mem_wdata_d   = rvfi_mem_wdata_q;
+      rvfi_mem_wcap_d    = rvfi_mem_wcap_q;
+      rvfi_mem_is_cap_d  = rvfi_mem_is_cap_q;
+    end
+  end
+
+  // Capture read data from LSU when it becomes valid
+  always_comb begin
+    if (load_store_unit_i.resp_is_cap_q & lsu_resp_valid) begin
+      rvfi_mem_rdata_d = rf_wdata_lsu;  
+      rvfi_mem_rcap_d  = rf_wcap_lsu;
+    end else if (lsu_resp_valid) begin
+      rvfi_mem_rdata_d = rf_wdata_lsu; 
+      rvfi_mem_rcap_d  = rvfi_mem_rcap_q;
+    end else begin
+      rvfi_mem_rdata_d = rvfi_mem_rdata_q;
+      rvfi_mem_rcap_d  = rvfi_mem_rcap_q;
+    end
+  end
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      rvfi_mem_addr_q  <= '0;
+      rvfi_mem_rdata_q <= '0;
+      rvfi_mem_wdata_q <= '0;
+      rvfi_mem_rcap_q  <= NULL_REG_CAP;
+      rvfi_mem_wcap_q  <= NULL_REG_CAP;
+      rvfi_mem_is_cap_q <= 1'b0;
+    end else begin
+      rvfi_mem_addr_q  <= rvfi_mem_addr_d;
+      rvfi_mem_rdata_q <= rvfi_mem_rdata_d;
+      rvfi_mem_wdata_q <= rvfi_mem_wdata_d;
+      rvfi_mem_rcap_q  <= rvfi_mem_rcap_d;
+      rvfi_mem_wcap_q  <= rvfi_mem_wcap_d;
+      rvfi_mem_is_cap_q <=rvfi_mem_is_cap_d;
+    end
+  end
+  // Byte enable based on data type
+  always_comb begin
+    unique case (lsu_type)
+      2'b00:   rvfi_mem_mask_int = 4'b1111;
+      2'b01:   rvfi_mem_mask_int = 4'b0011;
+      2'b10:   rvfi_mem_mask_int = 4'b0001;
+      2'b11:   rvfi_mem_mask_int = 4'b0001;   // kliu
+      default: rvfi_mem_mask_int = 4'b0000;
+    endcase
+  end
+
+  always_comb begin
+    if (instr_is_compressed_id) begin
+      rvfi_insn_id = {16'b0, instr_rdata_c_id};
+    end else begin
+      rvfi_insn_id = instr_rdata_id;
+    end
+  end
+
+  // Source registers 1 and 2 are read in the first instruction cycle
+  // Source register 3 is read in the second instruction cycle.
+  if (CHERIoTEn) begin
+    always_comb begin
+      if (instr_first_cycle_id) begin
+        rvfi_rs1_cap_d  = rf_ren_a ? g_cheri_ex.u_cheri_ex.rf_rcap_a : NULL_REG_CAP;
+        rvfi_rs2_cap_d  = rf_ren_b ? g_cheri_ex.u_cheri_ex.rf_rcap_b : NULL_REG_CAP;
+      end else begin
+        rvfi_rs1_cap_d  = rvfi_rs1_cap_q;
+        rvfi_rs2_cap_d  = rvfi_rs2_cap_q;
+      end
+    end
+  end else begin
+    assign rvfi_rs1_cap_d  = NULL_REG_CAP; 
+    assign rvfi_rs2_cap_d  = NULL_REG_CAP;
+  end
+
+  always_comb begin
+    if (instr_first_cycle_id) begin
+      rvfi_rs1_data_d = rf_ren_a ? multdiv_operand_a_ex : '0;
+      rvfi_rs1_addr_d = rf_ren_a ? rf_raddr_a : '0;
+      rvfi_rs2_data_d = rf_ren_b ? multdiv_operand_b_ex : '0;
+      rvfi_rs2_addr_d = rf_ren_b ? rf_raddr_b : '0;
+      rvfi_rs3_data_d = '0;
+      rvfi_rs3_addr_d = '0;
+    end else begin
+      rvfi_rs1_data_d = rvfi_rs1_data_q;
+      rvfi_rs1_addr_d = rvfi_rs1_addr_q;
+      rvfi_rs2_data_d = rvfi_rs2_data_q;
+      rvfi_rs2_addr_d = rvfi_rs2_addr_q;
+      rvfi_rs3_data_d = multdiv_operand_a_ex;
+      rvfi_rs3_addr_d = rf_raddr_a;
+    end
+  end
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      rvfi_rs1_data_q <= '0;
+      rvfi_rs1_addr_q <= '0;
+      rvfi_rs2_data_q <= '0;
+      rvfi_rs2_addr_q <= '0;
+      rvfi_rs1_cap_q  <= NULL_REG_CAP;
+      rvfi_rs2_cap_q  <= NULL_REG_CAP;
+    end else begin
+      rvfi_rs1_data_q <= rvfi_rs1_data_d;
+      rvfi_rs1_addr_q <= rvfi_rs1_addr_d;
+      rvfi_rs2_data_q <= rvfi_rs2_data_d;
+      rvfi_rs2_addr_q <= rvfi_rs2_addr_d;
+      rvfi_rs1_cap_q  <= rvfi_rs1_cap_d;
+      rvfi_rs2_cap_q  <= rvfi_rs2_cap_d;
+    end
+  end
+
+  always_comb begin
+    if (rvfi_rd_we_wb) begin
+      // Capture address/data of write to register file
+      rvfi_rd_addr_d = rvfi_rd_addr_wb;
+      // If writing to x0 zero write data as required by RVFI specification
+      if (rvfi_rd_addr_wb == 5'b0) begin
+        rvfi_rd_wdata_d = '0;
+        rvfi_rd_cap_d   = NULL_REG_CAP;
+      end else begin
+        rvfi_rd_wdata_d = rvfi_rd_wdata_wb;
+        rvfi_rd_cap_d   = rf_wcap_wb;
+      end
+    end else if (rvfi_instr_new_wb) begin
+      // If no RF write but new instruction in Writeback (when present) or ID/EX (when no writeback
+      // stage present) then zero RF write address/data as required by RVFI specification
+      rvfi_rd_addr_d  = '0;
+      rvfi_rd_wdata_d = '0;
+      rvfi_rd_cap_d   = NULL_REG_CAP;
+    end else begin
+      // Otherwise maintain previous value
+      rvfi_rd_addr_d  = rvfi_rd_addr_q;
+      rvfi_rd_wdata_d = rvfi_rd_wdata_q;
+      rvfi_rd_cap_d   = rvfi_rd_cap_q;
+    end
+  end
+
+  // RD write register is refreshed only once per cycle and
+  // then it is kept stable for the cycle.
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      rvfi_rd_addr_q    <= '0;
+      rvfi_rd_wdata_q   <= '0;
+      rvfi_rd_cap_q     <= NULL_REG_CAP;
+    end else begin
+      rvfi_rd_addr_q    <= rvfi_rd_addr_d;
+      rvfi_rd_wdata_q   <= rvfi_rd_wdata_d;
+      rvfi_rd_cap_q     <= rvfi_rd_cap_d;
+    end
+  end
+
+  // rvfi_intr must be set for first instruction that is part of a trap handler.
+  // On the first cycle of a new instruction see if a trap PC was set by the previous instruction,
+  // otherwise maintain value.
+  assign rvfi_intr_d = instr_first_cycle_id ? rvfi_set_trap_pc_q : rvfi_intr_q;
+
+  always_comb begin
+    rvfi_set_trap_pc_d = rvfi_set_trap_pc_q;
+
+    //if (pc_set && pc_mux_id == PC_EXC &&           // kliu - interrupt only 
+    //    (exc_pc_mux_id == EXC_PC_EXC || exc_pc_mux_id == EXC_PC_IRQ)) begin   
+    if (pc_set && pc_mux_id == PC_EXC && (exc_pc_mux_id == EXC_PC_IRQ)) begin
+      // PC is set to enter a trap handler
+      rvfi_set_trap_pc_d = 1'b1;
+    end else if (rvfi_set_trap_pc_q && rvfi_id_done) begin
+      // first instruction has been executed after PC is set to trap handler
+      rvfi_set_trap_pc_d = 1'b0;
+    end
+  end
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      rvfi_set_trap_pc_q <= 1'b0;
+      rvfi_intr_q        <= 1'b0;
+    end else begin
+      rvfi_set_trap_pc_q <= rvfi_set_trap_pc_d;
+      rvfi_intr_q        <= rvfi_intr_d;
+    end
+  end
+
+`else
+  logic unused_instr_new_id, unused_instr_id_done, unused_instr_done_wb;
+  assign unused_instr_id_done = instr_id_done;
+  assign unused_instr_new_id = instr_new_id;
+  assign unused_instr_done_wb = instr_done_wb;
+`endif
+
+  // Certain parameter combinations are not supported
+  `ASSERT_INIT(IllegalParamSecure, !(SecureIbex && (RV32M == RV32MNone)))
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_counter.sv b/hw/ip/cheriot-ibex/rtl/cheriot_counter.sv
new file mode 100644
index 0000000..f574eff
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_counter.sv
@@ -0,0 +1,99 @@
+module cheriot_counter #(
+  parameter int CounterWidth = 32,
+  // When set `counter_val_upd_o` provides an incremented version of the counter value, otherwise
+  // the output is hard-wired to 0. This is required to allow Xilinx DSP inference to work
+  // correctly. When `ProvideValUpd` is set no DSPs are inferred.
+  parameter bit ProvideValUpd = 0
+) (
+  input  logic        clk_i,
+  input  logic        rst_ni,
+
+  input  logic        counter_inc_i,
+  input  logic        counterh_we_i,
+  input  logic        counter_we_i,
+  input  logic [31:0] counter_val_i,
+  output logic [63:0] counter_val_o,
+  output logic [63:0] counter_val_upd_o
+);
+
+  logic [63:0]             counter;
+  logic [CounterWidth-1:0] counter_upd;
+  logic [63:0]             counter_load;
+  logic                    we;
+  logic [CounterWidth-1:0] counter_d;
+
+  // Increment
+  assign counter_upd = counter[CounterWidth-1:0] + {{CounterWidth - 1{1'b0}}, 1'b1};
+
+  // Update
+  always_comb begin
+    // Write
+    we = counter_we_i | counterh_we_i;
+    counter_load[63:32] = counter[63:32];
+    counter_load[31:0]  = counter_val_i;
+    if (counterh_we_i) begin
+      counter_load[63:32] = counter_val_i;
+      counter_load[31:0]  = counter[31:0];
+    end
+
+    // Next value logic
+    if (we) begin
+      counter_d = counter_load[CounterWidth-1:0];
+    end else if (counter_inc_i) begin
+      counter_d = counter_upd[CounterWidth-1:0];
+    end else begin
+      counter_d = counter[CounterWidth-1:0];
+    end
+  end
+
+`ifdef FPGA_XILINX
+  // Set DSP pragma for supported xilinx FPGAs
+  localparam int DspPragma = CounterWidth < 49 ? "yes" : "no";
+  (* use_dsp = DspPragma *) logic [CounterWidth-1:0] counter_q;
+
+  // DSP output register requires synchronous reset.
+  `define COUNTER_FLOP_RST posedge clk_i
+`else
+  logic [CounterWidth-1:0] counter_q;
+
+  `define COUNTER_FLOP_RST posedge clk_i or negedge rst_ni
+`endif
+
+  // Counter flop
+  always_ff @(`COUNTER_FLOP_RST) begin
+    if (!rst_ni) begin
+      counter_q <= '0;
+    end else begin
+      counter_q <= counter_d;
+    end
+  end
+
+  if (CounterWidth < 64) begin : g_counter_narrow
+    logic [63:CounterWidth] unused_counter_load;
+
+    assign counter[CounterWidth-1:0]           = counter_q;
+    assign counter[63:CounterWidth]            = '0;
+
+    if (ProvideValUpd) begin : g_counter_val_upd_o
+      assign counter_val_upd_o[CounterWidth-1:0] = counter_upd;
+    end else begin : g_no_counter_val_upd_o
+      assign counter_val_upd_o[CounterWidth-1:0] = '0;
+    end
+    assign counter_val_upd_o[63:CounterWidth]  = '0;
+    assign unused_counter_load                 = counter_load[63:CounterWidth];
+  end else begin : g_counter_full
+    assign counter           = counter_q;
+
+    if (ProvideValUpd) begin : g_counter_val_upd_o
+      assign counter_val_upd_o = counter_upd;
+    end else begin : g_no_counter_val_upd_o
+      assign counter_val_upd_o = '0;
+    end
+  end
+
+  assign counter_val_o = counter;
+
+endmodule
+
+// Keep helper defines file-local.
+`undef COUNTER_FLOP_RST
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_cs_registers.sv b/hw/ip/cheriot-ibex/rtl/cheriot_cs_registers.sv
new file mode 100644
index 0000000..c7e91dd
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_cs_registers.sv
@@ -0,0 +1,1998 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Control and Status Registers
+ *
+ * Control and Status Registers (CSRs) following the RISC-V Privileged
+ * Specification, draft version 1.11
+ */
+
+`include "prim_assert.sv"
+
+module cheriot_cs_registers import cheri_pkg::*;  #(
+  parameter bit               DbgTriggerEn      = 0,
+  parameter int unsigned      DbgHwBreakNum     = 1,
+  parameter bit               DataIndTiming     = 1'b0,
+  parameter bit               DummyInstructions = 1'b0,
+  parameter bit               ShadowCSR         = 1'b0,
+  parameter bit               ICache            = 1'b0,
+  parameter int unsigned      MHPMCounterNum    = 10,
+  parameter int unsigned      MHPMCounterWidth  = 40,
+  parameter bit               PMPEnable         = 0,
+  parameter int unsigned      PMPGranularity    = 0,
+  parameter int unsigned      PMPNumRegions     = 4,
+  parameter bit               RV32E             = 0,
+  parameter cheriot_pkg::rv32m_e RV32M             = cheriot_pkg::RV32MFast,
+  parameter cheriot_pkg::rv32b_e RV32B             = cheriot_pkg::RV32BNone,
+  parameter bit               CHERIoTEn         = 1'b1
+) (
+  // Clock and Reset
+  input  logic                 clk_i,
+  input  logic                 rst_ni,
+
+  input  logic                 cheri_pmode_i,
+  // Hart ID
+  input  logic [31:0]          hart_id_i,
+
+  // Privilege mode
+  output cheriot_pkg::priv_lvl_e  priv_mode_id_o,
+  output cheriot_pkg::priv_lvl_e  priv_mode_lsu_o,
+  output logic                 csr_mstatus_tw_o,
+
+  // mtvec
+  output logic [31:0]          csr_mtvec_o,
+  input  logic                 csr_mtvec_init_i,
+  input  logic [31:0]          boot_addr_i,
+
+  // Interface to registers (SRAM like)
+  input  logic                 csr_access_i,
+  input  cheriot_pkg::csr_num_e   csr_addr_i,
+  input  logic [31:0]          csr_wdata_i,
+  input  cheriot_pkg::csr_op_e    csr_op_i,
+  input                        csr_op_en_i,
+  output logic [31:0]          csr_rdata_o,
+
+  input  logic                 cheri_csr_access_i,
+  input  logic [4:0]           cheri_csr_addr_i,
+  input  logic [31:0]          cheri_csr_wdata_i,
+  input  reg_cap_t             cheri_csr_wcap_i,
+  input  cheri_csr_op_e        cheri_csr_op_i,
+  input  logic                 cheri_csr_op_en_i,
+  input  logic                 cheri_csr_set_mie_i,
+  input  logic                 cheri_csr_clr_mie_i,
+
+  output logic [31:0]          cheri_csr_rdata_o,
+  output reg_cap_t             cheri_csr_rcap_o,
+
+  // stack highwatermark and fast-clearing function
+  output logic [31:0]          csr_mshwm_o,
+  output logic [31:0]          csr_mshwmb_o,
+  input  logic                 csr_mshwm_set_i,
+  input  logic [31:0]          csr_mshwm_new_i,
+
+  // interrupts
+  input  logic                 irq_software_i,
+  input  logic                 irq_timer_i,
+  input  logic                 irq_external_i,
+  input  logic [14:0]          irq_fast_i,
+  input  logic                 nmi_mode_i,
+  output logic                 irq_pending_o,          // interrupt request pending
+  output cheriot_pkg::irqs_t      irqs_o,                 // interrupt requests qualified with mie
+  output logic                 csr_mstatus_mie_o,
+  output logic [31:0]          csr_mepc_o,
+
+  // PMP
+  output cheriot_pkg::pmp_cfg_t     csr_pmp_cfg_o  [PMPNumRegions],
+  output logic [33:0]            csr_pmp_addr_o [PMPNumRegions],
+  output cheriot_pkg::pmp_mseccfg_t csr_pmp_mseccfg_o,
+
+  // debug
+  input  logic                 debug_mode_i,
+  input  cheriot_pkg::dbg_cause_e debug_cause_i,
+  input  logic                 debug_csr_save_i,
+  output logic [31:0]          csr_depc_o,
+  output logic                 debug_single_step_o,
+  output logic                 debug_ebreakm_o,
+  output logic                 debug_ebreaku_o,
+  output logic                 trigger_match_o,
+
+  input  logic [31:0]          pc_if_i,
+  input  logic [31:0]          pc_id_i,
+  input  logic [31:0]          pc_wb_i,
+
+  // CPU control bits
+  output logic                 data_ind_timing_o,
+  output logic                 dummy_instr_en_o,
+  output logic [2:0]           dummy_instr_mask_o,
+  output logic                 dummy_instr_seed_en_o,
+  output logic [31:0]          dummy_instr_seed_o,
+  output logic                 icache_enable_o,
+  output logic                 csr_shadow_err_o,
+
+  // Exception save/restore
+  input  logic                 csr_save_if_i,
+  input  logic                 csr_save_id_i,
+  input  logic                 csr_save_wb_i,
+  input  logic                 csr_restore_mret_i,
+  input  logic                 csr_restore_dret_i,
+  input  logic                 csr_save_cause_i,
+  input  logic                 csr_mepcc_clrtag_i,
+  input  cheriot_pkg::exc_cause_e csr_mcause_i,
+  input  logic [31:0]          csr_mtval_i,
+  output logic                 illegal_csr_insn_o,     // access to non-existent CSR,
+                                                        // with wrong priviledge level, or
+                                                        // missing write permissions
+  output logic                 double_fault_seen_o,
+  // Performance Counters
+  input  logic                 instr_ret_i,                 // instr retired in ID/EX stage
+  input  logic                 instr_ret_compressed_i,      // compressed instr retired
+  input  logic                 instr_ret_spec_i,            // speculative instr_ret_i
+  input  logic                 instr_ret_compressed_spec_i, // speculative instr_ret_compressed_i
+  input  logic                 iside_wait_i,                // core waiting for the iside
+  input  logic                 jump_i,                      // jump instr seen (j, jr, jal, jalr)
+  input  logic                 branch_i,                    // branch instr seen (bf, bnf)
+  input  logic                 branch_taken_i,              // branch was taken
+  input  logic                 mem_load_i,                  // load from memory in this cycle
+  input  logic                 mem_store_i,                 // store to memory in this cycle
+  input  logic                 dside_wait_i,                // core waiting for the dside
+  input  logic                 mul_wait_i,                  // core waiting for multiply
+  input  logic                 div_wait_i,                   // core waiting for divide
+
+  input  logic                 cheri_branch_req_i,
+  input  logic [31:0]          cheri_branch_target_i,
+  input  pcc_cap_t             pcc_cap_i,
+  output pcc_cap_t             pcc_cap_o,
+
+  output logic                 csr_dbg_tclr_fault_o,
+  output logic                 cheri_fatal_err_o
+  );
+
+  import cheriot_pkg::*;
+
+  localparam int unsigned RV32BEnabled = (RV32B == RV32BNone) ? 0 : 1;
+  localparam int unsigned RV32MEnabled = (RV32M == RV32MNone) ? 0 : 1;
+  localparam int unsigned PMPAddrWidth = (PMPGranularity > 0) ? 33 - PMPGranularity : 32;
+
+  // misa
+  localparam logic [31:0] MISA_VALUE =
+      (0                 <<  0)  // A - Atomic Instructions extension
+    | (RV32BEnabled      <<  1)  // B - Bit-Manipulation extension
+    | (1                 <<  2)  // C - Compressed extension
+    | (0                 <<  3)  // D - Double precision floating-point extension
+    | (32'(RV32E)        <<  4)  // E - RV32E base ISA
+    | (0                 <<  5)  // F - Single precision floating-point extension
+    | (32'(!RV32E)       <<  8)  // I - RV32I/64I/128I base ISA
+    | (RV32MEnabled      << 12)  // M - Integer Multiply/Divide extension
+    | (0                 << 13)  // N - User level interrupts supported
+    | (0                 << 18)  // S - Supervisor mode implemented
+    | (1                 << 20)  // U - User mode implemented
+    | (32'(CHERIoTEn)    << 23)  // X - Non-standard extensions present
+    | (32'(CSR_MISA_MXL) << 30); // M-XLEN
+
+  typedef struct packed {
+    logic      mie;
+    logic      mpie;
+    priv_lvl_e mpp;
+    logic      mprv;
+    logic      tw;
+  } status_t;
+
+  typedef struct packed {
+    logic      mpie;
+    priv_lvl_e mpp;
+  } status_stk_t;
+
+  typedef struct packed {
+      x_debug_ver_e xdebugver;
+      logic [11:0]  zero2;
+      logic         ebreakm;
+      logic         zero1;
+      logic         ebreaks;
+      logic         ebreaku;
+      logic         stepie;
+      logic         stopcount;
+      logic         stoptime;
+      dbg_cause_e   cause;
+      logic         zero0;
+      logic         mprven;
+      logic         nmip;
+      logic         step;
+      priv_lvl_e    prv;
+  } dcsr_t;
+
+  // CPU control register fields
+  typedef struct packed {
+    logic        double_fault_seen;
+    logic        sync_exc_seen;
+    logic [2:0]  dummy_instr_mask;
+    logic        dummy_instr_en;
+    logic        data_ind_timing;
+    logic        icache_enable;
+  } cpu_ctrl_t;
+
+  // Interrupt and exception control signals
+  logic [31:0] exception_pc;
+
+  // CSRs
+  priv_lvl_e   priv_lvl_q, priv_lvl_d;
+  status_t     mstatus_q, mstatus_d;
+  logic        mstatus_err;
+  logic        mstatus_en;
+  irqs_t       mie_q, mie_d;
+  logic        mie_en;
+  logic [31:0] mscratch_q;
+  logic        mscratch_en;
+  logic [31:0] mepc_q, mepc_d;
+  logic        mepc_en;
+  reg_cap_t    mepc_cap;
+  logic  [5:0] mcause_q, mcause_d;
+  logic        mcause_en;
+  logic [31:0] mtval_q, mtval_d;
+  logic        mtval_en;
+  logic [31:0] mtvec_q, mtvec_d;
+  reg_cap_t    mtvec_cap;
+  logic        mtvec_err;
+  logic        mtvec_en;
+  irqs_t       mip;
+  dcsr_t       dcsr_q, dcsr_d;
+  logic        dcsr_en;
+  logic [31:0] depc_q, depc_d;
+  logic        depc_en;
+  reg_cap_t    depc_cap;
+  logic [31:0] dscratch0_q;
+  logic [31:0] dscratch1_q;
+  logic        dscratch0_en, dscratch1_en;
+  reg_cap_t    dscratch0_cap, dscratch1_cap;
+  logic [31:0] mshwm_q, mshwm_d;
+  logic [31:0] mshwmb_q;
+  logic        mshwm_en, mshwmb_en;
+  logic [31:0] cdbg_ctrl_q;
+  logic        cdbg_ctrl_en;
+  pcc_cap_t    pcc_cap_q, pcc_cap_d;
+
+  // CSRs for recoverable NMIs
+  // NOTE: these CSRS are nonstandard, see https://github.com/riscv/riscv-isa-manual/issues/261
+  status_stk_t mstack_q, mstack_d;
+  logic        mstack_en;
+  logic [31:0] mstack_epc_q, mstack_epc_d;
+  logic  [5:0] mstack_cause_q, mstack_cause_d;
+
+  // PMP Signals
+  logic [31:0]                 pmp_addr_rdata  [PMP_MAX_REGIONS];
+  logic [PMP_CFG_W-1:0]        pmp_cfg_rdata   [PMP_MAX_REGIONS];
+  logic                        pmp_csr_err;
+  pmp_mseccfg_t                pmp_mseccfg;
+
+  // Hardware performance monitor signals
+  logic [31:0]                 mcountinhibit;
+  // Only have mcountinhibit flops for counters that actually exist
+  logic [MHPMCounterNum+3-1:0] mcountinhibit_d, mcountinhibit_q;
+  logic                        mcountinhibit_we;
+
+  // mhpmcounter flops are elaborated below providing only the precise number that is required based
+  // on MHPMCounterNum/MHPMCounterWidth. This signal connects to the Q output of these flops
+  // where they exist and is otherwise 0.
+  logic [63:0] mhpmcounter [32];
+  logic [31:0] mhpmcounter_we;
+  logic [31:0] mhpmcounterh_we;
+  logic [31:0] mhpmcounter_incr;
+  logic [31:0] mhpmevent [32];
+  logic  [4:0] mhpmcounter_idx;
+  logic        unused_mhpmcounter_we_1;
+  logic        unused_mhpmcounterh_we_1;
+  logic        unused_mhpmcounter_incr_1;
+
+  logic [63:0] minstret_next, minstret_raw;
+
+  // Debug / trigger registers
+  logic [31:0] tselect_rdata;
+  logic [31:0] tmatch_control_rdata;
+  logic [31:0] tmatch_value_rdata;
+
+  // CPU control bits
+  cpu_ctrl_t   cpuctrl_q, cpuctrl_d, cpuctrl_wdata_raw, cpuctrl_wdata;
+  logic        cpuctrl_we;
+  logic        cpuctrl_err;
+
+  // CSR update logic
+  logic [31:0] csr_wdata_int;
+  logic [31:0] csr_rdata_int;
+  logic        csr_we_int;
+  logic        csr_wr;
+
+  // Access violation signals
+  logic        illegal_csr;
+  logic        illegal_csr_priv;
+  logic        illegal_csr_write;
+
+  logic [7:0]  unused_boot_addr;
+  logic [2:0]  unused_csr_addr;
+
+  logic        mepc_en_combi, mepc_en_cheri;
+  logic [31:0] mepc_d_combi;
+
+  logic        mtvec_en_combi, mtvec_en_cheri;
+  logic [31:0] mtvec_d_combi;
+
+  logic        depc_en_combi, depc_en_cheri;
+  logic [31:0] depc_d_combi;
+
+  logic        dscratch0_en_combi, dscratch0_en_cheri;
+  logic [31:0] dscratch0_d_combi;
+  logic        dscratch1_en_combi, dscratch1_en_cheri;
+  logic [31:0] dscratch1_d_combi;
+
+  assign unused_boot_addr = boot_addr_i[7:0];
+
+  logic [31:0] misa_value_masked;
+
+  assign misa_value_masked = MISA_VALUE & ~{8'h0, ~cheri_pmode_i, 23'h0};
+
+
+  /////////////
+  // CSR reg //
+  /////////////
+
+  logic [$bits(csr_num_e)-1:0] csr_addr;
+  assign csr_addr           = {csr_addr_i};
+  assign unused_csr_addr    = csr_addr[7:5];
+  assign mhpmcounter_idx    = csr_addr[4:0];
+
+  // See RISC-V Privileged Specification, version 1.11, Section 2.1
+  assign illegal_csr_priv   = (csr_addr[9:8] > {priv_lvl_q});
+  assign illegal_csr_write  = (csr_addr[11:10] == 2'b11) && csr_wr;
+  assign illegal_csr_insn_o = csr_access_i & (illegal_csr | illegal_csr_write | illegal_csr_priv);
+
+  // mip CSR is purely combinational - must be able to re-enable the clock upon WFI
+  assign mip.irq_software = irq_software_i;
+  assign mip.irq_timer    = irq_timer_i;
+  assign mip.irq_external = irq_external_i;
+  assign mip.irq_fast     = irq_fast_i;
+
+  // read logic
+  always_comb begin
+    csr_rdata_int = '0;
+    illegal_csr   = 1'b0;
+
+    unique case (csr_addr_i)
+      // mvendorid: encoding of manufacturer/provider
+      CSR_MVENDORID: csr_rdata_int = (CHERIoTEn&cheri_pmode_i) ? CSR_MVENDORID_CHERI_VALUE : CSR_MVENDORID_VALUE;
+      // marchid: encoding of base microarchitecture
+      CSR_MARCHID: csr_rdata_int = (CHERIoTEn&cheri_pmode_i) ? CSR_MARCHID_CHERI_VALUE : CSR_MARCHID_VALUE;
+      // mimpid: encoding of processor implementation version
+      CSR_MIMPID: csr_rdata_int = CSR_MIMPID_VALUE;
+      // mhartid: unique hardware thread id
+      CSR_MHARTID: csr_rdata_int = hart_id_i;
+
+      // mstatus: always M-mode, contains IE bit
+      CSR_MSTATUS: begin
+        csr_rdata_int                                                   = '0;
+        csr_rdata_int[CSR_MSTATUS_MIE_BIT]                              = mstatus_q.mie;
+        csr_rdata_int[CSR_MSTATUS_MPIE_BIT]                             = mstatus_q.mpie;
+        csr_rdata_int[CSR_MSTATUS_MPP_BIT_HIGH:CSR_MSTATUS_MPP_BIT_LOW] = mstatus_q.mpp;
+        csr_rdata_int[CSR_MSTATUS_MPRV_BIT]                             = mstatus_q.mprv;
+        csr_rdata_int[CSR_MSTATUS_TW_BIT]                               = mstatus_q.tw;
+      end
+
+      // misa
+      CSR_MISA: csr_rdata_int = misa_value_masked;
+
+      // interrupt enable
+      CSR_MIE: begin
+        csr_rdata_int                                     = '0;
+        csr_rdata_int[CSR_MSIX_BIT]                       = mie_q.irq_software;
+        csr_rdata_int[CSR_MTIX_BIT]                       = mie_q.irq_timer;
+        csr_rdata_int[CSR_MEIX_BIT]                       = mie_q.irq_external;
+        csr_rdata_int[CSR_MFIX_BIT_HIGH:CSR_MFIX_BIT_LOW] = mie_q.irq_fast;
+      end
+
+      // mcounteren: machine counter enable
+      CSR_MCOUNTEREN: begin
+        csr_rdata_int = '0;
+      end
+
+      CSR_MSCRATCH: csr_rdata_int = mscratch_q;
+
+      // mtvec: trap-vector base address
+      CSR_MTVEC: csr_rdata_int = mtvec_q;
+
+      // mepc: exception program counter
+      CSR_MEPC: csr_rdata_int = mepc_q;
+
+      // mcause: exception cause
+      CSR_MCAUSE: csr_rdata_int = {mcause_q[5], 26'b0, mcause_q[4:0]};
+
+      // mtval: trap value
+      CSR_MTVAL: csr_rdata_int = mtval_q;
+
+      // mip: interrupt pending
+      CSR_MIP: begin
+        csr_rdata_int                                     = '0;
+        csr_rdata_int[CSR_MSIX_BIT]                       = mip.irq_software;
+        csr_rdata_int[CSR_MTIX_BIT]                       = mip.irq_timer;
+        csr_rdata_int[CSR_MEIX_BIT]                       = mip.irq_external;
+        csr_rdata_int[CSR_MFIX_BIT_HIGH:CSR_MFIX_BIT_LOW] = mip.irq_fast;
+      end
+
+      CSR_MSECCFG: begin
+        if (PMPEnable) begin
+          csr_rdata_int                       = '0;
+          csr_rdata_int[CSR_MSECCFG_MML_BIT]  = pmp_mseccfg.mml;
+          csr_rdata_int[CSR_MSECCFG_MMWP_BIT] = pmp_mseccfg.mmwp;
+          csr_rdata_int[CSR_MSECCFG_RLB_BIT]  = pmp_mseccfg.rlb;
+        end else begin
+          illegal_csr = 1'b1;
+        end
+      end
+
+      CSR_MSECCFGH: begin
+        if (PMPEnable) begin
+          csr_rdata_int = '0;
+        end else begin
+          illegal_csr = 1'b1;
+        end
+      end
+
+      // PMP registers
+      CSR_PMPCFG0:   csr_rdata_int = {pmp_cfg_rdata[3],  pmp_cfg_rdata[2],
+                                      pmp_cfg_rdata[1],  pmp_cfg_rdata[0]};
+      CSR_PMPCFG1:   csr_rdata_int = {pmp_cfg_rdata[7],  pmp_cfg_rdata[6],
+                                      pmp_cfg_rdata[5],  pmp_cfg_rdata[4]};
+      CSR_PMPCFG2:   csr_rdata_int = {pmp_cfg_rdata[11], pmp_cfg_rdata[10],
+                                      pmp_cfg_rdata[9],  pmp_cfg_rdata[8]};
+      CSR_PMPCFG3:   csr_rdata_int = {pmp_cfg_rdata[15], pmp_cfg_rdata[14],
+                                      pmp_cfg_rdata[13], pmp_cfg_rdata[12]};
+      CSR_PMPADDR0:  csr_rdata_int = pmp_addr_rdata[0];
+      CSR_PMPADDR1:  csr_rdata_int = pmp_addr_rdata[1];
+      CSR_PMPADDR2:  csr_rdata_int = pmp_addr_rdata[2];
+      CSR_PMPADDR3:  csr_rdata_int = pmp_addr_rdata[3];
+      CSR_PMPADDR4:  csr_rdata_int = pmp_addr_rdata[4];
+      CSR_PMPADDR5:  csr_rdata_int = pmp_addr_rdata[5];
+      CSR_PMPADDR6:  csr_rdata_int = pmp_addr_rdata[6];
+      CSR_PMPADDR7:  csr_rdata_int = pmp_addr_rdata[7];
+      CSR_PMPADDR8:  csr_rdata_int = pmp_addr_rdata[8];
+      CSR_PMPADDR9:  csr_rdata_int = pmp_addr_rdata[9];
+      CSR_PMPADDR10: csr_rdata_int = pmp_addr_rdata[10];
+      CSR_PMPADDR11: csr_rdata_int = pmp_addr_rdata[11];
+      CSR_PMPADDR12: csr_rdata_int = pmp_addr_rdata[12];
+      CSR_PMPADDR13: csr_rdata_int = pmp_addr_rdata[13];
+      CSR_PMPADDR14: csr_rdata_int = pmp_addr_rdata[14];
+      CSR_PMPADDR15: csr_rdata_int = pmp_addr_rdata[15];
+
+      CSR_DCSR: begin
+        csr_rdata_int = dcsr_q;
+        illegal_csr = ~debug_mode_i;
+      end
+      CSR_DPC: begin
+        csr_rdata_int = depc_q;
+        illegal_csr = ~debug_mode_i;
+      end
+      CSR_DSCRATCH0: begin
+        csr_rdata_int = dscratch0_q;
+        illegal_csr = ~debug_mode_i;
+      end
+      CSR_DSCRATCH1: begin
+        csr_rdata_int = dscratch1_q;
+        illegal_csr = ~debug_mode_i;
+      end
+
+      // machine counter/timers
+      CSR_MCOUNTINHIBIT: csr_rdata_int = mcountinhibit;
+      CSR_MHPMEVENT3,
+      CSR_MHPMEVENT4,  CSR_MHPMEVENT5,  CSR_MHPMEVENT6,  CSR_MHPMEVENT7,
+      CSR_MHPMEVENT8,  CSR_MHPMEVENT9,  CSR_MHPMEVENT10, CSR_MHPMEVENT11,
+      CSR_MHPMEVENT12, CSR_MHPMEVENT13, CSR_MHPMEVENT14, CSR_MHPMEVENT15,
+      CSR_MHPMEVENT16, CSR_MHPMEVENT17, CSR_MHPMEVENT18, CSR_MHPMEVENT19,
+      CSR_MHPMEVENT20, CSR_MHPMEVENT21, CSR_MHPMEVENT22, CSR_MHPMEVENT23,
+      CSR_MHPMEVENT24, CSR_MHPMEVENT25, CSR_MHPMEVENT26, CSR_MHPMEVENT27,
+      CSR_MHPMEVENT28, CSR_MHPMEVENT29, CSR_MHPMEVENT30, CSR_MHPMEVENT31: begin
+        csr_rdata_int = mhpmevent[mhpmcounter_idx];
+      end
+
+      CSR_MCYCLE,
+      CSR_MINSTRET,
+      CSR_MHPMCOUNTER3,
+      CSR_MHPMCOUNTER4,  CSR_MHPMCOUNTER5,  CSR_MHPMCOUNTER6,  CSR_MHPMCOUNTER7,
+      CSR_MHPMCOUNTER8,  CSR_MHPMCOUNTER9,  CSR_MHPMCOUNTER10, CSR_MHPMCOUNTER11,
+      CSR_MHPMCOUNTER12, CSR_MHPMCOUNTER13, CSR_MHPMCOUNTER14, CSR_MHPMCOUNTER15,
+      CSR_MHPMCOUNTER16, CSR_MHPMCOUNTER17, CSR_MHPMCOUNTER18, CSR_MHPMCOUNTER19,
+      CSR_MHPMCOUNTER20, CSR_MHPMCOUNTER21, CSR_MHPMCOUNTER22, CSR_MHPMCOUNTER23,
+      CSR_MHPMCOUNTER24, CSR_MHPMCOUNTER25, CSR_MHPMCOUNTER26, CSR_MHPMCOUNTER27,
+      CSR_MHPMCOUNTER28, CSR_MHPMCOUNTER29, CSR_MHPMCOUNTER30, CSR_MHPMCOUNTER31: begin
+        csr_rdata_int = mhpmcounter[mhpmcounter_idx][31:0];
+      end
+
+      CSR_MCYCLEH,
+      CSR_MINSTRETH,
+      CSR_MHPMCOUNTER3H,
+      CSR_MHPMCOUNTER4H,  CSR_MHPMCOUNTER5H,  CSR_MHPMCOUNTER6H,  CSR_MHPMCOUNTER7H,
+      CSR_MHPMCOUNTER8H,  CSR_MHPMCOUNTER9H,  CSR_MHPMCOUNTER10H, CSR_MHPMCOUNTER11H,
+      CSR_MHPMCOUNTER12H, CSR_MHPMCOUNTER13H, CSR_MHPMCOUNTER14H, CSR_MHPMCOUNTER15H,
+      CSR_MHPMCOUNTER16H, CSR_MHPMCOUNTER17H, CSR_MHPMCOUNTER18H, CSR_MHPMCOUNTER19H,
+      CSR_MHPMCOUNTER20H, CSR_MHPMCOUNTER21H, CSR_MHPMCOUNTER22H, CSR_MHPMCOUNTER23H,
+      CSR_MHPMCOUNTER24H, CSR_MHPMCOUNTER25H, CSR_MHPMCOUNTER26H, CSR_MHPMCOUNTER27H,
+      CSR_MHPMCOUNTER28H, CSR_MHPMCOUNTER29H, CSR_MHPMCOUNTER30H, CSR_MHPMCOUNTER31H: begin
+        csr_rdata_int = mhpmcounter[mhpmcounter_idx][63:32];
+      end
+
+      // Debug triggers
+      CSR_TSELECT: begin
+        csr_rdata_int = tselect_rdata;
+        illegal_csr   = ~DbgTriggerEn;
+      end
+      CSR_TDATA1: begin
+        csr_rdata_int = tmatch_control_rdata;
+        illegal_csr   = ~DbgTriggerEn;
+      end
+      CSR_TDATA2: begin
+        csr_rdata_int = tmatch_value_rdata;
+        illegal_csr   = ~DbgTriggerEn;
+      end
+      CSR_TDATA3: begin
+        csr_rdata_int = '0;
+        illegal_csr   = ~DbgTriggerEn;
+      end
+      CSR_MCONTEXT: begin
+        csr_rdata_int = '0;
+        illegal_csr   = ~DbgTriggerEn;
+      end
+      CSR_SCONTEXT: begin
+        csr_rdata_int = '0;
+        illegal_csr   = ~DbgTriggerEn;
+      end
+
+      // Custom CSR for controlling CPU features
+      CSR_CPUCTRL: begin
+        csr_rdata_int = {{32 - $bits(cpu_ctrl_t) {1'b0}}, cpuctrl_q};
+      end
+
+      // Custom CSR for LFSR re-seeding (cannot be read)
+      CSR_SECURESEED: begin
+        csr_rdata_int = '0;
+      end
+
+      // MSHWM CSR (stack high watermark in cheriot)
+      CSR_MSHWM:  begin
+        if (cheri_pmode_i) begin
+          csr_rdata_int = cheri_pmode_i ? mshwm_q : 32'h0;
+        end else begin
+          illegal_csr = 1'b1;
+        end
+      end
+
+      CSR_MSHWMB: begin
+        if (cheri_pmode_i) begin
+          csr_rdata_int = cheri_pmode_i ? mshwmb_q : 32'h0;
+        end else begin
+          illegal_csr = 1'b1;
+        end
+      end
+
+      CSR_CDBG_CTRL: begin
+        if (cheri_pmode_i) begin
+          csr_rdata_int = cheri_pmode_i ? cdbg_ctrl_q : 32'h0;
+        end else begin
+          illegal_csr = 1'b1;
+        end
+      end
+
+      default: begin
+        illegal_csr = 1'b1;
+      end
+    endcase
+  end
+
+  // write logic
+  always_comb begin
+    exception_pc = pc_id_i;
+
+    priv_lvl_d   = priv_lvl_q;
+    mstatus_en   = 1'b0;
+    mstatus_d    = mstatus_q;
+    mie_en       = 1'b0;
+    mscratch_en  = 1'b0;
+    mepc_en      = 1'b0;
+    mepc_d       = {csr_wdata_int[31:1], 1'b0};
+    mcause_en    = 1'b0;
+    mcause_d     = {csr_wdata_int[31], csr_wdata_int[4:0]};
+    mtval_en     = 1'b0;
+    mtval_d      = csr_wdata_int;
+    mtvec_en     = csr_mtvec_init_i;
+    // mtvec.MODE set to vectored
+    // mtvec.BASE must be 256-byte aligned
+    mtvec_d      = csr_mtvec_init_i ? {boot_addr_i[31:8], 6'b0, 1'b0, ~(CHERIoTEn&cheri_pmode_i)} :
+                                      {csr_wdata_int[31:8], 6'b0, 1'b0, ~(CHERIoTEn&cheri_pmode_i)};
+    dcsr_en      = 1'b0;
+    dcsr_d       = dcsr_q;
+    depc_d       = {csr_wdata_int[31:1], 1'b0};
+    depc_en      = 1'b0;
+    dscratch0_en = 1'b0;
+    dscratch1_en = 1'b0;
+
+    mstack_en      = 1'b0;
+    mstack_d.mpie  = mstatus_q.mpie;
+    mstack_d.mpp   = mstatus_q.mpp;
+    mstack_epc_d   = mepc_q;
+    mstack_cause_d = mcause_q;
+
+    mcountinhibit_we = 1'b0;
+    mhpmcounter_we   = '0;
+    mhpmcounterh_we  = '0;
+
+    cpuctrl_we       = 1'b0;
+    cpuctrl_d        = cpuctrl_q;
+
+    mshwm_en     = 1'b0;
+    mshwmb_en    = 1'b0;
+    cdbg_ctrl_en = 1'b0;
+
+    double_fault_seen_o = 1'b0;
+
+    if (csr_we_int) begin
+      unique case (csr_addr_i)
+        // mstatus: IE bit
+        CSR_MSTATUS: begin
+          mstatus_en = 1'b1;
+          mstatus_d    = '{
+              mie:  csr_wdata_int[CSR_MSTATUS_MIE_BIT],
+              mpie: csr_wdata_int[CSR_MSTATUS_MPIE_BIT],
+              mpp:  priv_lvl_e'(csr_wdata_int[CSR_MSTATUS_MPP_BIT_HIGH:CSR_MSTATUS_MPP_BIT_LOW]),
+              mprv: csr_wdata_int[CSR_MSTATUS_MPRV_BIT],
+              tw:   csr_wdata_int[CSR_MSTATUS_TW_BIT]
+          };
+          // Convert illegal values to M-mode
+          if ((mstatus_d.mpp != PRIV_LVL_M) && (mstatus_d.mpp != PRIV_LVL_U)) begin
+            mstatus_d.mpp = PRIV_LVL_M;
+          end
+        end
+
+        // interrupt enable
+        CSR_MIE: mie_en = 1'b1;
+
+        CSR_MSCRATCH: mscratch_en = 1'b1;
+
+        // mepc: exception program counter
+        CSR_MEPC: mepc_en = ~CHERIoTEn | ~cheri_pmode_i;   // disabled for pure cap mode (only allow cap writes)
+
+        // mcause
+        CSR_MCAUSE: mcause_en = 1'b1;
+
+        // mtval: trap value
+        CSR_MTVAL: mtval_en = 1'b1;
+
+        // mtvec
+        CSR_MTVEC: mtvec_en = ~CHERIoTEn | ~cheri_pmode_i;  // disabled for pure cap mode (only allow cap writes)
+
+        CSR_DCSR: begin
+          dcsr_d = csr_wdata_int;
+          dcsr_d.xdebugver = XDEBUGVER_STD;
+          // Change to PRIV_LVL_M if software writes an unsupported value
+          if ((dcsr_d.prv != PRIV_LVL_M) && (dcsr_d.prv != PRIV_LVL_U)) begin
+            dcsr_d.prv = PRIV_LVL_M;
+          end
+
+          // Read-only for SW
+          dcsr_d.cause = dcsr_q.cause;
+
+          // Interrupts always disabled during single stepping
+          dcsr_d.stepie = 1'b0;
+
+          // currently not supported:
+          dcsr_d.nmip = 1'b0;
+          dcsr_d.mprven = 1'b0;
+          dcsr_d.stopcount = 1'b0;
+          dcsr_d.stoptime = 1'b0;
+
+          // forced to be zero
+          dcsr_d.zero0 = 1'b0;
+          dcsr_d.zero1 = 1'b0;
+          dcsr_d.zero2 = 12'h0;
+          dcsr_en      = 1'b1;
+        end
+
+        // dpc: debug program counter
+        CSR_DPC: depc_en = 1'b1;
+
+        CSR_DSCRATCH0: dscratch0_en = 1'b1;
+        CSR_DSCRATCH1: dscratch1_en = 1'b1;
+
+        // machine counter/timers
+        CSR_MCOUNTINHIBIT: mcountinhibit_we = 1'b1;
+
+        CSR_MCYCLE,
+        CSR_MINSTRET,
+        CSR_MHPMCOUNTER3,
+        CSR_MHPMCOUNTER4,  CSR_MHPMCOUNTER5,  CSR_MHPMCOUNTER6,  CSR_MHPMCOUNTER7,
+        CSR_MHPMCOUNTER8,  CSR_MHPMCOUNTER9,  CSR_MHPMCOUNTER10, CSR_MHPMCOUNTER11,
+        CSR_MHPMCOUNTER12, CSR_MHPMCOUNTER13, CSR_MHPMCOUNTER14, CSR_MHPMCOUNTER15,
+        CSR_MHPMCOUNTER16, CSR_MHPMCOUNTER17, CSR_MHPMCOUNTER18, CSR_MHPMCOUNTER19,
+        CSR_MHPMCOUNTER20, CSR_MHPMCOUNTER21, CSR_MHPMCOUNTER22, CSR_MHPMCOUNTER23,
+        CSR_MHPMCOUNTER24, CSR_MHPMCOUNTER25, CSR_MHPMCOUNTER26, CSR_MHPMCOUNTER27,
+        CSR_MHPMCOUNTER28, CSR_MHPMCOUNTER29, CSR_MHPMCOUNTER30, CSR_MHPMCOUNTER31: begin
+          mhpmcounter_we[mhpmcounter_idx] = 1'b1;
+        end
+
+        CSR_MCYCLEH,
+        CSR_MINSTRETH,
+        CSR_MHPMCOUNTER3H,
+        CSR_MHPMCOUNTER4H,  CSR_MHPMCOUNTER5H,  CSR_MHPMCOUNTER6H,  CSR_MHPMCOUNTER7H,
+        CSR_MHPMCOUNTER8H,  CSR_MHPMCOUNTER9H,  CSR_MHPMCOUNTER10H, CSR_MHPMCOUNTER11H,
+        CSR_MHPMCOUNTER12H, CSR_MHPMCOUNTER13H, CSR_MHPMCOUNTER14H, CSR_MHPMCOUNTER15H,
+        CSR_MHPMCOUNTER16H, CSR_MHPMCOUNTER17H, CSR_MHPMCOUNTER18H, CSR_MHPMCOUNTER19H,
+        CSR_MHPMCOUNTER20H, CSR_MHPMCOUNTER21H, CSR_MHPMCOUNTER22H, CSR_MHPMCOUNTER23H,
+        CSR_MHPMCOUNTER24H, CSR_MHPMCOUNTER25H, CSR_MHPMCOUNTER26H, CSR_MHPMCOUNTER27H,
+        CSR_MHPMCOUNTER28H, CSR_MHPMCOUNTER29H, CSR_MHPMCOUNTER30H, CSR_MHPMCOUNTER31H: begin
+          mhpmcounterh_we[mhpmcounter_idx] = 1'b1;
+        end
+
+        CSR_CPUCTRL: begin
+          cpuctrl_d  = cpuctrl_wdata;
+          cpuctrl_we = 1'b1;
+        end
+
+        CSR_MSHWM:      mshwm_en  = CHERIoTEn & cheri_pmode_i;
+        CSR_MSHWMB:     mshwmb_en = CHERIoTEn & cheri_pmode_i;
+        CSR_CDBG_CTRL:  cdbg_ctrl_en = CHERIoTEn & cheri_pmode_i;
+
+        default:;
+      endcase
+    end
+
+    // exception controller gets priority over other writes
+    unique case (1'b1)
+
+      csr_save_cause_i: begin
+        unique case (1'b1)
+          csr_save_if_i: begin
+            exception_pc = pc_if_i;
+          end
+          csr_save_id_i: begin
+            exception_pc = pc_id_i;
+          end
+          csr_save_wb_i: begin
+            exception_pc = pc_wb_i;
+          end
+          default:;
+        endcase
+
+        // Any exception, including debug mode, causes a switch to M-mode
+        priv_lvl_d = PRIV_LVL_M;
+
+        if (debug_csr_save_i) begin
+          // all interrupts are masked
+          // do not update cause, epc, tval, epc and status
+          dcsr_d.prv   = priv_lvl_q;
+          dcsr_d.cause = debug_cause_i;
+          dcsr_en      = 1'b1;
+          depc_d       = exception_pc;
+          depc_en      = 1'b1;
+        end else if (!debug_mode_i) begin
+          // In debug mode, "exceptions do not update any registers. That
+          // includes cause, epc, tval, dpc and mstatus." [Debug Spec v0.13.2, p.39]
+          mtval_en       = 1'b1;
+          mtval_d        = csr_mtval_i;
+          mstatus_en     = 1'b1;
+          mstatus_d.mie  = 1'b0; // disable interrupts
+          // save current status
+          mstatus_d.mpie = mstatus_q.mie;
+          mstatus_d.mpp  = priv_lvl_q;
+          mepc_en        = 1'b1;
+          mepc_d         = exception_pc;
+          mcause_en      = 1'b1;
+          mcause_d       = {csr_mcause_i};
+          // save previous status for recoverable NMI
+          mstack_en      = 1'b1;
+
+          if (!mcause_d[5]) begin
+            cpuctrl_we = 1'b1;
+
+            cpuctrl_d.sync_exc_seen = 1'b1;
+            if (cpuctrl_q.sync_exc_seen) begin
+              double_fault_seen_o         = 1'b1;
+              cpuctrl_d.double_fault_seen = 1'b1;
+            end
+          end
+        end
+      end // csr_save_cause_i
+
+      csr_restore_dret_i: begin // DRET
+        priv_lvl_d = dcsr_q.prv;
+      end // csr_restore_dret_i
+
+      csr_restore_mret_i: begin // MRET
+        priv_lvl_d     = mstatus_q.mpp;
+        mstatus_en     = 1'b1;
+        mstatus_d.mie  = mstatus_q.mpie; // re-enable interrupts
+
+        // merge in upstream change 9/7/2022  // LEC_NOT_COMPATIBLE
+        if (mstatus_q.mpp != PRIV_LVL_M) begin
+          mstatus_d.mprv = 1'b0;
+        end
+
+        cpuctrl_we              = 1'b1;
+        cpuctrl_d.sync_exc_seen = 1'b0;
+
+        if (nmi_mode_i) begin
+          // when returning from an NMI restore state from mstack CSR
+          mstatus_d.mpie = mstack_q.mpie;
+          mstatus_d.mpp  = mstack_q.mpp;
+          mepc_en        = 1'b1;
+          mepc_d         = mstack_epc_q;
+          mcause_en      = 1'b1;
+          mcause_d       = mstack_cause_q;
+        end else begin
+          // otherwise just set mstatus.MPIE/MPP
+          // See RISC-V Privileged Specification, version 1.11, Section 3.1.6.1
+          mstatus_d.mpie = 1'b1;
+          mstatus_d.mpp  = PRIV_LVL_U;
+        end
+      end // csr_restore_mret_i
+
+      default:;
+    endcase
+  end
+
+  // Update current priv level
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      priv_lvl_q     <= PRIV_LVL_M;
+    end else begin
+      priv_lvl_q     <= priv_lvl_d;
+    end
+  end
+
+  // Send current priv level to the decoder
+  assign priv_mode_id_o = priv_lvl_q;
+  // Load/store instructions must factor in MPRV for PMP checking
+  assign priv_mode_lsu_o = mstatus_q.mprv ? mstatus_q.mpp : priv_lvl_q;
+
+  // CSR operation logic
+  always_comb begin
+    unique case (csr_op_i)
+      CSR_OP_WRITE: csr_wdata_int =  csr_wdata_i;
+      CSR_OP_SET:   csr_wdata_int =  csr_wdata_i | csr_rdata_o;
+      CSR_OP_CLEAR: csr_wdata_int = ~csr_wdata_i & csr_rdata_o;
+      CSR_OP_READ:  csr_wdata_int = csr_wdata_i;
+      default:      csr_wdata_int = csr_wdata_i;
+    endcase
+  end
+
+  assign csr_wr = (csr_op_i inside {CSR_OP_WRITE, CSR_OP_SET, CSR_OP_CLEAR});
+
+  // only write CSRs during one clock cycle
+
+  // enforcing the CHERI CSR access policy. 
+  //  - exceptions for ASR violation is generated in the controller. 
+  //  - we never allow writes to any CSR if ASR=0 
+  //  - no need to gate csr_rdata for ASR violation since the instruction will be faulted anyway 
+
+  // logic read_ok;
+  // assign read_ok = ~CHERIoTEn || ~cheri_pmode_i || debug_mode_i || pcc_cap_q.perms[PERM_SR] || 
+                   // ((csr_addr_i>=CSR_MCYCLE) && (csr_addr_i<=CSR_CDBG_CTRL));
+  assign csr_we_int  = csr_wr & csr_op_en_i & (~CHERIoTEn | ~cheri_pmode_i | debug_mode_i | pcc_cap_q.perms[PERM_SR]) & ~illegal_csr_insn_o;
+
+  //  assign csr_rdata_o = read_ok ? csr_rdata_int : 0;
+   assign csr_rdata_o = csr_rdata_int;
+
+  // directly output some registers
+  assign csr_mepc_o  = mepc_q;
+  assign csr_depc_o  = depc_q;
+  assign csr_mtvec_o = mtvec_q;
+
+  assign csr_mshwm_o  = mshwm_q;
+  assign csr_mshwmb_o = mshwmb_q;
+
+  assign csr_mstatus_mie_o   = mstatus_q.mie;
+  assign csr_mstatus_tw_o    = mstatus_q.tw;
+  assign debug_single_step_o = dcsr_q.step;
+  assign debug_ebreakm_o     = dcsr_q.ebreakm;
+  assign debug_ebreaku_o     = dcsr_q.ebreaku;
+
+  // Qualify incoming interrupt requests in mip CSR with mie CSR for controller and to re-enable
+  // clock upon WFI (must be purely combinational).
+  assign irqs_o        = mip & mie_q;
+  assign irq_pending_o = |irqs_o;
+
+  ////////////////////////
+  // CSR instantiations //
+  ////////////////////////
+
+  // MSTATUS
+  localparam status_t MSTATUS_RST_VAL = '{mie:  1'b0,
+                                          mpie: 1'b1,
+                                          mpp:  PRIV_LVL_U,
+                                          mprv: 1'b0,
+                                          tw:   1'b0};
+
+  // adding set/clr of mie based on sentry type for CHERIoT
+  logic    mstatus_en_combi;
+  status_t mstatus_d_combi;
+
+  assign mstatus_en_combi = mstatus_en | cheri_csr_clr_mie_i | cheri_csr_set_mie_i;
+
+  always_comb begin
+   mstatus_d_combi      = mstatus_d;
+   mstatus_d_combi.mie  = (mstatus_d.mie & ~cheri_csr_clr_mie_i) | cheri_csr_set_mie_i;
+  end
+
+  cheriot_csr #(
+    .Width     ($bits(status_t)),
+    .ShadowCopy(ShadowCSR),
+    .ResetValue({MSTATUS_RST_VAL})
+  ) u_mstatus_csr (
+    .clk_i     (clk_i),
+    .rst_ni    (rst_ni),
+    .wr_data_i ({mstatus_d_combi}),
+    .wr_en_i   (mstatus_en_combi),
+    .rd_data_o (mstatus_q),
+    .rd_error_o(mstatus_err)
+  );
+
+  assign mepc_en_combi = mepc_en | mepc_en_cheri;
+  assign mepc_d_combi = ({32{mepc_en}} & mepc_d) | ({32{mepc_en_cheri}} & cheri_csr_wdata_i);
+
+  // MEPC
+  cheriot_csr #(
+    .Width     (32),
+    .ShadowCopy(1'b0),
+    .ResetValue('0)
+  ) u_mepc_csr (
+    .clk_i     (clk_i),
+    .rst_ni    (rst_ni),
+    .wr_data_i (mepc_d_combi),
+    .wr_en_i   (mepc_en_combi),
+    .rd_data_o (mepc_q),
+    .rd_error_o()
+  );
+
+  // MIE
+  assign mie_d.irq_software = csr_wdata_int[CSR_MSIX_BIT];
+  assign mie_d.irq_timer    = csr_wdata_int[CSR_MTIX_BIT];
+  assign mie_d.irq_external = csr_wdata_int[CSR_MEIX_BIT];
+  assign mie_d.irq_fast     = csr_wdata_int[CSR_MFIX_BIT_HIGH:CSR_MFIX_BIT_LOW];
+  cheriot_csr #(
+    .Width     ($bits(irqs_t)),
+    .ShadowCopy(1'b0),
+    .ResetValue('0)
+  ) u_mie_csr (
+    .clk_i     (clk_i),
+    .rst_ni    (rst_ni),
+    .wr_data_i ({mie_d}),
+    .wr_en_i   (mie_en),
+    .rd_data_o (mie_q),
+    .rd_error_o()
+  );
+
+  // MSCRATCH
+  cheriot_csr #(
+    .Width     (32),
+    .ShadowCopy(1'b0),
+    .ResetValue('0)
+  ) u_mscratch_csr (
+    .clk_i     (clk_i),
+    .rst_ni    (rst_ni),
+    .wr_data_i (csr_wdata_int),
+    .wr_en_i   (mscratch_en),
+    .rd_data_o (mscratch_q),
+    .rd_error_o()
+  );
+
+  // MCAUSE
+  cheriot_csr #(
+    .Width     (6),
+    .ShadowCopy(1'b0),
+    .ResetValue('0)
+  ) u_mcause_csr (
+    .clk_i     (clk_i),
+    .rst_ni    (rst_ni),
+    .wr_data_i (mcause_d),
+    .wr_en_i   (mcause_en),
+    .rd_data_o (mcause_q),
+    .rd_error_o()
+  );
+
+  // MTVAL
+  cheriot_csr #(
+    .Width     (32),
+    .ShadowCopy(1'b0),
+    .ResetValue('0)
+  ) u_mtval_csr (
+    .clk_i     (clk_i),
+    .rst_ni    (rst_ni),
+    .wr_data_i (mtval_d),
+    .wr_en_i   (mtval_en),
+    .rd_data_o (mtval_q),
+    .rd_error_o()
+  );
+
+
+  assign mtvec_en_combi = mtvec_en | mtvec_en_cheri;
+
+  // use only 2'b00 (direct mode) for CHERIoT
+  assign mtvec_d_combi = ({32{mtvec_en}} & mtvec_d) | ({32{mtvec_en_cheri}} & 
+                          {cheri_csr_wdata_i[31:2],2'b00});
+
+  // MTVEC
+  cheriot_csr #(
+    .Width     (32),
+    .ShadowCopy(ShadowCSR),
+    .ResetValue({32'd1})   // retain this to make lec vs ibex pass
+  ) u_mtvec_csr (
+    .clk_i     (clk_i),
+    .rst_ni    (rst_ni),
+    .wr_data_i (mtvec_d_combi),
+    .wr_en_i   (mtvec_en_combi),
+    .rd_data_o (mtvec_q),
+    .rd_error_o(mtvec_err)
+  );
+
+  // DCSR
+  localparam dcsr_t DCSR_RESET_VAL = '{
+      xdebugver: XDEBUGVER_STD,
+      cause: DBG_CAUSE_NONE,  // 3'h0
+      prv: PRIV_LVL_M,
+      default: '0
+  };
+  cheriot_csr #(
+    .Width     ($bits(dcsr_t)),
+    .ShadowCopy(1'b0),
+    .ResetValue({DCSR_RESET_VAL})
+  ) u_dcsr_csr (
+    .clk_i     (clk_i),
+    .rst_ni    (rst_ni),
+    .wr_data_i ({dcsr_d}),
+    .wr_en_i   (dcsr_en),
+    .rd_data_o (dcsr_q),
+    .rd_error_o()
+  );
+
+  assign depc_en_combi = depc_en | depc_en_cheri;
+  assign depc_d_combi = ({32{depc_en}} & depc_d) | ({32{depc_en_cheri}} & cheri_csr_wdata_i);
+
+  // DEPC
+  cheriot_csr #(
+    .Width     (32),
+    .ShadowCopy(1'b0),
+    .ResetValue('0)
+  ) u_depc_csr (
+    .clk_i     (clk_i),
+    .rst_ni    (rst_ni),
+    .wr_data_i (depc_d_combi),
+    .wr_en_i   (depc_en_combi),
+    .rd_data_o (depc_q),
+    .rd_error_o()
+  );
+
+  assign dscratch0_en_combi = dscratch0_en | dscratch0_en_cheri;
+  assign dscratch0_d_combi = ({32{dscratch0_en}} & csr_wdata_int) | ({32{dscratch0_en_cheri}} & cheri_csr_wdata_i);
+
+  // DSCRATCH0
+  cheriot_csr #(
+    .Width     (32),
+    .ShadowCopy(1'b0),
+    .ResetValue('0)
+  ) u_dscratch0_csr (
+    .clk_i     (clk_i),
+    .rst_ni    (rst_ni),
+    .wr_data_i (dscratch0_d_combi),
+    .wr_en_i   (dscratch0_en_combi),
+    .rd_data_o (dscratch0_q),
+    .rd_error_o()
+  );
+
+  assign dscratch1_en_combi = dscratch1_en | dscratch1_en_cheri;
+  assign dscratch1_d_combi = ({32{dscratch1_en}} & csr_wdata_int) | ({32{dscratch1_en_cheri}} & cheri_csr_wdata_i);
+
+  // DSCRATCH0
+  cheriot_csr #(
+    .Width     (32),
+    .ShadowCopy(1'b0),
+    .ResetValue('0)
+  ) u_dscratch1_csr (
+    .clk_i     (clk_i),
+    .rst_ni    (rst_ni),
+    .wr_data_i (dscratch1_d_combi),
+    .wr_en_i   (dscratch1_en_combi),
+    .rd_data_o (dscratch1_q),
+    .rd_error_o()
+  );
+
+  // MSTACK
+  localparam status_stk_t MSTACK_RESET_VAL = '{mpie: 1'b1, mpp: PRIV_LVL_U};
+  cheriot_csr #(
+    .Width     ($bits(status_stk_t)),
+    .ShadowCopy(1'b0),
+    .ResetValue({MSTACK_RESET_VAL})
+  ) u_mstack_csr (
+    .clk_i     (clk_i),
+    .rst_ni    (rst_ni),
+    .wr_data_i ({mstack_d}),
+    .wr_en_i   (mstack_en),
+    .rd_data_o (mstack_q),
+    .rd_error_o()
+  );
+
+  // MSTACK_EPC
+  cheriot_csr #(
+    .Width     (32),
+    .ShadowCopy(1'b0),
+    .ResetValue('0)
+  ) u_mstack_epc_csr (
+    .clk_i     (clk_i),
+    .rst_ni    (rst_ni),
+    .wr_data_i (mstack_epc_d),
+    .wr_en_i   (mstack_en),
+    .rd_data_o (mstack_epc_q),
+    .rd_error_o()
+  );
+
+  // MSTACK_CAUSE
+  cheriot_csr #(
+    .Width     (6),
+    .ShadowCopy(1'b0),
+    .ResetValue('0)
+  ) u_mstack_cause_csr (
+    .clk_i     (clk_i),
+    .rst_ni    (rst_ni),
+    .wr_data_i (mstack_cause_d),
+    .wr_en_i   (mstack_en),
+    .rd_data_o (mstack_cause_q),
+    .rd_error_o()
+  );
+
+  // MSHWM and HSHWMB
+  logic        mshwm_en_combi;
+  assign mshwm_en_combi = mshwm_en | csr_mshwm_set_i;
+  assign mshwm_d = csr_mshwm_set_i ? csr_mshwm_new_i : {csr_wdata_int[31:4], 4'h0};
+
+  if (CHERIoTEn) begin: g_mshwm
+    cheriot_csr #(
+      .Width     (32),
+      .ShadowCopy(ShadowCSR),
+      .ResetValue(32'd0)
+    ) u_mshwm_csr (
+      .clk_i     (clk_i),
+      .rst_ni    (rst_ni),
+      .wr_data_i (mshwm_d),
+      .wr_en_i   (mshwm_en_combi),
+      .rd_data_o (mshwm_q),
+      .rd_error_o()
+    );
+
+    cheriot_csr #(
+      .Width     (32),
+      .ShadowCopy(ShadowCSR),
+      .ResetValue(32'd0)
+    ) u_mshwmb_csr (
+      .clk_i     (clk_i),
+      .rst_ni    (rst_ni),
+      .wr_data_i ({csr_wdata_int[31:4], 4'h0}),
+      .wr_en_i   (mshwmb_en),
+      .rd_data_o (mshwmb_q),
+      .rd_error_o()
+      );
+
+    // cheri debug feature control
+    cheriot_csr #(
+      .Width     (32),
+      .ShadowCopy(ShadowCSR),
+      .ResetValue(32'd0)
+    ) u_cdbg_ctrl_csr (
+      .clk_i     (clk_i),
+      .rst_ni    (rst_ni),
+      .wr_data_i ({31'h0, csr_wdata_int[0]}),
+      .wr_en_i   (cdbg_ctrl_en),
+      .rd_data_o (cdbg_ctrl_q),
+      .rd_error_o()
+      );
+
+    assign csr_dbg_tclr_fault_o = cdbg_ctrl_q[0];
+
+  end else begin
+    assign mshwm_q  = 32'h0;
+    assign mshwmb_q = 32'h0;
+
+    assign csr_dbg_tclr_fault_o = 1'b0;
+  end
+
+  // -----------------
+  // PMP registers
+  // -----------------
+
+  if (PMPEnable) begin : g_pmp_registers
+    // PMP reset values
+    `ifdef CHERIOT_CUSTOM_PMP_RESET_VALUES
+      `include "cheriot_pmp_reset.svh"
+    `else
+      `include "cheriot_pmp_reset_default.svh"
+    `endif
+
+    pmp_mseccfg_t                pmp_mseccfg_q, pmp_mseccfg_d;
+    logic                        pmp_mseccfg_we;
+    logic                        pmp_mseccfg_err;
+    pmp_cfg_t                    pmp_cfg         [PMPNumRegions];
+    logic [PMPNumRegions-1:0]    pmp_cfg_locked;
+    pmp_cfg_t                    pmp_cfg_wdata   [PMPNumRegions];
+    logic [PMPAddrWidth-1:0]     pmp_addr        [PMPNumRegions];
+    logic [PMPNumRegions-1:0]    pmp_cfg_we;
+    logic [PMPNumRegions-1:0]    pmp_cfg_err;
+    logic [PMPNumRegions-1:0]    pmp_addr_we;
+    logic [PMPNumRegions-1:0]    pmp_addr_err;
+    logic                        any_pmp_entry_locked;
+
+    // Expanded / qualified register read data
+    for (genvar i = 0; i < PMP_MAX_REGIONS; i++) begin : g_exp_rd_data
+      if (i < PMPNumRegions) begin : g_implemented_regions
+        // Add in zero padding for reserved fields
+        assign pmp_cfg_rdata[i] = {pmp_cfg[i].lock, 2'b00, pmp_cfg[i].mode,
+                                   pmp_cfg[i].exec, pmp_cfg[i].write, pmp_cfg[i].read};
+
+        // Address field read data depends on the current programmed mode and the granularity
+        // See RISC-V Privileged Specification, version 1.11, Section 3.6.1
+        if (PMPGranularity == 0) begin : g_pmp_g0
+          // If G == 0, read data is unmodified
+          assign pmp_addr_rdata[i] = pmp_addr[i];
+
+        end else if (PMPGranularity == 1) begin : g_pmp_g1
+          // If G == 1, bit [G-1] reads as zero in TOR or OFF mode
+          always_comb begin
+            pmp_addr_rdata[i] = pmp_addr[i];
+            if ((pmp_cfg[i].mode == PMP_MODE_OFF) || (pmp_cfg[i].mode == PMP_MODE_TOR)) begin
+              pmp_addr_rdata[i][PMPGranularity-1:0] = '0;
+            end
+          end
+
+        end else begin : g_pmp_g2
+          // For G >= 2, bits are masked to one or zero depending on the mode
+          always_comb begin
+            // In NAPOT mode, bits [G-2:0] must read as one
+            pmp_addr_rdata[i] = {pmp_addr[i], {PMPGranularity - 1{1'b1}}};
+
+            if ((pmp_cfg[i].mode == PMP_MODE_OFF) || (pmp_cfg[i].mode == PMP_MODE_TOR)) begin
+              // In TOR or OFF mode, bits [G-1:0] must read as zero
+              pmp_addr_rdata[i][PMPGranularity-1:0] = '0;
+            end
+          end
+        end
+
+      end else begin : g_other_regions
+        // Non-implemented regions read as zero
+        assign pmp_cfg_rdata[i]  = '0;
+        assign pmp_addr_rdata[i] = '0;
+      end
+    end
+
+    // Write data calculation
+    for (genvar i = 0; i < PMPNumRegions; i++) begin : g_pmp_csrs
+      // -------------------------
+      // Instantiate cfg registers
+      // -------------------------
+      assign pmp_cfg_we[i] = csr_we_int & ~pmp_cfg_locked[i] &
+                             (csr_addr == (CSR_OFF_PMP_CFG + (i[11:0] >> 2)));
+
+      // Select the correct WDATA (each CSR contains 4 CFG fields, each with 2 RES bits)
+      assign pmp_cfg_wdata[i].lock  = csr_wdata_int[(i%4)*PMP_CFG_W+7];
+      // NA4 mode is not selectable when G > 0, mode is treated as OFF
+      always_comb begin
+        unique case (csr_wdata_int[(i%4)*PMP_CFG_W+3+:2])
+          2'b00   : pmp_cfg_wdata[i].mode = PMP_MODE_OFF;
+          2'b01   : pmp_cfg_wdata[i].mode = PMP_MODE_TOR;
+          2'b10   : pmp_cfg_wdata[i].mode = (PMPGranularity == 0) ? PMP_MODE_NA4:
+                                                                    PMP_MODE_OFF;
+          2'b11   : pmp_cfg_wdata[i].mode = PMP_MODE_NAPOT;
+          default : pmp_cfg_wdata[i].mode = PMP_MODE_OFF;
+        endcase
+      end
+      assign pmp_cfg_wdata[i].exec  = csr_wdata_int[(i%4)*PMP_CFG_W+2];
+      // When MSECCFG.MML is unset, W = 1, R = 0 is a reserved combination, so force W to 0 if R ==
+      // 0. Otherwise allow all possible values to be written.
+      assign pmp_cfg_wdata[i].write = pmp_mseccfg_q.mml ? csr_wdata_int[(i%4)*PMP_CFG_W+1] :
+                                                          &csr_wdata_int[(i%4)*PMP_CFG_W+:2];
+      assign pmp_cfg_wdata[i].read  = csr_wdata_int[(i%4)*PMP_CFG_W];
+
+      cheriot_csr #(
+        .Width     ($bits(pmp_cfg_t)),
+        .ShadowCopy(ShadowCSR),
+        .ResetValue(pmp_cfg_rst[i])
+      ) u_pmp_cfg_csr (
+        .clk_i     (clk_i),
+        .rst_ni    (rst_ni),
+        .wr_data_i ({pmp_cfg_wdata[i]}),
+        .wr_en_i   (pmp_cfg_we[i]),
+        .rd_data_o (pmp_cfg[i]),
+        .rd_error_o(pmp_cfg_err[i])
+      );
+
+      // MSECCFG.RLB allows the lock bit to be bypassed (allowing cfg writes when MSECCFG.RLB is
+      // set).
+      assign pmp_cfg_locked[i] = pmp_cfg[i].lock & ~pmp_mseccfg_q.rlb;
+
+      // --------------------------
+      // Instantiate addr registers
+      // --------------------------
+      if (i < PMPNumRegions - 1) begin : g_lower
+        assign pmp_addr_we[i] = csr_we_int & ~pmp_cfg_locked[i] &
+                                (~pmp_cfg_locked[i+1] | (pmp_cfg[i+1].mode != PMP_MODE_TOR)) &
+                                (csr_addr == (CSR_OFF_PMP_ADDR + i[11:0]));
+      end else begin : g_upper
+        assign pmp_addr_we[i] = csr_we_int & ~pmp_cfg_locked[i] &
+                                (csr_addr == (CSR_OFF_PMP_ADDR + i[11:0]));
+      end
+
+      cheriot_csr #(
+        .Width     (PMPAddrWidth),
+        .ShadowCopy(ShadowCSR),
+        .ResetValue(pmp_addr_rst[i][33-:PMPAddrWidth])
+      ) u_pmp_addr_csr (
+        .clk_i     (clk_i),
+        .rst_ni    (rst_ni),
+        .wr_data_i (csr_wdata_int[31-:PMPAddrWidth]),
+        .wr_en_i   (pmp_addr_we[i]),
+        .rd_data_o (pmp_addr[i]),
+        .rd_error_o(pmp_addr_err[i])
+      );
+
+      `ASSERT_INIT(PMPAddrRstLowBitsZero_A, pmp_addr_rst[i][33-PMPAddrWidth:0] == '0)
+
+      assign csr_pmp_cfg_o[i]  = pmp_cfg[i];
+      assign csr_pmp_addr_o[i] = {pmp_addr_rdata[i], 2'b00};
+    end
+
+    assign pmp_mseccfg_we = csr_we_int & (csr_addr == CSR_MSECCFG);
+
+    // MSECCFG.MML/MSECCFG.MMWP cannot be unset once set
+    assign pmp_mseccfg_d.mml  = pmp_mseccfg_q.mml  ? 1'b1 : csr_wdata_int[CSR_MSECCFG_MML_BIT];
+    assign pmp_mseccfg_d.mmwp = pmp_mseccfg_q.mmwp ? 1'b1 : csr_wdata_int[CSR_MSECCFG_MMWP_BIT];
+
+    // pmp_cfg_locked factors in MSECCFG.RLB so any_pmp_entry_locked will only be set if MSECCFG.RLB
+    // is unset
+    assign any_pmp_entry_locked = |pmp_cfg_locked;
+
+    // When any PMP entry is locked (A PMP entry has the L bit set and MSECCFG.RLB is unset),
+    // MSECCFG.RLB cannot be set again
+    assign pmp_mseccfg_d.rlb = any_pmp_entry_locked ? 1'b0 : csr_wdata_int[CSR_MSECCFG_RLB_BIT];
+
+    cheriot_csr #(
+      .Width     ($bits(pmp_mseccfg_t)),
+      .ShadowCopy(ShadowCSR),
+      .ResetValue(pmp_mseccfg_rst)
+    ) u_pmp_mseccfg (
+      .clk_i     (clk_i),
+      .rst_ni    (rst_ni),
+      .wr_data_i (pmp_mseccfg_d),
+      .wr_en_i   (pmp_mseccfg_we),
+      .rd_data_o (pmp_mseccfg_q),
+      .rd_error_o(pmp_mseccfg_err)
+    );
+
+    assign pmp_csr_err = (|pmp_cfg_err) | (|pmp_addr_err) | pmp_mseccfg_err;
+    assign pmp_mseccfg = pmp_mseccfg_q;
+
+  end else begin : g_no_pmp_tieoffs
+    // Generate tieoffs when PMP is not configured
+    for (genvar i = 0; i < PMP_MAX_REGIONS; i++) begin : g_rdata
+      assign pmp_addr_rdata[i] = '0;
+      assign pmp_cfg_rdata[i]  = '0;
+    end
+    for (genvar i = 0; i < PMPNumRegions; i++) begin : g_outputs
+      assign csr_pmp_cfg_o[i]  = pmp_cfg_t'(1'b0);
+      assign csr_pmp_addr_o[i] = '0;
+    end
+    assign pmp_csr_err = 1'b0;
+    assign pmp_mseccfg = '0;
+  end
+
+  assign csr_pmp_mseccfg_o = pmp_mseccfg;
+
+  //////////////////////////
+  //  Performance monitor //
+  //////////////////////////
+
+  // update enable signals
+  always_comb begin : mcountinhibit_update
+    if (mcountinhibit_we == 1'b1) begin
+      // bit 1 must always be 0
+      mcountinhibit_d = {csr_wdata_int[MHPMCounterNum+2:2], 1'b0, csr_wdata_int[0]};
+    end else begin
+      mcountinhibit_d = mcountinhibit_q;
+    end
+  end
+
+  // event selection (hardwired) & control
+  always_comb begin : gen_mhpmcounter_incr
+
+    // Assign inactive counters (first to prevent latch inference)
+    for (int unsigned i = 0; i < 32; i++) begin : gen_mhpmcounter_incr_inactive
+      mhpmcounter_incr[i] = 1'b0;
+    end
+
+    // When adding or altering performance counter meanings and default
+    // mappings please update dv/verilator/pcount/cpp/ibex_pcounts.cc
+    // appropriately.
+    //
+    // active counters
+    mhpmcounter_incr[0]  = 1'b1;                   // mcycle
+    mhpmcounter_incr[1]  = 1'b0;                   // reserved
+    mhpmcounter_incr[2]  = instr_ret_i;            // minstret
+    mhpmcounter_incr[3]  = dside_wait_i;           // cycles waiting for data memory
+    mhpmcounter_incr[4]  = iside_wait_i;           // cycles waiting for instr fetches
+    mhpmcounter_incr[5]  = mem_load_i;             // num of loads
+    mhpmcounter_incr[6]  = mem_store_i;            // num of stores
+    mhpmcounter_incr[7]  = jump_i;                 // num of jumps (unconditional)
+    mhpmcounter_incr[8]  = branch_i;               // num of branches (conditional)
+    mhpmcounter_incr[9]  = branch_taken_i;         // num of taken branches (conditional)
+    mhpmcounter_incr[10] = instr_ret_compressed_i; // num of compressed instr
+    mhpmcounter_incr[11] = mul_wait_i;             // cycles waiting for multiply
+    mhpmcounter_incr[12] = div_wait_i;             // cycles waiting for divide
+  end
+
+  // event selector (hardwired, 0 means no event)
+  always_comb begin : gen_mhpmevent
+
+    // activate all
+    for (int i = 0; i < 32; i++) begin : gen_mhpmevent_active
+      mhpmevent[i]    =   '0;
+      mhpmevent[i][i] = 1'b1;
+    end
+
+    // deactivate
+    mhpmevent[1] = '0; // not existing, reserved
+    for (int unsigned i = 3 + MHPMCounterNum; i < 32; i++) begin : gen_mhpmevent_inactive
+      mhpmevent[i] = '0;
+    end
+  end
+
+  // mcycle
+  cheriot_counter #(
+    .CounterWidth(64)
+  ) mcycle_counter_i (
+    .clk_i(clk_i),
+    .rst_ni(rst_ni),
+    .counter_inc_i(mhpmcounter_incr[0] & ~mcountinhibit[0]),
+    .counterh_we_i(mhpmcounterh_we[0]),
+    .counter_we_i(mhpmcounter_we[0]),
+    .counter_val_i(csr_wdata_int),
+    .counter_val_o(mhpmcounter[0]),
+    .counter_val_upd_o()
+  );
+
+
+  // minstret
+  cheriot_counter #(
+    .CounterWidth(64),
+    .ProvideValUpd(1)
+  ) minstret_counter_i (
+    .clk_i(clk_i),
+    .rst_ni(rst_ni),
+    .counter_inc_i(mhpmcounter_incr[2] & ~mcountinhibit[2]),
+    .counterh_we_i(mhpmcounterh_we[2]),
+    .counter_we_i(mhpmcounter_we[2]),
+    .counter_val_i(csr_wdata_int),
+    .counter_val_o(minstret_raw),
+    .counter_val_upd_o(minstret_next)
+  );
+
+  // Where the writeback stage is present instruction in ID observing value of minstret must take
+  // into account any instruction in the writeback stage. If one is present the incremented value of
+  // minstret is used. A speculative version of the signal is used to aid timing. When the writeback
+  // stage sees an exception (so the speculative signal is incorrect) the ID stage will be flushed
+  // so the incorrect value doesn't matter. A similar behaviour is required for the compressed
+  // instruction retired counter below. When the writeback stage isn't present the speculative
+  // signals are always 0.
+  assign mhpmcounter[2] = instr_ret_spec_i & ~mcountinhibit[2] ? minstret_next : minstret_raw;
+
+  // reserved:
+  assign mhpmcounter[1]            = '0;
+  assign unused_mhpmcounter_we_1   = mhpmcounter_we[1];
+  assign unused_mhpmcounterh_we_1  = mhpmcounterh_we[1];
+  assign unused_mhpmcounter_incr_1 = mhpmcounter_incr[1];
+
+  // Iterate through optionally included counters (MHPMCounterNum controls how many are included)
+  for (genvar i = 0; i < 29; i++) begin : gen_cntrs
+    localparam int Cnt = i + 3;
+
+    if (i < MHPMCounterNum) begin : gen_imp
+      logic [63:0] mhpmcounter_raw, mhpmcounter_next;
+
+      cheriot_counter #(
+        .CounterWidth(MHPMCounterWidth),
+        .ProvideValUpd(Cnt == 10)
+      ) mcounters_variable_i (
+        .clk_i(clk_i),
+        .rst_ni(rst_ni),
+        .counter_inc_i(mhpmcounter_incr[Cnt] & ~mcountinhibit[Cnt]),
+        .counterh_we_i(mhpmcounterh_we[Cnt]),
+        .counter_we_i(mhpmcounter_we[Cnt]),
+        .counter_val_i(csr_wdata_int),
+        .counter_val_o(mhpmcounter_raw),
+        .counter_val_upd_o(mhpmcounter_next)
+      );
+
+      if (Cnt == 10) begin : gen_compressed_instr_cnt
+        // Special behaviour for reading compressed instruction retired counter, see comment on
+        // `mhpmcounter[2]` above for further information.
+        assign mhpmcounter[Cnt] =
+          instr_ret_compressed_spec_i & ~mcountinhibit[Cnt] ? mhpmcounter_next:
+                                                              mhpmcounter_raw;
+      end else begin : gen_other_cnts
+        logic [63:0] unused_mhpmcounter_next;
+        // All other counters just see the raw counter value directly.
+        assign mhpmcounter[Cnt] = mhpmcounter_raw;
+        assign unused_mhpmcounter_next = mhpmcounter_next;
+      end
+    end else begin : gen_unimp
+      assign mhpmcounter[Cnt] = '0;
+
+      if (Cnt == 10) begin : gen_no_compressed_instr_cnt
+        logic unused_instr_ret_compressed_spec_i;
+        assign unused_instr_ret_compressed_spec_i = instr_ret_compressed_spec_i;
+      end
+    end
+  end
+
+  if (MHPMCounterNum < 29) begin : g_mcountinhibit_reduced
+    logic [29-MHPMCounterNum-1:0] unused_mhphcounter_we;
+    logic [29-MHPMCounterNum-1:0] unused_mhphcounterh_we;
+    logic [29-MHPMCounterNum-1:0] unused_mhphcounter_incr;
+
+    assign mcountinhibit = {{29 - MHPMCounterNum{1'b1}}, mcountinhibit_q};
+    // Lint tieoffs for unused bits
+    assign unused_mhphcounter_we   = mhpmcounter_we[31:MHPMCounterNum+3];
+    assign unused_mhphcounterh_we  = mhpmcounterh_we[31:MHPMCounterNum+3];
+    assign unused_mhphcounter_incr = mhpmcounter_incr[31:MHPMCounterNum+3];
+  end else begin : g_mcountinhibit_full
+    assign mcountinhibit = mcountinhibit_q;
+  end
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      mcountinhibit_q <= '0;
+    end else begin
+      mcountinhibit_q <= mcountinhibit_d;
+    end
+  end
+
+  /////////////////////////////
+  // Debug trigger registers //
+  /////////////////////////////
+
+  if (DbgTriggerEn) begin : gen_trigger_regs
+    localparam int unsigned DbgHwNumLen = DbgHwBreakNum > 1 ? $clog2(DbgHwBreakNum) : 1;
+    localparam int unsigned MaxTselect = DbgHwBreakNum - 1;
+
+    // Register values
+    logic [DbgHwNumLen-1:0]   tselect_d, tselect_q;
+    logic                     tmatch_control_d;
+    logic [DbgHwBreakNum-1:0] tmatch_control_q;
+    logic [31:0]              tmatch_value_d;
+    logic [31:0]              tmatch_value_q[DbgHwBreakNum];
+    logic                     selected_tmatch_control;
+    logic [31:0]              selected_tmatch_value;
+
+    // Write enables
+    logic                     tselect_we;
+    logic [DbgHwBreakNum-1:0] tmatch_control_we;
+    logic [DbgHwBreakNum-1:0] tmatch_value_we;
+    // Trigger comparison result
+    logic [DbgHwBreakNum-1:0] trigger_match;
+
+    // Write select
+    assign tselect_we = csr_we_int & debug_mode_i & (csr_addr_i == CSR_TSELECT);
+    for (genvar i = 0; i < DbgHwBreakNum; i++) begin : g_dbg_tmatch_we
+      assign tmatch_control_we[i] = (i[DbgHwNumLen-1:0] == tselect_q) & csr_we_int & debug_mode_i &
+                                    (csr_addr_i == CSR_TDATA1);
+      assign tmatch_value_we[i]   = (i[DbgHwNumLen-1:0] == tselect_q) & csr_we_int & debug_mode_i &
+                                    (csr_addr_i == CSR_TDATA2);
+    end
+
+    // Debug interface tests the available number of triggers by writing and reading the trigger
+    // select register. Only allow changes to the register if it is within the supported region.
+    assign tselect_d = (csr_wdata_int < DbgHwBreakNum) ? csr_wdata_int[DbgHwNumLen-1:0] :
+                                                         MaxTselect[DbgHwNumLen-1:0];
+
+    // tmatch_control is enabled when the execute bit is set
+    assign tmatch_control_d = csr_wdata_int[2];
+    assign tmatch_value_d   = csr_wdata_int[31:0];
+
+    // Registers
+    cheriot_csr #(
+      .Width     (DbgHwNumLen),
+      .ShadowCopy(1'b0),
+      .ResetValue('0)
+    ) u_tselect_csr (
+      .clk_i     (clk_i),
+      .rst_ni    (rst_ni),
+      .wr_data_i (tselect_d),
+      .wr_en_i   (tselect_we),
+      .rd_data_o (tselect_q),
+      .rd_error_o()
+    );
+
+    for (genvar i = 0; i < DbgHwBreakNum; i++) begin : g_dbg_tmatch_reg
+      cheriot_csr #(
+        .Width     (1),
+        .ShadowCopy(1'b0),
+        .ResetValue('0)
+      ) u_tmatch_control_csr (
+        .clk_i     (clk_i),
+        .rst_ni    (rst_ni),
+        .wr_data_i (tmatch_control_d),
+        .wr_en_i   (tmatch_control_we[i]),
+        .rd_data_o (tmatch_control_q[i]),
+        .rd_error_o()
+      );
+
+      cheriot_csr #(
+        .Width     (32),
+        .ShadowCopy(1'b0),
+        .ResetValue('0)
+      ) u_tmatch_value_csr (
+        .clk_i     (clk_i),
+        .rst_ni    (rst_ni),
+        .wr_data_i (tmatch_value_d),
+        .wr_en_i   (tmatch_value_we[i]),
+        .rd_data_o (tmatch_value_q[i]),
+        .rd_error_o()
+      );
+    end
+
+    // Assign read data
+    // TSELECT - number of supported triggers defined by parameter DbgHwBreakNum
+    localparam int unsigned TSelectRdataPadlen = DbgHwNumLen >= 32 ? 0 : (32 - DbgHwNumLen);
+    assign tselect_rdata = {{TSelectRdataPadlen{1'b0}}, tselect_q};
+
+    if (DbgHwBreakNum > 1) begin : g_dbg_tmatch_multiple_select
+      assign selected_tmatch_control = tmatch_control_q[tselect_q];
+      assign selected_tmatch_value   = tmatch_value_q[tselect_q];
+    end else begin : g_dbg_tmatch_single_select
+      assign selected_tmatch_control = tmatch_control_q[0];
+      assign selected_tmatch_value   = tmatch_value_q[0];
+    end
+
+    // TDATA0 - only support simple address matching
+    assign tmatch_control_rdata = {4'h2,                    // type    : address/data match
+                                   1'b1,                    // dmode   : access from D mode only
+                                   6'h00,                   // maskmax : exact match only
+                                   1'b0,                    // hit     : not supported
+                                   1'b0,                    // select  : address match only
+                                   1'b0,                    // timing  : match before execution
+                                   2'b00,                   // sizelo  : match any access
+                                   4'h1,                    // action  : enter debug mode
+                                   1'b0,                    // chain   : not supported
+                                   4'h0,                    // match   : simple match
+                                   1'b1,                    // m       : match in m-mode
+                                   1'b0,                    // 0       : zero
+                                   1'b0,                    // s       : not supported
+                                   1'b1,                    // u       : match in u-mode
+                                   selected_tmatch_control, // execute : match instruction address
+                                   1'b0,                    // store   : not supported
+                                   1'b0};                   // load    : not supported
+
+    // TDATA1 - address match value only
+    assign tmatch_value_rdata = selected_tmatch_value;
+
+    // Breakpoint matching
+    // We match against the next address, as the breakpoint must be taken before execution
+    for (genvar i = 0; i < DbgHwBreakNum; i++) begin : g_dbg_trigger_match
+      assign trigger_match[i] = tmatch_control_q[i] & (pc_if_i[31:0] == tmatch_value_q[i]);
+    end
+    assign trigger_match_o = |trigger_match;
+
+  end else begin : gen_no_trigger_regs
+    assign tselect_rdata        = 'b0;
+    assign tmatch_control_rdata = 'b0;
+    assign tmatch_value_rdata   = 'b0;
+    assign trigger_match_o      = 'b0;
+  end
+
+  //////////////////////////
+  // CPU control register //
+  //////////////////////////
+
+  // Cast register write data
+  assign cpuctrl_wdata_raw = cpu_ctrl_t'(csr_wdata_int[$bits(cpu_ctrl_t)-1:0]);
+
+  // Generate fixed time execution bit
+  if (DataIndTiming) begin : gen_dit
+    assign cpuctrl_wdata.data_ind_timing = cpuctrl_wdata_raw.data_ind_timing;
+
+  end else begin : gen_no_dit
+    // tieoff for the unused bit
+    logic unused_dit;
+    assign unused_dit = cpuctrl_wdata_raw.data_ind_timing;
+
+    // field will always read as zero if not configured
+    assign cpuctrl_wdata.data_ind_timing = 1'b0;
+  end
+
+  assign data_ind_timing_o = cpuctrl_q.data_ind_timing;
+
+  // Generate dummy instruction signals
+  if (DummyInstructions) begin : gen_dummy
+    assign cpuctrl_wdata.dummy_instr_en   = cpuctrl_wdata_raw.dummy_instr_en;
+    assign cpuctrl_wdata.dummy_instr_mask = cpuctrl_wdata_raw.dummy_instr_mask;
+
+    // Signal a write to the seed register
+    assign dummy_instr_seed_en_o = csr_we_int && (csr_addr == CSR_SECURESEED);
+    assign dummy_instr_seed_o    = csr_wdata_int;
+
+  end else begin : gen_no_dummy
+    // tieoff for the unused bit
+    logic       unused_dummy_en;
+    logic [2:0] unused_dummy_mask;
+    assign unused_dummy_en   = cpuctrl_wdata_raw.dummy_instr_en;
+    assign unused_dummy_mask = cpuctrl_wdata_raw.dummy_instr_mask;
+
+    // field will always read as zero if not configured
+    assign cpuctrl_wdata.dummy_instr_en   = 1'b0;
+    assign cpuctrl_wdata.dummy_instr_mask = 3'b000;
+    assign dummy_instr_seed_en_o      = 1'b0;
+    assign dummy_instr_seed_o         = '0;
+  end
+
+  assign dummy_instr_en_o   = cpuctrl_q.dummy_instr_en;
+  assign dummy_instr_mask_o = cpuctrl_q.dummy_instr_mask;
+
+  // Generate icache enable bit
+  if (ICache) begin : gen_icache_enable
+    assign cpuctrl_wdata.icache_enable = cpuctrl_wdata_raw.icache_enable;
+  end else begin : gen_no_icache
+    // tieoff for the unused icen bit
+    logic unused_icen;
+    assign unused_icen = cpuctrl_wdata_raw.icache_enable;
+
+    // icen field will always read as zero if ICache not configured
+    assign cpuctrl_wdata.icache_enable = 1'b0;
+  end
+
+  assign cpuctrl_wdata.double_fault_seen = cpuctrl_wdata_raw.double_fault_seen;
+  assign cpuctrl_wdata.sync_exc_seen     = cpuctrl_wdata_raw.sync_exc_seen;
+
+  assign icache_enable_o = cpuctrl_q.icache_enable;
+
+  cheriot_csr #(
+    .Width     ($bits(cpu_ctrl_t)),
+    .ShadowCopy(ShadowCSR),
+    .ResetValue('0)
+  ) u_cpuctrl_csr (
+    .clk_i     (clk_i),
+    .rst_ni    (rst_ni),
+    .wr_data_i ({cpuctrl_d}),
+    .wr_en_i   (cpuctrl_we),
+    .rd_data_o (cpuctrl_q),
+    .rd_error_o(cpuctrl_err)
+  );
+
+  assign csr_shadow_err_o = mstatus_err | mtvec_err | pmp_csr_err | cpuctrl_err;
+
+  ////////////////
+  // Assertions //
+  ////////////////
+
+  `ASSERT(IbexCsrOpEnRequiresAccess, csr_op_en_i |-> csr_access_i)
+
+  //////////////////////
+  // Cheriot SCR's
+  //////////////////////
+
+  if (CHERIoTEn) begin: gen_scr
+    reg_cap_t     pcc_exc_cap;
+    reg_cap_t     mtdc_cap;
+    logic [31:0]  mtdc_data;
+    reg_cap_t     mscratchc_cap;
+    logic [31:0]  mscratchc_data;  // note this is separate from legacy mscratch
+
+
+    logic mtdc_en_cheri, mscratchc_en_cheri;
+
+    always_comb begin
+      case (cheri_csr_addr_i)
+        CHERI_SCR_DEPCC:
+          begin
+            cheri_csr_rdata_o = debug_mode_i ? depc_q : 0;
+            cheri_csr_rcap_o  = debug_mode_i ? depc_cap : NULL_REG_CAP;
+          end
+        CHERI_SCR_DSCRATCHC0:
+          begin
+            cheri_csr_rdata_o = debug_mode_i ? dscratch0_q : 0;
+            cheri_csr_rcap_o  = debug_mode_i ? dscratch0_cap : NULL_REG_CAP;
+          end
+        CHERI_SCR_DSCRATCHC1:
+          begin
+            cheri_csr_rdata_o = debug_mode_i ? dscratch1_q : 0;
+            cheri_csr_rcap_o  = debug_mode_i ? dscratch1_cap : NULL_REG_CAP;
+          end
+        CHERI_SCR_MTCC:
+          begin
+            cheri_csr_rdata_o = mtvec_q;
+            cheri_csr_rcap_o  = mtvec_cap;
+          end
+        CHERI_SCR_MTDC:
+          begin
+            cheri_csr_rdata_o = mtdc_data;
+            cheri_csr_rcap_o  = mtdc_cap;
+          end
+        CHERI_SCR_MSCRATCHC:
+          begin
+            cheri_csr_rdata_o = mscratchc_data;
+            cheri_csr_rcap_o  = mscratchc_cap;
+          end
+        CHERI_SCR_MEPCC:
+          begin
+            cheri_csr_rdata_o = mepc_q;
+            cheri_csr_rcap_o  = mepc_cap;
+          end
+        default:
+          begin
+            cheri_csr_rdata_o = 32'h0;
+            cheri_csr_rcap_o  = NULL_REG_CAP;
+          end
+      endcase
+    end
+
+    assign pcc_cap_o = pcc_cap_q;
+
+    assign pcc_exc_cap = pcc2mepcc(pcc_cap_q, exception_pc, csr_mepcc_clrtag_i);
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        pcc_cap_q  <= PCC_RESET_CAP;
+      end else begin
+        pcc_cap_q  <= pcc_cap_d;
+      end
+    end
+
+    // PCC updating
+    //  -- PC address range checking is always against the pcc_cap, which is only updated with
+    //     CHER CJALR or exceptions. Legacy RV32 jumps/branches can change PC but not the PCC
+    //     bounds/perms, so they are still limited by the orginal bounds in IF stage checking
+    always_comb begin
+      full_cap_t   tf_cap;
+      reg_cap_t    tr_cap;
+      logic [31:0] tr_addr;
+     
+      if (csr_save_cause_i) begin              // Exception cases
+        tr_cap  = mtvec_cap;
+        tr_addr = mtvec_q;
+      end else if (csr_restore_mret_i) begin
+        tr_cap  = mepc_cap;
+        tr_addr = mepc_q;
+      end else if (csr_restore_dret_i & debug_mode_i) begin
+        tr_cap  = depc_cap;
+        tr_addr = depc_q;
+      end else begin
+        tr_cap  = NULL_REG_CAP;
+        tr_addr = 32'h0;
+      end
+
+      tf_cap = reg2fullcap(tr_cap, tr_addr);
+
+      // Exception cases
+      if (csr_save_cause_i | csr_restore_mret_i | (csr_restore_dret_i & debug_mode_i)) begin 
+        pcc_cap_d = full2pcap(tf_cap);
+      end else if (cheri_branch_req_i) begin
+        pcc_cap_d = pcc_cap_i;
+      end else begin
+        pcc_cap_d = pcc_cap_q;
+      end
+    end
+
+    // mtvec extended capability
+    assign mtvec_en_cheri = cheri_csr_op_en_i && (cheri_csr_addr_i == CHERI_SCR_MTCC) && (cheri_csr_op_i == CHERI_CSR_RW);
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni)
+        mtvec_cap <= MTVEC_RESET_CAP;
+      else if (mtvec_en_cheri)
+        mtvec_cap <= cheri_csr_wcap_i;
+    end
+
+    // mepc extended capability
+    assign mepc_en_cheri = cheri_csr_op_en_i && (cheri_csr_addr_i == CHERI_SCR_MEPCC) && (cheri_csr_op_i == CHERI_CSR_RW);
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni)
+        mepc_cap <= MEPC_RESET_CAP;
+      else if (csr_save_cause_i & (~debug_csr_save_i) & (~debug_mode_i))
+        mepc_cap <= pcc_exc_cap;
+      else if (cheri_pmode_i & mepc_en)            // legacy cssrw; NMI recover
+        mepc_cap <= NULL_REG_CAP;
+      else if (mepc_en_cheri)
+        mepc_cap <= cheri_csr_wcap_i;
+    end
+
+    // MTDC capability
+    assign mtdc_en_cheri = cheri_csr_op_en_i && (cheri_csr_addr_i == CHERI_SCR_MTDC) && (cheri_csr_op_i == CHERI_CSR_RW);
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        mtdc_cap  <= MTDC_RESET_CAP;
+        mtdc_data <= 32'h0;
+      end else if (mtdc_en_cheri) begin
+        mtdc_cap  <= cheri_csr_wcap_i;
+        mtdc_data <= cheri_csr_wdata_i;
+      end
+    end
+
+    // MSCRATCHC capability
+    assign mscratchc_en_cheri = cheri_csr_op_en_i && (cheri_csr_addr_i == CHERI_SCR_MSCRATCHC) && (cheri_csr_op_i == CHERI_CSR_RW);
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        mscratchc_cap  <= MSCRATCHC_RESET_CAP;
+        mscratchc_data <= 32'h0;
+      end else if (mscratchc_en_cheri) begin
+        mscratchc_cap  <= cheri_csr_wcap_i;
+        mscratchc_data <= cheri_csr_wdata_i;
+      end
+    end
+
+    // depc extended capability
+    assign depc_en_cheri = debug_mode_i & cheri_csr_op_en_i && (cheri_csr_addr_i == CHERI_SCR_DEPCC) && (cheri_csr_op_i == CHERI_CSR_RW);
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni)
+        depc_cap <= NULL_REG_CAP;
+      else if (csr_save_cause_i & debug_csr_save_i)
+        depc_cap <= pcc_exc_cap;
+      else if (depc_en_cheri)
+        depc_cap <= cheri_csr_wcap_i;
+    end
+
+    // dscratch0/1 extended capability
+    assign dscratch0_en_cheri = debug_mode_i & cheri_csr_op_en_i && (cheri_csr_addr_i == CHERI_SCR_DSCRATCHC0) && (cheri_csr_op_i == CHERI_CSR_RW);
+    assign dscratch1_en_cheri = debug_mode_i & cheri_csr_op_en_i && (cheri_csr_addr_i == CHERI_SCR_DSCRATCHC1) && (cheri_csr_op_i == CHERI_CSR_RW);
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        dscratch0_cap <= NULL_REG_CAP;
+        dscratch1_cap <= NULL_REG_CAP;
+      end else if (dscratch0_en_cheri)
+        dscratch0_cap <= cheri_csr_wcap_i;
+      else if (dscratch1_en_cheri)
+        dscratch1_cap <= cheri_csr_wcap_i;
+
+    end
+
+    // fatal error condition (unrecoverable, need external reset)
+    // exception with invalid mepcc
+    logic cheri_fatal_err_q;
+
+    assign cheri_fatal_err_o = cheri_fatal_err_q;
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        cheri_fatal_err_q <= 1'b0;
+      end else begin
+        if (cheri_pmode_i & csr_save_cause_i & ~mtvec_cap.valid) 
+          cheri_fatal_err_q <= 1'b1;
+      end
+    end
+
+
+  end else begin: gen_no_scr
+    
+    assign cheri_csr_rdata_o = 32'h0;
+    assign cheri_csr_rcap_o  = NULL_REG_CAP;
+
+    assign pcc_cap_o         = NULL_PCC_CAP;
+    assign pcc_cap_q         = NULL_PCC_CAP;
+
+    assign mtvec_en_cheri      = 1'b0;
+    assign mepc_en_cheri       = 1'b0;
+    assign depc_en_cheri       = 1'b0;
+    assign dscratch0_en_cheri  = 1'b0;
+    assign dscratch1_en_cheri  = 1'b0;
+ 
+    assign cheri_fatal_err_o   = 1'b0;
+
+  end
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_csr.sv b/hw/ip/cheriot-ibex/rtl/cheriot_csr.sv
new file mode 100644
index 0000000..9dbe1b6
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_csr.sv
@@ -0,0 +1,57 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Control / status register primitive
+ */
+
+`include "prim_assert.sv"
+
+module cheriot_csr #(
+  parameter int unsigned    Width      = 32,
+  parameter bit             ShadowCopy = 1'b0,
+  parameter bit [Width-1:0] ResetValue = '0
+ ) (
+  input  logic             clk_i,
+  input  logic             rst_ni,
+
+  input  logic [Width-1:0] wr_data_i,
+  input  logic             wr_en_i,
+  output logic [Width-1:0] rd_data_o,
+
+  output logic             rd_error_o
+);
+
+  logic [Width-1:0] rdata_q;
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      rdata_q <= ResetValue;
+    end else if (wr_en_i) begin
+      rdata_q <= wr_data_i;
+    end
+  end
+
+  assign rd_data_o = rdata_q;
+
+  if (ShadowCopy) begin : gen_shadow
+    logic [Width-1:0] shadow_q;
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        shadow_q <= ~ResetValue;
+      end else if (wr_en_i) begin
+        shadow_q <= ~wr_data_i;
+      end
+    end
+
+    assign rd_error_o = rdata_q != ~shadow_q;
+
+  end else begin : gen_no_shadow
+    assign rd_error_o = 1'b0;
+  end
+
+  `ASSERT_KNOWN(IbexCSREnValid, wr_en_i)
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_decoder.sv b/hw/ip/cheriot-ibex/rtl/cheriot_decoder.sv
new file mode 100644
index 0000000..8b0fcdb
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_decoder.sv
@@ -0,0 +1,1432 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+
+/**
+ * Instruction decoder
+ *
+ * This module is fully combinatorial, clock and reset are used for
+ * assertions only.
+ */
+
+`include "prim_assert.sv"
+
+module cheriot_decoder import cheri_pkg::*; #(
+  parameter bit RV32E               = 0,
+  parameter cheriot_pkg::rv32m_e RV32M = cheriot_pkg::RV32MFast,
+  parameter cheriot_pkg::rv32b_e RV32B = cheriot_pkg::RV32BNone,
+  parameter bit BranchTargetALU     = 0,
+  parameter bit CHERIoTEn           = 1'b1,
+  parameter bit CheriPPLBC          = 1'b0,
+  parameter bit CheriSBND2          = 1'b0
+) (
+  input  logic                 clk_i,
+  input  logic                 rst_ni,
+
+  input  logic                 cheri_pmode_i,
+  input  logic                 cheri_tsafe_en_i,
+
+  // to/from controller
+  output logic                 illegal_insn_o,        // illegal instr encountered
+  output logic                 ebrk_insn_o,           // trap instr encountered
+  output logic                 mret_insn_o,           // return from exception instr
+                                                      // encountered
+  output logic                 dret_insn_o,           // return from debug instr encountered
+  output logic                 ecall_insn_o,          // syscall instr encountered
+  output logic                 wfi_insn_o,            // wait for interrupt instr encountered
+  output logic                 jump_set_o,            // jump taken set signal
+  input  logic                 branch_taken_i,        // registered branch decision
+  output logic                 icache_inval_o,
+
+  // from IF-ID pipeline register
+  input  logic                 instr_first_cycle_i,   // instruction read is in its first cycle
+  input  logic [31:0]          instr_rdata_i,         // instruction read from memory/cache
+  input  logic [31:0]          instr_rdata_alu_i,     // instruction read from memory/cache
+                                                      // replicated to ease fan-out)
+
+  input  logic                 illegal_c_insn_i,      // compressed instruction decode failed
+
+  // immediates
+  output cheriot_pkg::imm_a_sel_e  imm_a_mux_sel_o,       // immediate selection for operand a
+  output cheriot_pkg::imm_b_sel_e  imm_b_mux_sel_o,       // immediate selection for operand b
+  output cheriot_pkg::op_a_sel_e   bt_a_mux_sel_o,        // branch target selection operand a
+  output cheriot_pkg::imm_b_sel_e  bt_b_mux_sel_o,        // branch target selection operand b
+  output logic [31:0]           imm_i_type_o,
+  output logic [31:0]           imm_s_type_o,
+  output logic [31:0]           imm_b_type_o,
+  output logic [31:0]           imm_u_type_o,
+  output logic [31:0]           imm_j_type_o,
+  output logic [31:0]           zimm_rs1_type_o,
+
+  // register file
+  output cheriot_pkg::rf_wd_sel_e rf_wdata_sel_o,   // RF write data selection
+  output logic                 rf_we_o,          // write enable for regfile
+  output logic                 rf_we_or_load_o,
+  output logic [4:0]           rf_raddr_a_o,
+  output logic [4:0]           rf_raddr_b_o,
+  output logic [4:0]           rf_waddr_o,
+  output logic                 rf_ren_a_o,          // Instruction reads from RF addr A
+  output logic                 rf_ren_b_o,          // Instruction reads from RF addr B
+
+  // ALU
+  output cheriot_pkg::alu_op_e    alu_operator_o,        // ALU operation selection
+  output cheriot_pkg::op_a_sel_e  alu_op_a_mux_sel_o,    // operand a selection: reg value, PC,
+                                                      // immediate or zero
+  output cheriot_pkg::op_b_sel_e  alu_op_b_mux_sel_o,    // operand b selection: reg value or
+                                                      // immediate
+  output logic                 alu_multicycle_o,      // ternary bitmanip instruction
+
+  // MULT & DIV
+  output logic                 mult_en_o,             // perform integer multiplication
+  output logic                 div_en_o,              // perform integer division or remainder
+  output logic                 mult_sel_o,            // as above but static, for data muxes
+  output logic                 div_sel_o,             // as above but static, for data muxes
+
+  output cheriot_pkg::md_op_e     multdiv_operator_o,
+  output logic [1:0]           multdiv_signed_mode_o,
+
+  // CSRs
+  output logic                 csr_access_o,          // access to CSR
+  output cheriot_pkg::csr_op_e    csr_op_o,              // operation to perform on CSR
+  output logic                 csr_cheri_always_ok_o, // CHERI safe-listed (no ASR needed) CSRs
+
+  // LSU
+  output logic                 data_req_o,            // start transaction to data memory
+  output logic                 cheri_data_req_o,      // cheri lsu transaction
+  output logic                 data_we_o,             // write enable
+  output logic [1:0]           data_type_o,           // size of transaction: byte, half
+                                                      // word or word
+  output logic                 data_sign_extension_o, // sign extension for data read from
+                                                      // memory
+
+  // jump/branches
+  output logic                 jump_in_dec_o,         // jump is being calculated in ALU
+  output logic                 branch_in_dec_o,
+
+  // output to cheri EX
+  output logic                 instr_is_cheri_o,
+  output logic                 instr_is_legal_cheri_o,
+  output logic [11:0]          cheri_imm12_o,
+  output logic [19:0]          cheri_imm20_o,
+  output logic [20:0]          cheri_imm21_o,
+  output logic [OPDW-1:0]      cheri_operator_o,
+  output logic [4:0]           cheri_cs2_dec_o,
+  output logic                 cheri_multicycle_dec_o
+);
+
+  import cheriot_pkg::*;
+
+  localparam bit CheriLimit16Regs = CHERIoTEn;
+
+  logic        illegal_insn;
+  logic        illegal_reg_rv32e;
+  logic        illegal_reg_cheri;
+  logic        csr_illegal;
+  logic        rf_we;
+
+  logic [31:0] instr;
+  logic [31:0] instr_alu;
+  logic [9:0]  unused_instr_alu;
+  // Source/Destination register instruction index
+  logic [4:0] instr_rs1;
+  logic [4:0] instr_rs2;
+  logic [4:0] instr_rs3;
+  logic [4:0] instr_rd;
+
+  logic        use_rs3_d;
+  logic        use_rs3_q;
+
+  csr_op_e     csr_op;
+
+  opcode_e     opcode;
+  opcode_e     opcode_alu;
+
+  logic        cheri_opcode_en;
+  logic        cheri_auipcc_en;
+  logic        cheri_auicgp_en;
+  logic        cheri_jalr_en;
+  logic        cheri_jal_en;
+  logic        cheri_cload_en;
+  logic        cheri_cstore_en;
+  logic        instr_is_legal_cheri;
+  logic        cheri_rf_ren_a, cheri_rf_ren_b;
+  logic        cheri_rf_we_dec;
+
+  // To help timing the flops containing the current instruction are replicated to reduce fan-out.
+  // instr_alu is used to determine the ALU control logic and associated operand/imm select signals
+  // as the ALU is often on the more critical timing paths. instr is used for everything else.
+  assign instr     = instr_rdata_i;
+  assign instr_alu = instr_rdata_alu_i;
+
+  //////////////////////////////////////
+  // Register and immediate selection //
+  //////////////////////////////////////
+
+  // immediate extraction and sign extension
+  assign imm_i_type_o = { {20{instr[31]}}, instr[31:20] };
+  assign imm_s_type_o = { {20{instr[31]}}, instr[31:25], instr[11:7] };
+  assign imm_b_type_o = { {19{instr[31]}}, instr[31], instr[7], instr[30:25], instr[11:8], 1'b0 };
+  assign imm_u_type_o = { instr[31:12], 12'b0 };
+  assign imm_j_type_o = { {12{instr[31]}}, instr[19:12], instr[20], instr[30:21], 1'b0 };
+
+  // immediate for CSR manipulation (zero extended)
+  assign zimm_rs1_type_o = { 27'b0, instr_rs1 }; // rs1
+
+  if (RV32B != RV32BNone) begin : gen_rs3_flop
+    // the use of rs3 is known one cycle ahead.
+    always_ff  @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        use_rs3_q <= 1'b0;
+      end else begin
+        use_rs3_q <= use_rs3_d;
+      end
+    end
+  end else begin : gen_no_rs3_flop
+    logic unused_clk;
+    logic unused_rst_n;
+
+    // Clock and reset unused when there's no rs3 flop
+    assign unused_clk = clk_i;
+    assign unused_rst_n = rst_ni;
+
+    // always zero
+    assign use_rs3_q = use_rs3_d;
+  end
+
+  // source registers
+  assign instr_rs1 = instr[19:15];
+  assign instr_rs2 = instr[24:20];
+  assign instr_rs3 = instr[31:27];
+
+  // read cx3 if AUICGP
+  // note for GDC (c3) we want to use the regular scheme to resovel data hazards, instead of using
+  // sideband signals to export CX3 from register file directly
+  logic [4:0] raddr_a, raddr_b;
+  assign raddr_a = cheri_auicgp_en ? 5'h3 : ((use_rs3_q & ~instr_first_cycle_i) ? instr_rs3 : instr_rs1); // rs3 / rs1
+  assign raddr_b = instr_rs2; // rs2
+
+  // cheriot only uses 16 registers and repurposes the MSB addr bits
+  if (CheriLimit16Regs) begin
+    assign rf_raddr_a_o = cheri_pmode_i ?{1'b0,  raddr_a[3:0]} : raddr_a;
+    assign rf_raddr_b_o = cheri_pmode_i ?{1'b0,  raddr_b[3:0]} : raddr_b;
+  end else begin
+    assign rf_raddr_a_o = raddr_a;
+    assign rf_raddr_b_o = raddr_b;
+  end
+
+  // destination register
+  assign instr_rd = instr[11:7];
+  if (CheriLimit16Regs) begin
+    assign rf_waddr_o   = cheri_pmode_i ? {1'b0, instr_rd[3:0]} : instr_rd; // rd
+  end else begin
+    assign rf_waddr_o   = instr_rd; // rd
+  end
+
+  ////////////////////
+  // Register check //
+  ////////////////////
+
+  // rf_we from decoder doesn't cover memory load case (where regfile write signal comes from LSU response)
+  logic rf_we_or_load;
+  assign rf_we_or_load = rf_we | (opcode == OPCODE_LOAD);
+
+  assign rf_we_or_load_o = rf_we_or_load;
+ 
+  if (RV32E) begin : gen_rv32e_reg_check_active
+    //assign illegal_reg_rv32e = ((rf_raddr_a_o[4] & (alu_op_a_mux_sel_o == OP_A_REG_A)) |
+    //                            (rf_raddr_b_o[4] & (alu_op_b_mux_sel_o == OP_B_REG_B)) |
+    assign illegal_reg_rv32e = ((rf_raddr_a_o[4] & rf_ren_a_o) |
+                                (rf_raddr_b_o[4] & rf_ren_b_o) |
+                                (instr_rs3[4] & use_rs3_d & rf_ren_a_o) | 
+                                (rf_waddr_o[4]   & rf_we_or_load));
+  end else begin : gen_rv32e_reg_check_inactive
+    assign illegal_reg_rv32e = 1'b0;
+  end
+
+  if (CheriLimit16Regs) begin : gen_cheri_reg_check_active
+    assign illegal_reg_cheri = cheri_pmode_i & 
+                               ((raddr_a[4]  & rf_ren_a_o) |
+                                (raddr_b[4]  & rf_ren_b_o) |
+                                (instr_rs3[4] & use_rs3_d & rf_ren_a_o) | 
+                                (instr_rd[4] & rf_we_or_load ));
+  end else begin : gen_cheri_reg_check_inactive
+    assign illegal_reg_cheri = 1'b0;
+  end
+
+  ///////////////////////
+  // CSR operand check //
+  ///////////////////////
+  always_comb begin : csr_operand_check
+    csr_op_o = csr_op;
+
+    // CSRRSI/CSRRCI must not write 0 to CSRs (uimm[4:0]=='0)
+    // CSRRS/CSRRC must not write from x0 to CSRs (rs1=='0)
+    if ((csr_op == CSR_OP_SET || csr_op == CSR_OP_CLEAR) &&
+        instr_rs1 == '0) begin
+      csr_op_o = CSR_OP_READ;
+    end
+  end
+
+  /////////////
+  // Decoder //
+  /////////////
+
+  always_comb begin
+    jump_in_dec_o         = 1'b0;
+    jump_set_o            = 1'b0;
+    branch_in_dec_o       = 1'b0;
+    icache_inval_o        = 1'b0;
+
+    multdiv_operator_o    = MD_OP_MULL;
+    multdiv_signed_mode_o = 2'b00;
+
+    rf_wdata_sel_o        = RF_WD_EX;
+    rf_we                 = 1'b0;
+    rf_ren_a_o            = 1'b0;
+    rf_ren_b_o            = 1'b0;
+
+    csr_access_o          = 1'b0;
+    csr_illegal           = 1'b0;
+    csr_op                = CSR_OP_READ;
+    csr_cheri_always_ok_o = 1'b0;
+
+    data_we_o             = 1'b0;
+    data_type_o           = 2'b00;
+    data_sign_extension_o = 1'b0;
+    data_req_o            = 1'b0;
+    cheri_data_req_o      = 1'b0;
+
+    illegal_insn          = 1'b0;
+    ebrk_insn_o           = 1'b0;
+    mret_insn_o           = 1'b0;
+    dret_insn_o           = 1'b0;
+    ecall_insn_o          = 1'b0;
+    wfi_insn_o            = 1'b0;
+
+    cheri_opcode_en       = 1'b0;
+    cheri_cload_en        = 1'b0;
+    cheri_cstore_en       = 1'b0;
+    cheri_auipcc_en       = 1'b0;
+    cheri_auicgp_en       = 1'b0;
+    cheri_jalr_en         = 1'b0;
+    cheri_jal_en          = 1'b0;
+
+    opcode                = opcode_e'(instr[6:0]);
+
+    unique case (opcode)
+
+      ///////////
+      // Jumps //
+      ///////////
+
+      OPCODE_JAL: begin   // Jump and Link
+        if (CHERIoTEn & cheri_pmode_i & ~illegal_c_insn_i) begin
+          // cheri_ex takes over JAL now as a single-cycle jump
+          cheri_jal_en      = 1'b1;
+          illegal_insn      = 1'b0;
+          rf_we             = 1'b1;
+        end else begin
+          jump_in_dec_o     = 1'b1;
+
+          if (instr_first_cycle_i) begin
+            // Calculate jump target (and store PC + 4 if BranchTargetALU is configured)
+            rf_we            = BranchTargetALU;
+            jump_set_o       = 1'b1;
+          end else begin
+            // Calculate and store PC+4
+            rf_we            = 1'b1;
+          end
+        end
+      end
+
+      OPCODE_JALR: begin  // Jump and Link Register
+        if (CHERIoTEn & cheri_pmode_i & ~illegal_c_insn_i) begin
+          // cheri_ex takes over JALR now as a single-cycle jump
+          cheri_jalr_en     = (instr[14:12] == 3'b0);
+          rf_ren_a_o        = 1'b1;
+          rf_we             = 1'b1;
+
+          if (instr[14:12] != 3'b0) begin
+            illegal_insn    = 1'b1;
+          end
+        end else begin
+          jump_in_dec_o      = 1'b1;
+
+          if (instr_first_cycle_i) begin
+            // Calculate jump target (and store PC + 4 if BranchTargetALU is configured)
+            rf_we            = BranchTargetALU;
+            jump_set_o       = 1'b1;
+          end else begin
+            // Calculate and store PC+4
+            rf_we            = 1'b1;
+          end
+          if (instr[14:12] != 3'b0) begin
+            illegal_insn = 1'b1;
+          end
+
+          rf_ren_a_o = 1'b1;
+        end
+      end
+
+      OPCODE_BRANCH: begin // Branch
+        branch_in_dec_o       = 1'b1;
+        // Check branch condition selection
+        unique case (instr[14:12])
+          3'b000,
+          3'b001,
+          3'b100,
+          3'b101,
+          3'b110,
+          3'b111:  illegal_insn = 1'b0;
+          default: illegal_insn = 1'b1;
+        endcase
+
+        rf_ren_a_o = 1'b1;
+        rf_ren_b_o = 1'b1;
+      end
+
+      ////////////////
+      // Load/store //
+      ////////////////
+
+      OPCODE_STORE: begin
+        rf_ren_a_o         = 1'b1;
+        rf_ren_b_o         = 1'b1;
+        data_req_o         = 1'b1;  // keep this to pass LEC w/ ibex
+        data_we_o          = 1'b1;
+
+        if (instr[14]) begin
+          illegal_insn     = 1'b1;
+        end else if (instr[13:12] == 2'b11) begin
+          if (CHERIoTEn & cheri_pmode_i) begin 
+            cheri_cstore_en  =  ~illegal_c_insn_i; // csc
+            cheri_data_req_o =  ~illegal_c_insn_i;
+            data_req_o       =  1'b0; 
+            illegal_insn     =  1'b0;
+          end else begin
+            cheri_cstore_en  =  1'b0; // csc
+            cheri_data_req_o =  1'b0;
+            illegal_insn     =  1'b1;
+          end
+        end
+
+        // store size
+        unique case (instr[13:12])
+          2'b00:   data_type_o  = 2'b10; // sb
+          2'b01:   data_type_o  = 2'b01; // sh
+          2'b10:   data_type_o  = 2'b00; // sw
+          default: data_type_o  = 2'b00;
+        endcase
+
+      end
+
+      OPCODE_LOAD: begin
+        rf_ren_a_o          = 1'b1;
+        data_req_o          = 1'b1;
+        data_type_o         = 2'b00;
+
+        // sign/zero extension
+        data_sign_extension_o = ~instr[14];
+
+        // load size
+        unique case (instr[13:12])
+          2'b00: data_type_o = 2'b10; // lb(u)
+          2'b01: data_type_o = 2'b01; // lh(u)
+          2'b10: begin
+            data_type_o = 2'b00;      // lw
+            if (instr[14]) begin
+              illegal_insn = 1'b1;    // lwu does not exist
+            end
+          end
+          2'b11: begin
+            // illegal_c_insn_i is added to fix the c.clcsp case 
+            //   (compressed decoder translate to cheri instruction but could still assert illegal_c_insn
+            //   if rd == 0
+            if (CHERIoTEn & cheri_pmode_i && ~instr[14] && ~illegal_c_insn_i) begin
+              cheri_cload_en   = 1'b1;
+              cheri_data_req_o = ~cheri_tsafe_en_i | CheriPPLBC;
+              data_req_o       = 1'b0;    // req generated by cheri_ex
+              illegal_insn     = 1'b0;
+            end else begin                // CHERIoT consider instr[14]=1 illegal
+              cheri_cload_en   = 1'b0;
+              cheri_data_req_o = 1'b0;
+              illegal_insn     = 1'b1;
+            end
+          end
+          default: begin
+            illegal_insn = 1'b1;
+          end
+        endcase
+      end
+
+      /////////
+      // ALU //
+      /////////
+
+      OPCODE_LUI: begin  // Load Upper Immediate
+        rf_we            = 1'b1;
+      end
+
+      OPCODE_AUIPC: begin
+        if (CHERIoTEn & cheri_pmode_i & ~illegal_c_insn_i) begin
+          cheri_auipcc_en  = 1'b1;
+          illegal_insn     = 1'b0;
+          rf_we            = 1'b1;
+        end else begin
+          // OPCODE_AUIPC: begin  // Add Upper Immediate to PC
+          rf_we            = 1'b1;
+        end
+      end
+
+      OPCODE_OP_IMM: begin // Register-Immediate ALU Operations
+        rf_ren_a_o       = 1'b1;
+        rf_we            = 1'b1;
+
+        unique case (instr[14:12])
+          3'b000,
+          3'b010,
+          3'b011,
+          3'b100,
+          3'b110,
+          3'b111: illegal_insn = 1'b0;
+
+          3'b001: begin
+            unique case (instr[31:27])
+              5'b0_0000: illegal_insn = (instr[26:25] == 2'b00) ? 1'b0 : 1'b1;        // slli
+              5'b0_0100: begin                                                        // sloi
+                illegal_insn = (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) ? 1'b0 : 1'b1;
+              end
+              5'b0_1001,                                                              // bclri
+              5'b0_0101,                                                              // bseti
+              // 5'b0_1101: illegal_insn = (RV32B != RV32BNone) ? 1'b0 : 1'b1;           // binvi
+              5'b0_1101: illegal_insn = (RV32B != RV32BNone) ? (instr[26:25] != 2'b00) : 1'b1;    // binvi
+              5'b0_0001: begin
+                if (instr[26] == 1'b0) begin                                          // shfl
+                  illegal_insn = (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) ? 1'b0 : 1'b1;
+                end else begin
+                  illegal_insn = 1'b1;
+                end
+              end
+              5'b0_1100: begin
+                unique case(instr[26:20])
+                  7'b000_0000,                                                         // clz
+                  7'b000_0001,                                                         // ctz
+                  7'b000_0010,                                                         // cpop
+                  7'b000_0100,                                                         // sext.b
+                  7'b000_0101: illegal_insn = (RV32B != RV32BNone) ? 1'b0 : 1'b1;      // sext.h
+                  7'b001_0000,                                                         // crc32.b
+                  7'b001_0001,                                                         // crc32.h
+                  7'b001_0010,                                                         // crc32.w
+                  7'b001_1000,                                                         // crc32c.b
+                  7'b001_1001,                                                         // crc32c.h
+                  7'b001_1010: begin                                                   // crc32c.w
+                    illegal_insn = (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) ? 1'b0 : 1'b1;
+                  end
+                  default: illegal_insn = 1'b1;
+                endcase
+              end
+              default : illegal_insn = 1'b1;
+            endcase
+          end
+
+          3'b101: begin
+            if (instr[26]) begin
+              illegal_insn = (RV32B != RV32BNone) ? 1'b0 : 1'b1;                       // fsri
+            end else begin
+              unique case (instr[31:27])
+                5'b0_0000,                                                             // srli
+                5'b0_1000: illegal_insn = (instr[26:25] == 2'b00) ? 1'b0 : 1'b1;       // srai
+
+                5'b0_0100: begin                                                       // sroi
+                  illegal_insn = (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) ? 1'b0 : 1'b1;
+                end
+                5'b0_1100,                                                             // rori
+                // 5'b0_1001: illegal_insn = (RV32B != RV32BNone) ? 1'b0 : 1'b1;          // bexti
+                5'b0_1001: illegal_insn = (RV32B != RV32BNone) ? (instr[26:25] != 2'b00) : 1'b1;          // bexti
+
+                5'b0_1101: begin
+                  if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) begin
+                    illegal_insn = 1'b0;                                               // grevi
+                  end else if (RV32B == RV32BBalanced) begin
+                    illegal_insn = (instr[24:20] == 5'b11000) ? 1'b0 : 1'b1;           // rev8
+                  end else begin
+                    illegal_insn = 1'b1;
+                  end
+                end
+                5'b0_0101: begin
+                  if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) begin
+                    illegal_insn = 1'b0;                                              // gorci
+                  end else if (instr[24:20] == 5'b00111) begin
+                    illegal_insn = (RV32B == RV32BBalanced) ? 1'b0 : 1'b1;            // orc.b
+                  end else begin
+                    illegal_insn = 1'b1;
+                  end
+                end
+                5'b0_0001: begin
+                  if (instr[26] == 1'b0) begin                                        // unshfl
+                    illegal_insn = (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) ? 1'b0 : 1'b1;
+                  end else begin
+                    illegal_insn = 1'b1;
+                  end
+                end
+
+                default: illegal_insn = 1'b1;
+              endcase
+            end
+          end
+
+          default: illegal_insn = 1'b1;
+        endcase
+      end
+
+      OPCODE_OP: begin  // Register-Register ALU operation
+        rf_ren_a_o      = 1'b1;
+        rf_ren_b_o      = 1'b1;
+        rf_we           = 1'b1;
+        if ({instr[26], instr[13:12]} == {1'b1, 2'b01}) begin
+          illegal_insn = (RV32B != RV32BNone) ? 1'b0 : 1'b1; // cmix / cmov / fsl / fsr
+        end else begin
+          unique case ({instr[31:25], instr[14:12]})
+            // RV32I ALU operations
+            {7'b000_0000, 3'b000},
+            {7'b010_0000, 3'b000},
+            {7'b000_0000, 3'b010},
+            {7'b000_0000, 3'b011},
+            {7'b000_0000, 3'b100},
+            {7'b000_0000, 3'b110},
+            {7'b000_0000, 3'b111},
+            {7'b000_0000, 3'b001},
+            {7'b000_0000, 3'b101},
+            {7'b010_0000, 3'b101}: illegal_insn = 1'b0;
+
+            // RV32B zba
+            {7'b001_0000, 3'b010}, // sh1add
+            {7'b001_0000, 3'b100}, // sh2add
+            {7'b001_0000, 3'b110}, // sh3add
+            // RV32B zbb
+            {7'b010_0000, 3'b111}, // andn
+            {7'b010_0000, 3'b110}, // orn
+            {7'b010_0000, 3'b100}, // xnor
+            {7'b011_0000, 3'b001}, // rol
+            {7'b011_0000, 3'b101}, // ror
+            {7'b000_0101, 3'b100}, // min
+            {7'b000_0101, 3'b110}, // max
+            {7'b000_0101, 3'b101}, // minu
+            {7'b000_0101, 3'b111}, // maxu
+            {7'b000_0100, 3'b100}, // pack
+            {7'b010_0100, 3'b100}, // packu
+            {7'b000_0100, 3'b111}, // packh
+            // RV32B zbs
+            {7'b010_0100, 3'b001}, // bclr
+            {7'b001_0100, 3'b001}, // bset
+            {7'b011_0100, 3'b001}, // binv
+            {7'b010_0100, 3'b101}, // bext
+            // RV32B zbf
+            {7'b010_0100, 3'b111}: illegal_insn = (RV32B != RV32BNone) ? 1'b0 : 1'b1; // bfp
+            // RV32B zbp
+            {7'b011_0100, 3'b101}, // grev
+            {7'b001_0100, 3'b101}, // gorc
+            {7'b000_0100, 3'b001}, // shfl
+            {7'b000_0100, 3'b101}, // unshfl
+            {7'b001_0100, 3'b010}, // xperm.n
+            {7'b001_0100, 3'b100}, // xperm.b
+            {7'b001_0100, 3'b110}, // xperm.h
+            {7'b001_0000, 3'b001}, // slo
+            {7'b001_0000, 3'b101}, // sro
+            // RV32B zbc
+            {7'b000_0101, 3'b001}, // clmul
+            {7'b000_0101, 3'b010}, // clmulr
+            {7'b000_0101, 3'b011}: begin // clmulh
+              illegal_insn = (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) ? 1'b0 : 1'b1;
+            end
+            // RV32B zbe
+            {7'b010_0100, 3'b110}, // bdecompress
+            {7'b000_0100, 3'b110}: illegal_insn = (RV32B == RV32BFull) ? 1'b0 : 1'b1; // bcompress
+
+            // RV32M instructions
+            {7'b000_0001, 3'b000}: begin // mul
+              multdiv_operator_o    = MD_OP_MULL;
+              multdiv_signed_mode_o = 2'b00;
+              illegal_insn          = (RV32M == RV32MNone) ? 1'b1 : 1'b0;
+            end
+            {7'b000_0001, 3'b001}: begin // mulh
+              multdiv_operator_o    = MD_OP_MULH;
+              multdiv_signed_mode_o = 2'b11;
+              illegal_insn          = (RV32M == RV32MNone) ? 1'b1 : 1'b0;
+            end
+            {7'b000_0001, 3'b010}: begin // mulhsu
+              multdiv_operator_o    = MD_OP_MULH;
+              multdiv_signed_mode_o = 2'b01;
+              illegal_insn          = (RV32M == RV32MNone) ? 1'b1 : 1'b0;
+            end
+            {7'b000_0001, 3'b011}: begin // mulhu
+              multdiv_operator_o    = MD_OP_MULH;
+              multdiv_signed_mode_o = 2'b00;
+              illegal_insn          = (RV32M == RV32MNone) ? 1'b1 : 1'b0;
+            end
+            {7'b000_0001, 3'b100}: begin // div
+              multdiv_operator_o    = MD_OP_DIV;
+              multdiv_signed_mode_o = 2'b11;
+              illegal_insn          = (RV32M == RV32MNone) ? 1'b1 : 1'b0;
+            end
+            {7'b000_0001, 3'b101}: begin // divu
+              multdiv_operator_o    = MD_OP_DIV;
+              multdiv_signed_mode_o = 2'b00;
+              illegal_insn          = (RV32M == RV32MNone) ? 1'b1 : 1'b0;
+            end
+            {7'b000_0001, 3'b110}: begin // rem
+              multdiv_operator_o    = MD_OP_REM;
+              multdiv_signed_mode_o = 2'b11;
+              illegal_insn          = (RV32M == RV32MNone) ? 1'b1 : 1'b0;
+            end
+            {7'b000_0001, 3'b111}: begin // remu
+              multdiv_operator_o    = MD_OP_REM;
+              multdiv_signed_mode_o = 2'b00;
+              illegal_insn          = (RV32M == RV32MNone) ? 1'b1 : 1'b0;
+            end
+            default: begin
+              illegal_insn = 1'b1;
+            end
+          endcase
+        end
+      end
+
+      /////////////
+      // Special //
+      /////////////
+
+      OPCODE_MISC_MEM: begin
+        unique case (instr[14:12])
+          3'b000: begin
+            // FENCE is treated as a NOP since all memory operations are already strictly ordered.
+            rf_we           = 1'b0;
+          end
+          3'b001: begin
+            // FENCE.I is implemented as a jump to the next PC, this gives the required flushing
+            // behaviour (iside prefetch buffer flushed and response to any outstanding iside
+            // requests will be ignored).
+            // If present, the ICache will also be flushed.
+            jump_in_dec_o   = 1'b1;
+
+            rf_we           = 1'b0;
+
+            if (instr_first_cycle_i) begin
+              jump_set_o       = 1'b1;
+              icache_inval_o   = 1'b1;
+            end
+          end
+          default: begin
+            illegal_insn       = 1'b1;
+          end
+        endcase
+      end
+
+      OPCODE_SYSTEM: begin
+        if (instr[14:12] == 3'b000) begin
+          // non CSR related SYSTEM instructions
+          unique case (instr[31:20])
+            12'h000:  // ECALL
+              // environment (system) call
+              ecall_insn_o = 1'b1;
+
+            12'h001:  // ebreak
+              // debugger trap
+              ebrk_insn_o = 1'b1;
+
+            12'h302:  // mret
+              mret_insn_o = 1'b1;
+
+            12'h7b2:  // dret
+              dret_insn_o = 1'b1;
+
+            12'h105:  // wfi
+              wfi_insn_o = 1'b1;
+
+            default:
+              illegal_insn = 1'b1;
+          endcase
+
+          // rs1 and rd must be 0
+          if (instr_rs1 != 5'b0 || instr_rd != 5'b0) begin
+            illegal_insn = 1'b1;
+          end
+        end else begin
+          // instruction to read/modify CSR
+          csr_access_o     = 1'b1;
+          rf_wdata_sel_o   = RF_WD_CSR;
+          rf_we            = 1'b1;
+
+          if (~instr[14]) begin
+            rf_ren_a_o         = 1'b1;
+          end
+
+          unique case (instr[13:12])
+            2'b01:   csr_op = CSR_OP_WRITE;
+            2'b10:   csr_op = CSR_OP_SET;
+            2'b11:   csr_op = CSR_OP_CLEAR;
+            default: csr_illegal = 1'b1;
+          endcase
+
+          // always allow access to the following CSRs even without ASR permission 
+          //   -- 0xC01-0xC9F (unpriviledged counters)
+          //   -- 0xB01-0xB9F (m-mode counters). 
+          //      note 0xb01 is undefined per rvi spec. CSR register logic will handle it.
+          csr_cheri_always_ok_o = CHERIoTEn & cheri_pmode_i &
+                                  (((instr[31:28] == 4'hb) || (instr[31:28] == 4'hc)) && 
+                                   ((instr[27] == 1'b0) || (instr[26:25] == 2'b00))); 
+       
+          illegal_insn = csr_illegal;
+        end
+      end
+
+      OPCODE_CHERI: begin
+        if (CHERIoTEn & cheri_pmode_i & ~illegal_c_insn_i) begin
+          cheri_opcode_en  = 1'b1;
+          rf_ren_a_o       = cheri_rf_ren_a;
+          rf_ren_b_o       = cheri_rf_ren_b;
+          rf_we            = cheri_rf_we_dec;
+          illegal_insn     = ~instr_is_legal_cheri;
+        end else begin
+          cheri_opcode_en  = 1'b0;
+          rf_ren_a_o       = 1'b0;
+          rf_ren_b_o       = 1'b0;
+          rf_we            = 1'b0;
+          illegal_insn     = 1'b1;
+        end
+      end
+
+      OPCODE_AUICGP: begin
+        if (CHERIoTEn & cheri_pmode_i & ~illegal_c_insn_i) begin
+          cheri_auicgp_en  = 1'b1;
+          rf_ren_a_o       = 1'b1;
+          rf_ren_b_o       = 1'b0;
+          rf_we            = 1'b1;
+          illegal_insn     = 1'b0;
+        end else begin
+          cheri_opcode_en  = 1'b0;
+          rf_ren_a_o       = 1'b0;
+          rf_ren_b_o       = 1'b0;
+          illegal_insn     = 1'b1;
+        end
+      end
+
+      default: begin
+        illegal_insn = 1'b1;
+      end
+    endcase
+
+    // make sure illegal compressed instructions cause illegal instruction exceptions
+    if (illegal_c_insn_i) begin
+      illegal_insn = 1'b1;
+    end
+
+    // make sure illegal instructions detected in the decoder do not propagate from decoder
+    // into register file, LSU, EX, WB, CSRs, PC
+    // NOTE: instructions can also be detected to be illegal inside the CSRs (upon accesses with
+    // insufficient privileges), or when accessing non-available registers in RV32E,
+    // these cases are not handled here
+    if (illegal_insn) begin
+      rf_we           = 1'b0;
+      data_req_o      = 1'b0;
+      data_we_o       = 1'b0;
+      jump_in_dec_o   = 1'b0;
+      jump_set_o      = 1'b0;
+      branch_in_dec_o = 1'b0;
+      csr_access_o    = 1'b0;
+    end
+  end
+
+  /////////////////////////////
+  // Decoder for ALU control //
+  /////////////////////////////
+
+  always_comb begin
+    alu_operator_o     = ALU_SLTU;
+    alu_op_a_mux_sel_o = OP_A_IMM;
+    alu_op_b_mux_sel_o = OP_B_IMM;
+
+    imm_a_mux_sel_o    = IMM_A_ZERO;
+    imm_b_mux_sel_o    = IMM_B_I;
+
+    bt_a_mux_sel_o     = OP_A_CURRPC;
+    bt_b_mux_sel_o     = IMM_B_I;
+
+
+    opcode_alu         = opcode_e'(instr_alu[6:0]);
+
+    use_rs3_d          = 1'b0;
+    alu_multicycle_o   = 1'b0;
+    mult_sel_o         = 1'b0;
+    div_sel_o          = 1'b0;
+
+    unique case (opcode_alu)
+
+      ///////////
+      // Jumps //
+      ///////////
+
+      OPCODE_JAL: begin // Jump and Link
+        if (BranchTargetALU) begin
+          bt_a_mux_sel_o = OP_A_CURRPC;
+          bt_b_mux_sel_o = IMM_B_J;
+        end
+
+        // Jumps take two cycles without the BTALU
+        if (instr_first_cycle_i && !BranchTargetALU) begin
+          // Calculate jump target
+          alu_op_a_mux_sel_o  = OP_A_CURRPC;
+          alu_op_b_mux_sel_o  = OP_B_IMM;
+          imm_b_mux_sel_o     = IMM_B_J;
+          alu_operator_o      = ALU_ADD;
+        end else begin
+          // Calculate and store PC+4
+          alu_op_a_mux_sel_o  = OP_A_CURRPC;
+          alu_op_b_mux_sel_o  = OP_B_IMM;
+          imm_b_mux_sel_o     = IMM_B_INCR_PC;
+          alu_operator_o      = ALU_ADD;
+        end
+      end
+
+      OPCODE_JALR: begin // Jump and Link Register
+        if (BranchTargetALU) begin
+          bt_a_mux_sel_o = OP_A_REG_A;
+          bt_b_mux_sel_o = IMM_B_I;
+        end
+
+        // Jumps take two cycles without the BTALU
+        if (instr_first_cycle_i && !BranchTargetALU) begin
+          // Calculate jump target
+          alu_op_a_mux_sel_o  = OP_A_REG_A;
+          alu_op_b_mux_sel_o  = OP_B_IMM;
+          imm_b_mux_sel_o     = IMM_B_I;
+          alu_operator_o      = ALU_ADD;
+        end else begin
+          // Calculate and store PC+4
+          alu_op_a_mux_sel_o  = OP_A_CURRPC;
+          alu_op_b_mux_sel_o  = OP_B_IMM;
+          imm_b_mux_sel_o     = IMM_B_INCR_PC;
+          alu_operator_o      = ALU_ADD;
+        end
+      end
+
+      OPCODE_BRANCH: begin // Branch
+        // Check branch condition selection
+        unique case (instr_alu[14:12])
+          3'b000:  alu_operator_o = ALU_EQ;
+          3'b001:  alu_operator_o = ALU_NE;
+          3'b100:  alu_operator_o = ALU_LT;
+          3'b101:  alu_operator_o = ALU_GE;
+          3'b110:  alu_operator_o = ALU_LTU;
+          3'b111:  alu_operator_o = ALU_GEU;
+          default: ;
+        endcase
+
+        if (BranchTargetALU) begin
+          bt_a_mux_sel_o = OP_A_CURRPC;
+          // Not-taken branch will jump to next instruction (used in secure mode)
+          bt_b_mux_sel_o = branch_taken_i ? IMM_B_B : IMM_B_INCR_PC;
+        end
+
+        // Without branch target ALU, a branch is a two-stage operation using the Main ALU in both
+        // stages
+        if (instr_first_cycle_i) begin
+          // First evaluate the branch condition
+          alu_op_a_mux_sel_o  = OP_A_REG_A;
+          alu_op_b_mux_sel_o  = OP_B_REG_B;
+        end else if (!BranchTargetALU) begin
+          // Then calculate jump target
+          alu_op_a_mux_sel_o  = OP_A_CURRPC;
+          alu_op_b_mux_sel_o  = OP_B_IMM;
+          // Not-taken branch will jump to next instruction (used in secure mode)
+          imm_b_mux_sel_o     = branch_taken_i ? IMM_B_B : IMM_B_INCR_PC;
+          alu_operator_o      = ALU_ADD;
+        end
+      end
+
+      ////////////////
+      // Load/store //
+      ////////////////
+
+      OPCODE_STORE: begin
+        alu_op_a_mux_sel_o = OP_A_REG_A;
+        alu_op_b_mux_sel_o = OP_B_REG_B;
+        alu_operator_o     = ALU_ADD;
+
+        if (!instr_alu[14]) begin
+          // offset from immediate
+          imm_b_mux_sel_o     = IMM_B_S;
+          alu_op_b_mux_sel_o  = OP_B_IMM;
+        end
+      end
+
+      OPCODE_LOAD: begin
+        alu_op_a_mux_sel_o  = OP_A_REG_A;
+
+        // offset from immediate
+        alu_operator_o      = ALU_ADD;
+        alu_op_b_mux_sel_o  = OP_B_IMM;
+        imm_b_mux_sel_o     = IMM_B_I;
+      end
+
+      /////////
+      // ALU //
+      /////////
+
+      OPCODE_LUI: begin  // Load Upper Immediate
+        alu_op_a_mux_sel_o  = OP_A_IMM;
+        alu_op_b_mux_sel_o  = OP_B_IMM;
+        imm_a_mux_sel_o     = IMM_A_ZERO;
+        imm_b_mux_sel_o     = IMM_B_U;
+        alu_operator_o      = ALU_ADD;
+      end
+
+      // use CHERI version of AUIPCC when pmode == 1
+      OPCODE_AUIPC: begin  // Add Upper Immediate to PC
+        alu_op_a_mux_sel_o  = OP_A_CURRPC;
+        alu_op_b_mux_sel_o  = OP_B_IMM;
+        imm_b_mux_sel_o     = IMM_B_U;
+        alu_operator_o      = ALU_ADD;
+      end
+
+      OPCODE_OP_IMM: begin // Register-Immediate ALU Operations
+        alu_op_a_mux_sel_o  = OP_A_REG_A;
+        alu_op_b_mux_sel_o  = OP_B_IMM;
+        imm_b_mux_sel_o     = IMM_B_I;
+
+        unique case (instr_alu[14:12])
+          3'b000: alu_operator_o = ALU_ADD;  // Add Immediate
+          3'b010: alu_operator_o = ALU_SLT;  // Set to one if Lower Than Immediate
+          3'b011: alu_operator_o = ALU_SLTU; // Set to one if Lower Than Immediate Unsigned
+          3'b100: alu_operator_o = ALU_XOR;  // Exclusive Or with Immediate
+          3'b110: alu_operator_o = ALU_OR;   // Or with Immediate
+          3'b111: alu_operator_o = ALU_AND;  // And with Immediate
+
+          3'b001: begin
+            if (RV32B != RV32BNone) begin
+              unique case (instr_alu[31:27])
+                5'b0_0000: alu_operator_o = ALU_SLL;    // Shift Left Logical by Immediate
+                // Shift Left Ones by Immediate
+                5'b0_0100: begin
+                  if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) alu_operator_o = ALU_SLO;
+                end
+                5'b0_1001: alu_operator_o = ALU_BCLR; // Clear bit specified by immediate
+                5'b0_0101: alu_operator_o = ALU_BSET; // Set bit specified by immediate
+                5'b0_1101: alu_operator_o = ALU_BINV; // Invert bit specified by immediate.
+                // Shuffle with Immediate Control Value
+                5'b0_0001: if (instr_alu[26] == 0) alu_operator_o = ALU_SHFL;
+                5'b0_1100: begin
+                  unique case (instr_alu[26:20])
+                    7'b000_0000: alu_operator_o = ALU_CLZ;   // clz
+                    7'b000_0001: alu_operator_o = ALU_CTZ;   // ctz
+                    7'b000_0010: alu_operator_o = ALU_CPOP;  // cpop
+                    7'b000_0100: alu_operator_o = ALU_SEXTB; // sext.b
+                    7'b000_0101: alu_operator_o = ALU_SEXTH; // sext.h
+                    7'b001_0000: begin
+                      if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) begin
+                        alu_operator_o = ALU_CRC32_B;  // crc32.b
+                        alu_multicycle_o = 1'b1;
+                      end
+                    end
+                    7'b001_0001: begin
+                      if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) begin
+                        alu_operator_o = ALU_CRC32_H;  // crc32.h
+                        alu_multicycle_o = 1'b1;
+                      end
+                    end
+                    7'b001_0010: begin
+                      if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) begin
+                        alu_operator_o = ALU_CRC32_W;  // crc32.w
+                        alu_multicycle_o = 1'b1;
+                      end
+                    end
+                    7'b001_1000: begin
+                      if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) begin
+                        alu_operator_o = ALU_CRC32C_B; // crc32c.b
+                        alu_multicycle_o = 1'b1;
+                      end
+                    end
+                    7'b001_1001: begin
+                      if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) begin
+                        alu_operator_o = ALU_CRC32C_H; // crc32c.h
+                        alu_multicycle_o = 1'b1;
+                      end
+                    end
+                    7'b001_1010: begin
+                      if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) begin
+                        alu_operator_o = ALU_CRC32C_W; // crc32c.w
+                        alu_multicycle_o = 1'b1;
+                      end
+                    end
+                    default: ;
+                  endcase
+                end
+
+                default: ;
+              endcase
+            end else begin
+              alu_operator_o = ALU_SLL; // Shift Left Logical by Immediate
+            end
+          end
+
+          3'b101: begin
+            if (RV32B != RV32BNone) begin
+              if (instr_alu[26] == 1'b1) begin
+                alu_operator_o = ALU_FSR;
+                alu_multicycle_o = 1'b1;
+                if (instr_first_cycle_i) begin
+                  use_rs3_d = 1'b1;
+                end else begin
+                  use_rs3_d = 1'b0;
+                end
+              end else begin
+                unique case (instr_alu[31:27])
+                  5'b0_0000: alu_operator_o = ALU_SRL;   // Shift Right Logical by Immediate
+                  5'b0_1000: alu_operator_o = ALU_SRA;   // Shift Right Arithmetically by Immediate
+                  // Shift Right Ones by Immediate
+                  5'b0_0100: begin
+                    if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) alu_operator_o = ALU_SRO;
+                  end
+                  5'b0_1001: alu_operator_o = ALU_BEXT;  // Extract bit specified by immediate.
+                  5'b0_1100: begin
+                    alu_operator_o = ALU_ROR;            // Rotate Right by Immediate
+                    alu_multicycle_o = 1'b1;
+                  end
+                  5'b0_1101: alu_operator_o = ALU_GREV;  // General Reverse with Imm Control Val
+                  5'b0_0101: alu_operator_o = ALU_GORC;  // General Or-combine with Imm Control Val
+                  // Unshuffle with Immediate Control Value
+                  5'b0_0001: begin
+                    if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) begin
+                      if (instr_alu[26] == 1'b0) alu_operator_o = ALU_UNSHFL;
+                    end
+                  end
+                  default: ;
+                endcase
+              end
+
+            end else begin
+              if (instr_alu[31:27] == 5'b0_0000) begin
+                alu_operator_o = ALU_SRL;               // Shift Right Logical by Immediate
+              end else if (instr_alu[31:27] == 5'b0_1000) begin
+                alu_operator_o = ALU_SRA;               // Shift Right Arithmetically by Immediate
+              end
+            end
+          end
+
+          default: ;
+        endcase
+      end
+
+      OPCODE_OP: begin  // Register-Register ALU operation
+        alu_op_a_mux_sel_o = OP_A_REG_A;
+        alu_op_b_mux_sel_o = OP_B_REG_B;
+
+        if (instr_alu[26]) begin
+          if (RV32B != RV32BNone) begin
+            unique case ({instr_alu[26:25], instr_alu[14:12]})
+              {2'b11, 3'b001}: begin
+                alu_operator_o   = ALU_CMIX; // cmix
+                alu_multicycle_o = 1'b1;
+                if (instr_first_cycle_i) begin
+                  use_rs3_d = 1'b1;
+                end else begin
+                  use_rs3_d = 1'b0;
+                end
+              end
+              {2'b11, 3'b101}: begin
+                alu_operator_o   = ALU_CMOV; // cmov
+                alu_multicycle_o = 1'b1;
+                if (instr_first_cycle_i) begin
+                  use_rs3_d = 1'b1;
+                end else begin
+                  use_rs3_d = 1'b0;
+                end
+              end
+              {2'b10, 3'b001}: begin
+                alu_operator_o   = ALU_FSL;  // fsl
+                alu_multicycle_o = 1'b1;
+                if (instr_first_cycle_i) begin
+                  use_rs3_d = 1'b1;
+                end else begin
+                  use_rs3_d = 1'b0;
+                end
+              end
+              {2'b10, 3'b101}: begin
+                alu_operator_o   = ALU_FSR;  // fsr
+                alu_multicycle_o = 1'b1;
+                if (instr_first_cycle_i) begin
+                  use_rs3_d = 1'b1;
+                end else begin
+                  use_rs3_d = 1'b0;
+                end
+              end
+              default: ;
+            endcase
+          end
+        end else begin
+          unique case ({instr_alu[31:25], instr_alu[14:12]})
+            // RV32I ALU operations
+            {7'b000_0000, 3'b000}: alu_operator_o = ALU_ADD;   // Add
+            {7'b010_0000, 3'b000}: alu_operator_o = ALU_SUB;   // Sub
+            {7'b000_0000, 3'b010}: alu_operator_o = ALU_SLT;   // Set Lower Than
+            {7'b000_0000, 3'b011}: alu_operator_o = ALU_SLTU;  // Set Lower Than Unsigned
+            {7'b000_0000, 3'b100}: alu_operator_o = ALU_XOR;   // Xor
+            {7'b000_0000, 3'b110}: alu_operator_o = ALU_OR;    // Or
+            {7'b000_0000, 3'b111}: alu_operator_o = ALU_AND;   // And
+            {7'b000_0000, 3'b001}: alu_operator_o = ALU_SLL;   // Shift Left Logical
+            {7'b000_0000, 3'b101}: alu_operator_o = ALU_SRL;   // Shift Right Logical
+            {7'b010_0000, 3'b101}: alu_operator_o = ALU_SRA;   // Shift Right Arithmetic
+
+            // RV32B ALU Operations
+            {7'b011_0000, 3'b001}: begin
+              if (RV32B != RV32BNone) begin
+                alu_operator_o = ALU_ROL;
+                alu_multicycle_o = 1'b1;
+              end
+            end
+            {7'b011_0000, 3'b101}: begin
+              if (RV32B != RV32BNone) begin
+                alu_operator_o = ALU_ROR;
+                alu_multicycle_o = 1'b1;
+              end
+            end
+
+            {7'b000_0101, 3'b100}: if (RV32B != RV32BNone) alu_operator_o = ALU_MIN;
+            {7'b000_0101, 3'b110}: if (RV32B != RV32BNone) alu_operator_o = ALU_MAX;
+            {7'b000_0101, 3'b101}: if (RV32B != RV32BNone) alu_operator_o = ALU_MINU;
+            {7'b000_0101, 3'b111}: if (RV32B != RV32BNone) alu_operator_o = ALU_MAXU;
+
+            {7'b000_0100, 3'b100}: if (RV32B != RV32BNone) alu_operator_o = ALU_PACK;
+            {7'b010_0100, 3'b100}: if (RV32B != RV32BNone) alu_operator_o = ALU_PACKU;
+            {7'b000_0100, 3'b111}: if (RV32B != RV32BNone) alu_operator_o = ALU_PACKH;
+
+            {7'b010_0000, 3'b100}: if (RV32B != RV32BNone) alu_operator_o = ALU_XNOR;
+            {7'b010_0000, 3'b110}: if (RV32B != RV32BNone) alu_operator_o = ALU_ORN;
+            {7'b010_0000, 3'b111}: if (RV32B != RV32BNone) alu_operator_o = ALU_ANDN;
+
+            // RV32B zba
+            {7'b001_0000, 3'b010}: if (RV32B != RV32BNone) alu_operator_o = ALU_SH1ADD;
+            {7'b001_0000, 3'b100}: if (RV32B != RV32BNone) alu_operator_o = ALU_SH2ADD;
+            {7'b001_0000, 3'b110}: if (RV32B != RV32BNone) alu_operator_o = ALU_SH3ADD;
+
+            // RV32B zbs
+            {7'b010_0100, 3'b001}: if (RV32B != RV32BNone) alu_operator_o = ALU_BCLR;
+            {7'b001_0100, 3'b001}: if (RV32B != RV32BNone) alu_operator_o = ALU_BSET;
+            {7'b011_0100, 3'b001}: if (RV32B != RV32BNone) alu_operator_o = ALU_BINV;
+            {7'b010_0100, 3'b101}: if (RV32B != RV32BNone) alu_operator_o = ALU_BEXT;
+
+            // RV32B zbf
+            {7'b010_0100, 3'b111}: if (RV32B != RV32BNone) alu_operator_o = ALU_BFP;
+
+            // RV32B zbp
+            {7'b011_0100, 3'b101}: if (RV32B != RV32BNone) alu_operator_o = ALU_GREV;
+            {7'b001_0100, 3'b101}: if (RV32B != RV32BNone) alu_operator_o = ALU_GORC;
+            {7'b000_0100, 3'b001}: begin
+              if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) alu_operator_o = ALU_SHFL;
+            end
+            {7'b000_0100, 3'b101}: begin
+              if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) alu_operator_o = ALU_UNSHFL;
+            end
+            {7'b001_0100, 3'b010}: begin
+              if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) alu_operator_o = ALU_XPERM_N;
+            end
+            {7'b001_0100, 3'b100}: begin
+              if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) alu_operator_o = ALU_XPERM_B;
+            end
+            {7'b001_0100, 3'b110}: begin
+              if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) alu_operator_o = ALU_XPERM_H;
+            end
+            {7'b001_0000, 3'b001}: begin
+              if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) alu_operator_o = ALU_SLO;
+            end
+            {7'b001_0000, 3'b101}: begin
+              if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) alu_operator_o = ALU_SRO;
+            end
+
+            // RV32B zbc
+            {7'b000_0101, 3'b001}: begin
+              if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) alu_operator_o = ALU_CLMUL;
+            end
+            {7'b000_0101, 3'b010}: begin
+              if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) alu_operator_o = ALU_CLMULR;
+            end
+            {7'b000_0101, 3'b011}: begin
+              if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) alu_operator_o = ALU_CLMULH;
+            end
+
+            // RV32B zbe
+            {7'b010_0100, 3'b110}: begin
+              if (RV32B == RV32BFull) begin
+                alu_operator_o = ALU_BDECOMPRESS;
+                alu_multicycle_o = 1'b1;
+              end
+            end
+            {7'b000_0100, 3'b110}: begin
+              if (RV32B == RV32BFull) begin
+                alu_operator_o = ALU_BCOMPRESS;
+                alu_multicycle_o = 1'b1;
+              end
+            end
+
+            // RV32M instructions, all use the same ALU operation
+            {7'b000_0001, 3'b000}: begin // mul
+              alu_operator_o = ALU_ADD;
+              mult_sel_o     = (RV32M == RV32MNone) ? 1'b0 : 1'b1;
+            end
+            {7'b000_0001, 3'b001}: begin // mulh
+              alu_operator_o = ALU_ADD;
+              mult_sel_o     = (RV32M == RV32MNone) ? 1'b0 : 1'b1;
+            end
+            {7'b000_0001, 3'b010}: begin // mulhsu
+              alu_operator_o = ALU_ADD;
+              mult_sel_o     = (RV32M == RV32MNone) ? 1'b0 : 1'b1;
+            end
+            {7'b000_0001, 3'b011}: begin // mulhu
+              alu_operator_o = ALU_ADD;
+              mult_sel_o     = (RV32M == RV32MNone) ? 1'b0 : 1'b1;
+            end
+            {7'b000_0001, 3'b100}: begin // div
+              alu_operator_o = ALU_ADD;
+              div_sel_o      = (RV32M == RV32MNone) ? 1'b0 : 1'b1;
+            end
+            {7'b000_0001, 3'b101}: begin // divu
+              alu_operator_o = ALU_ADD;
+              div_sel_o      = (RV32M == RV32MNone) ? 1'b0 : 1'b1;
+            end
+            {7'b000_0001, 3'b110}: begin // rem
+              alu_operator_o = ALU_ADD;
+              div_sel_o      = (RV32M == RV32MNone) ? 1'b0 : 1'b1;
+            end
+            {7'b000_0001, 3'b111}: begin // remu
+              alu_operator_o = ALU_ADD;
+              div_sel_o      = (RV32M == RV32MNone) ? 1'b0 : 1'b1;
+            end
+
+            default: ;
+          endcase
+        end
+      end
+
+      /////////////
+      // Special //
+      /////////////
+
+      OPCODE_MISC_MEM: begin
+        unique case (instr_alu[14:12])
+          3'b000: begin
+            // FENCE is treated as a NOP since all memory operations are already strictly ordered.
+            alu_operator_o     = ALU_ADD; // nop
+            alu_op_a_mux_sel_o = OP_A_REG_A;
+            alu_op_b_mux_sel_o = OP_B_IMM;
+          end
+          3'b001: begin
+            // FENCE.I will flush the IF stage, prefetch buffer and ICache if present.
+            if (BranchTargetALU) begin
+              bt_a_mux_sel_o     = OP_A_CURRPC;
+              bt_b_mux_sel_o     = IMM_B_INCR_PC;
+            end else begin
+              alu_op_a_mux_sel_o = OP_A_CURRPC;
+              alu_op_b_mux_sel_o = OP_B_IMM;
+              imm_b_mux_sel_o    = IMM_B_INCR_PC;
+              alu_operator_o     = ALU_ADD;
+            end
+          end
+          default: ;
+        endcase
+      end
+
+      OPCODE_SYSTEM: begin
+        if (instr_alu[14:12] == 3'b000) begin
+          // non CSR related SYSTEM instructions
+          alu_op_a_mux_sel_o = OP_A_REG_A;
+          alu_op_b_mux_sel_o = OP_B_IMM;
+        end else begin
+          // instruction to read/modify CSR
+          alu_op_b_mux_sel_o = OP_B_IMM;
+          imm_a_mux_sel_o    = IMM_A_Z;
+          imm_b_mux_sel_o    = IMM_B_I;  // CSR address is encoded in I imm
+
+          if (instr_alu[14]) begin
+            // rs1 field is used as immediate
+            alu_op_a_mux_sel_o = OP_A_IMM;
+          end else begin
+            alu_op_a_mux_sel_o = OP_A_REG_A;
+          end
+        end
+
+      end
+      default: ;
+    endcase
+  end
+
+  // do not enable multdiv in case of illegal instruction exceptions
+  assign mult_en_o = illegal_insn_o ? 1'b0 : mult_sel_o;
+  assign div_en_o  = illegal_insn_o ? 1'b0 : div_sel_o;
+
+  // make sure instructions accessing non-available registers in RV32E cause illegal
+  // instruction exceptions
+  assign illegal_insn_o = illegal_insn | illegal_reg_rv32e | illegal_reg_cheri;
+
+  // do not propgate regfile write enable if non-available registers are accessed in RV32E
+  assign rf_we_o = rf_we & ~illegal_reg_rv32e & ~illegal_reg_cheri;
+
+  // Not all bits are used
+  assign unused_instr_alu = {instr_alu[19:15],instr_alu[11:7]};
+
+  assign instr_is_legal_cheri_o = instr_is_legal_cheri & ~illegal_reg_cheri;
+
+  // cheri decoder
+  if (CHERIoTEn) begin : gen_cheri_decoder
+    cheri_decoder # (
+      .CheriPPLBC              (CheriPPLBC),
+      .CheriSBND2              (CheriSBND2)
+    ) u_cheri_decoder (
+      .cheri_opcode_en_i       (cheri_opcode_en),
+      .cheri_tsafe_en_i        (cheri_tsafe_en_i),
+      .cheri_auipcc_en_i       (cheri_auipcc_en),
+      .cheri_auicgp_en_i       (cheri_auicgp_en),
+      .cheri_jalr_en_i         (cheri_jalr_en),
+      .cheri_jal_en_i          (cheri_jal_en),
+      .cheri_cload_en_i        (cheri_cload_en),
+      .cheri_cstore_en_i       (cheri_cstore_en),
+      .instr_rdata_i           (instr_rdata_i),
+      .instr_is_cheri_o        (instr_is_cheri_o),
+      .instr_is_legal_cheri_o  (instr_is_legal_cheri),
+      .cheri_imm12_o           (cheri_imm12_o),
+      .cheri_imm20_o           (cheri_imm20_o),
+      .cheri_imm21_o           (cheri_imm21_o),
+      .cheri_operator_o        (cheri_operator_o),
+      .cheri_cs2_dec_o         (cheri_cs2_dec_o),
+      .cheri_rf_ren_a_o        (cheri_rf_ren_a),
+      .cheri_rf_ren_b_o        (cheri_rf_ren_b),
+      .cheri_rf_we_dec_o       (cheri_rf_we_dec),
+      .cheri_multicycle_dec_o  (cheri_multicycle_dec_o)
+      );
+  end else begin
+    assign instr_is_cheri_o       = 1'b0;
+    assign instr_is_legal_cheri   = 1'b0;
+    assign cheri_imm12_o          = 12'h0;
+    assign cheri_imm20_o          = 20'h0;
+    assign cheri_imm21_o          = 21'h0;
+    assign cheri_operator_o       = 'h0;
+    assign cheri_cs2_dec_o        = 1'b0;
+    assign cheri_rf_ren_a         = 1'b0;    
+    assign cheri_rf_ren_b         = 1'b0;
+    assign cheri_rf_we_dec        = 1'b0;
+    assign cheri_multicycle_dec_o = 1'b0;
+ 
+  end
+
+  ////////////////a
+  // Assertions //
+  ////////////////
+
+  // Selectors must be known/valid.
+  `ASSERT(IbexRegImmAluOpKnown, (opcode == OPCODE_OP_IMM) |->
+      !$isunknown(instr[14:12]))
+endmodule // controller
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_dummy_instr.sv b/hw/ip/cheriot-ibex/rtl/cheriot_dummy_instr.sv
new file mode 100644
index 0000000..897172d
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_dummy_instr.sv
@@ -0,0 +1,149 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Dummy instruction module
+ *
+ * Provides pseudo-randomly inserted fake instructions for secure code obfuscation
+ */
+
+module cheriot_dummy_instr import cheriot_pkg::*; #(
+    parameter lfsr_seed_t RndCnstLfsrSeed = RndCnstLfsrSeedDefault,
+    parameter lfsr_perm_t RndCnstLfsrPerm = RndCnstLfsrPermDefault
+) (
+  // Clock and reset
+  input  logic        clk_i,
+  input  logic        rst_ni,
+
+  // Interface to CSRs
+  input  logic        dummy_instr_en_i,
+  input  logic [2:0]  dummy_instr_mask_i,
+  input  logic        dummy_instr_seed_en_i,
+  input  logic [31:0] dummy_instr_seed_i,
+
+  // Interface to IF stage
+  input  logic        fetch_valid_i,
+  input  logic        id_in_ready_i,
+  output logic        insert_dummy_instr_o,
+  output logic [31:0] dummy_instr_data_o
+);
+
+  localparam int unsigned TIMEOUT_CNT_W = 5;
+  localparam int unsigned OP_W          = 5;
+
+  typedef enum logic [1:0] {
+    DUMMY_ADD = 2'b00,
+    DUMMY_MUL = 2'b01,
+    DUMMY_DIV = 2'b10,
+    DUMMY_AND = 2'b11
+  } dummy_instr_e;
+
+  typedef struct packed {
+    dummy_instr_e             instr_type;
+    logic [OP_W-1:0]          op_b;
+    logic [OP_W-1:0]          op_a;
+    logic [TIMEOUT_CNT_W-1:0] cnt;
+  } lfsr_data_t;
+  localparam int unsigned LFSR_OUT_W = $bits(lfsr_data_t);
+
+  lfsr_data_t               lfsr_data;
+  logic [TIMEOUT_CNT_W-1:0] dummy_cnt_incr, dummy_cnt_threshold;
+  logic [TIMEOUT_CNT_W-1:0] dummy_cnt_d, dummy_cnt_q;
+  logic                     dummy_cnt_en;
+  logic                     lfsr_en;
+  logic [LFSR_OUT_W-1:0]    lfsr_state;
+  logic                     insert_dummy_instr;
+  logic [6:0]               dummy_set;
+  logic [2:0]               dummy_opcode;
+  logic [31:0]              dummy_instr;
+  logic [31:0]              dummy_instr_seed_q, dummy_instr_seed_d;
+
+  // Shift the LFSR every time we insert an instruction
+  assign lfsr_en = insert_dummy_instr & id_in_ready_i;
+
+  assign dummy_instr_seed_d = dummy_instr_seed_q ^ dummy_instr_seed_i;
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      dummy_instr_seed_q <= '0;
+    end else if (dummy_instr_seed_en_i) begin
+      dummy_instr_seed_q <= dummy_instr_seed_d;
+    end
+  end
+
+  prim_lfsr #(
+      .LfsrDw      ( LfsrWidth       ),
+      .StateOutDw  ( LFSR_OUT_W      ),
+      .DefaultSeed ( RndCnstLfsrSeed ),
+      .StatePermEn ( 1'b1            ),
+      .StatePerm   ( RndCnstLfsrPerm )
+  ) lfsr_i (
+      .clk_i     ( clk_i                 ),
+      .rst_ni    ( rst_ni                ),
+      .seed_en_i ( dummy_instr_seed_en_i ),
+      .seed_i    ( dummy_instr_seed_d    ),
+      .lfsr_en_i ( lfsr_en               ),
+      .entropy_i ( '0                    ),
+      .state_o   ( lfsr_state            )
+  );
+
+  // Extract fields from LFSR
+  assign lfsr_data = lfsr_data_t'(lfsr_state);
+
+  // Set count threshold for inserting a new instruction. This is the pseudo-random value from the
+  // LFSR with a mask applied (based on CSR config data) to shorten the period if required.
+  assign dummy_cnt_threshold = lfsr_data.cnt & {dummy_instr_mask_i,{TIMEOUT_CNT_W-3{1'b1}}};
+  assign dummy_cnt_incr      = dummy_cnt_q + {{TIMEOUT_CNT_W-1{1'b0}},1'b1};
+  // Clear the counter everytime a new instruction is inserted
+  assign dummy_cnt_d         = insert_dummy_instr ? '0 : dummy_cnt_incr;
+  // Increment the counter for each executed instruction while dummy instuctions are
+  // enabled.
+  assign dummy_cnt_en        = dummy_instr_en_i & id_in_ready_i &
+                               (fetch_valid_i | insert_dummy_instr);
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      dummy_cnt_q <= '0;
+    end else if (dummy_cnt_en) begin
+      dummy_cnt_q <= dummy_cnt_d;
+    end
+  end
+
+  // Insert a dummy instruction each time the counter hits the threshold
+  assign insert_dummy_instr = dummy_instr_en_i & (dummy_cnt_q == dummy_cnt_threshold);
+
+  // Encode instruction
+  always_comb begin
+    unique case (lfsr_data.instr_type)
+      DUMMY_ADD: begin
+        dummy_set    = 7'b0000000;
+        dummy_opcode = 3'b000;
+      end
+      DUMMY_MUL: begin
+        dummy_set    = 7'b0000001;
+        dummy_opcode = 3'b000;
+      end
+      DUMMY_DIV: begin
+        dummy_set    = 7'b0000001;
+        dummy_opcode = 3'b100;
+      end
+      DUMMY_AND: begin
+        dummy_set    = 7'b0000000;
+        dummy_opcode = 3'b111;
+      end
+      default: begin
+        dummy_set    = 7'b0000000;
+        dummy_opcode = 3'b000;
+      end
+    endcase
+  end
+
+  //                    SET        RS2             RS1             OP            RD
+  assign dummy_instr = {dummy_set, lfsr_data.op_b, lfsr_data.op_a, dummy_opcode, 5'h00, 7'h33};
+
+  // Assign outputs
+  assign insert_dummy_instr_o = insert_dummy_instr;
+  assign dummy_instr_data_o   = dummy_instr;
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_ex_block.sv b/hw/ip/cheriot-ibex/rtl/cheriot_ex_block.sv
new file mode 100644
index 0000000..8eb30a5
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_ex_block.sv
@@ -0,0 +1,199 @@
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Execution stage
+ *
+ * Execution block: Hosts ALU and MUL/DIV unit
+ */
+module cheriot_ex_block #(
+  parameter cheriot_pkg::rv32m_e RV32M           = cheriot_pkg::RV32MFast,
+  parameter cheriot_pkg::rv32b_e RV32B           = cheriot_pkg::RV32BNone,
+  parameter bit               BranchTargetALU = 0
+) (
+  input  logic                  clk_i,
+  input  logic                  rst_ni,
+
+  // ALU
+  input  cheriot_pkg::alu_op_e     alu_operator_i,
+  input  logic [31:0]           alu_operand_a_i,
+  input  logic [31:0]           alu_operand_b_i,
+  input  logic                  alu_instr_first_cycle_i,
+
+  // Branch Target ALU
+  // All of these signals are unusued when BranchTargetALU == 0
+  input  logic [31:0]           bt_a_operand_i,
+  input  logic [31:0]           bt_b_operand_i,
+
+  // Multiplier/Divider
+  input  cheriot_pkg::md_op_e      multdiv_operator_i,
+  input  logic                  mult_en_i,             // dynamic enable signal, for FSM control
+  input  logic                  div_en_i,              // dynamic enable signal, for FSM control
+  input  logic                  mult_sel_i,            // static decoder output, for data muxes
+  input  logic                  div_sel_i,             // static decoder output, for data muxes
+  input  logic  [1:0]           multdiv_signed_mode_i,
+  input  logic [31:0]           multdiv_operand_a_i,
+  input  logic [31:0]           multdiv_operand_b_i,
+  input  logic                  multdiv_ready_id_i,
+  input  logic                  data_ind_timing_i,
+
+  // intermediate val reg
+  output logic [1:0]            imd_val_we_o,
+  output logic [33:0]           imd_val_d_o[2],
+  input  logic [33:0]           imd_val_q_i[2],
+
+  // Outputs
+  output logic [31:0]           alu_adder_result_ex_o, // to LSU
+  output logic [31:0]           result_ex_o,
+  output logic [31:0]           branch_target_o,       // to IF
+  output logic                  branch_decision_o,     // to ID
+
+  output logic                  ex_valid_o             // EX has valid output
+);
+
+  import cheriot_pkg::*;
+
+  logic [31:0] alu_result, multdiv_result;
+
+  logic [32:0] multdiv_alu_operand_b, multdiv_alu_operand_a;
+  logic [33:0] alu_adder_result_ext;
+  logic        alu_cmp_result, alu_is_equal_result;
+  logic        multdiv_valid;
+  logic        multdiv_sel;
+  logic [31:0] alu_imd_val_q[2];
+  logic [31:0] alu_imd_val_d[2];
+  logic [ 1:0] alu_imd_val_we;
+  logic [33:0] multdiv_imd_val_d[2];
+  logic [ 1:0] multdiv_imd_val_we;
+
+  /*
+    The multdiv_i output is never selected if RV32M=RV32MNone
+    At synthesis time, all the combinational and sequential logic
+    from the multdiv_i module are eliminated
+  */
+  if (RV32M != RV32MNone) begin : gen_multdiv_m
+    assign multdiv_sel = mult_sel_i | div_sel_i;
+  end else begin : gen_multdiv_no_m
+    assign multdiv_sel = 1'b0;
+  end
+
+  // Intermediate Value Register Mux
+  assign imd_val_d_o[0] = multdiv_sel ? multdiv_imd_val_d[0] : {2'b0, alu_imd_val_d[0]};
+  assign imd_val_d_o[1] = multdiv_sel ? multdiv_imd_val_d[1] : {2'b0, alu_imd_val_d[1]};
+  assign imd_val_we_o   = multdiv_sel ? multdiv_imd_val_we : alu_imd_val_we;
+
+  assign alu_imd_val_q = '{imd_val_q_i[0][31:0], imd_val_q_i[1][31:0]};
+
+  assign result_ex_o  = multdiv_sel ? multdiv_result : alu_result;
+
+  // branch handling
+  assign branch_decision_o  = alu_cmp_result;
+
+  if (BranchTargetALU) begin : g_branch_target_alu
+    logic [32:0] bt_alu_result;
+    logic        unused_bt_carry;
+
+    assign bt_alu_result   = bt_a_operand_i + bt_b_operand_i;
+
+    assign unused_bt_carry = bt_alu_result[32];
+    assign branch_target_o = bt_alu_result[31:0];
+  end else begin : g_no_branch_target_alu
+    // Unused bt_operand signals cause lint errors, this avoids them
+    logic [31:0] unused_bt_a_operand, unused_bt_b_operand;
+
+    assign unused_bt_a_operand = bt_a_operand_i;
+    assign unused_bt_b_operand = bt_b_operand_i;
+
+    assign branch_target_o = alu_adder_result_ex_o;
+  end
+
+  /////////
+  // ALU //
+  /////////
+
+  cheriot_alu #(
+    .RV32B(RV32B)
+  ) alu_i (
+    .operator_i         (alu_operator_i),
+    .operand_a_i        (alu_operand_a_i),
+    .operand_b_i        (alu_operand_b_i),
+    .instr_first_cycle_i(alu_instr_first_cycle_i),
+    .imd_val_q_i        (alu_imd_val_q),
+    .imd_val_we_o       (alu_imd_val_we),
+    .imd_val_d_o        (alu_imd_val_d),
+    .multdiv_operand_a_i(multdiv_alu_operand_a),
+    .multdiv_operand_b_i(multdiv_alu_operand_b),
+    .multdiv_sel_i      (multdiv_sel),
+    .adder_result_o     (alu_adder_result_ex_o),
+    .adder_result_ext_o (alu_adder_result_ext),
+    .result_o           (alu_result),
+    .comparison_result_o(alu_cmp_result),
+    .is_equal_result_o  (alu_is_equal_result)
+  );
+
+  ////////////////
+  // Multiplier //
+  ////////////////
+
+  if (RV32M == RV32MSlow) begin : gen_multdiv_slow
+    cheriot_multdiv_slow multdiv_i (
+      .clk_i             (clk_i),
+      .rst_ni            (rst_ni),
+      .mult_en_i         (mult_en_i),
+      .div_en_i          (div_en_i),
+      .mult_sel_i        (mult_sel_i),
+      .div_sel_i         (div_sel_i),
+      .operator_i        (multdiv_operator_i),
+      .signed_mode_i     (multdiv_signed_mode_i),
+      .op_a_i            (multdiv_operand_a_i),
+      .op_b_i            (multdiv_operand_b_i),
+      .alu_adder_ext_i   (alu_adder_result_ext),
+      .alu_adder_i       (alu_adder_result_ex_o),
+      .equal_to_zero_i   (alu_is_equal_result),
+      .data_ind_timing_i (data_ind_timing_i),
+      .valid_o           (multdiv_valid),
+      .alu_operand_a_o   (multdiv_alu_operand_a),
+      .alu_operand_b_o   (multdiv_alu_operand_b),
+      .imd_val_q_i       (imd_val_q_i),
+      .imd_val_d_o       (multdiv_imd_val_d),
+      .imd_val_we_o      (multdiv_imd_val_we),
+      .multdiv_ready_id_i(multdiv_ready_id_i),
+      .multdiv_result_o  (multdiv_result)
+    );
+  end else if (RV32M == RV32MFast || RV32M == RV32MSingleCycle) begin : gen_multdiv_fast
+    cheriot_multdiv_fast #(
+      .RV32M(RV32M)
+    ) multdiv_i (
+      .clk_i             (clk_i),
+      .rst_ni            (rst_ni),
+      .mult_en_i         (mult_en_i),
+      .div_en_i          (div_en_i),
+      .mult_sel_i        (mult_sel_i),
+      .div_sel_i         (div_sel_i),
+      .operator_i        (multdiv_operator_i),
+      .signed_mode_i     (multdiv_signed_mode_i),
+      .op_a_i            (multdiv_operand_a_i),
+      .op_b_i            (multdiv_operand_b_i),
+      .alu_operand_a_o   (multdiv_alu_operand_a),
+      .alu_operand_b_o   (multdiv_alu_operand_b),
+      .alu_adder_ext_i   (alu_adder_result_ext),
+      .alu_adder_i       (alu_adder_result_ex_o),
+      .equal_to_zero_i   (alu_is_equal_result),
+      .data_ind_timing_i (data_ind_timing_i),
+      .imd_val_q_i       (imd_val_q_i),
+      .imd_val_d_o       (multdiv_imd_val_d),
+      .imd_val_we_o      (multdiv_imd_val_we),
+      .multdiv_ready_id_i(multdiv_ready_id_i),
+      .valid_o           (multdiv_valid),
+      .multdiv_result_o  (multdiv_result)
+    );
+  end
+
+  // Multiplier/divider may require multiple cycles. The ALU output is valid in the same cycle
+  // unless the intermediate result register is being written (which indicates this isn't the
+  // final cycle of ALU operation).
+  assign ex_valid_o = multdiv_sel ? multdiv_valid : ~(|alu_imd_val_we);
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_fetch_fifo.sv b/hw/ip/cheriot-ibex/rtl/cheriot_fetch_fifo.sv
new file mode 100644
index 0000000..463a9ec
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_fetch_fifo.sv
@@ -0,0 +1,298 @@
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Fetch Fifo for 32 bit memory interface
+ *
+ * input port: send address and data to the FIFO
+ * clear_i clears the FIFO for the following cycle, including any new request
+ */
+
+`include "prim_assert.sv"
+
+module cheriot_fetch_fifo #(
+  parameter int unsigned NUM_REQS = 2,
+  parameter bit          ResetAll = 1'b0
+) (
+  input  logic                clk_i,
+  input  logic                rst_ni,
+
+  // control signals
+  input  logic                clear_i,   // clears the contents of the FIFO
+  output logic [NUM_REQS-1:0] busy_o,
+
+  // input port
+  input  logic                in_valid_i,
+  input  logic [31:0]         in_addr_i,
+  input  logic [31:0]         in_rdata_i,
+  input  logic                in_err_i,
+
+  input  logic                cheri_force_uc_i,  // force unaligned compressed based on CHERI bounds check
+
+  // output port
+  output logic                out_valid_o,
+  input  logic                out_ready_i,
+  output logic [31:0]         out_addr_o,
+  output logic [31:0]         out_rdata_o,
+  output logic                out_err_o,
+  output logic                out_err_plus2_o
+);
+
+  localparam int unsigned DEPTH = NUM_REQS+1;
+
+  // index 0 is used for output
+  logic [DEPTH-1:0] [31:0]  rdata_d,   rdata_q;
+  logic [DEPTH-1:0]         err_d,     err_q;
+  logic [DEPTH-1:0]         valid_d,   valid_q;
+  logic [DEPTH-1:0]         lowest_free_entry;
+  logic [DEPTH-1:0]         valid_pushed, valid_popped;
+  logic [DEPTH-1:0]         entry_en;
+
+  logic                     pop_fifo;
+  logic             [31:0]  rdata, rdata_unaligned;
+  logic                     err,   err_unaligned, err_plus2;
+  logic                     valid, valid_unaligned;
+
+  logic                     aligned_is_compressed, unaligned_is_compressed;
+
+  logic                     addr_incr_two;
+  logic [31:1]              instr_addr_next;
+  logic [31:1]              instr_addr_d, instr_addr_q;
+  logic                     instr_addr_en;
+  logic                     unused_addr_in;
+
+  /////////////////
+  // Output port //
+  /////////////////
+
+  assign rdata = valid_q[0] ? rdata_q[0] : in_rdata_i;
+  assign err   = valid_q[0] ? err_q[0]   : in_err_i;
+  assign valid = valid_q[0] | in_valid_i;
+
+  // The FIFO contains word aligned memory fetches, but the instructions contained in each entry
+  // might be half-word aligned (due to compressed instructions)
+  // e.g.
+  //              | 31               16 | 15               0 |
+  // FIFO entry 0 | Instr 1 [15:0]      | Instr 0 [15:0]     |
+  // FIFO entry 1 | Instr 2 [15:0]      | Instr 1 [31:16]    |
+  //
+  // The FIFO also has a direct bypass path, so a complete instruction might be made up of data
+  // from the FIFO and new incoming data.
+  //
+
+  // Construct the output data for an unaligned instruction
+  assign rdata_unaligned = valid_q[1] ? {rdata_q[1][15:0], rdata[31:16]} :
+                                        {in_rdata_i[15:0], rdata[31:16]};
+
+  // If entry[1] is valid, an error can come from entry[0] or entry[1], unless the
+  // instruction in entry[0] is compressed (entry[1] is a new instruction)
+  // If entry[1] is not valid, and entry[0] is, an error can come from entry[0] or the incoming
+  // data, unless the instruction in entry[0] is compressed
+  // If entry[0] is not valid, the error must come from the incoming data
+  assign err_unaligned   = valid_q[1] ? ((err_q[1] & ~unaligned_is_compressed) | err_q[0]) :
+                                        ((valid_q[0] & err_q[0]) |
+                                         (in_err_i & (~valid_q[0] | ~unaligned_is_compressed)));
+
+  // Record when an error is caused by the second half of an unaligned 32bit instruction.
+  // Only needs to be correct when unaligned and if err_unaligned is set
+  assign err_plus2       = valid_q[1] ? (err_q[1] & ~err_q[0]) :
+                                        (in_err_i & valid_q[0] & ~err_q[0]);
+
+  // An uncompressed unaligned instruction is only valid if both parts are available
+  assign valid_unaligned = valid_q[1] ? 1'b1 :
+                                        (valid_q[0] & in_valid_i);
+
+  // If there is an error, rdata is unknown
+`ifdef DII_SIM
+  logic [31:0] instr_rdata_dii;
+  logic [31:0] instr_pc;
+  logic        instr_ack;
+
+  // for DII we directly force out_rdata_o (re-aligned instruction)
+  // to keep the unaligned/aligned_is_compressed signals in sync
+  //   32-bit instruction; instr_rdata_dii[31:0] = instr
+  //   16-bit instruction: instr_rdata_dii[15:0] = compressed instruction
+  //                       instr_rdata_dii[31:0] = don't care
+
+  assign unaligned_is_compressed = out_addr_o[1] & cheri_force_uc_i | ((instr_rdata_dii[1:0] != 2'b11) & ~err);
+  assign aligned_is_compressed   = ~out_addr_o[1] & (instr_rdata_dii[1:0] != 2'b11) & ~err;
+
+  assign instr_ack = out_ready_i & out_valid_o;
+  assign instr_pc  = out_addr_o;
+`else
+  assign unaligned_is_compressed = cheri_force_uc_i | ((rdata[17:16] != 2'b11) & ~err);
+  assign aligned_is_compressed   = (rdata[ 1: 0] != 2'b11) & ~err;
+`endif
+
+  ////////////////////////////////////////
+  // Instruction aligner (if unaligned) //
+  ////////////////////////////////////////
+
+  always_comb begin
+    if (out_addr_o[1]) begin
+      // unaligned case
+
+`ifdef DII_SIM
+     out_rdata_o      = instr_rdata_dii;
+`else
+      out_rdata_o     = rdata_unaligned;
+`endif
+      out_err_o       = err_unaligned;
+      out_err_plus2_o = err_plus2;
+
+      if (unaligned_is_compressed) begin
+        out_valid_o = valid;
+      end else begin
+        out_valid_o = valid_unaligned;
+      end
+    end else begin
+      // aligned case
+`ifdef DII_SIM
+     out_rdata_o      = instr_rdata_dii;
+`else
+      out_rdata_o     = rdata;
+`endif
+      out_err_o       = err;
+      out_err_plus2_o = 1'b0;
+      out_valid_o     = valid;
+    end
+  end
+
+  /////////////////////////
+  // Instruction address //
+  /////////////////////////
+
+  // Update the address on branches and every time an instruction is driven
+  assign instr_addr_en = clear_i | (out_ready_i & out_valid_o);
+
+  // Increment the address by two every time a compressed instruction is popped
+  assign addr_incr_two = instr_addr_q[1] ? unaligned_is_compressed :
+                                           aligned_is_compressed;
+
+  assign instr_addr_next = (instr_addr_q[31:1] +
+                            // Increment address by 4 or 2
+                            {29'd0,~addr_incr_two,addr_incr_two});
+
+  assign instr_addr_d = clear_i ? in_addr_i[31:1] :
+                                  instr_addr_next;
+
+  if (ResetAll) begin : g_instr_addr_ra
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        instr_addr_q <= '0;
+      end else if (instr_addr_en) begin
+        instr_addr_q <= instr_addr_d;
+      end
+    end
+  end else begin : g_instr_addr_nr
+    always_ff @(posedge clk_i) begin
+      if (instr_addr_en) begin
+        instr_addr_q <= instr_addr_d;
+      end
+    end
+  end
+
+  // Output PC of current instruction
+  assign out_addr_o      = {instr_addr_q, 1'b0};
+
+  // The LSB of the address is unused, since all addresses are halfword aligned
+  assign unused_addr_in = in_addr_i[0];
+
+  /////////////////
+  // FIFO status //
+  /////////////////
+
+  // Indicate the fill level of fifo-entries. This is used to determine when a new request can be
+  // made on the bus. The prefetch buffer only needs to know about the upper entries which overlap
+  // with NUM_REQS.
+  assign busy_o = valid_q[DEPTH-1:DEPTH-NUM_REQS];
+
+  /////////////////////
+  // FIFO management //
+  /////////////////////
+
+  // Since an entry can contain unaligned instructions, popping an entry can leave the entry valid
+  assign pop_fifo = out_ready_i & out_valid_o & (~aligned_is_compressed | out_addr_o[1]);
+
+  for (genvar i = 0; i < (DEPTH - 1); i++) begin : g_fifo_next
+    // Calculate lowest free entry (write pointer)
+    if (i == 0) begin : g_ent0
+      assign lowest_free_entry[i] = ~valid_q[i];
+    end else begin : g_ent_others
+      assign lowest_free_entry[i] = ~valid_q[i] & valid_q[i-1];
+    end
+
+    // An entry is set when an incoming request chooses the lowest available entry
+    assign valid_pushed[i] = (in_valid_i & lowest_free_entry[i]) |
+                             valid_q[i];
+    // Popping the FIFO shifts all entries down
+    assign valid_popped[i] = pop_fifo ? valid_pushed[i+1] : valid_pushed[i];
+    // All entries are wiped out on a clear
+    assign valid_d[i] = valid_popped[i] & ~clear_i;
+
+    // data flops are enabled if there is new data to shift into it, or
+    assign entry_en[i] = (valid_pushed[i+1] & pop_fifo) |
+                         // a new request is incoming and this is the lowest free entry
+                         (in_valid_i & lowest_free_entry[i] & ~pop_fifo);
+
+    // take the next entry or the incoming data
+    assign rdata_d[i]  = valid_q[i+1] ? rdata_q[i+1] : in_rdata_i;
+    assign err_d  [i]  = valid_q[i+1] ? err_q  [i+1] : in_err_i;
+  end
+  // The top entry is similar but with simpler muxing
+  assign lowest_free_entry[DEPTH-1] = ~valid_q[DEPTH-1] & valid_q[DEPTH-2];
+  assign valid_pushed     [DEPTH-1] = valid_q[DEPTH-1] | (in_valid_i & lowest_free_entry[DEPTH-1]);
+  assign valid_popped     [DEPTH-1] = pop_fifo ? 1'b0 : valid_pushed[DEPTH-1];
+  assign valid_d [DEPTH-1]          = valid_popped[DEPTH-1] & ~clear_i;
+  assign entry_en[DEPTH-1]          = in_valid_i & lowest_free_entry[DEPTH-1];
+  assign rdata_d [DEPTH-1]          = in_rdata_i;
+  assign err_d   [DEPTH-1]          = in_err_i;
+
+  ////////////////////
+  // FIFO registers //
+  ////////////////////
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      valid_q <= '0;
+    end else begin
+      valid_q <= valid_d;
+    end
+  end
+
+  for (genvar i = 0; i < DEPTH; i++) begin : g_fifo_regs
+    if (ResetAll) begin : g_rdata_ra
+      always_ff @(posedge clk_i or negedge rst_ni) begin
+        if (!rst_ni) begin
+          rdata_q[i] <= '0;
+          err_q[i]   <= '0;
+        end else if (entry_en[i]) begin
+          rdata_q[i] <= rdata_d[i];
+          err_q[i]   <= err_d[i];
+        end
+      end
+    end else begin : g_rdata_nr
+      always_ff @(posedge clk_i) begin
+        if (entry_en[i]) begin
+          rdata_q[i] <= rdata_d[i];
+          err_q[i]   <= err_d[i];
+        end
+      end
+    end
+  end
+
+  ////////////////
+  // Assertions //
+  ////////////////
+
+  // Must not push and pop simultaneously when FIFO full.
+  `ASSERT(IbexFetchFifoPushPopFull,
+      (in_valid_i && pop_fifo) |-> (!valid_q[DEPTH-1] || clear_i))
+
+  // Must not push to FIFO when full.
+  `ASSERT(IbexFetchFifoPushFull,
+      (in_valid_i) |-> (!valid_q[DEPTH-1] || clear_i))
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_icache.sv b/hw/ip/cheriot-ibex/rtl/cheriot_icache.sv
new file mode 100644
index 0000000..91ab025
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_icache.sv
@@ -0,0 +1,1155 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Instruction cache
+ *
+ * Provides an instruction cache along with cache management, instruction buffering and prefetching
+ */
+
+`include "prim_assert.sv"
+
+module cheriot_icache import cheriot_pkg::*; #(
+  parameter bit          ICacheECC       = 1'b0,
+  parameter bit          ResetAll        = 1'b0,
+  parameter int unsigned BusSizeECC      = BUS_SIZE,
+  parameter int unsigned TagSizeECC      = IC_TAG_SIZE,
+  parameter int unsigned LineSizeECC     = IC_LINE_SIZE,
+  // Only cache branch targets
+  parameter bit          BranchCache     = 1'b0
+) (
+  // Clock and reset
+  input  logic                           clk_i,
+  input  logic                           rst_ni,
+
+  // Signal that the core would like instructions
+  input  logic                           req_i,
+
+  // Set the cache's address counter
+  input  logic                           branch_i,
+  input  logic                           branch_mispredict_i,
+  input  logic [31:0]                    mispredict_addr_i,
+  input  logic [31:0]                    addr_i,
+
+  // IF stage interface: Pass fetched instructions to the core
+  input  logic                           ready_i,
+  output logic                           valid_o,
+  output logic [31:0]                    rdata_o,
+  output logic [31:0]                    addr_o,
+  output logic                           err_o,
+  output logic                           err_plus2_o,
+
+  // Instruction memory / interconnect interface: Fetch instruction data from memory
+  output logic                           instr_req_o,
+  input  logic                           instr_gnt_i,
+  output logic [31:0]                    instr_addr_o,
+  input  logic [BUS_SIZE-1:0]            instr_rdata_i,
+  input  logic                           instr_err_i,
+  input  logic                           instr_rvalid_i,
+
+  // RAM IO
+  output logic [IC_NUM_WAYS-1:0]         ic_tag_req_o,
+  output logic                           ic_tag_write_o,
+  output logic [IC_INDEX_W-1:0]          ic_tag_addr_o,
+  output logic [TagSizeECC-1:0]          ic_tag_wdata_o,
+  input  logic [TagSizeECC-1:0]          ic_tag_rdata_i [IC_NUM_WAYS],
+  output logic [IC_NUM_WAYS-1:0]         ic_data_req_o,
+  output logic                           ic_data_write_o,
+  output logic [IC_INDEX_W-1:0]          ic_data_addr_o,
+  output logic [LineSizeECC-1:0]         ic_data_wdata_o,
+  input  logic [LineSizeECC-1:0]         ic_data_rdata_i [IC_NUM_WAYS],
+  input  logic                           ic_scr_key_valid_i,
+
+  // Cache status
+  input  logic                           icache_enable_i,
+  input  logic                           icache_inval_i,
+  output logic                           busy_o
+);
+
+  // Number of fill buffers (must be >= 2)
+  localparam int unsigned NUM_FB        = 4;
+  // Request throttling threshold
+  localparam int unsigned FB_THRESHOLD  = NUM_FB - 2;
+
+  // Prefetch signals
+  logic [ADDR_W-1:0]                      lookup_addr_aligned;
+  logic [ADDR_W-1:0]                      prefetch_addr_d, prefetch_addr_q;
+  logic                                   prefetch_addr_en;
+  logic                                   branch_or_mispredict;
+  // Cache pipelipe IC0 signals
+  logic                                   lookup_throttle;
+  logic                                   lookup_req_ic0;
+  logic [ADDR_W-1:0]                      lookup_addr_ic0;
+  logic [IC_INDEX_W-1:0]                  lookup_index_ic0;
+  logic                                   fill_req_ic0;
+  logic [IC_INDEX_W-1:0]                  fill_index_ic0;
+  logic [IC_TAG_SIZE-1:0]                 fill_tag_ic0;
+  logic [IC_LINE_SIZE-1:0]                fill_wdata_ic0;
+  logic                                   lookup_grant_ic0;
+  logic                                   lookup_actual_ic0;
+  logic                                   fill_grant_ic0;
+  logic                                   tag_req_ic0;
+  logic [IC_INDEX_W-1:0]                  tag_index_ic0;
+  logic [IC_NUM_WAYS-1:0]                 tag_banks_ic0;
+  logic                                   tag_write_ic0;
+  logic [TagSizeECC-1:0]                  tag_wdata_ic0;
+  logic                                   data_req_ic0;
+  logic [IC_INDEX_W-1:0]                  data_index_ic0;
+  logic [IC_NUM_WAYS-1:0]                 data_banks_ic0;
+  logic                                   data_write_ic0;
+  logic [LineSizeECC-1:0]                 data_wdata_ic0;
+  // Cache pipelipe IC1 signals
+  logic [TagSizeECC-1:0]                  tag_rdata_ic1  [IC_NUM_WAYS];
+  logic [LineSizeECC-1:0]                 data_rdata_ic1 [IC_NUM_WAYS];
+  logic [LineSizeECC-1:0]                 hit_data_ecc_ic1;
+  logic [IC_LINE_SIZE-1:0]                hit_data_ic1;
+  logic                                   lookup_valid_ic1;
+  logic [ADDR_W-1:IC_INDEX_HI+1]          lookup_addr_ic1;
+  logic [IC_NUM_WAYS-1:0]                 tag_match_ic1;
+  logic                                   tag_hit_ic1;
+  logic [IC_NUM_WAYS-1:0]                 tag_invalid_ic1;
+  logic [IC_NUM_WAYS-1:0]                 lowest_invalid_way_ic1;
+  logic [IC_NUM_WAYS-1:0]                 round_robin_way_ic1, round_robin_way_q;
+  logic [IC_NUM_WAYS-1:0]                 sel_way_ic1;
+  logic                                   ecc_err_ic1;
+  logic                                   ecc_write_req;
+  logic [IC_NUM_WAYS-1:0]                 ecc_write_ways;
+  logic [IC_INDEX_W-1:0]                  ecc_write_index;
+  // Fill buffer signals
+  logic [$clog2(NUM_FB)-1:0]              fb_fill_level;
+  logic                                   fill_cache_new;
+  logic                                   fill_new_alloc;
+  logic                                   fill_spec_req, fill_spec_done, fill_spec_hold;
+  logic [NUM_FB-1:0][NUM_FB-1:0]          fill_older_d, fill_older_q;
+  logic [NUM_FB-1:0]                      fill_alloc_sel, fill_alloc;
+  logic [NUM_FB-1:0]                      fill_busy_d, fill_busy_q;
+  logic [NUM_FB-1:0]                      fill_done;
+  logic [NUM_FB-1:0]                      fill_in_ic1;
+  logic [NUM_FB-1:0]                      fill_stale_d, fill_stale_q;
+  logic [NUM_FB-1:0]                      fill_cache_d, fill_cache_q;
+  logic [NUM_FB-1:0]                      fill_hit_ic1, fill_hit_d, fill_hit_q;
+  logic [NUM_FB-1:0][IC_LINE_BEATS_W:0]   fill_ext_cnt_d, fill_ext_cnt_q;
+  logic [NUM_FB-1:0]                      fill_ext_hold_d, fill_ext_hold_q;
+  logic [NUM_FB-1:0]                      fill_ext_done_d, fill_ext_done_q;
+  logic [NUM_FB-1:0][IC_LINE_BEATS_W:0]   fill_rvd_cnt_d, fill_rvd_cnt_q;
+  logic [NUM_FB-1:0]                      fill_rvd_done;
+  logic [NUM_FB-1:0]                      fill_ram_done_d, fill_ram_done_q;
+  logic [NUM_FB-1:0]                      fill_out_grant;
+  logic [NUM_FB-1:0][IC_LINE_BEATS_W:0]   fill_out_cnt_d, fill_out_cnt_q;
+  logic [NUM_FB-1:0]                      fill_out_done;
+  logic [NUM_FB-1:0]                      fill_ext_req, fill_rvd_exp, fill_ram_req, fill_out_req;
+  logic [NUM_FB-1:0]                      fill_data_sel, fill_data_reg;
+  logic [NUM_FB-1:0]                      fill_data_hit, fill_data_rvd;
+  logic [NUM_FB-1:0][IC_LINE_BEATS_W-1:0] fill_ext_off, fill_rvd_off;
+  logic [NUM_FB-1:0][IC_LINE_BEATS_W:0]   fill_ext_beat, fill_rvd_beat;
+  logic [NUM_FB-1:0]                      fill_ext_arb, fill_ram_arb, fill_out_arb;
+  logic [NUM_FB-1:0]                      fill_rvd_arb;
+  logic [NUM_FB-1:0]                      fill_entry_en;
+  logic [NUM_FB-1:0]                      fill_addr_en;
+  logic [NUM_FB-1:0]                      fill_way_en;
+  logic [NUM_FB-1:0][IC_LINE_BEATS-1:0]   fill_data_en;
+  logic [NUM_FB-1:0][IC_LINE_BEATS-1:0]   fill_err_d, fill_err_q;
+  logic [ADDR_W-1:0]                      fill_addr_q [NUM_FB];
+  logic [IC_NUM_WAYS-1:0]                 fill_way_q  [NUM_FB];
+  logic [IC_LINE_SIZE-1:0]                fill_data_d [NUM_FB];
+  logic [IC_LINE_SIZE-1:0]                fill_data_q [NUM_FB];
+  logic [ADDR_W-1:BUS_W]                  fill_ext_req_addr;
+  logic [ADDR_W-1:0]                      fill_ram_req_addr;
+  logic [IC_NUM_WAYS-1:0]                 fill_ram_req_way;
+  logic [IC_LINE_SIZE-1:0]                fill_ram_req_data;
+  logic [IC_LINE_SIZE-1:0]                fill_out_data;
+  logic [IC_LINE_BEATS-1:0]               fill_out_err;
+  // External req signals
+  logic                                   instr_req;
+  logic [ADDR_W-1:BUS_W]                  instr_addr;
+  // Data output signals
+  logic                                   skid_complete_instr;
+  logic                                   skid_ready;
+  logic                                   output_compressed;
+  logic                                   skid_valid_d, skid_valid_q, skid_en;
+  logic [15:0]                            skid_data_d, skid_data_q;
+  logic                                   skid_err_q;
+  logic                                   output_valid;
+  logic                                   addr_incr_two;
+  logic                                   output_addr_en;
+  logic [ADDR_W-1:1]                      output_addr_incr;
+  logic [ADDR_W-1:1]                      output_addr_d, output_addr_q;
+  logic [15:0]                            output_data_lo, output_data_hi;
+  logic                                   data_valid, output_ready;
+  logic [IC_LINE_SIZE-1:0]                line_data;
+  logic [IC_LINE_BEATS-1:0]               line_err;
+  logic [31:0]                            line_data_muxed;
+  logic                                   line_err_muxed;
+  logic [31:0]                            output_data;
+  logic                                   output_err;
+  // Invalidations
+  logic                                   start_inval, inval_done;
+  logic                                   inval_lock, inval_req_d, inval_req_q;
+  logic                                   reset_inval_q;
+  logic                                   inval_prog_d, inval_prog_q;
+  logic [IC_INDEX_W-1:0]                  inval_index_d, inval_index_q;
+
+  //////////////////////////
+  // Instruction prefetch //
+  //////////////////////////
+
+  assign branch_or_mispredict = branch_i | branch_mispredict_i;
+
+  assign lookup_addr_aligned = {lookup_addr_ic0[ADDR_W-1:IC_LINE_W], {IC_LINE_W{1'b0}}};
+
+  // The prefetch address increments by one cache line for each granted request.
+  // This address is also updated if there is a branch that is not granted, since the target
+  // address (addr_i) is only valid for one cycle while branch_i is high.
+
+  // The captured branch target address is not forced to be aligned since the offset in the cache
+  // line must also be recorded for later use by the fill buffers.
+  assign prefetch_addr_d     =
+      lookup_grant_ic0 ? (lookup_addr_aligned +
+                          {{ADDR_W-IC_LINE_W-1{1'b0}}, 1'b1, {IC_LINE_W{1'b0}}}) :
+      branch_i         ? addr_i :
+                         mispredict_addr_i;
+
+  assign prefetch_addr_en    = branch_or_mispredict | lookup_grant_ic0;
+
+  if (ResetAll) begin : g_prefetch_addr_ra
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        prefetch_addr_q <= '0;
+      end else if (prefetch_addr_en) begin
+        prefetch_addr_q <= prefetch_addr_d;
+      end
+    end
+  end else begin : g_prefetch_addr_nr
+    always_ff @(posedge clk_i) begin
+      if (prefetch_addr_en) begin
+        prefetch_addr_q <= prefetch_addr_d;
+      end
+    end
+  end
+
+  ////////////////////////
+  // Pipeline stage IC0 //
+  ////////////////////////
+
+  // Cache lookup
+  assign lookup_throttle  = (fb_fill_level > FB_THRESHOLD[$clog2(NUM_FB)-1:0]);
+
+  assign lookup_req_ic0   = req_i & ~&fill_busy_q & (branch_or_mispredict | ~lookup_throttle) &
+                            ~ecc_write_req;
+  assign lookup_addr_ic0  = branch_i            ? addr_i :
+                            branch_mispredict_i ? mispredict_addr_i :
+                                                  prefetch_addr_q;
+  assign lookup_index_ic0 = lookup_addr_ic0[IC_INDEX_HI:IC_LINE_W];
+
+  // Cache write
+  assign fill_req_ic0   = (|fill_ram_req);
+  assign fill_index_ic0 = fill_ram_req_addr[IC_INDEX_HI:IC_LINE_W];
+  assign fill_tag_ic0   = {(~inval_prog_q & ~ecc_write_req),
+                           fill_ram_req_addr[ADDR_W-1:IC_INDEX_HI+1]};
+  assign fill_wdata_ic0 = fill_ram_req_data;
+
+  // Arbitrated signals - lookups have highest priority
+  assign lookup_grant_ic0  = lookup_req_ic0;
+  assign fill_grant_ic0    = fill_req_ic0 & ~lookup_req_ic0 & ~inval_prog_q &
+                             ~ecc_write_req;
+  // Qualified lookup grant to mask ram signals in IC1 if access was not made
+  assign lookup_actual_ic0 = lookup_grant_ic0 & icache_enable_i & ~inval_prog_q &
+                             ~icache_inval_i & ~inval_lock & ~start_inval;
+
+  // Tagram
+  assign tag_req_ic0   = lookup_req_ic0 | fill_req_ic0 | inval_prog_q | ecc_write_req;
+  assign tag_index_ic0 = inval_prog_q   ? inval_index_q :
+                         ecc_write_req  ? ecc_write_index :
+                         fill_grant_ic0 ? fill_index_ic0 :
+                                          lookup_index_ic0;
+  assign tag_banks_ic0 = ecc_write_req  ? ecc_write_ways :
+                         fill_grant_ic0 ? fill_ram_req_way :
+                                          {IC_NUM_WAYS{1'b1}};
+  assign tag_write_ic0 = fill_grant_ic0 | inval_prog_q | ecc_write_req;
+
+  // Dataram
+  assign data_req_ic0   = lookup_req_ic0 | fill_req_ic0;
+  assign data_index_ic0 = tag_index_ic0;
+  assign data_banks_ic0 = tag_banks_ic0;
+  assign data_write_ic0 = tag_write_ic0;
+
+  // Append ECC checkbits to write data if required
+  if (ICacheECC) begin : gen_ecc_wdata
+
+    // Tagram ECC
+    // Reuse the same ecc encoding module for larger cache sizes by padding with zeros
+    logic [21:0]             tag_ecc_input_padded;
+    logic [27:0]             tag_ecc_output_padded;
+    logic [22-IC_TAG_SIZE:0] unused_tag_ecc_output;
+
+    assign tag_ecc_input_padded  = {{22-IC_TAG_SIZE{1'b0}},fill_tag_ic0};
+    assign unused_tag_ecc_output = tag_ecc_output_padded[21:IC_TAG_SIZE-1];
+
+    prim_secded_inv_28_22_enc tag_ecc_enc (
+      .data_i (tag_ecc_input_padded),
+      .data_o (tag_ecc_output_padded)
+    );
+
+    assign tag_wdata_ic0 = {tag_ecc_output_padded[27:22],tag_ecc_output_padded[IC_TAG_SIZE-1:0]};
+
+    // Dataram ECC
+    for (genvar bank = 0; bank < IC_LINE_BEATS; bank++) begin : gen_ecc_banks
+      prim_secded_inv_39_32_enc data_ecc_enc (
+        .data_i (fill_wdata_ic0[bank*BUS_SIZE+:BUS_SIZE]),
+        .data_o (data_wdata_ic0[bank*BusSizeECC+:BusSizeECC])
+      );
+    end
+
+  end else begin : gen_noecc_wdata
+    assign tag_wdata_ic0  = fill_tag_ic0;
+    assign data_wdata_ic0 = fill_wdata_ic0;
+  end
+
+  ////////////////
+  // IC0 -> IC1 //
+  ////////////////
+
+  // Tag RAMs outputs
+  assign ic_tag_req_o    = {IC_NUM_WAYS{tag_req_ic0}} & tag_banks_ic0;
+  assign ic_tag_write_o  = tag_write_ic0;
+  assign ic_tag_addr_o   = tag_index_ic0;
+  assign ic_tag_wdata_o  = tag_wdata_ic0;
+
+  // Tag RAMs inputs
+  assign tag_rdata_ic1   = ic_tag_rdata_i;
+
+  // Data RAMs outputs
+  assign ic_data_req_o   = {IC_NUM_WAYS{data_req_ic0}} & data_banks_ic0;
+  assign ic_data_write_o = data_write_ic0;
+  assign ic_data_addr_o  = data_index_ic0;
+  assign ic_data_wdata_o = data_wdata_ic0;
+
+  // Data RAMs inputs
+  assign data_rdata_ic1  = ic_data_rdata_i;
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      lookup_valid_ic1 <= 1'b0;
+    end else begin
+      lookup_valid_ic1 <= lookup_actual_ic0;
+    end
+  end
+
+  if (ResetAll) begin : g_lookup_addr_ra
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        lookup_addr_ic1 <= '0;
+        fill_in_ic1     <= '0;
+      end else if (lookup_grant_ic0) begin
+        lookup_addr_ic1 <= lookup_addr_ic0[ADDR_W-1:IC_INDEX_HI+1];
+        fill_in_ic1     <= fill_alloc_sel;
+      end
+    end
+  end else begin : g_lookup_addr_nr
+    always_ff @(posedge clk_i) begin
+      if (lookup_grant_ic0) begin
+        lookup_addr_ic1 <= lookup_addr_ic0[ADDR_W-1:IC_INDEX_HI+1];
+        fill_in_ic1     <= fill_alloc_sel;
+      end
+    end
+  end
+
+  ////////////////////////
+  // Pipeline stage IC1 //
+  ////////////////////////
+
+  // Tag matching
+  for (genvar way = 0; way < IC_NUM_WAYS; way++) begin : gen_tag_match
+    assign tag_match_ic1[way]   = (tag_rdata_ic1[way][IC_TAG_SIZE-1:0] ==
+                                   {1'b1,lookup_addr_ic1[ADDR_W-1:IC_INDEX_HI+1]});
+    assign tag_invalid_ic1[way] = ~tag_rdata_ic1[way][IC_TAG_SIZE-1];
+  end
+
+  assign tag_hit_ic1 = |tag_match_ic1;
+
+  // Hit data mux
+  always_comb begin
+    hit_data_ecc_ic1 = 'b0;
+    for (int way = 0; way < IC_NUM_WAYS; way++) begin
+      if (tag_match_ic1[way]) begin
+        hit_data_ecc_ic1 |= data_rdata_ic1[way];
+      end
+    end
+  end
+
+  // Way selection for allocations to the cache (onehot signals)
+  // 1 first invalid way
+  // 2 global round-robin (pseudorandom) way
+  assign lowest_invalid_way_ic1[0] = tag_invalid_ic1[0];
+  assign round_robin_way_ic1[0]    = round_robin_way_q[IC_NUM_WAYS-1];
+  for (genvar way = 1; way < IC_NUM_WAYS; way++) begin : gen_lowest_way
+    assign lowest_invalid_way_ic1[way] = tag_invalid_ic1[way] & ~|tag_invalid_ic1[way-1:0];
+    assign round_robin_way_ic1[way]    = round_robin_way_q[way-1];
+  end
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      round_robin_way_q <= {{IC_NUM_WAYS-1{1'b0}}, 1'b1};
+    end else if (lookup_valid_ic1) begin
+      round_robin_way_q <= round_robin_way_ic1;
+    end
+  end
+
+  assign sel_way_ic1 = |tag_invalid_ic1 ? lowest_invalid_way_ic1 :
+                                          round_robin_way_q;
+
+  // ECC checking logic
+  if (ICacheECC) begin : gen_data_ecc_checking
+    logic [IC_NUM_WAYS-1:0]     tag_err_ic1;
+    logic [IC_LINE_BEATS*2-1:0] data_err_ic1;
+    logic                       ecc_correction_write_d, ecc_correction_write_q;
+    logic [IC_NUM_WAYS-1:0]     ecc_correction_ways_d, ecc_correction_ways_q;
+    logic [IC_INDEX_W-1:0]      lookup_index_ic1, ecc_correction_index_q;
+
+    // Tag ECC checking
+    for (genvar way = 0; way < IC_NUM_WAYS; way++) begin : gen_tag_ecc
+      logic [1:0]  tag_err_bank_ic1;
+      logic [27:0] tag_rdata_padded_ic1;
+
+      // Expand the tag rdata with extra padding if the tag size is less than the maximum
+      assign tag_rdata_padded_ic1 = {tag_rdata_ic1[way][TagSizeECC-1-:6],
+                                     {22-IC_TAG_SIZE{1'b0}},
+                                     tag_rdata_ic1[way][IC_TAG_SIZE-1:0]};
+
+      prim_secded_inv_28_22_dec data_ecc_dec (
+        .data_i     (tag_rdata_padded_ic1),
+        .data_o     (),
+        .syndrome_o (),
+        .err_o      (tag_err_bank_ic1)
+      );
+      assign tag_err_ic1[way] = |tag_err_bank_ic1;
+    end
+
+    // Data ECC checking
+    // Note - could generate for all ways and mux after
+    for (genvar bank = 0; bank < IC_LINE_BEATS; bank++) begin : gen_ecc_banks
+      prim_secded_inv_39_32_dec data_ecc_dec (
+        .data_i     (hit_data_ecc_ic1[bank*BusSizeECC+:BusSizeECC]),
+        .data_o     (),
+        .syndrome_o (),
+        .err_o      (data_err_ic1[bank*2+:2])
+      );
+
+      assign hit_data_ic1[bank*BUS_SIZE+:BUS_SIZE] =
+          hit_data_ecc_ic1[bank*BusSizeECC+:BUS_SIZE];
+
+    end
+
+    assign ecc_err_ic1 = lookup_valid_ic1 & ((|data_err_ic1) | (|tag_err_ic1));
+
+    // Error correction
+    // All ways will be invalidated on a tag error to prevent X-propagation from data_err_ic1 on
+    // spurious hits. Also prevents the same line being allocated twice when there was a true
+    // hit and a spurious hit.
+    assign ecc_correction_ways_d  = {IC_NUM_WAYS{|tag_err_ic1}} |
+                                    (tag_match_ic1 & {IC_NUM_WAYS{|data_err_ic1}});
+    assign ecc_correction_write_d = ecc_err_ic1;
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        ecc_correction_write_q <= 1'b0;
+      end else begin
+        ecc_correction_write_q <= ecc_correction_write_d;
+      end
+    end
+
+    // The index is required in IC1 only when ECC is configured so is registered here
+    if (ResetAll) begin : g_lookup_ind_ra
+      always_ff @(posedge clk_i or negedge rst_ni) begin
+        if (!rst_ni) begin
+          lookup_index_ic1 <= '0;
+        end else if (lookup_grant_ic0) begin
+          lookup_index_ic1 <= lookup_addr_ic0[IC_INDEX_HI-:IC_INDEX_W];
+        end
+      end
+    end else begin : g_lookup_ind_nr
+      always_ff @(posedge clk_i) begin
+        if (lookup_grant_ic0) begin
+          lookup_index_ic1 <= lookup_addr_ic0[IC_INDEX_HI-:IC_INDEX_W];
+        end
+      end
+    end
+
+    // Store the ways with errors to be invalidated
+    if (ResetAll) begin : g_ecc_correction_ra
+      always_ff @(posedge clk_i or negedge rst_ni) begin
+        if (!rst_ni) begin
+          ecc_correction_ways_q  <= '0;
+          ecc_correction_index_q <= '0;
+        end else if (ecc_err_ic1) begin
+          ecc_correction_ways_q  <= ecc_correction_ways_d;
+          ecc_correction_index_q <= lookup_index_ic1;
+        end
+      end
+    end else begin : g_ecc_correction_nr
+      always_ff @(posedge clk_i) begin
+        if (ecc_err_ic1) begin
+          ecc_correction_ways_q  <= ecc_correction_ways_d;
+          ecc_correction_index_q <= lookup_index_ic1;
+        end
+      end
+    end
+
+    assign ecc_write_req   = ecc_correction_write_q;
+    assign ecc_write_ways  = ecc_correction_ways_q;
+    assign ecc_write_index = ecc_correction_index_q;
+
+  end else begin : gen_no_data_ecc
+    assign ecc_err_ic1     = 1'b0;
+    assign ecc_write_req   = 1'b0;
+    assign ecc_write_ways  = '0;
+    assign ecc_write_index = '0;
+    assign hit_data_ic1    = hit_data_ecc_ic1;
+  end
+
+  ///////////////////////////////
+  // Cache allocation decision //
+  ///////////////////////////////
+
+  if (BranchCache) begin : gen_caching_logic
+
+    // Cache branch target + a number of subsequent lines
+    localparam int unsigned CACHE_AHEAD = 2;
+    localparam int unsigned CACHE_CNT_W = (CACHE_AHEAD == 1) ? 1 : $clog2(CACHE_AHEAD) + 1;
+    logic                   cache_cnt_dec;
+    logic [CACHE_CNT_W-1:0] cache_cnt_d, cache_cnt_q;
+
+    assign cache_cnt_dec = lookup_grant_ic0 & (|cache_cnt_q);
+    assign cache_cnt_d   = branch_i ? CACHE_AHEAD[CACHE_CNT_W-1:0] :
+                                      (cache_cnt_q - {{CACHE_CNT_W-1{1'b0}},cache_cnt_dec});
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        cache_cnt_q <= '0;
+      end else begin
+        cache_cnt_q <= cache_cnt_d;
+      end
+    end
+
+    assign fill_cache_new = (branch_i | (|cache_cnt_q)) & icache_enable_i &
+                            ~icache_inval_i & ~inval_lock & ~inval_prog_q;
+
+  end else begin : gen_cache_all
+
+    // Cache all missing fetches
+    assign fill_cache_new = icache_enable_i & ~start_inval & ~inval_prog_q;
+  end
+
+  //////////////////////////
+  // Fill buffer tracking //
+  //////////////////////////
+
+  always_comb begin
+    fb_fill_level = '0;
+    for (int i = 0; i < NUM_FB; i++) begin
+      if (fill_busy_q[i] & ~fill_stale_q[i]) begin
+        fb_fill_level += {{$clog2(NUM_FB) - 1{1'b0}}, 1'b1};
+      end
+    end
+  end
+
+  // Allocate a new buffer for every granted lookup
+  assign fill_new_alloc = lookup_grant_ic0;
+  // Track whether a speculative external request was made from IC0, and whether it was granted
+  // Speculative requests are only made for branches, or if the cache is disabled
+  assign fill_spec_req  = (~icache_enable_i | branch_or_mispredict) & ~|fill_ext_req;
+  assign fill_spec_done = fill_spec_req & instr_gnt_i;
+  assign fill_spec_hold = fill_spec_req & ~instr_gnt_i;
+
+  for (genvar fb = 0; fb < NUM_FB; fb++) begin : gen_fbs
+
+    /////////////////////////////
+    // Fill buffer allocations //
+    /////////////////////////////
+
+    // Allocate the lowest available buffer
+    if (fb == 0) begin : gen_fb_zero
+      assign fill_alloc_sel[fb] = ~fill_busy_q[fb];
+    end else begin : gen_fb_rest
+      assign fill_alloc_sel[fb] = ~fill_busy_q[fb] & (&fill_busy_q[fb-1:0]);
+    end
+
+    assign fill_alloc[fb]      = fill_alloc_sel[fb] & fill_new_alloc;
+    assign fill_busy_d[fb]     = fill_alloc[fb] | (fill_busy_q[fb] & ~fill_done[fb]);
+
+    // Track which other fill buffers are older than this one (for age-based arbitration)
+    // TODO sparsify
+    assign fill_older_d[fb]    = (fill_alloc[fb] ? fill_busy_q : fill_older_q[fb]) & ~fill_done;
+
+    // A fill buffer can release once all its actions are completed
+                                 // all data written to the cache (unless hit or error)
+    assign fill_done[fb]       = (fill_ram_done_q[fb] | fill_hit_q[fb] | ~fill_cache_q[fb] |
+                                  (|fill_err_q[fb])) &
+                                 // all data output unless stale due to intervening branch
+                                 (fill_out_done[fb] | fill_stale_q[fb] | branch_or_mispredict) &
+                                 // all external requests completed
+                                 fill_rvd_done[fb];
+
+    /////////////////////////////////
+    // Fill buffer status tracking //
+    /////////////////////////////////
+
+    // Track staleness (requests become stale when a branch intervenes)
+    assign fill_stale_d[fb]    = fill_busy_q[fb] & (branch_or_mispredict | fill_stale_q[fb]);
+    // Track whether or not this request should allocate to the cache
+    // Any invalidation or disabling of the cache while the buffer is busy will stop allocation
+    assign fill_cache_d[fb]    = (fill_alloc[fb] & fill_cache_new) |
+                                 (fill_cache_q[fb] & fill_busy_q[fb] &
+                                  icache_enable_i & ~icache_inval_i & ~inval_lock);
+    // Record whether the request hit in the cache
+    assign fill_hit_ic1[fb]    = lookup_valid_ic1 & fill_in_ic1[fb] & tag_hit_ic1 & ~ecc_err_ic1;
+    assign fill_hit_d[fb]      = fill_hit_ic1[fb] | (fill_hit_q[fb] & fill_busy_q[fb]);
+
+    ///////////////////////////////////////////
+    // Fill buffer external request tracking //
+    ///////////////////////////////////////////
+
+    // Make an external request
+    assign fill_ext_req[fb]    = fill_busy_q[fb] & ~fill_ext_done_d[fb];
+
+    // Count the number of completed external requests (each line requires IC_LINE_BEATS requests)
+    assign fill_ext_cnt_d[fb]  = fill_alloc[fb] ?
+                                   {{IC_LINE_BEATS_W{1'b0}},fill_spec_done} :
+                                   (fill_ext_cnt_q[fb] + {{IC_LINE_BEATS_W{1'b0}},
+                                                          fill_ext_arb[fb] & instr_gnt_i});
+    // External request must be held until granted
+    assign fill_ext_hold_d[fb] = (fill_alloc[fb] & fill_spec_hold) |
+                                 (fill_ext_arb[fb] & ~instr_gnt_i);
+    // External requests are completed when the counter is filled or when the request is cancelled
+    assign fill_ext_done_d[fb] = (fill_ext_cnt_q[fb][IC_LINE_BEATS_W] |
+                                  // external requests are considered complete if the request hit
+                                  fill_hit_ic1[fb] | fill_hit_q[fb] |
+                                  // cancel if the line won't be cached and, it is stale
+                                  (~fill_cache_q[fb] & (branch_or_mispredict | fill_stale_q[fb] |
+                                   // or we're already at the end of the line
+                                                        fill_ext_beat[fb][IC_LINE_BEATS_W]))) &
+                                 // can't cancel while we are waiting for a grant on the bus
+                                 ~fill_ext_hold_q[fb] & fill_busy_q[fb];
+    // Track whether this fill buffer expects to receive beats of data
+    assign fill_rvd_exp[fb]    = fill_busy_q[fb] & ~fill_rvd_done[fb];
+    // Count the number of rvalid beats received
+    assign fill_rvd_cnt_d[fb]  = fill_alloc[fb] ? '0 :
+                                                  (fill_rvd_cnt_q[fb] +
+                                                   {{IC_LINE_BEATS_W{1'b0}},fill_rvd_arb[fb]});
+    // External data is complete when all issued external requests have received their data
+    assign fill_rvd_done[fb]   = (fill_ext_done_q[fb] & ~fill_ext_hold_q[fb]) &
+                                 (fill_rvd_cnt_q[fb] == fill_ext_cnt_q[fb]);
+
+    //////////////////////////////////////
+    // Fill buffer data output tracking //
+    //////////////////////////////////////
+
+    // Send data to the IF stage for requests that are not stale, have not completed their
+    // data output, and have data available to send.
+    // Data is available if:
+    // - The request hit in the cache
+    // - Buffered data is available (fill_rvd_cnt_q is ahead of fill_out_cnt_q)
+    // - Data is available from the bus this cycle (fill_rvd_arb)
+    assign fill_out_req[fb]    = fill_busy_q[fb] & ~fill_stale_q[fb] & ~fill_out_done[fb] &
+                                 (fill_hit_ic1[fb] | fill_hit_q[fb] |
+                                  (fill_rvd_beat[fb] > fill_out_cnt_q[fb]) | fill_rvd_arb[fb]);
+
+    // Calculate when a beat of data is output. Any ECC error squashes the output that cycle.
+    assign fill_out_grant[fb]  = fill_out_arb[fb] & output_ready;
+
+    // Count the beats of data output to the IF stage
+    assign fill_out_cnt_d[fb]  = fill_alloc[fb] ? {1'b0,lookup_addr_ic0[IC_LINE_W-1:BUS_W]} :
+                                                  (fill_out_cnt_q[fb] +
+                                                   {{IC_LINE_BEATS_W{1'b0}},fill_out_grant[fb]});
+    // Data output complete when the counter fills
+    assign fill_out_done[fb]   = fill_out_cnt_q[fb][IC_LINE_BEATS_W];
+
+    //////////////////////////////////////
+    // Fill buffer ram request tracking //
+    //////////////////////////////////////
+
+                                 // make a fill request once all data beats received
+    assign fill_ram_req[fb]    = fill_busy_q[fb] & fill_rvd_cnt_q[fb][IC_LINE_BEATS_W] &
+                                 // unless the request hit, was non-allocating or got an error
+                                 ~fill_hit_q[fb] & fill_cache_q[fb] & ~|fill_err_q[fb] &
+                                 // or the request was already completed
+                                 ~fill_ram_done_q[fb];
+
+    // Record when a cache allocation request has been completed
+    assign fill_ram_done_d[fb] = fill_ram_arb[fb] | (fill_ram_done_q[fb] & fill_busy_q[fb]);
+
+    //////////////////////////////
+    // Fill buffer line offsets //
+    //////////////////////////////
+
+    // When we branch into the middle of a line, the output count will not start from zero. This
+    // beat count is used to know which incoming rdata beats are relevant.
+    assign fill_ext_beat[fb]   = {1'b0,fill_addr_q[fb][IC_LINE_W-1:BUS_W]} +
+                                 fill_ext_cnt_q[fb][IC_LINE_BEATS_W:0];
+    assign fill_ext_off[fb]    = fill_ext_beat[fb][IC_LINE_BEATS_W-1:0];
+    assign fill_rvd_beat[fb]   = {1'b0,fill_addr_q[fb][IC_LINE_W-1:BUS_W]} +
+                                 fill_rvd_cnt_q[fb][IC_LINE_BEATS_W:0];
+    assign fill_rvd_off[fb]    = fill_rvd_beat[fb][IC_LINE_BEATS_W-1:0];
+
+    /////////////////////////////
+    // Fill buffer arbitration //
+    /////////////////////////////
+
+    // Age based arbitration - all these signals are one-hot
+    assign fill_ext_arb[fb]    = fill_ext_req[fb] & ~|(fill_ext_req & fill_older_q[fb]);
+    assign fill_ram_arb[fb]    = fill_ram_req[fb] & fill_grant_ic0 &
+                                 ~|(fill_ram_req & fill_older_q[fb]);
+    // Calculate which fill buffer is the oldest one which still needs to output data to IF
+    assign fill_data_sel[fb]   = ~|(fill_busy_q & ~fill_out_done & ~fill_stale_q &
+                                    fill_older_q[fb]);
+    // Arbitrate the request which has data available to send, and is the oldest outstanding
+    assign fill_out_arb[fb]    = fill_out_req[fb] & fill_data_sel[fb];
+    // Assign incoming rvalid data to the oldest fill buffer expecting it
+    assign fill_rvd_arb[fb]    = instr_rvalid_i & fill_rvd_exp[fb] &
+                                 ~|(fill_rvd_exp & fill_older_q[fb]);
+
+    /////////////////////////////
+    // Fill buffer data muxing //
+    /////////////////////////////
+
+    // Output data muxing controls
+    // 1. Select data from the fill buffer data register
+    assign fill_data_reg[fb]   = fill_busy_q[fb] & ~fill_stale_q[fb] &
+                                 ~fill_out_done[fb] & fill_data_sel[fb] &
+    //                           The incoming data is already ahead of the output count
+                                 ((fill_rvd_beat[fb] > fill_out_cnt_q[fb]) | fill_hit_q[fb] |
+                                  (|fill_err_q[fb]));
+    // 2. Select IC1 hit data
+    assign fill_data_hit[fb]   = fill_busy_q[fb] & fill_hit_ic1[fb] & fill_data_sel[fb];
+    // 3. Select incoming instr_rdata_i
+    assign fill_data_rvd[fb]   = fill_busy_q[fb] & fill_rvd_arb[fb] & ~fill_hit_q[fb] &
+                                 ~fill_hit_ic1[fb] & ~fill_stale_q[fb] & ~fill_out_done[fb] &
+    //                           The incoming data lines up with the output count
+                                 (fill_rvd_beat[fb] == fill_out_cnt_q[fb]) & fill_data_sel[fb];
+
+
+    ///////////////////////////
+    // Fill buffer registers //
+    ///////////////////////////
+
+    // Fill buffer general enable
+    assign fill_entry_en[fb]   = fill_alloc[fb] | fill_busy_q[fb];
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        fill_busy_q[fb]     <= 1'b0;
+        fill_older_q[fb]    <= '0;
+        fill_stale_q[fb]    <= 1'b0;
+        fill_cache_q[fb]    <= 1'b0;
+        fill_hit_q[fb]      <= 1'b0;
+        fill_ext_cnt_q[fb]  <= '0;
+        fill_ext_hold_q[fb] <= 1'b0;
+        fill_ext_done_q[fb] <= 1'b0;
+        fill_rvd_cnt_q[fb]  <= '0;
+        fill_ram_done_q[fb] <= 1'b0;
+        fill_out_cnt_q[fb]  <= '0;
+      end else if (fill_entry_en[fb]) begin
+        fill_busy_q[fb]     <= fill_busy_d[fb];
+        fill_older_q[fb]    <= fill_older_d[fb];
+        fill_stale_q[fb]    <= fill_stale_d[fb];
+        fill_cache_q[fb]    <= fill_cache_d[fb];
+        fill_hit_q[fb]      <= fill_hit_d[fb];
+        fill_ext_cnt_q[fb]  <= fill_ext_cnt_d[fb];
+        fill_ext_hold_q[fb] <= fill_ext_hold_d[fb];
+        fill_ext_done_q[fb] <= fill_ext_done_d[fb];
+        fill_rvd_cnt_q[fb]  <= fill_rvd_cnt_d[fb];
+        fill_ram_done_q[fb] <= fill_ram_done_d[fb];
+        fill_out_cnt_q[fb]  <= fill_out_cnt_d[fb];
+      end
+    end
+
+    ////////////////////////////////////////
+    // Fill buffer address / data storage //
+    ////////////////////////////////////////
+
+    assign fill_addr_en[fb]    = fill_alloc[fb];
+    assign fill_way_en[fb]     = (lookup_valid_ic1 & fill_in_ic1[fb]);
+
+    if (ResetAll) begin : g_fill_addr_ra
+      always_ff @(posedge clk_i or negedge rst_ni) begin
+        if (!rst_ni) begin
+          fill_addr_q[fb] <= '0;
+        end else if (fill_addr_en[fb]) begin
+          fill_addr_q[fb] <= lookup_addr_ic0;
+        end
+      end
+    end else begin : g_fill_addr_nr
+      always_ff @(posedge clk_i) begin
+        if (fill_addr_en[fb]) begin
+          fill_addr_q[fb] <= lookup_addr_ic0;
+        end
+      end
+    end
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        fill_way_q[fb]  <= '0;
+      end else if (fill_way_en[fb]) begin
+        fill_way_q[fb]  <= sel_way_ic1;
+      end
+    end
+
+    // Data either comes from the cache or the bus. If there was an ECC error, we must take
+    // the incoming bus data since the cache hit data is corrupted.
+    assign fill_data_d[fb] = fill_hit_ic1[fb] ? hit_data_ic1 :
+                                                {IC_LINE_BEATS{instr_rdata_i}};
+
+    for (genvar b = 0; b < IC_LINE_BEATS; b++) begin : gen_data_buf
+      // Error tracking (per beat)
+      assign fill_err_d[fb][b]   = (fill_rvd_arb[fb] & instr_err_i &
+                                    (fill_rvd_off[fb] == b[IC_LINE_BEATS_W-1:0])) |
+      //                           Hold the error once recorded
+                                   (fill_busy_q[fb] & fill_err_q[fb][b]);
+
+      always_ff @(posedge clk_i or negedge rst_ni) begin
+        if (!rst_ni) begin
+          fill_err_q[fb][b] <= '0;
+        end else if (fill_entry_en[fb]) begin
+          fill_err_q[fb][b] <= fill_err_d[fb][b];
+        end
+      end
+
+      // Enable the relevant part of the data register (or all for cache hits)
+      // Ignore incoming rvalid data when we already have cache hit data
+      assign fill_data_en[fb][b] = fill_hit_ic1[fb] |
+                                   (fill_rvd_arb[fb] & ~fill_hit_q[fb] &
+                                    (fill_rvd_off[fb] == b[IC_LINE_BEATS_W-1:0]));
+
+      if (ResetAll) begin : g_fill_data_ra
+        always_ff @(posedge clk_i or negedge rst_ni) begin
+          if (!rst_ni) begin
+            fill_data_q[fb][b*BUS_SIZE+:BUS_SIZE] <= '0;
+          end else if (fill_data_en[fb][b]) begin
+            fill_data_q[fb][b*BUS_SIZE+:BUS_SIZE] <= fill_data_d[fb][b*BUS_SIZE+:BUS_SIZE];
+          end
+        end
+      end else begin : g_fill_data_nr
+        always_ff @(posedge clk_i) begin
+          if (fill_data_en[fb][b]) begin
+            fill_data_q[fb][b*BUS_SIZE+:BUS_SIZE] <= fill_data_d[fb][b*BUS_SIZE+:BUS_SIZE];
+          end
+        end
+      end
+
+    end
+  end
+
+  ////////////////////////////////
+  // Fill buffer one-hot muxing //
+  ////////////////////////////////
+
+  // External req info
+  always_comb begin
+    fill_ext_req_addr = '0;
+    for (int i = 0; i < NUM_FB; i++) begin
+      if (fill_ext_arb[i]) begin
+        fill_ext_req_addr |= {fill_addr_q[i][ADDR_W-1:IC_LINE_W], fill_ext_off[i]};
+      end
+    end
+  end
+
+  // Cache req info
+  always_comb begin
+    fill_ram_req_addr = '0;
+    fill_ram_req_way  = '0;
+    fill_ram_req_data = '0;
+    for (int i = 0; i < NUM_FB; i++) begin
+      if (fill_ram_arb[i]) begin
+        fill_ram_req_addr |= fill_addr_q[i];
+        fill_ram_req_way  |= fill_way_q[i];
+        fill_ram_req_data |= fill_data_q[i];
+      end
+    end
+  end
+
+  // IF stage output data
+  always_comb begin
+    fill_out_data = '0;
+    fill_out_err  = '0;
+    for (int i = 0; i < NUM_FB; i++) begin
+      if (fill_data_reg[i]) begin
+        fill_out_data |= fill_data_q[i];
+        // Ignore any speculative errors accumulated on cache hits
+        fill_out_err  |= (fill_err_q[i] & ~{IC_LINE_BEATS{fill_hit_q[i]}});
+      end
+    end
+  end
+
+  ///////////////////////
+  // External requests //
+  ///////////////////////
+
+  assign instr_req  = ((~icache_enable_i | branch_or_mispredict) & lookup_grant_ic0) |
+                      (|fill_ext_req);
+
+  assign instr_addr = |fill_ext_req ? fill_ext_req_addr :
+                                      lookup_addr_ic0[ADDR_W-1:BUS_W];
+
+  assign instr_req_o  = instr_req;
+  assign instr_addr_o = {instr_addr[ADDR_W-1:BUS_W],{BUS_W{1'b0}}};
+
+  ////////////////////////
+  // Output data muxing //
+  ////////////////////////
+
+  // Mux between line-width data sources
+  assign line_data = |fill_data_hit ? hit_data_ic1 : fill_out_data;
+  assign line_err  = |fill_data_hit ? {IC_LINE_BEATS{1'b0}} : fill_out_err;
+
+  // Mux the relevant beat of line data, based on the output address
+  always_comb begin
+    line_data_muxed = '0;
+    line_err_muxed  = 1'b0;
+    for (int unsigned i = 0; i < IC_LINE_BEATS; i++) begin
+      // When data has been skidded, the output address is behind by one
+      if ((output_addr_q[IC_LINE_W-1:BUS_W] + {{IC_LINE_BEATS_W-1{1'b0}},skid_valid_q}) ==
+          i[IC_LINE_BEATS_W-1:0]) begin
+        line_data_muxed |= line_data[i*32+:32];
+        line_err_muxed  |= line_err[i];
+      end
+    end
+  end
+
+  // Mux between incoming rdata and the muxed line data
+  assign output_data = |fill_data_rvd ? instr_rdata_i : line_data_muxed;
+  assign output_err  = |fill_data_rvd ? instr_err_i   : line_err_muxed;
+
+  // Output data is valid (from any of the three possible sources). Note that fill_out_arb
+  // must be used here rather than fill_out_req because data can become valid out of order
+  // (e.g. cache hit data can become available ahead of an older outstanding miss).
+  assign data_valid = |fill_out_arb;
+
+  // Skid buffer data
+  assign skid_data_d = output_data[31:16];
+
+  assign skid_en     = data_valid & (ready_i | skid_ready);
+
+  if (ResetAll) begin : g_skid_data_ra
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        skid_data_q <= '0;
+        skid_err_q  <= '0;
+      end else if (skid_en) begin
+        skid_data_q <= skid_data_d;
+        skid_err_q  <= output_err;
+      end
+    end
+  end else begin : g_skid_data_nr
+    always_ff @(posedge clk_i) begin
+      if (skid_en) begin
+        skid_data_q <= skid_data_d;
+        skid_err_q  <= output_err;
+      end
+    end
+  end
+
+  // The data in the skid buffer is ready if it's a complete compressed instruction or if there's
+  // an error (no need to wait for the second half)
+  assign skid_complete_instr = skid_valid_q & ((skid_data_q[1:0] != 2'b11) | skid_err_q);
+
+  // Data can be loaded into the skid buffer for an unaligned uncompressed instruction
+  assign skid_ready = output_addr_q[1] & ~skid_valid_q & (~output_compressed | output_err);
+
+  assign output_ready = (ready_i | skid_ready) & ~skid_complete_instr;
+
+  assign output_compressed = (rdata_o[1:0] != 2'b11);
+
+  assign skid_valid_d =
+      // Branches invalidate the skid buffer
+      branch_or_mispredict ? 1'b0 :
+      // Once valid, the skid buffer stays valid until a compressed instruction realigns the stream
+      (skid_valid_q ? ~(ready_i & ((skid_data_q[1:0] != 2'b11) | skid_err_q)) :
+      // The skid buffer becomes valid when:
+                        // - we branch to an unaligned uncompressed instruction
+                      (data_valid &
+                       (((output_addr_q[1] & (~output_compressed | output_err)) |
+                        // - a compressed instruction misaligns the stream
+                        (~output_addr_q[1] & output_compressed & ~output_err & ready_i)))));
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      skid_valid_q <= 1'b0;
+    end else begin
+      skid_valid_q <= skid_valid_d;
+    end
+  end
+
+  // Signal that valid data is available to the IF stage
+  // Note that if the first half of an unaligned instruction reports an error, we do not need
+  // to wait for the second half
+                        // Compressed instruction completely satisfied by skid buffer
+  assign output_valid = skid_complete_instr |
+                        // Output data available and, output stream aligned, or skid data available,
+                        (data_valid & (~output_addr_q[1] | skid_valid_q |
+                                       // or this is an error or an unaligned compressed instruction
+                                       output_err | (output_data[17:16] != 2'b11)));
+
+  // Update the address on branches and every time an instruction is driven
+  assign output_addr_en = branch_or_mispredict | (ready_i & valid_o);
+
+  // Increment the address by two every time a compressed instruction is popped
+  assign addr_incr_two = output_compressed & ~err_o;
+
+  // Next IF stage PC
+  assign output_addr_incr = (output_addr_q[31:1] +
+                             // Increment address by 4 or 2
+                             {29'd0, ~addr_incr_two, addr_incr_two});
+
+  // Redirect the address on branches or mispredicts
+  assign output_addr_d = branch_i            ? addr_i[31:1] :
+                         branch_mispredict_i ? mispredict_addr_i[31:1] :
+                                               output_addr_incr;
+
+  if (ResetAll) begin : g_output_addr_ra
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        output_addr_q <= '0;
+      end else if (output_addr_en) begin
+        output_addr_q <= output_addr_d;
+      end
+    end
+  end else begin : g_output_addr_nr
+    always_ff @(posedge clk_i) begin
+      if (output_addr_en) begin
+        output_addr_q <= output_addr_d;
+      end
+    end
+  end
+
+  // Mux the data from BUS_SIZE to halfword
+  // This muxing realigns data when instruction words are split across BUS_W e.g.
+  // word 1 |----|*h1*|
+  // word 0 |*h0*|----| --> |*h1*|*h0*|
+  //        31   15   0     31   15   0
+  always_comb begin
+    output_data_lo = '0;
+    for (int unsigned i = 0; i < IC_OUTPUT_BEATS; i++) begin
+      if (output_addr_q[BUS_W-1:1] == i[BUS_W-2:0]) begin
+        output_data_lo |= output_data[i*16+:16];
+      end
+    end
+  end
+
+  always_comb begin
+    output_data_hi = '0;
+    for (int unsigned i = 0; i < IC_OUTPUT_BEATS - 1; i++) begin
+      if (output_addr_q[BUS_W-1:1] == i[BUS_W-2:0]) begin
+        output_data_hi |= output_data[(i+1)*16+:16];
+      end
+    end
+    if (&output_addr_q[BUS_W-1:1]) begin
+      output_data_hi |= output_data[15:0];
+    end
+  end
+
+  assign valid_o     = output_valid & ~branch_mispredict_i;
+  assign rdata_o     = {output_data_hi, (skid_valid_q ? skid_data_q : output_data_lo)};
+  assign addr_o      = {output_addr_q, 1'b0};
+  assign err_o       = (skid_valid_q & skid_err_q) | (~skid_complete_instr & output_err);
+  // Error caused by the second half of a misaligned uncompressed instruction
+  // (only relevant when err_o is set)
+  assign err_plus2_o = skid_valid_q & ~skid_err_q;
+
+  ///////////////////
+  // Invalidations //
+  ///////////////////
+
+
+  // We need to save the invalidation request inside a register. That way we can wait
+  // until we have a valid scrambling key to do it. Since the key itself is needed for
+  // starting to fill in the RAMs and read from them, ICache also needs to stop operating.
+  assign inval_req_d = (inval_req_q | icache_inval_i) & ~(inval_done & inval_prog_q);
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      inval_req_q  <= 1'b0;
+    end else begin
+      inval_req_q  <= inval_req_d;
+    end
+  end
+
+  // This will act like a lock mechanism.
+  // Main idea is to lock the invalidation request until we got a valid scrambling key.
+  assign inval_lock = inval_req_d & ~ic_scr_key_valid_i;
+
+  // Invalidate on reset, or when instructed. If an invalidation request is received while a
+  // previous invalidation is ongoing, it does not need to be restarted. Do not start
+  // this process until inval lock is removed meaning the scrambling key is valid.
+  assign start_inval   = ~inval_lock & (~reset_inval_q | inval_req_q) & ~inval_prog_q ;
+  assign inval_prog_d  = ~inval_lock & (start_inval | (inval_prog_q & ~inval_done));
+  assign inval_done    = &inval_index_q;
+  assign inval_index_d = start_inval ? '0 : (inval_index_q + {{IC_INDEX_W-1{1'b0}},1'b1});
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      inval_prog_q  <= 1'b0;
+      reset_inval_q <= 1'b0;
+    end else begin
+      inval_prog_q  <= inval_prog_d;
+      reset_inval_q <= 1'b1;
+    end
+  end
+
+  if (ResetAll) begin : g_inval_index_ra
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        inval_index_q <= '0;
+      end else if (inval_prog_d) begin
+        inval_index_q <= inval_index_d;
+      end
+    end
+  end else begin : g_inval_index_nr
+    always_ff @(posedge clk_i) begin
+      if (inval_prog_d) begin
+        inval_index_q <= inval_index_d;
+      end
+    end
+  end
+
+  /////////////////
+  // Busy status //
+  /////////////////
+
+  // Only busy (for WFI purposes) while an invalidation is in-progress, or external requests are
+  // outstanding.
+  assign busy_o = inval_req_q | (|(fill_busy_q & ~fill_rvd_done));
+
+  ////////////////
+  // Assertions //
+  ////////////////
+
+  `ASSERT_INIT(size_param_legal, (IC_LINE_SIZE > 32))
+
+  // ECC primitives will need to be changed for different sizes
+  `ASSERT_INIT(ecc_tag_param_legal, (IC_TAG_SIZE <= 27))
+  `ASSERT_INIT(ecc_data_param_legal, !ICacheECC || (BUS_SIZE == 32))
+
+  // Lookups in the tag ram should always give a known result
+  `ASSERT_KNOWN(TagHitKnown,     lookup_valid_ic1 & tag_hit_ic1)
+  `ASSERT_KNOWN(TagInvalidKnown, lookup_valid_ic1 & tag_invalid_ic1)
+
+  // This is only used for the Yosys-based formal flow. Once we have working bind support, we can
+  // get rid of it.
+`ifdef FORMAL
+ `ifdef YOSYS
+  // Unfortunately, Yosys doesn't support passing unpacked arrays as ports. Explicitly pack up the
+  // signals we need.
+  logic [NUM_FB-1:0][ADDR_W-1:0] packed_fill_addr_q;
+  always_comb begin
+    for (int i = 0; i < NUM_FB; i++) begin
+      packed_fill_addr_q[i][ADDR_W-1:0] = fill_addr_q[i];
+    end
+  end
+
+  `include "formal_tb_frag.svh"
+ `endif
+`endif
+
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_id_stage.sv b/hw/ip/cheriot-ibex/rtl/cheriot_id_stage.sv
new file mode 100644
index 0000000..8305792
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_id_stage.sv
@@ -0,0 +1,1270 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+`ifdef RISCV_FORMAL
+  `define RVFI
+`endif
+
+/**
+ * Instruction Decode Stage
+ *
+ * Decode stage of the core. It decodes the instructions and hosts the register
+ * file.
+ */
+
+`include "prim_assert.sv"
+`include "dv_fcov_macros.svh"
+
+module cheriot_id_stage import cheri_pkg::*; #(
+  parameter bit               RV32E           = 0,
+  parameter cheriot_pkg::rv32m_e RV32M           = cheriot_pkg::RV32MFast,
+  parameter cheriot_pkg::rv32b_e RV32B           = cheriot_pkg::RV32BNone,
+  parameter bit               DataIndTiming   = 1'b0,
+  parameter bit               BranchTargetALU = 0,
+  parameter bit               WritebackStage  = 0,
+  parameter bit               BranchPredictor = 0,
+  parameter bit               CHERIoTEn       = 1'b1,
+  parameter bit               CheriPPLBC      = 1'b1,
+  parameter bit               CheriSBND2      = 1'b0
+) (
+  input  logic                      clk_i,
+  input  logic                      rst_ni,
+
+  input  logic                      cheri_pmode_i,
+  input  logic                      cheri_tsafe_en_i,    
+  output logic                      ctrl_busy_o,
+  output logic                      illegal_insn_o,
+
+  // Interface to IF stage
+  input  logic                      instr_valid_i,
+  input  logic [31:0]               instr_rdata_i,         // from IF-ID pipeline registers
+  input  logic [31:0]               instr_rdata_alu_i,     // from IF-ID pipeline registers
+  input  logic [15:0]               instr_rdata_c_i,       // from IF-ID pipeline registers
+  input  logic                      instr_is_compressed_i,
+  input  logic                      instr_bp_taken_i,
+  output logic                      instr_req_o,
+  output logic                      instr_first_cycle_id_o,
+  output logic                      instr_valid_clear_o,   // kill instr in IF-ID reg
+  output logic                      id_in_ready_o,         // ID stage is ready for next instr
+  output logic                      icache_inval_o,
+
+  // Jumps and branches
+  input  logic                      branch_decision_i,
+
+  // IF and ID stage signals
+  output logic                      pc_set_o,
+  output cheriot_pkg::pc_sel_e         pc_mux_o,
+  output logic                      nt_branch_mispredict_o,
+  output logic [31:0]               nt_branch_addr_o,
+  output cheriot_pkg::exc_pc_sel_e     exc_pc_mux_o,
+  output cheriot_pkg::exc_cause_e      exc_cause_o,
+
+  input  logic                      illegal_c_insn_i,
+  input  logic                      instr_fetch_err_i,
+  input  logic                      instr_fetch_err_plus2_i,
+  input  logic                      instr_fetch_cheri_acc_vio_i,         
+  input  logic                      instr_fetch_cheri_bound_vio_i,         
+
+  input  logic [31:0]               pc_id_i,
+
+  // Stalls
+  input  logic                      ex_valid_i,       // EX stage has valid output
+  input  logic                      lsu_resp_valid_i, // LSU has valid output, or is done
+  // ALU
+  output cheriot_pkg::alu_op_e         alu_operator_ex_o,
+  output logic [31:0]               alu_operand_a_ex_o,
+  output logic [31:0]               alu_operand_b_ex_o,
+
+  // Multicycle Operation Stage Register
+  input  logic [1:0]                imd_val_we_ex_i,
+  input  logic [33:0]               imd_val_d_ex_i[2],
+  output logic [33:0]               imd_val_q_ex_o[2],
+
+  // Branch target ALU
+  output logic [31:0]               bt_a_operand_o,
+  output logic [31:0]               bt_b_operand_o,
+
+  // MUL, DIV
+  output logic                      mult_en_ex_o,
+  output logic                      div_en_ex_o,
+  output logic                      mult_sel_ex_o,
+  output logic                      div_sel_ex_o,
+  output cheriot_pkg::md_op_e          multdiv_operator_ex_o,
+  output logic  [1:0]               multdiv_signed_mode_ex_o,
+  output logic [31:0]               multdiv_operand_a_ex_o,
+  output logic [31:0]               multdiv_operand_b_ex_o,
+  output logic                      multdiv_ready_id_o,
+
+  // CSR
+  output logic                      csr_access_o,
+  output cheriot_pkg::csr_op_e         csr_op_o,
+  output logic                      csr_op_en_o,
+  output logic                      csr_save_if_o,
+  output logic                      csr_save_id_o,
+  output logic                      csr_save_wb_o,
+  output logic                      csr_restore_mret_id_o,
+  output logic                      csr_restore_dret_id_o,
+  output logic                      csr_save_cause_o,
+  output logic                      csr_mepcc_clrtag_o,
+  output logic [31:0]               csr_mtval_o,
+  input  cheriot_pkg::priv_lvl_e       priv_mode_i,
+  input  logic                      csr_mstatus_tw_i,
+  input  logic                      illegal_csr_insn_i,
+  input  logic                      data_ind_timing_i,
+  input  logic                      csr_pcc_perm_sr_i,
+
+  // Interface to load store unit
+  output logic                      lsu_req_o,
+  output logic                      lsu_we_o,
+  output logic [1:0]                lsu_type_o,
+  output logic                      lsu_sign_ext_o,
+  output logic [31:0]               lsu_wdata_o,
+
+  input  logic                      lsu_req_done_i, // Data req to LSU is complete and
+                                                    // instruction can move to writeback
+                                                    // (only relevant where writeback stage is
+                                                    // present)
+
+  input  logic                      lsu_addr_incr_req_i,
+  input  logic [31:0]               lsu_addr_last_i,
+
+  // Interrupt signals
+  input  logic                      csr_mstatus_mie_i,
+  input  logic                      irq_pending_i,
+  input  cheriot_pkg::irqs_t           irqs_i,
+  input  logic                      irq_nm_i,
+  output logic                      nmi_mode_o,
+
+  input  logic                      lsu_load_err_i,
+  input  logic                      lsu_store_err_i,
+  input  logic                      lsu_err_is_cheri_i, 
+
+  // Debug Signal
+  output logic                      debug_mode_o,
+  output cheriot_pkg::dbg_cause_e      debug_cause_o,
+  output logic                      debug_csr_save_o,
+  input  logic                      debug_req_i,
+  input  logic                      debug_single_step_i,
+  input  logic                      debug_ebreakm_i,
+  input  logic                      debug_ebreaku_i,
+  input  logic                      trigger_match_i,
+
+  // Write back signal
+  input  logic [31:0]               result_ex_i,
+  input  logic [31:0]               csr_rdata_i,
+
+  // Register file read
+  output logic [4:0]                rf_raddr_a_o,
+  input  logic [31:0]               rf_rdata_a_i,
+  output logic [4:0]                rf_raddr_b_o,
+  input  logic [31:0]               rf_rdata_b_i,
+  output logic                      rf_ren_a_o,
+  output logic                      rf_ren_b_o,
+
+  // Register file write (via writeback)
+  output logic [4:0]                rf_waddr_id_o,
+  output logic [31:0]               rf_wdata_id_o,
+  output logic                      rf_we_id_o,
+  output logic                      rf_rd_a_wb_match_o,
+  output logic                      rf_rd_b_wb_match_o,
+  input  logic [31:0]               rf_reg_rdy_i,
+
+  // Register write information from writeback (for resolving data hazards)
+  input  logic [4:0]                rf_waddr_wb_i,
+  input  logic [31:0]               rf_wdata_fwd_wb_i,
+  input  logic                      rf_write_wb_i,
+
+  output  logic                     en_wb_o,
+  output  cheriot_pkg::wb_instr_type_e instr_type_wb_o,
+  output  logic                     instr_perf_count_id_o,
+  input logic                       ready_wb_i,
+  input logic                       outstanding_load_wb_i,
+  input logic                       outstanding_store_wb_i,
+
+  // Performance Counters
+  output logic                      perf_jump_o,    // executing a jump instr
+  output logic                      perf_branch_o,  // executing a branch instr
+  output logic                      perf_tbranch_o, // executing a taken branch instr
+  output logic                      perf_dside_wait_o, // instruction in ID/EX is awaiting memory
+                                                        // access to finish before proceeding
+  output logic                      perf_mul_wait_o,
+  output logic                      perf_div_wait_o,
+  output logic                      instr_id_done_o,
+
+  // cheri signals
+  output logic                      cheri_exec_id_o,
+  output logic                      instr_is_cheri_id_o,
+  output logic                      instr_is_rv32lsu_id_o,
+  output logic [11:0]               cheri_imm12_o,
+  output logic [19:0]               cheri_imm20_o,
+  output logic [20:0]               cheri_imm21_o,
+  output logic [OPDW-1:0]           cheri_operator_o,
+  output logic  [4:0]               cheri_cs2_dec_o,
+  output logic                      cheri_load_o,
+  output logic                      cheri_store_o,
+
+  input  logic                      cheri_ex_valid_i,
+  input  logic                      cheri_ex_err_i,
+  input  logic [11:0]               cheri_ex_err_info_i,
+  input  logic                      cheri_wb_err_i,
+  input  logic [15:0]               cheri_wb_err_info_i,
+  input  logic                      cheri_branch_req_i,   // from cheri EX
+  input  logic [31:0]               cheri_branch_target_i
+);
+
+  import cheriot_pkg::*;
+
+  // Decoder/Controller, ID stage internal signals
+  logic        illegal_insn_dec;
+  logic        ebrk_insn;
+  logic        mret_insn_dec;
+  logic        dret_insn_dec;
+  logic        ecall_insn_dec;
+  logic        wfi_insn_dec;
+
+  logic        wb_exception;
+  logic        unused_id_exception;
+  logic        id_exception_nc;
+
+  logic        branch_in_dec;
+  logic        branch_set, branch_set_raw, branch_set_raw_d;
+  logic        branch_jump_set_done_q, branch_jump_set_done_d;
+  logic        branch_not_set;
+  logic        branch_taken;
+  logic        jump_in_dec;
+  logic        jump_set_dec;
+  logic        jump_set, jump_set_raw;
+
+  logic        instr_first_cycle;
+  logic        instr_executing_spec;
+  logic        instr_executing;
+  logic        instr_done;
+  logic        controller_run;
+  logic        stall_ld_hz;
+  logic        stall_mem;
+  logic        stall_multdiv;
+  logic        stall_branch;
+  logic        stall_jump;
+  logic        stall_id;
+  logic        stall_wb;
+  logic        stall_cheri;
+  logic        flush_id;
+  logic        multicycle_done;
+
+  // Immediate decoding and sign extension
+  logic [31:0] imm_i_type;
+  logic [31:0] imm_s_type;
+  logic [31:0] imm_b_type;
+  logic [31:0] imm_u_type;
+  logic [31:0] imm_j_type;
+  logic [31:0] zimm_rs1_type;
+
+  logic [31:0] imm_a;       // contains the immediate for operand b
+  logic [31:0] imm_b;       // contains the immediate for operand b
+
+  // Register file interface
+
+  rf_wd_sel_e  rf_wdata_sel;
+  logic        rf_we_dec, rf_we_raw;
+  logic        rf_ren_a, rf_ren_b;
+  logic        rf_ren_a_dec, rf_ren_b_dec;
+  logic        rf_we_or_load;
+
+  // Read enables should only be asserted for valid and legal instructions
+  assign rf_ren_a = instr_valid_i & ~instr_fetch_err_i & ~illegal_insn_o & rf_ren_a_dec;
+  assign rf_ren_b = instr_valid_i & ~instr_fetch_err_i & ~illegal_insn_o & rf_ren_b_dec;
+
+  assign rf_ren_a_o = rf_ren_a;
+  assign rf_ren_b_o = rf_ren_b;
+
+  logic [31:0] rf_rdata_a_fwd;
+  logic [31:0] rf_rdata_b_fwd;
+
+  logic cheri_lsu_req_dec;
+  logic cheri_multicycle_dec;
+  logic ex_valid_all;
+
+  // ALU Control
+  alu_op_e     alu_operator;
+  op_a_sel_e   alu_op_a_mux_sel, alu_op_a_mux_sel_dec;
+  op_b_sel_e   alu_op_b_mux_sel, alu_op_b_mux_sel_dec;
+  logic        alu_multicycle_dec;
+  logic        stall_alu;
+
+  logic [33:0] imd_val_q[2];
+
+  op_a_sel_e   bt_a_mux_sel;
+  imm_b_sel_e  bt_b_mux_sel;
+
+  imm_a_sel_e  imm_a_mux_sel;
+  imm_b_sel_e  imm_b_mux_sel, imm_b_mux_sel_dec;
+
+  // Multiplier Control
+  logic        mult_en_id, mult_en_dec; // use integer multiplier
+  logic        div_en_id, div_en_dec;   // use integer division or reminder
+  logic        multdiv_en_dec;
+  md_op_e      multdiv_operator;
+  logic [1:0]  multdiv_signed_mode;
+
+  // Data Memory Control
+  logic        lsu_we;
+  logic [1:0]  lsu_type;
+  logic        lsu_sign_ext;
+  logic        lsu_req, lsu_req_dec;
+  logic        data_req_allowed;
+
+  // CSR control
+  logic        csr_pipe_flush;
+  logic        csr_cheri_always_ok;
+
+  logic [31:0] alu_operand_a;
+  logic [31:0] alu_operand_b;
+
+  logic        stall_cheri_trvk;
+  logic        instr_is_legal_cheri;
+
+  /////////////
+  // LSU Mux //
+  /////////////
+
+  // Misaligned loads/stores result in two aligned loads/stores, compute second address
+  assign alu_op_a_mux_sel = lsu_addr_incr_req_i ? OP_A_FWD        : alu_op_a_mux_sel_dec;
+  assign alu_op_b_mux_sel = lsu_addr_incr_req_i ? OP_B_IMM        : alu_op_b_mux_sel_dec;
+  assign imm_b_mux_sel    = lsu_addr_incr_req_i ? IMM_B_INCR_ADDR : imm_b_mux_sel_dec;
+
+  ///////////////////
+  // Operand MUXES //
+  ///////////////////
+
+  // Main ALU immediate MUX for Operand A
+  assign imm_a = (imm_a_mux_sel == IMM_A_Z) ? zimm_rs1_type : '0;
+
+  // Main ALU MUX for Operand A
+  always_comb begin : alu_operand_a_mux
+    unique case (alu_op_a_mux_sel)
+      OP_A_REG_A:  alu_operand_a = rf_rdata_a_fwd;
+      OP_A_FWD:    alu_operand_a = lsu_addr_last_i;
+      OP_A_CURRPC: alu_operand_a = pc_id_i;
+      OP_A_IMM:    alu_operand_a = imm_a;
+      default:     alu_operand_a = pc_id_i;
+    endcase
+  end
+
+  if (BranchTargetALU) begin : g_btalu_muxes
+    // Branch target ALU operand A mux
+    always_comb begin : bt_operand_a_mux
+      unique case (bt_a_mux_sel)
+        OP_A_REG_A:  bt_a_operand_o = rf_rdata_a_fwd;
+        OP_A_CURRPC: bt_a_operand_o = pc_id_i;
+        default:     bt_a_operand_o = pc_id_i;
+      endcase
+    end
+
+    // Branch target ALU operand B mux
+    always_comb begin : bt_immediate_b_mux
+      unique case (bt_b_mux_sel)
+        IMM_B_I:         bt_b_operand_o = imm_i_type;
+        IMM_B_B:         bt_b_operand_o = imm_b_type;
+        IMM_B_J:         bt_b_operand_o = imm_j_type;
+        IMM_B_INCR_PC:   bt_b_operand_o = instr_is_compressed_i ? 32'h2 : 32'h4;
+        default:         bt_b_operand_o = instr_is_compressed_i ? 32'h2 : 32'h4;
+      endcase
+    end
+
+    // Reduced main ALU immediate MUX for Operand B
+    always_comb begin : immediate_b_mux
+      unique case (imm_b_mux_sel)
+        IMM_B_I:         imm_b = imm_i_type;
+        IMM_B_S:         imm_b = imm_s_type;
+        IMM_B_U:         imm_b = imm_u_type;
+        IMM_B_INCR_PC:   imm_b = instr_is_compressed_i ? 32'h2 : 32'h4;
+        IMM_B_INCR_ADDR: imm_b = 32'h4;
+        default:         imm_b = 32'h4;
+      endcase
+    end
+    `ASSERT(IbexImmBMuxSelValid, instr_valid_i |-> imm_b_mux_sel inside {
+        IMM_B_I,
+        IMM_B_S,
+        IMM_B_U,
+        IMM_B_INCR_PC,
+        IMM_B_INCR_ADDR})
+  end else begin : g_nobtalu
+    op_a_sel_e  unused_a_mux_sel;
+    imm_b_sel_e unused_b_mux_sel;
+
+    assign unused_a_mux_sel = bt_a_mux_sel;
+    assign unused_b_mux_sel = bt_b_mux_sel;
+    assign bt_a_operand_o   = '0;
+    assign bt_b_operand_o   = '0;
+
+    // Full main ALU immediate MUX for Operand B
+    always_comb begin : immediate_b_mux
+      unique case (imm_b_mux_sel)
+        IMM_B_I:         imm_b = imm_i_type;
+        IMM_B_S:         imm_b = imm_s_type;
+        IMM_B_B:         imm_b = imm_b_type;
+        IMM_B_U:         imm_b = imm_u_type;
+        IMM_B_J:         imm_b = imm_j_type;
+        IMM_B_INCR_PC:   imm_b = instr_is_compressed_i ? 32'h2 : 32'h4;
+        IMM_B_INCR_ADDR: imm_b = 32'h4;
+        default:         imm_b = 32'h4;
+      endcase
+    end
+    `ASSERT(IbexImmBMuxSelValid, instr_valid_i |-> imm_b_mux_sel inside {
+        IMM_B_I,
+        IMM_B_S,
+        IMM_B_B,
+        IMM_B_U,
+        IMM_B_J,
+        IMM_B_INCR_PC,
+        IMM_B_INCR_ADDR})
+  end
+
+  // ALU MUX for Operand B
+  assign alu_operand_b = (alu_op_b_mux_sel == OP_B_IMM) ? imm_b : rf_rdata_b_fwd;
+
+  /////////////////////////////////////////
+  // Multicycle Operation Stage Register //
+  /////////////////////////////////////////
+
+  for (genvar i = 0; i < 2; i++) begin : gen_intermediate_val_reg
+    always_ff @(posedge clk_i or negedge rst_ni) begin : intermediate_val_reg
+      if (!rst_ni) begin
+        imd_val_q[i] <= '0;
+      end else if (imd_val_we_ex_i[i]) begin
+        imd_val_q[i] <= imd_val_d_ex_i[i];
+      end
+    end
+  end
+
+  assign imd_val_q_ex_o = imd_val_q;
+
+  ///////////////////////
+  // Register File MUX //
+  ///////////////////////
+
+  // Suppress register write if there is an illegal CSR access or instruction is not executing
+  assign rf_we_id_o = rf_we_raw & instr_executing & ~illegal_csr_insn_i;
+
+  // Register file write data mux
+  always_comb begin : rf_wdata_id_mux
+    unique case (rf_wdata_sel)
+      RF_WD_EX:  rf_wdata_id_o = result_ex_i;
+      RF_WD_CSR: rf_wdata_id_o = csr_rdata_i;
+      default:   rf_wdata_id_o = result_ex_i;
+    endcase
+  end
+
+  /////////////
+  // Decoder //
+  /////////////
+
+  cheriot_decoder #(
+    .RV32E          (RV32E),
+    .RV32M          (RV32M),
+    .RV32B          (RV32B),
+    .BranchTargetALU(BranchTargetALU),
+    .CHERIoTEn      (CHERIoTEn),
+    .CheriPPLBC     (CheriPPLBC),
+    .CheriSBND2     (CheriSBND2)
+  ) decoder_i (
+    .clk_i (clk_i),
+    .rst_ni(rst_ni),
+
+    .cheri_pmode_i (cheri_pmode_i),
+    .cheri_tsafe_en_i (cheri_tsafe_en_i),
+    // controller
+    .illegal_insn_o(illegal_insn_dec),
+    .ebrk_insn_o   (ebrk_insn),
+    .mret_insn_o   (mret_insn_dec),
+    .dret_insn_o   (dret_insn_dec),
+    .ecall_insn_o  (ecall_insn_dec),
+    .wfi_insn_o    (wfi_insn_dec),
+    .jump_set_o    (jump_set_dec),
+    .branch_taken_i(branch_taken),
+    .icache_inval_o(icache_inval_o),
+
+    // from IF-ID pipeline register
+    .instr_first_cycle_i(instr_first_cycle),
+    .instr_rdata_i      (instr_rdata_i),
+    .instr_rdata_alu_i  (instr_rdata_alu_i),
+    .illegal_c_insn_i   (illegal_c_insn_i),
+
+    // immediates
+    .imm_a_mux_sel_o(imm_a_mux_sel),
+    .imm_b_mux_sel_o(imm_b_mux_sel_dec),
+    .bt_a_mux_sel_o (bt_a_mux_sel),
+    .bt_b_mux_sel_o (bt_b_mux_sel),
+
+    .imm_i_type_o   (imm_i_type),
+    .imm_s_type_o   (imm_s_type),
+    .imm_b_type_o   (imm_b_type),
+    .imm_u_type_o   (imm_u_type),
+    .imm_j_type_o   (imm_j_type),
+    .zimm_rs1_type_o(zimm_rs1_type),
+
+    // register file
+    .rf_wdata_sel_o(rf_wdata_sel),
+    .rf_we_o       (rf_we_dec),
+    .rf_we_or_load_o(rf_we_or_load),
+
+    .rf_raddr_a_o(rf_raddr_a_o),
+    .rf_raddr_b_o(rf_raddr_b_o),
+    .rf_waddr_o  (rf_waddr_id_o),
+    .rf_ren_a_o  (rf_ren_a_dec),
+    .rf_ren_b_o  (rf_ren_b_dec),
+
+    // ALU
+    .alu_operator_o    (alu_operator),
+    .alu_op_a_mux_sel_o(alu_op_a_mux_sel_dec),
+    .alu_op_b_mux_sel_o(alu_op_b_mux_sel_dec),
+    .alu_multicycle_o  (alu_multicycle_dec),
+
+    // MULT & DIV
+    .mult_en_o            (mult_en_dec),
+    .div_en_o             (div_en_dec),
+    .mult_sel_o           (mult_sel_ex_o),
+    .div_sel_o            (div_sel_ex_o),
+    .multdiv_operator_o   (multdiv_operator),
+    .multdiv_signed_mode_o(multdiv_signed_mode),
+
+    // CSRs
+    .csr_access_o(csr_access_o),
+    .csr_op_o    (csr_op_o),
+    .csr_cheri_always_ok_o (csr_cheri_always_ok),
+
+    // LSU
+    .data_req_o           (lsu_req_dec),
+    .cheri_data_req_o     (cheri_lsu_req_dec),
+    .data_we_o            (lsu_we),
+    .data_type_o          (lsu_type),
+    .data_sign_extension_o(lsu_sign_ext),
+
+    // jump/branches
+    .jump_in_dec_o  (jump_in_dec),
+    .branch_in_dec_o(branch_in_dec),
+
+    // cheri signals
+    .instr_is_cheri_o   (instr_is_cheri_id_o),
+    .instr_is_legal_cheri_o (instr_is_legal_cheri),
+    .cheri_imm12_o      (cheri_imm12_o),
+    .cheri_imm20_o      (cheri_imm20_o),
+    .cheri_imm21_o      (cheri_imm21_o),
+    .cheri_operator_o   (cheri_operator_o),
+    .cheri_cs2_dec_o    (cheri_cs2_dec_o),
+    .cheri_multicycle_dec_o (cheri_multicycle_dec)
+  );
+
+  // assign cheri_lsu_req_dec     = cheri_load_o | cheri_store_o;
+  assign instr_is_rv32lsu_id_o = lsu_req_dec;    // go to cheri_ex
+
+  assign ex_valid_all   = instr_is_cheri_id_o ? cheri_ex_valid_i : ex_valid_i;
+
+  // If use "internal" CLBC, execution is sequential/multicyle. Otherwise use pipelined version.
+  assign cheri_load_o   = cheri_operator_o[CLOAD_CAP] & (~cheri_tsafe_en_i | CheriPPLBC);
+
+  assign cheri_store_o  = cheri_operator_o[CSTORE_CAP];
+
+
+  /////////////////////////////////
+  // CSR-related pipline flushes //
+  /////////////////////////////////
+  always_comb begin : csr_pipeline_flushes
+    csr_pipe_flush = 1'b0;
+
+    // A pipeline flush is needed to let the controller react after modifying certain CSRs:
+    // - When enabling interrupts, pending IRQs become visible to the controller only during
+    //   the next cycle. If during that cycle the core disables interrupts again, it does not
+    //   see any pending IRQs and consequently does not start to handle interrupts.
+    // - When modifying debug CSRs - TODO: Check if this is really needed
+    if (csr_op_en_o == 1'b1 && (csr_op_o == CSR_OP_WRITE || csr_op_o == CSR_OP_SET)) begin
+      if (csr_num_e'(instr_rdata_i[31:20]) == CSR_MSTATUS   ||
+          csr_num_e'(instr_rdata_i[31:20]) == CSR_MIE) begin
+        csr_pipe_flush = 1'b1;
+      end
+    end else if (csr_op_en_o == 1'b1 && csr_op_o != CSR_OP_READ) begin
+      if (csr_num_e'(instr_rdata_i[31:20]) == CSR_DCSR      ||
+          csr_num_e'(instr_rdata_i[31:20]) == CSR_DPC       ||
+          csr_num_e'(instr_rdata_i[31:20]) == CSR_DSCRATCH0 ||
+          csr_num_e'(instr_rdata_i[31:20]) == CSR_DSCRATCH1) begin
+        csr_pipe_flush = 1'b1;
+      end
+    end
+  end
+
+  ////////////////
+  // Controller //
+  ////////////////
+
+  assign illegal_insn_o = instr_valid_i & (illegal_insn_dec | illegal_csr_insn_i);
+
+  cheriot_controller #(
+    .CHERIoTEn      (CHERIoTEn),
+    .WritebackStage (WritebackStage),
+    .BranchPredictor(BranchPredictor)
+  ) controller_i (
+    .clk_i (clk_i),
+    .rst_ni(rst_ni),
+    .cheri_pmode_i (cheri_pmode_i),
+    .ctrl_busy_o(ctrl_busy_o),
+
+    // decoder related signals
+    .illegal_insn_i  (illegal_insn_o),
+    .ecall_insn_i    (ecall_insn_dec),
+    .mret_insn_i     (mret_insn_dec),
+    .dret_insn_i     (dret_insn_dec),
+    .wfi_insn_i      (wfi_insn_dec),
+    .ebrk_insn_i     (ebrk_insn),
+    .csr_pipe_flush_i(csr_pipe_flush),
+    .csr_access_i    (csr_access_o),
+    .csr_cheri_always_ok_i (csr_cheri_always_ok),
+
+    // from IF-ID pipeline
+    .instr_valid_i          (instr_valid_i),
+    .instr_i                (instr_rdata_i),
+    .instr_compressed_i     (instr_rdata_c_i),
+    .instr_is_compressed_i  (instr_is_compressed_i),
+    .instr_bp_taken_i       (instr_bp_taken_i),
+    .instr_fetch_err_i      (instr_fetch_err_i),
+    .instr_fetch_err_plus2_i(instr_fetch_err_plus2_i),
+    .instr_fetch_cheri_acc_vio_i  (instr_fetch_cheri_acc_vio_i),       
+    .instr_fetch_cheri_bound_vio_i (instr_fetch_cheri_bound_vio_i),       
+
+    .pc_id_i                (pc_id_i),
+
+    // to IF-ID pipeline
+    .instr_valid_clear_o(instr_valid_clear_o),
+    .id_in_ready_o      (id_in_ready_o),
+    .controller_run_o   (controller_run),
+
+    // to prefetcher
+    .instr_req_o           (instr_req_o),
+    .pc_set_o              (pc_set_o),
+    .pc_mux_o              (pc_mux_o),
+    .nt_branch_mispredict_o(nt_branch_mispredict_o),
+    .exc_pc_mux_o          (exc_pc_mux_o),
+    .exc_cause_o           (exc_cause_o),
+
+    // LSU
+    .lsu_addr_last_i(lsu_addr_last_i),
+    .load_err_i     (lsu_load_err_i),
+    .store_err_i    (lsu_store_err_i),
+    .lsu_err_is_cheri_i (lsu_err_is_cheri_i),
+    .wb_exception_o (wb_exception),
+    .id_exception_o (unused_id_exception),
+    .id_exception_nc_o (id_exception_nc),
+
+    // jump/branch control
+    .branch_set_i     (branch_set),
+    .branch_not_set_i (branch_not_set),
+    .jump_set_i       (jump_set),
+
+    // interrupt signals
+    .csr_mstatus_mie_i(csr_mstatus_mie_i),
+    .irq_pending_i    (irq_pending_i),
+    .irqs_i           (irqs_i),
+    .irq_nm_i         (irq_nm_i),
+    .nmi_mode_o       (nmi_mode_o),
+
+    // CSR Controller Signals
+    .csr_save_if_o        (csr_save_if_o),
+    .csr_save_id_o        (csr_save_id_o),
+    .csr_save_wb_o        (csr_save_wb_o),
+    .csr_restore_mret_id_o(csr_restore_mret_id_o),
+    .csr_restore_dret_id_o(csr_restore_dret_id_o),
+    .csr_save_cause_o     (csr_save_cause_o),
+    .csr_mepcc_clrtag_o   (csr_mepcc_clrtag_o),
+    .csr_mtval_o          (csr_mtval_o),
+    .priv_mode_i          (priv_mode_i),
+    .csr_mstatus_tw_i     (csr_mstatus_tw_i),
+    .csr_pcc_perm_sr_i    (csr_pcc_perm_sr_i),
+
+    // Debug Signal
+    .debug_mode_o       (debug_mode_o),
+    .debug_cause_o      (debug_cause_o),
+    .debug_csr_save_o   (debug_csr_save_o),
+    .debug_req_i        (debug_req_i),
+    .debug_single_step_i(debug_single_step_i),
+    .debug_ebreakm_i    (debug_ebreakm_i),
+    .debug_ebreaku_i    (debug_ebreaku_i),
+    .trigger_match_i    (trigger_match_i),
+
+    .stall_id_i(stall_id),
+    .stall_wb_i(stall_wb),
+    .flush_id_o(flush_id),
+    .ready_wb_i(ready_wb_i),
+
+    // Performance Counters
+    .perf_jump_o   (perf_jump_o),
+    .perf_tbranch_o(perf_tbranch_o),
+
+    .instr_is_cheri_i       (instr_is_cheri_id_o)  ,
+    .cheri_ex_valid_i       (cheri_ex_valid_i)     ,
+    .cheri_ex_err_i         (cheri_ex_err_i)       ,
+    .cheri_ex_err_info_i    (cheri_ex_err_info_i)  ,
+    .cheri_wb_err_i         (cheri_wb_err_i)       ,
+    .cheri_wb_err_info_i    (cheri_wb_err_info_i)  ,
+    .cheri_branch_req_i     (cheri_branch_req_i)   ,   // from cheri EX
+    .cheri_branch_target_i  (cheri_branch_target_i)
+  );
+
+  assign multdiv_en_dec   = mult_en_dec | div_en_dec;
+
+  // note data_req_allowed is already part of instr_executing
+  assign lsu_req         = instr_executing ? data_req_allowed & lsu_req_dec  : 1'b0;
+  assign mult_en_id      = instr_executing ? mult_en_dec                     : 1'b0;
+  assign div_en_id       = instr_executing ? div_en_dec                      : 1'b0;
+
+  assign lsu_req_o               = lsu_req;
+  assign lsu_we_o                = lsu_we;
+  assign lsu_type_o              = lsu_type;
+  assign lsu_sign_ext_o          = lsu_sign_ext;
+  assign lsu_wdata_o             = rf_rdata_b_fwd;
+  // csr_op_en_o is set when CSR access should actually happen.
+  // csv_access_o is set when CSR access instruction is present and is used to compute whether a CSR
+  // access is illegal. A combinational loop would be created if csr_op_en_o was used along (as
+  // asserting it for an illegal csr access would result in a flush that would need to deassert it).
+
+  // assign csr_op_en_o             = csr_access_o & instr_executing & instr_id_done_o;
+  // improve timing for CHERIoT mode (instr_id_done has too much logic)
+  assign csr_op_en_o             = csr_access_o & instr_executing & 
+                                   (CHERIoTEn ? instr_first_cycle : instr_id_done_o);
+
+  assign alu_operator_ex_o           = alu_operator;
+  assign alu_operand_a_ex_o          = alu_operand_a;
+  assign alu_operand_b_ex_o          = alu_operand_b;
+
+  assign mult_en_ex_o                = mult_en_id;
+  assign div_en_ex_o                 = div_en_id;
+
+  assign multdiv_operator_ex_o       = multdiv_operator;
+  assign multdiv_signed_mode_ex_o    = multdiv_signed_mode;
+  assign multdiv_operand_a_ex_o      = rf_rdata_a_fwd;
+  assign multdiv_operand_b_ex_o      = rf_rdata_b_fwd;
+
+  ////////////////////////
+  // Branch set control //
+  ////////////////////////
+
+  if (BranchTargetALU && !DataIndTiming) begin : g_branch_set_direct
+    // Branch set fed straight to controller with branch target ALU
+    // (condition pass/fail used same cycle as generated instruction request)
+    assign branch_set_raw      = branch_set_raw_d;
+  end else begin : g_branch_set_flop
+    // Branch set flopped without branch target ALU, or in fixed time execution mode
+    // (condition pass/fail used next cycle where branch target is calculated)
+    logic branch_set_raw_q;
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        branch_set_raw_q <= 1'b0;
+    // bug here (see the 07082022 report). should qualify this with instr_executing
+    // (same as id_fsm_q). let's wait for now and fix later QQQ
+      end else begin
+        branch_set_raw_q <= branch_set_raw_d;
+      end
+    end
+
+    // Branches always take two cycles in fixed time execution mode, with or without the branch
+    // target ALU (to avoid a path from the branch decision into the branch target ALU operand
+    // muxing).
+    assign branch_set_raw      = (BranchTargetALU && !data_ind_timing_i) ? branch_set_raw_d :
+                                                                           branch_set_raw_q;
+
+  end
+
+  // Track whether the current instruction in ID/EX has done a branch or jump set.
+  assign branch_jump_set_done_d = (branch_set_raw | jump_set_raw | branch_jump_set_done_q) &
+    ~instr_valid_clear_o;
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      branch_jump_set_done_q <= 1'b0;
+    end else begin
+      branch_jump_set_done_q <= branch_jump_set_done_d;
+    end
+  end
+
+  // the _raw signals from the state machine may be asserted for multiple cycles when
+  // instr_executing_spec is asserted and instr_executing is not asserted. This may occur where
+  // a memory error is seen or a there are outstanding memory accesses (indicate a load or store is
+  // in the WB stage). The branch or jump speculatively begins the fetch but is held back from
+  // completing until it is certain the outstanding access hasn't seen a memory error. This logic
+  // ensures only the first cycle of a branch or jump set is sent to the controller to prevent
+  // needless extra IF flushes and fetches.
+  assign jump_set        = jump_set_raw        & ~branch_jump_set_done_q;
+  assign branch_set      = branch_set_raw      & ~branch_jump_set_done_q;
+
+  // Branch condition is calculated in the first cycle and flopped for use in the second cycle
+  // (only used in fixed time execution mode to determine branch destination).
+  if (DataIndTiming) begin : g_sec_branch_taken
+    logic branch_taken_q;
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        branch_taken_q <= 1'b0;
+      end else begin
+        branch_taken_q <= branch_decision_i;
+      end
+    end
+
+    assign branch_taken = ~data_ind_timing_i | branch_taken_q;
+
+  end else begin : g_nosec_branch_taken
+
+    // Signal unused without fixed time execution mode - only taken branches will trigger
+    // branch_set_raw
+    assign branch_taken = 1'b1;
+
+  end
+
+  // Holding branch_set/jump_set high for more than one cycle should not cause a functional issue.
+  // However it could generate needless prefetch buffer flushes and instruction fetches. The ID/EX
+  // designs ensures that this never happens for non-predicted branches.
+  `ASSERT(NeverDoubleBranch, branch_set & ~instr_bp_taken_i |=> ~branch_set)
+  `ASSERT(NeverDoubleJump, jump_set & ~instr_bp_taken_i |=> ~jump_set)
+
+  //////////////////////////////
+  // Branch not-taken address //
+  //////////////////////////////
+
+  if (BranchPredictor) begin : g_calc_nt_addr
+    assign nt_branch_addr_o = pc_id_i + (instr_is_compressed_i ? 32'd2 : 32'd4);
+  end else begin : g_n_calc_nt_addr
+    assign nt_branch_addr_o = 32'd0;
+  end
+
+  ///////////////
+  // ID-EX FSM //
+  ///////////////
+
+  typedef enum logic { FIRST_CYCLE, MULTI_CYCLE } id_fsm_e;
+  id_fsm_e id_fsm_q, id_fsm_d;
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin : id_pipeline_reg
+    if (!rst_ni) begin
+      id_fsm_q <= FIRST_CYCLE;
+    end else if (instr_executing) begin
+      id_fsm_q <= id_fsm_d;
+    end
+  end
+
+  // ID/EX stage can be in two states, FIRST_CYCLE and MULTI_CYCLE. An instruction enters
+  // MULTI_CYCLE if it requires multiple cycles to complete regardless of stalls and other
+  // considerations. An instruction may be held in FIRST_CYCLE if it's unable to begin executing
+  // (this is controlled by instr_executing).
+
+  always_comb begin
+    id_fsm_d                = id_fsm_q;
+    rf_we_raw               = rf_we_dec;
+    stall_multdiv           = 1'b0;
+    stall_jump              = 1'b0;
+    stall_branch            = 1'b0;
+    stall_alu               = 1'b0;
+    stall_cheri             = 1'b0;
+    branch_set_raw_d        = 1'b0;
+    branch_not_set          = 1'b0;
+    jump_set_raw            = 1'b0;
+    perf_branch_o           = 1'b0;
+
+    if (instr_executing_spec) begin
+      unique case (id_fsm_q)
+        FIRST_CYCLE: begin
+          unique case (1'b1)
+            lsu_req_dec: begin
+              if (!WritebackStage) begin
+                // LSU operation
+                id_fsm_d    = MULTI_CYCLE;
+              end else if(~lsu_req_done_i) begin
+                id_fsm_d  = MULTI_CYCLE;
+              end
+            end
+            cheri_lsu_req_dec: begin
+              if (cheri_pmode_i) begin
+                if (!WritebackStage) begin
+                  id_fsm_d    = MULTI_CYCLE;
+                end else if(~lsu_req_done_i) begin  // covers the lsu_cheri_err case (1cycle)
+                  id_fsm_d  = MULTI_CYCLE;
+                end
+              end
+            end
+            multdiv_en_dec: begin
+              // MUL or DIV operation
+              if (~ex_valid_i) begin
+                // When single-cycle multiply is configured mul can finish in the first cycle so
+                // only enter MULTI_CYCLE state if a result isn't immediately available
+                id_fsm_d      = MULTI_CYCLE;
+                rf_we_raw     = 1'b0;
+                stall_multdiv = 1'b1;
+              end
+            end
+            branch_in_dec: begin
+              // cond branch operation
+              // All branches take two cycles in fixed time execution mode, regardless of branch
+              // condition.
+              id_fsm_d         = (data_ind_timing_i || (!BranchTargetALU && branch_decision_i)) ?
+                                     MULTI_CYCLE : FIRST_CYCLE;
+              stall_branch     = (~BranchTargetALU & branch_decision_i) | data_ind_timing_i;
+              branch_set_raw_d = (branch_decision_i | data_ind_timing_i);
+
+              if (BranchPredictor) begin
+                branch_not_set = ~branch_decision_i;
+              end
+
+              perf_branch_o = 1'b1;
+            end
+            jump_in_dec: begin
+              // uncond branch operation
+              // BTALU means jumps only need one cycle
+              id_fsm_d      = BranchTargetALU ? FIRST_CYCLE : MULTI_CYCLE;
+              stall_jump    = ~BranchTargetALU;
+              jump_set_raw  = jump_set_dec;
+            end
+            alu_multicycle_dec: begin
+              stall_alu     = 1'b1;
+              id_fsm_d      = MULTI_CYCLE;
+              rf_we_raw     = 1'b0;
+            end
+            cheri_multicycle_dec: begin
+              if (cheri_pmode_i) begin
+                id_fsm_d      = MULTI_CYCLE;
+                rf_we_raw     = 1'b0;
+                stall_cheri   = 1'b1;
+              end
+            end
+            default: begin
+              id_fsm_d      = FIRST_CYCLE;
+            end
+          endcase
+        end
+
+        MULTI_CYCLE: begin
+          if(multdiv_en_dec) begin
+            rf_we_raw       = rf_we_dec & ex_valid_i;
+          end
+
+          if (multicycle_done & ready_wb_i) begin
+            id_fsm_d        = FIRST_CYCLE;
+          end else begin
+            stall_multdiv   = multdiv_en_dec;
+            stall_branch    = branch_in_dec;
+            stall_jump      = jump_in_dec;
+            stall_cheri     = cheri_multicycle_dec;
+          end
+        end
+
+        default: begin
+          id_fsm_d          = FIRST_CYCLE;
+        end
+      endcase
+    end
+  end
+
+  // Note for the two-stage configuration ready_wb_i is always set
+  assign multdiv_ready_id_o = ready_wb_i;
+
+  `ASSERT(StallIDIfMulticycle, (id_fsm_q == FIRST_CYCLE) & (id_fsm_d == MULTI_CYCLE) |-> stall_id)
+
+
+  // Stall ID/EX stage for reason that relates to instruction in ID/EX, update assertion below if
+  // modifying this.
+  assign stall_id = stall_ld_hz | stall_mem | stall_multdiv | stall_jump | stall_branch | stall_cheri |
+                      stall_alu | stall_cheri_trvk;
+
+  // Generally illegal instructions have no reason to stall, however they must still stall waiting
+  // for outstanding memory requests so exceptions related to them take priority over the illegal
+  // instruction exception.
+  `ASSERT(IllegalInsnStallMustBeMemStall, illegal_insn_o & stall_id |-> stall_mem &
+    ~(stall_ld_hz | stall_multdiv | stall_jump | stall_branch | stall_alu | stall_cheri_trvk))
+
+  assign instr_done = ~stall_id & ~flush_id & instr_executing;
+
+  // Signal instruction in ID is in it's first cycle. It can remain in its
+  // first cycle if it is stalled.
+  assign instr_first_cycle      = instr_valid_i & (id_fsm_q == FIRST_CYCLE);
+  // Used by RVFI to know when to capture register read data
+  // Used by ALU to access RS3 if ternary instruction.
+  assign instr_first_cycle_id_o = instr_first_cycle;
+
+  if (WritebackStage) begin : gen_stall_mem
+    // Register read address matches write address in WB
+    logic rf_rd_a_wb_match;
+    logic rf_rd_b_wb_match;
+    // Hazard between registers being read and written
+    logic rf_rd_a_hz;
+    logic rf_rd_b_hz;
+
+    logic outstanding_memory_access;
+
+    logic instr_kill;
+
+    assign multicycle_done = (lsu_req_dec|cheri_lsu_req_dec) ? ~stall_mem : ex_valid_all;
+
+    // Is a memory access ongoing that isn't finishing this cycle
+    assign outstanding_memory_access = (outstanding_load_wb_i | outstanding_store_wb_i) &
+                                       ~lsu_resp_valid_i;
+
+    // Can start a new memory access if any previous one has finished or is finishing
+    assign data_req_allowed = ~outstanding_memory_access;
+
+    // Instruction won't execute because:
+    // - There is a pending exception in writeback
+    //   The instruction in ID/EX will be flushed and the core will jump to an exception handler
+    // - The controller isn't running instructions
+    //   This either happens in preparation for a flush and jump to an exception handler e.g. in
+    //   response to an IRQ or debug request or whilst the core is sleeping or resetting/fetching
+    //   first instruction in which case any valid instruction in ID/EX should be ignored.
+    // - There was an error on instruction fetch
+
+    // cheri instr can only generate exception after execution
+    // exclude cheri EX exception from insr_kill improves timing
+
+    assign instr_kill = instr_fetch_err_i |
+                        wb_exception      |
+                        id_exception_nc   |   // exclude cheri EX exceptions
+                        ~controller_run;
+
+    // With writeback stage instructions must be prevented from executing if there is:
+    // - A load hazard
+    // - A pending memory access
+    //   If it receives an error response this results in a precise exception from WB so ID/EX
+    //   instruction must not execute until error response is known).
+    // - A load/store error
+    //   This will cause a precise exception for the instruction in WB so ID/EX instruction must not
+    //   execute
+    //
+    // instr_executing_spec is a speculative signal. It indicates an instruction can execute
+    // assuming there are no exceptions from writeback and any outstanding memory access won't
+    // receive an error. It is required so branch and jump requests don't factor in an incoming dmem
+    // error (that in turn would factor directly into imem requests leading to a feedthrough path).
+    //
+    // instr_executing is the full signal, it will only allow execution once any potential
+    // exceptions from writeback have been resolved.
+    assign instr_executing_spec = instr_valid_i      &
+                                  ~instr_fetch_err_i &
+                                  controller_run     &
+                                  ~stall_ld_hz       &
+                                  ~stall_cheri_trvk;
+
+    assign instr_executing = instr_valid_i              &
+                             ~instr_kill                &
+                             ~stall_ld_hz               &
+                             ~stall_cheri_trvk          &
+                             ~outstanding_memory_access;
+
+    // allowing a cheri instruction to start execution - valid instruction not stalled by WB/hz
+    // note we can't use_instr_kill here since it includes id_exception (cherr_ex_err), which causes a
+    // comb loop.
+
+    assign cheri_exec_id_o = cheri_pmode_i & instr_valid_i &
+                            ~instr_fetch_err_i         &
+                            instr_is_legal_cheri       &
+                            controller_run             &
+                            ~wb_exception              &
+                            ~stall_ld_hz               &
+                            ~stall_cheri_trvk          &
+                            ~outstanding_memory_access;
+
+
+    `ASSERT(IbexExecutingSpecIfExecuting, instr_executing |-> instr_executing_spec)
+
+    `ASSERT(IbexStallIfValidInstrNotExecuting,
+      instr_valid_i & ~instr_kill & ~instr_executing |-> stall_id)
+
+    `ASSERT(IbexCannotRetireWithPendingExceptions,
+      instr_done |-> ~(wb_exception | outstanding_memory_access))
+
+    // Stall for reasons related to memory:
+    // * There is an outstanding memory access that won't resolve this cycle (need to wait to allow
+    //   precise exceptions)
+    // * There is a load/store request not being granted or which is unaligned and waiting to issue
+    //   a second request (needs to stay in ID for the address calculation)
+
+
+    // For pipeline timing/stalling, we treat cheri data load/stores the same as legacy RV32 load/stores
+    assign stall_mem = instr_valid_i & (outstanding_memory_access |
+                                        ((lsu_req_dec | cheri_lsu_req_dec) & ~lsu_req_done_i));
+
+    // If we stall a load in ID for any reason, it must not make an LSU request
+    // (otherwide we might issue two requests for the same instruction)
+    `ASSERT(IbexStallMemNoRequest,
+      instr_valid_i & lsu_req_dec & ~instr_done |-> ~lsu_req_done_i)
+
+    assign rf_rd_a_wb_match = (rf_waddr_wb_i == rf_raddr_a_o) & |rf_raddr_a_o;
+    assign rf_rd_b_wb_match = (rf_waddr_wb_i == rf_raddr_b_o) & |rf_raddr_b_o;
+
+    assign rf_rd_a_wb_match_o = rf_rd_a_wb_match;
+    assign rf_rd_b_wb_match_o = rf_rd_b_wb_match;
+
+    // If instruction is reading register that load will be writing stall in
+    // ID until load is complete. No need to stall when reading zero register.
+    assign rf_rd_a_hz = rf_rd_a_wb_match & rf_ren_a;
+    assign rf_rd_b_hz = rf_rd_b_wb_match & rf_ren_b;
+
+    // If instruction is read register that writeback is writing forward writeback data to read
+    // data. Note this doesn't factor in load data as it arrives too late, such hazards are
+    // resolved via a stall (see above).
+    assign rf_rdata_a_fwd = rf_rd_a_wb_match & rf_write_wb_i ? rf_wdata_fwd_wb_i : rf_rdata_a_i;
+    assign rf_rdata_b_fwd = rf_rd_b_wb_match & rf_write_wb_i ? rf_wdata_fwd_wb_i : rf_rdata_b_i;
+
+    assign stall_ld_hz = outstanding_load_wb_i & (rf_rd_a_hz | rf_rd_b_hz);
+
+    logic rf_we_or_load_valid;
+    assign rf_we_or_load_valid = rf_we_or_load & instr_valid_i & ~instr_fetch_err_i & ~illegal_insn_o;
+   
+
+    assign stall_cheri_trvk = (CHERIoTEn & cheri_pmode_i & CheriPPLBC) ? 
+                               ((rf_ren_a && ~rf_reg_rdy_i[rf_raddr_a_o]) |
+                                (rf_ren_b && ~rf_reg_rdy_i[rf_raddr_b_o]) |
+                                (rf_we_or_load_valid && ~rf_reg_rdy_i[rf_waddr_id_o])) :
+                               1'b0;
+
+    assign instr_type_wb_o = ~lsu_req_dec ? WB_INSTR_OTHER :
+                              lsu_we      ? WB_INSTR_STORE :
+                                            WB_INSTR_LOAD;
+
+    assign instr_id_done_o = en_wb_o & ready_wb_i;
+
+    // Stall ID/EX as instruction in ID/EX cannot proceed to writeback yet
+    assign stall_wb = en_wb_o & ~ready_wb_i;
+
+    assign perf_dside_wait_o = instr_valid_i & ~instr_kill &
+                               (outstanding_memory_access | stall_ld_hz | stall_cheri_trvk);
+  end else begin : gen_no_stall_mem
+
+    assign multicycle_done = (cheri_lsu_req_dec | lsu_req_dec) ? lsu_resp_valid_i : ex_valid_all;
+
+    assign data_req_allowed = instr_first_cycle;
+
+    // Without Writeback Stage always stall the first cycle of a load/store.
+    // Then stall until it is complete
+    assign stall_mem = instr_valid_i & ((lsu_req_dec | cheri_lsu_req_dec) & (~lsu_resp_valid_i | instr_first_cycle));
+
+    // No load hazards without Writeback Stage
+    assign stall_ld_hz   = 1'b0;
+    assign stall_cheri_trvk = 1'b0;    // CheriPPLBC can't work with 2-stage pipeline configuration
+
+    // Without writeback stage any valid instruction that hasn't seen an error will execute
+    assign instr_executing_spec = instr_valid_i & ~instr_fetch_err_i & controller_run;
+    assign instr_executing      = instr_executing_spec;
+    assign cheri_exec_id_o      = instr_executing;
+
+    `ASSERT(IbexStallIfValidInstrNotExecuting,
+      instr_valid_i & ~instr_fetch_err_i & ~instr_executing & controller_run |-> stall_id)
+
+    // No data forwarding without writeback stage so always take source register data direct from
+    // register file
+    assign rf_rdata_a_fwd = rf_rdata_a_i;
+    assign rf_rdata_b_fwd = rf_rdata_b_i;
+
+    assign rf_rd_a_wb_match_o = 1'b0;
+    assign rf_rd_b_wb_match_o = 1'b0;
+
+    // Unused Writeback stage only IO & wiring
+    // Assign inputs and internal wiring to unused signals to satisfy lint checks
+    // Tie-off outputs to constant values
+    logic unused_data_req_done_ex;
+    logic [4:0] unused_rf_waddr_wb;
+    logic unused_rf_write_wb;
+    logic unused_outstanding_load_wb;
+    logic unused_outstanding_store_wb;
+    logic unused_wb_exception;
+    logic [31:0] unused_rf_wdata_fwd_wb;
+
+    assign unused_data_req_done_ex     = lsu_req_done_i;
+    assign unused_rf_waddr_wb          = rf_waddr_wb_i;
+    assign unused_rf_write_wb          = rf_write_wb_i;
+    assign unused_outstanding_load_wb  = outstanding_load_wb_i;
+    assign unused_outstanding_store_wb = outstanding_store_wb_i;
+    assign unused_wb_exception         = wb_exception;
+    assign unused_rf_wdata_fwd_wb      = rf_wdata_fwd_wb_i;
+
+    assign instr_type_wb_o = WB_INSTR_OTHER;
+    assign stall_wb        = 1'b0;
+
+    assign perf_dside_wait_o = instr_executing & lsu_req_dec & ~lsu_resp_valid_i;
+
+    assign instr_id_done_o = instr_done;
+  end
+
+  // Signal which instructions to count as retired in minstret, all traps along with ebrk and
+  // ecall instructions are not counted.
+  assign instr_perf_count_id_o = ~ebrk_insn & ~ecall_insn_dec & ~illegal_insn_dec &
+      ~illegal_csr_insn_i & ~instr_fetch_err_i;
+
+  // An instruction is ready to move to the writeback stage (or retire if there is no writeback
+  // stage)
+  assign en_wb_o = instr_done;
+
+  assign perf_mul_wait_o = stall_multdiv & mult_en_dec;
+  assign perf_div_wait_o = stall_multdiv & div_en_dec;
+
+  //////////
+  // FCOV //
+  //////////
+
+  `DV_FCOV_SIGNAL_GEN_IF(logic, rf_rd_wb_hz,
+    (gen_stall_mem.rf_rd_a_hz | gen_stall_mem.rf_rd_b_hz) & instr_valid_i, WritebackStage)
+  `DV_FCOV_SIGNAL(logic, branch_taken,
+    instr_executing & (id_fsm_q == FIRST_CYCLE) & branch_decision_i)
+  `DV_FCOV_SIGNAL(logic, branch_not_taken,
+    instr_executing & (id_fsm_q == FIRST_CYCLE) & ~branch_decision_i)
+
+  ////////////////
+  // Assertions //
+  ////////////////
+
+  // Selectors must be known/valid.
+  `ASSERT_KNOWN_IF(IbexAluOpMuxSelKnown, alu_op_a_mux_sel, instr_valid_i)
+  `ASSERT(IbexAluAOpMuxSelValid, instr_valid_i |-> alu_op_a_mux_sel inside {
+      OP_A_REG_A,
+      OP_A_FWD,
+      OP_A_CURRPC,
+      OP_A_IMM})
+  `ASSERT_KNOWN_IF(IbexBTAluAOpMuxSelKnown, bt_a_mux_sel, instr_valid_i)
+  `ASSERT(IbexBTAluAOpMuxSelValid, instr_valid_i |-> bt_a_mux_sel inside {
+      OP_A_REG_A,
+      OP_A_CURRPC})
+  `ASSERT_KNOWN_IF(IbexBTAluBOpMuxSelKnown, bt_b_mux_sel, instr_valid_i)
+  `ASSERT(IbexBTAluBOpMuxSelValid, instr_valid_i |-> bt_b_mux_sel inside {
+      IMM_B_I,
+      IMM_B_B,
+      IMM_B_J,
+      IMM_B_INCR_PC})
+  `ASSERT(IbexRegfileWdataSelValid, instr_valid_i |-> rf_wdata_sel inside {
+      RF_WD_EX,
+      RF_WD_CSR})
+  `ASSERT_KNOWN(IbexWbStateKnown, id_fsm_q)
+
+  // Branch decision must be valid when jumping.
+  `ASSERT_KNOWN_IF(IbexBranchDecisionValid, branch_decision_i,
+      instr_valid_i && !(illegal_csr_insn_i || instr_fetch_err_i))
+
+  // Instruction delivered to ID stage can not contain X.
+  `ASSERT_KNOWN_IF(IbexIdInstrKnown, instr_rdata_i,
+      instr_valid_i && !(illegal_c_insn_i || instr_fetch_err_i))
+
+  // Instruction delivered to ID stage can not contain X.
+  `ASSERT_KNOWN_IF(IbexIdInstrALUKnown, instr_rdata_alu_i,
+      instr_valid_i && !(illegal_c_insn_i || instr_fetch_err_i))
+
+  // Multicycle enable signals must be unique.
+  `ASSERT(IbexMulticycleEnableUnique,
+      $onehot0({lsu_req_dec, multdiv_en_dec, branch_in_dec, jump_in_dec}))
+
+  // Duplicated instruction flops must match
+  // === as DV environment can produce instructions with Xs in, so must use precise match that
+  // includes Xs
+  `ASSERT(IbexDuplicateInstrMatch, instr_valid_i |-> instr_rdata_i === instr_rdata_alu_i)
+
+  `ifdef CHECK_MISALIGNED
+  `ASSERT(IbexMisalignedMemoryAccess, !lsu_addr_incr_req_i)
+  `endif
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_if_stage.sv b/hw/ip/cheriot-ibex/rtl/cheriot_if_stage.sv
new file mode 100644
index 0000000..2829dd5
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_if_stage.sv
@@ -0,0 +1,807 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Instruction Fetch Stage
+ *
+ * Instruction fetch unit: Selection of the next PC, and buffering (sampling) of
+ * the read instruction.
+ */
+
+`include "prim_assert.sv"
+
+module cheriot_if_stage import cheriot_pkg::*; import cheri_pkg::*; #(
+  parameter int unsigned DmHaltAddr        = 32'h1A110800,
+  parameter int unsigned DmExceptionAddr   = 32'h1A110808,
+  parameter bit          DummyInstructions = 1'b0,
+  parameter bit          ICache            = 1'b0,
+  parameter bit          ICacheECC         = 1'b0,
+  parameter int unsigned BusSizeECC        = BUS_SIZE,
+  parameter int unsigned TagSizeECC        = IC_TAG_SIZE,
+  parameter int unsigned LineSizeECC       = IC_LINE_SIZE,
+  parameter bit          PCIncrCheck       = 1'b0,
+  parameter bit          ResetAll          = 1'b0,
+  parameter lfsr_seed_t  RndCnstLfsrSeed   = RndCnstLfsrSeedDefault,
+  parameter lfsr_perm_t  RndCnstLfsrPerm   = RndCnstLfsrPermDefault,
+  parameter bit          BranchPredictor   = 1'b0,
+  parameter bit          CHERIoTEn         = 1'b1
+) (
+  input  logic                         clk_i,
+  input  logic                         rst_ni,
+
+  input  logic                         cheri_pmode_i,
+  input  logic [31:0]                  boot_addr_i,              // also used for mtvec
+  input  logic                         req_i,                    // instruction request control
+  input  logic                         debug_mode_i,
+
+  // instruction cache interface
+  output logic                        instr_req_o,
+  output logic [31:0]                 instr_addr_o,
+  input  logic                        instr_gnt_i,
+  input  logic                        instr_rvalid_i,
+  input  logic [31:0]                 instr_rdata_i,
+  input  logic                        instr_err_i,
+
+  // ICache RAM IO
+  output logic [IC_NUM_WAYS-1:0]      ic_tag_req_o,
+  output logic                        ic_tag_write_o,
+  output logic [IC_INDEX_W-1:0]       ic_tag_addr_o,
+  output logic [TagSizeECC-1:0]       ic_tag_wdata_o,
+  input  logic [TagSizeECC-1:0]       ic_tag_rdata_i [IC_NUM_WAYS],
+  output logic [IC_NUM_WAYS-1:0]      ic_data_req_o,
+  output logic                        ic_data_write_o,
+  output logic [IC_INDEX_W-1:0]       ic_data_addr_o,
+  output logic [LineSizeECC-1:0]      ic_data_wdata_o,
+  input  logic [LineSizeECC-1:0]      ic_data_rdata_i [IC_NUM_WAYS],
+  input  logic                        ic_scr_key_valid_i,
+
+  // output of ID stage
+  output logic                        instr_valid_id_o,         // instr in IF-ID is valid
+  output logic                        instr_new_id_o,           // instr in IF-ID is new
+  output logic [31:0]                 instr_rdata_id_o,         // instr for ID stage
+  output logic [31:0]                 instr_rdata_alu_id_o,     // replicated instr for ID stage
+                                                                // to reduce fan-out
+  output logic [15:0]                 instr_rdata_c_id_o,       // compressed instr for ID stage
+                                                                // (mtval), meaningful only if
+                                                                // instr_is_compressed_id_o = 1'b1
+  output logic                        instr_is_compressed_id_o, // compressed decoder thinks this
+                                                                // is a compressed instr
+  output logic                        instr_bp_taken_o,         // instruction was predicted to be
+                                                                // a taken branch
+  output logic                        instr_fetch_err_o,        // bus error on fetch
+  output logic                        instr_fetch_err_plus2_o,  // bus error misaligned
+  output logic                        illegal_c_insn_id_o,      // compressed decoder thinks this
+                                                                // is an invalid instr
+  output logic                        instr_fetch_cheri_acc_vio_o,         
+  output logic                        instr_fetch_cheri_bound_vio_o,         
+  output logic                        dummy_instr_id_o,         // Instruction is a dummy
+  output logic [31:0]                 pc_if_o,
+  output logic [31:0]                 pc_id_o,
+  input  logic                        pmp_err_if_i,
+  input  logic                        pmp_err_if_plus2_i,
+
+  // control signals
+  input  logic                        instr_valid_clear_i,      // clear instr valid bit in IF-ID
+  input  logic                        pc_set_i,                 // set the PC to a new value
+  input  pc_sel_e                     pc_mux_i,                 // selector for PC multiplexer
+  input  logic                        nt_branch_mispredict_i,   // Not-taken branch in ID/EX was
+                                                                // mispredicted (predicted taken)
+  input  logic [31:0]                 nt_branch_addr_i,         // Not-taken branch address in ID/EX
+  input  exc_pc_sel_e                 exc_pc_mux_i,             // selects ISR address
+  input  exc_cause_e                  exc_cause,                // selects ISR address for
+                                                                // vectorized interrupt lines
+  input logic                         dummy_instr_en_i,
+  input logic [2:0]                   dummy_instr_mask_i,
+  input logic                         dummy_instr_seed_en_i,
+  input logic [31:0]                  dummy_instr_seed_i,
+  input logic                         icache_enable_i,
+  input logic                         icache_inval_i,
+
+  // jump and branch target
+  input  logic [31:0]                 branch_target_ex_i,       // branch/jump target address
+
+  // CSRs
+  input  logic [31:0]                 csr_mepc_i,               // PC to restore after handling
+                                                                // the interrupt/exception
+  input  logic [31:0]                 csr_depc_i,               // PC to restore after handling
+                                                                // the debug request
+  input  logic [31:0]                 csr_mtvec_i,              // base PC to jump to on exception
+  output logic                        csr_mtvec_init_o,         // tell CS regfile to init mtvec
+
+  // pipeline stall
+  input  logic                        id_in_ready_i,            // ID stage is ready for new instr
+
+  // misc signals
+  output logic                        pc_mismatch_alert_o,
+  output logic                        if_busy_o,                // IF stage is busy fetching instr
+  input  pcc_cap_t                    pcc_cap_i
+);
+
+  logic              instr_valid_id_d, instr_valid_id_q;
+  logic              instr_new_id_d, instr_new_id_q;
+
+  // prefetch buffer related signals
+  logic              prefetch_busy;
+  logic              branch_req;
+  logic       [31:0] fetch_addr_n;
+  logic              unused_fetch_addr_n0;
+
+  logic              fetch_valid;
+  logic              fetch_ready;
+  logic       [31:0] fetch_rdata;
+  logic       [31:0] fetch_addr;
+  logic              fetch_err;
+  logic              fetch_err_plus2;
+
+  logic [31:0]       instr_decompressed;
+  logic              illegal_c_insn;
+  logic              instr_is_compressed;
+
+  logic              if_instr_valid;
+  logic       [31:0] if_instr_rdata;
+  logic       [31:0] if_instr_addr;
+  logic              if_instr_bus_err;
+  logic              if_instr_pmp_err;
+  logic              if_instr_err;
+  logic              if_instr_err_plus2;
+
+  logic       [31:0] exc_pc;
+
+  logic        [5:0] irq_id;
+  logic              unused_irq_bit;
+
+  logic              if_id_pipe_reg_we; // IF-ID pipeline reg write enable
+
+  // Dummy instruction signals
+  logic              stall_dummy_instr;
+  logic [31:0]       instr_out;
+  logic              instr_is_compressed_out;
+  logic              illegal_c_instr_out;
+  logic              instr_err_out;
+
+  logic              predict_branch_taken;
+  logic       [31:0] predict_branch_pc;
+
+  cheriot_pkg::pc_sel_e pc_mux_internal;
+
+  logic        [7:0] unused_boot_addr;
+  logic        [7:0] unused_csr_mtvec;
+
+  logic              cheri_acc_vio, cheri_bound_vio;
+  logic              cheri_force_uc;
+
+  assign unused_boot_addr = boot_addr_i[7:0];
+  assign unused_csr_mtvec = csr_mtvec_i[7:0];
+
+  // extract interrupt ID from exception cause
+  assign irq_id         = {exc_cause};
+  assign unused_irq_bit = irq_id[5];   // MSB distinguishes interrupts from exceptions
+
+  // exception PC selection mux
+  always_comb begin : exc_pc_mux
+    unique case (exc_pc_mux_i)
+      EXC_PC_EXC:     exc_pc = (csr_mtvec_i[0] | ~cheri_pmode_i)? { csr_mtvec_i[31:8], 8'h00 } : {csr_mtvec_i[31:2], 2'b00};
+      EXC_PC_IRQ:     exc_pc = (csr_mtvec_i[0] | ~cheri_pmode_i) ? { csr_mtvec_i[31:8], 1'b0, irq_id[4:0], 2'b00 } : {csr_mtvec_i[31:2], 2'b00};
+      EXC_PC_DBD:     exc_pc = DmHaltAddr;
+      EXC_PC_DBG_EXC: exc_pc = DmExceptionAddr;
+      default:        exc_pc = { csr_mtvec_i[31:8], 8'h00                    };
+    endcase
+  end
+
+  // The Branch predictor can provide a new PC which is internal to if_stage. Only override the mux
+  // select to choose this if the core isn't already trying to set a PC.
+  assign pc_mux_internal =
+    (BranchPredictor && predict_branch_taken && !pc_set_i) ? PC_BP : pc_mux_i;
+
+  // fetch address selection mux
+  always_comb begin : fetch_addr_mux
+    unique case (pc_mux_internal)
+      PC_BOOT: fetch_addr_n = { boot_addr_i[31:8], 8'h80 };
+      PC_JUMP: fetch_addr_n = branch_target_ex_i;
+      PC_EXC:  fetch_addr_n = exc_pc;                       // set PC to exception handler
+      PC_ERET: fetch_addr_n = csr_mepc_i;                   // restore PC when returning from EXC
+      PC_DRET: fetch_addr_n = csr_depc_i;
+      // Without branch predictor will never get pc_mux_internal == PC_BP. We still handle no branch
+      // predictor case here to ensure redundant mux logic isn't synthesised.
+      PC_BP:   fetch_addr_n = BranchPredictor ? predict_branch_pc : { boot_addr_i[31:8], 8'h80 };
+      default: fetch_addr_n = { boot_addr_i[31:8], 8'h80 };
+    endcase
+  end
+
+  // tell CS register file to initialize mtvec on boot
+  assign csr_mtvec_init_o = (pc_mux_i == PC_BOOT) & pc_set_i;
+  if (ICache) begin : gen_icache
+    // Full I-Cache option
+    cheriot_icache #(
+      .ICacheECC       (ICacheECC),
+      .ResetAll        (ResetAll),
+      .BusSizeECC      (BusSizeECC),
+      .TagSizeECC      (TagSizeECC),
+      .LineSizeECC     (LineSizeECC)
+    ) icache_i (
+        .clk_i               ( clk_i                      ),
+        .rst_ni              ( rst_ni                     ),
+
+        .req_i               ( req_i                      ),
+
+        .branch_i            ( branch_req                 ),
+        .branch_mispredict_i ( nt_branch_mispredict_i     ),
+        .mispredict_addr_i   ( nt_branch_addr_i           ),
+        .addr_i              ( {fetch_addr_n[31:1], 1'b0} ),
+
+        .ready_i             ( fetch_ready                ),
+        .valid_o             ( fetch_valid                ),
+        .rdata_o             ( fetch_rdata                ),
+        .addr_o              ( fetch_addr                 ),
+        .err_o               ( fetch_err                  ),
+        .err_plus2_o         ( fetch_err_plus2            ),
+
+        .instr_req_o         ( instr_req_o                ),
+        .instr_addr_o        ( instr_addr_o               ),
+        .instr_gnt_i         ( instr_gnt_i                ),
+        .instr_rvalid_i      ( instr_rvalid_i             ),
+        .instr_rdata_i       ( instr_rdata_i              ),
+        .instr_err_i         ( instr_err_i                ),
+
+        .ic_tag_req_o        ( ic_tag_req_o               ),
+        .ic_tag_write_o      ( ic_tag_write_o             ),
+        .ic_tag_addr_o       ( ic_tag_addr_o              ),
+        .ic_tag_wdata_o      ( ic_tag_wdata_o             ),
+        .ic_tag_rdata_i      ( ic_tag_rdata_i             ),
+        .ic_data_req_o       ( ic_data_req_o              ),
+        .ic_data_write_o     ( ic_data_write_o            ),
+        .ic_data_addr_o      ( ic_data_addr_o             ),
+        .ic_data_wdata_o     ( ic_data_wdata_o            ),
+        .ic_data_rdata_i     ( ic_data_rdata_i            ),
+        .ic_scr_key_valid_i  ( ic_scr_key_valid_i         ),
+
+        .icache_enable_i     ( icache_enable_i            ),
+        .icache_inval_i      ( icache_inval_i             ),
+        .busy_o              ( prefetch_busy              )
+    );
+
+  end else begin : gen_prefetch_buffer
+
+    // prefetch buffer, caches a fixed number of instructions
+    cheriot_prefetch_buffer #(
+      .ResetAll        (ResetAll)
+    ) prefetch_buffer_i (
+        .clk_i               ( clk_i                      ),
+        .rst_ni              ( rst_ni                     ),
+
+        .req_i               ( req_i                      ),
+
+        .branch_i            ( branch_req                 ),
+        .branch_mispredict_i ( nt_branch_mispredict_i     ),
+        .mispredict_addr_i   ( nt_branch_addr_i           ),
+        .addr_i              ( {fetch_addr_n[31:1], 1'b0} ),
+
+        .ready_i             ( fetch_ready                ),
+        .valid_o             ( fetch_valid                ),
+        .rdata_o             ( fetch_rdata                ),
+        .addr_o              ( fetch_addr                 ),
+        .err_o               ( fetch_err                  ),
+        .err_plus2_o         ( fetch_err_plus2            ),
+
+        .cheri_force_uc_i    ( cheri_force_uc            ),
+
+        .instr_req_o         ( instr_req_o                ),
+        .instr_addr_o        ( instr_addr_o               ),
+        .instr_gnt_i         ( instr_gnt_i                ),
+        .instr_rvalid_i      ( instr_rvalid_i             ),
+        .instr_rdata_i       ( instr_rdata_i              ),
+        .instr_err_i         ( instr_err_i                ),
+
+        .busy_o              ( prefetch_busy              )
+    );
+
+    // ICache tieoffs
+    logic                   unused_icen, unused_icinv, unused_scr_key_valid;
+    logic [TagSizeECC-1:0]  unused_tag_ram_input [IC_NUM_WAYS];
+    logic [LineSizeECC-1:0] unused_data_ram_input [IC_NUM_WAYS];
+    assign unused_icen           = icache_enable_i;
+    assign unused_icinv          = icache_inval_i;
+    assign unused_tag_ram_input  = ic_tag_rdata_i;
+    assign unused_data_ram_input = ic_data_rdata_i;
+    assign unused_scr_key_valid  = ic_scr_key_valid_i;
+    assign ic_tag_req_o          = 'b0;
+    assign ic_tag_write_o        = 'b0;
+    assign ic_tag_addr_o         = 'b0;
+    assign ic_tag_wdata_o        = 'b0;
+    assign ic_data_req_o         = 'b0;
+    assign ic_data_write_o       = 'b0;
+    assign ic_data_addr_o        = 'b0;
+    assign ic_data_wdata_o       = 'b0;
+
+`ifndef SYNTHESIS
+    // If we don't instantiate an icache and this is a simulation then we have a problem because the
+    // simulator might discard the icache module entirely, including some DPI exports that it
+    // implies. This then causes problems for linking against C++ testbench code that expected them.
+    // As a slightly ugly hack, let's define the DPI functions here (the real versions are defined
+    // in prim_util_get_scramble_params.svh)
+    export "DPI-C" function simutil_get_scramble_key;
+    export "DPI-C" function simutil_get_scramble_nonce;
+    function automatic int simutil_get_scramble_key(output bit [127:0] val);
+      return 0;
+    endfunction
+    function automatic int simutil_get_scramble_nonce(output bit [319:0] nonce);
+      return 0;
+    endfunction
+`endif
+  end
+
+  assign unused_fetch_addr_n0 = fetch_addr_n[0];
+
+  assign branch_req  = pc_set_i | predict_branch_taken;
+
+  assign pc_if_o     = if_instr_addr;
+  assign if_busy_o   = prefetch_busy;
+
+  // PMP errors
+  // An error can come from the instruction address, or the next instruction address for unaligned,
+  // uncompressed instructions.
+  assign if_instr_pmp_err = pmp_err_if_i |
+                            (if_instr_addr[1] & ~instr_is_compressed & pmp_err_if_plus2_i);
+
+  // Combine bus errors and pmp errors
+  assign if_instr_err = if_instr_bus_err | if_instr_pmp_err | cheri_acc_vio | cheri_bound_vio;
+
+  // Capture the second half of the address for errors on the second part of an instruction
+  // LEC_NOT_COMPATIBLE
+  assign if_instr_err_plus2 = ((if_instr_addr[1] & ~instr_is_compressed & pmp_err_if_plus2_i) |
+                               fetch_err_plus2) & ~pmp_err_if_i;
+
+  // pre-calculate headroom to improve memory read timing
+  logic [33:0] instr_hdrm;
+  logic        hdrm_ge4, hdrm_ge2, hdrm_ok, base_ok;
+  logic        allow_all;
+
+  // allow_all is used to permit the pc wraparound case (pc = 0xffff_fffe, uncompressed instruction)
+  // - in this case fetch should be allowed if pcc bounds is specified as the entire 32-bit space. 
+  // - If we don't treat this as a specail case the fetch would be erred since headroom < 4
+  assign allow_all  = (pcc_cap_i.base32==0) & (pcc_cap_i.top33==33'h1_0000_0000);
+
+  assign instr_hdrm = {1'b0, pcc_cap_i.top33} - {2'b00, if_instr_addr};
+  assign hdrm_ge4   = (|instr_hdrm[32:2]) & ~instr_hdrm[33];     // >= 4
+  assign hdrm_ge2   = (|instr_hdrm[32:1]) & ~instr_hdrm[33];     // >= 2
+  assign hdrm_ok    = allow_all || (instr_is_compressed ? hdrm_ge2 : hdrm_ge4);
+  assign base_ok    = ~(if_instr_addr < pcc_cap_i.base32);
+
+  // only issue cheri_acc_vio on valid fetches
+  assign cheri_bound_vio = CHERIoTEn & cheri_pmode_i & ~debug_mode_i & (~base_ok  || ~hdrm_ok);
+
+  // In order to have constant timing (avoid side-channel leakage due to data-dependent behavior), 
+  // if base vio or headroom < 4 (we are only authorized to fetch 2 bytes), force the fetch_fifo
+  // to treat the current rdata as a unaligned compressed instruction if pc[1]=1, and push it to 
+  // ID stage without waiting for the 2nd part of 32-bit instruciton. 
+  // 
+  assign cheri_force_uc = CHERIoTEn & cheri_pmode_i & ~allow_all & (~base_ok | ~hdrm_ge4);
+
+  // we still check seal/perm here to be safe, however by ISA those can't happen at fetch time 
+  // since they are check elsewhere already
+  assign cheri_acc_vio = CHERIoTEn & cheri_pmode_i & ~debug_mode_i & 
+                         (~pcc_cap_i.perms[PERM_EX] || ~pcc_cap_i.valid || (pcc_cap_i.otype!=0));
+
+  // compressed instruction decoding, or more precisely compressed instruction
+  // expander
+  //
+  // since it does not matter where we decompress instructions, we do it here
+  // to ease timing closure
+  cheriot_compressed_decoder #(
+    .CHERIoTEn (CHERIoTEn)
+  ) compressed_decoder_i (
+    .clk_i          (clk_i),
+    .rst_ni         (rst_ni),
+    .valid_i        (fetch_valid & ~fetch_err),
+    .instr_i        (if_instr_rdata),
+    .cheri_pmode_i  (cheri_pmode_i),
+    .instr_o        (instr_decompressed),
+    .is_compressed_o(instr_is_compressed),
+    .illegal_instr_o(illegal_c_insn)
+  );
+
+  // Dummy instruction insertion
+  if (DummyInstructions) begin : gen_dummy_instr
+    logic        insert_dummy_instr;
+    logic [31:0] dummy_instr_data;
+
+    cheriot_dummy_instr #(
+      .RndCnstLfsrSeed (RndCnstLfsrSeed),
+      .RndCnstLfsrPerm (RndCnstLfsrPerm)
+    ) dummy_instr_i (
+      .clk_i                (clk_i),
+      .rst_ni               (rst_ni),
+      .dummy_instr_en_i     (dummy_instr_en_i),
+      .dummy_instr_mask_i   (dummy_instr_mask_i),
+      .dummy_instr_seed_en_i(dummy_instr_seed_en_i),
+      .dummy_instr_seed_i   (dummy_instr_seed_i),
+      .fetch_valid_i        (fetch_valid),
+      .id_in_ready_i        (id_in_ready_i),
+      .insert_dummy_instr_o (insert_dummy_instr),
+      .dummy_instr_data_o   (dummy_instr_data)
+    );
+
+    // Mux between actual instructions and dummy instructions
+    assign instr_out               = insert_dummy_instr ? dummy_instr_data : instr_decompressed;
+    assign instr_is_compressed_out = insert_dummy_instr ? 1'b0 : instr_is_compressed;
+    assign illegal_c_instr_out     = insert_dummy_instr ? 1'b0 : illegal_c_insn;
+    assign instr_err_out           = insert_dummy_instr ? 1'b0 : if_instr_err;
+
+    // Stall the IF stage if we insert a dummy instruction. The dummy will execute between whatever
+    // is currently in the ID stage and whatever is valid from the prefetch buffer this cycle. The
+    // PC of the dummy instruction will match whatever is next from the prefetch buffer.
+    assign stall_dummy_instr = insert_dummy_instr;
+
+    // Register the dummy instruction indication into the ID stage
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        dummy_instr_id_o <= 1'b0;
+      end else if (if_id_pipe_reg_we) begin
+        dummy_instr_id_o <= insert_dummy_instr;
+      end
+    end
+
+  end else begin : gen_no_dummy_instr
+    logic        unused_dummy_en;
+    logic [2:0]  unused_dummy_mask;
+    logic        unused_dummy_seed_en;
+    logic [31:0] unused_dummy_seed;
+
+    assign unused_dummy_en         = dummy_instr_en_i;
+    assign unused_dummy_mask       = dummy_instr_mask_i;
+    assign unused_dummy_seed_en    = dummy_instr_seed_en_i;
+    assign unused_dummy_seed       = dummy_instr_seed_i;
+    assign instr_out               = instr_decompressed;
+    assign instr_is_compressed_out = instr_is_compressed;
+    assign illegal_c_instr_out     = illegal_c_insn;
+    assign instr_err_out           = if_instr_err;
+    assign stall_dummy_instr       = 1'b0;
+    assign dummy_instr_id_o        = 1'b0;
+  end
+
+  // The ID stage becomes valid as soon as any instruction is registered in the ID stage flops.
+  // Note that the current instruction is squashed by the incoming pc_set_i signal.
+  // Valid is held until it is explicitly cleared (due to an instruction completing or an exception)
+  assign instr_valid_id_d = (if_instr_valid & id_in_ready_i & ~pc_set_i) |
+                            (instr_valid_id_q & ~instr_valid_clear_i);
+  assign instr_new_id_d   = if_instr_valid & id_in_ready_i;
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      instr_valid_id_q <= 1'b0;
+      instr_new_id_q   <= 1'b0;
+    end else begin
+      instr_valid_id_q <= instr_valid_id_d;
+      instr_new_id_q   <= instr_new_id_d;
+    end
+  end
+
+  assign instr_valid_id_o = instr_valid_id_q;
+  // Signal when a new instruction enters the ID stage (only used for RVFI signalling).
+  assign instr_new_id_o   = instr_new_id_q;
+
+  // IF-ID pipeline registers, frozen when the ID stage is stalled
+  assign if_id_pipe_reg_we = instr_new_id_d;
+
+  if (ResetAll) begin : g_instr_rdata_ra
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        instr_rdata_id_o         <= '0;
+        instr_rdata_alu_id_o     <= '0;
+        instr_fetch_err_o        <= '0;
+        instr_fetch_err_plus2_o  <= '0;
+        instr_rdata_c_id_o       <= '0;
+        instr_is_compressed_id_o <= '0;
+        illegal_c_insn_id_o      <= '0;
+        pc_id_o                  <= '0;
+        instr_fetch_cheri_acc_vio_o   <= '0;
+        instr_fetch_cheri_bound_vio_o <= '0;
+      end else if (if_id_pipe_reg_we) begin
+        instr_rdata_id_o         <= instr_out;
+        // To reduce fan-out and help timing from the instr_rdata_id flops they are replicated.
+        instr_rdata_alu_id_o     <= instr_out;
+        instr_fetch_err_o        <= instr_err_out;
+        instr_fetch_err_plus2_o  <= if_instr_err_plus2;
+        instr_rdata_c_id_o       <= if_instr_rdata[15:0];
+        instr_is_compressed_id_o <= instr_is_compressed_out;
+        illegal_c_insn_id_o      <= illegal_c_instr_out;
+        pc_id_o                  <= pc_if_o;
+        instr_fetch_cheri_acc_vio_o    <= cheri_acc_vio; 
+        instr_fetch_cheri_bound_vio_o  <= cheri_bound_vio; 
+      end
+    end
+  end else begin : g_instr_rdata_nr
+    always_ff @(posedge clk_i) begin
+      if (if_id_pipe_reg_we) begin
+        instr_rdata_id_o         <= instr_out;
+        // To reduce fan-out and help timing from the instr_rdata_id flops they are replicated.
+        instr_rdata_alu_id_o     <= instr_out;
+        instr_fetch_err_o        <= instr_err_out;
+        instr_fetch_err_plus2_o  <= if_instr_err_plus2;
+        instr_rdata_c_id_o       <= if_instr_rdata[15:0];
+        instr_is_compressed_id_o <= instr_is_compressed_out;
+        illegal_c_insn_id_o      <= illegal_c_instr_out;
+        pc_id_o                  <= pc_if_o;
+        instr_fetch_cheri_acc_vio_o    <= cheri_acc_vio; 
+        instr_fetch_cheri_bound_vio_o  <= cheri_bound_vio; 
+      end
+    end
+  end
+
+  // Check for expected increments of the PC when security hardening enabled
+  if (PCIncrCheck) begin : g_secure_pc
+    logic [31:0] prev_instr_addr_incr, prev_instr_addr_incr_buf;
+    logic        prev_instr_seq_q, prev_instr_seq_d;
+
+    // Do not check for sequential increase after a branch, jump, exception, interrupt or debug
+    // request, all of which will set branch_req. Also do not check after reset or for dummys.
+    assign prev_instr_seq_d = (prev_instr_seq_q | instr_new_id_d) &
+        ~branch_req & ~if_instr_err & ~stall_dummy_instr;
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        prev_instr_seq_q <= 1'b0;
+      end else begin
+        prev_instr_seq_q <= prev_instr_seq_d;
+      end
+    end
+
+    assign prev_instr_addr_incr = pc_id_o + (instr_is_compressed_id_o ? 32'd2 : 32'd4);
+
+    `ifdef FPGA
+    // Buffer anticipated next PC address to ensure optimiser cannot remove the check.
+    prim_buf #(.Width(32)) u_prev_instr_addr_incr_buf (
+      .in_i (prev_instr_addr_incr),
+      .out_o(prev_instr_addr_incr_buf)
+    );
+    `else
+      assign prev_instr_addr_incr_buf = prev_instr_addr_incr;
+    `endif
+
+    // Check that the address equals the previous address +2/+4
+    assign pc_mismatch_alert_o = prev_instr_seq_q & (pc_if_o != prev_instr_addr_incr_buf);
+
+  end else begin : g_no_secure_pc
+    assign pc_mismatch_alert_o = 1'b0;
+  end
+
+  if (BranchPredictor) begin : g_branch_predictor
+    logic [31:0] instr_skid_data_q;
+    logic [31:0] instr_skid_addr_q;
+    logic        instr_skid_bp_taken_q;
+    logic        instr_skid_valid_q, instr_skid_valid_d;
+    logic        instr_skid_en;
+    logic        instr_bp_taken_q, instr_bp_taken_d;
+
+    logic        predict_branch_taken_raw;
+
+    // ID stages needs to know if branch was predicted taken so it can signal mispredicts
+    if (ResetAll) begin : g_bp_taken_ra
+      always_ff @(posedge clk_i or negedge rst_ni) begin
+        if (!rst_ni) begin
+          instr_bp_taken_q <= '0;
+        end else if (if_id_pipe_reg_we) begin
+          instr_bp_taken_q <= instr_bp_taken_d;
+        end
+      end
+    end else begin : g_bp_taken_nr
+      always_ff @(posedge clk_i) begin
+        if (if_id_pipe_reg_we) begin
+          instr_bp_taken_q <= instr_bp_taken_d;
+        end
+      end
+    end
+
+    // When branch prediction is enabled a skid buffer between the IF and ID/EX stage is introduced.
+    // If an instruction in IF is predicted to be a taken branch and ID/EX is not ready the
+    // instruction in IF is moved to the skid buffer which becomes the output of the IF stage until
+    // the ID/EX stage accepts the instruction. The skid buffer is required as otherwise the ID/EX
+    // ready signal is coupled to the instr_req_o output which produces a feedthrough path from
+    // data_gnt_i -> instr_req_o (which needs to be avoided as for some interconnects this will
+    // result in a combinational loop).
+
+    assign instr_skid_en = predict_branch_taken & ~pc_set_i & ~id_in_ready_i & ~instr_skid_valid_q;
+
+    assign instr_skid_valid_d = (instr_skid_valid_q & ~id_in_ready_i & ~stall_dummy_instr) |
+                                instr_skid_en;
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        instr_skid_valid_q <= 1'b0;
+      end else begin
+        instr_skid_valid_q <= instr_skid_valid_d;
+      end
+    end
+
+    if (ResetAll) begin : g_instr_skid_ra
+      always_ff @(posedge clk_i or negedge rst_ni) begin
+        if (!rst_ni) begin
+          instr_skid_bp_taken_q <= '0;
+          instr_skid_data_q     <= '0;
+          instr_skid_addr_q     <= '0;
+        end else if (instr_skid_en) begin
+          instr_skid_bp_taken_q <= predict_branch_taken;
+          instr_skid_data_q     <= fetch_rdata;
+          instr_skid_addr_q     <= fetch_addr;
+        end
+      end
+    end else begin : g_instr_skid_nr
+      always_ff @(posedge clk_i) begin
+        if (instr_skid_en) begin
+          instr_skid_bp_taken_q <= predict_branch_taken;
+          instr_skid_data_q     <= fetch_rdata;
+          instr_skid_addr_q     <= fetch_addr;
+        end
+      end
+    end
+
+    cheriot_branch_predict branch_predict_i (
+      .clk_i        (clk_i),
+      .rst_ni       (rst_ni),
+      .fetch_rdata_i(fetch_rdata),
+      .fetch_pc_i   (fetch_addr),
+      .fetch_valid_i(fetch_valid),
+
+      .predict_branch_taken_o(predict_branch_taken_raw),
+      .predict_branch_pc_o   (predict_branch_pc)
+    );
+
+    // If there is an instruction in the skid buffer there must be no branch prediction.
+    // Instructions are only placed in the skid after they have been predicted to be a taken branch
+    // so with the skid valid any prediction has already occurred.
+    // Do not branch predict on instruction errors.
+    assign predict_branch_taken = predict_branch_taken_raw & ~instr_skid_valid_q & ~fetch_err;
+
+    assign if_instr_valid   = fetch_valid | (instr_skid_valid_q & ~nt_branch_mispredict_i);
+    assign if_instr_rdata   = instr_skid_valid_q ? instr_skid_data_q : fetch_rdata;
+    assign if_instr_addr    = instr_skid_valid_q ? instr_skid_addr_q : fetch_addr;
+
+    // Don't branch predict on instruction error so only instructions without errors end up in the
+    // skid buffer.
+    assign if_instr_bus_err = ~instr_skid_valid_q & fetch_err;
+    assign instr_bp_taken_d = instr_skid_valid_q ? instr_skid_bp_taken_q : predict_branch_taken;
+
+    assign fetch_ready = id_in_ready_i & ~stall_dummy_instr & ~instr_skid_valid_q;
+
+    assign instr_bp_taken_o = instr_bp_taken_q;
+
+    `ASSERT(NoPredictSkid, instr_skid_valid_q |-> ~predict_branch_taken)
+    `ASSERT(NoPredictIllegal, predict_branch_taken |-> ~illegal_c_insn)
+  end else begin : g_no_branch_predictor
+    assign instr_bp_taken_o     = 1'b0;
+    assign predict_branch_taken = 1'b0;
+    assign predict_branch_pc    = 32'b0;
+
+    assign if_instr_valid = fetch_valid;
+    assign if_instr_rdata = fetch_rdata;
+    assign if_instr_addr  = fetch_addr;
+    assign if_instr_bus_err = fetch_err;
+    assign fetch_ready = id_in_ready_i & ~stall_dummy_instr;
+  end
+
+  ////////////////
+  // Assertions //
+  ////////////////
+
+  // Selectors must be known/valid.
+  `ASSERT_KNOWN(IbexExcPcMuxKnown, exc_pc_mux_i)
+
+  if (BranchPredictor) begin : g_branch_predictor_asserts
+    `ASSERT_IF(IbexPcMuxValid, pc_mux_internal inside {
+        PC_BOOT,
+        PC_JUMP,
+        PC_EXC,
+        PC_ERET,
+        PC_DRET,
+        PC_BP},
+      pc_set_i)
+
+`ifdef INC_ASSERT
+    /**
+     * Checks for branch prediction interface to fetch_fifo/icache
+     *
+     * The interface has two signals:
+     * - predicted_branch_i: When set with a branch (branch_i) indicates the branch is a predicted
+     *   one, it should be ignored when a branch_i isn't set.
+     * - branch_mispredict_i: Indicates the previously predicted branch was mis-predicted and
+     *   execution should resume with the not-taken side of the branch (i.e. continue with the PC
+     *   that followed the predicted branch). This must be raised before the instruction that is
+     *   made available following a predicted branch is accepted (Following a cycle with branch_i
+     *   & predicted_branch_i, branch_mispredict_i can only be asserted before or on the same cycle
+     *   as seeing fetch_valid & fetch_ready). When branch_mispredict_i is asserted, fetch_valid may
+     *   be asserted in response. If fetch_valid is asserted on the same cycle as
+     *   branch_mispredict_i this indicates the fetch_fifo/icache has the not-taken side of the
+     *   branch immediately ready for use
+     */
+    logic        predicted_branch_live_q, predicted_branch_live_d;
+    logic [31:0] predicted_branch_nt_pc_q, predicted_branch_nt_pc_d;
+    logic [31:0] awaiting_instr_after_mispredict_q, awaiting_instr_after_mispredict_d;
+    logic [31:0] next_pc;
+
+    logic mispredicted, mispredicted_d, mispredicted_q;
+
+    assign next_pc = fetch_addr + (instr_is_compressed_out ? 32'd2 : 32'd4);
+
+    logic predicted_branch;
+
+    // pc_set_i takes precendence over branch prediction
+    assign predicted_branch = predict_branch_taken & ~pc_set_i;
+
+    always_comb begin
+      predicted_branch_live_d = predicted_branch_live_q;
+      mispredicted_d          = mispredicted_q;
+
+      if (branch_req & predicted_branch) begin
+        predicted_branch_live_d = 1'b1;
+        mispredicted_d          = 1'b0;
+      end else if (predicted_branch_live_q) begin
+        if (fetch_valid & fetch_ready) begin
+          predicted_branch_live_d = 1'b0;
+        end else if (nt_branch_mispredict_i) begin
+          mispredicted_d = 1'b1;
+        end
+      end
+    end
+
+    always @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        predicted_branch_live_q <= 1'b0;
+        mispredicted_q          <= 1'b0;
+      end else begin
+        predicted_branch_live_q <= predicted_branch_live_d;
+        mispredicted_q          <= mispredicted_d;
+      end
+    end
+
+    always @(posedge clk_i) begin
+      if (branch_req & predicted_branch) begin
+        predicted_branch_nt_pc_q <= next_pc;
+      end
+    end
+
+    // Must only see mispredict after we've performed a predicted branch but before we've accepted
+    // any instruction (with fetch_ready & fetch_valid) that follows that predicted branch.
+    `ASSERT(MispredictOnlyImmediatelyAfterPredictedBranch,
+      nt_branch_mispredict_i |-> predicted_branch_live_q)
+    // Check that on mispredict we get the correct PC for the non-taken side of the branch when
+    // prefetch buffer/icache makes that PC available.
+    `ASSERT(CorrectPCOnMispredict,
+      predicted_branch_live_q & mispredicted_d & fetch_valid |->
+      fetch_addr == predicted_branch_nt_pc_q)
+    // Must not signal mispredict over multiple cycles but it's possible to have back to back
+    // mispredicts for different branches (core signals mispredict, prefetch buffer/icache immediate
+    // has not-taken side of the mispredicted branch ready, which itself is a predicted branch,
+    // following cycle core signal that that branch has mispredicted).
+    `ASSERT(MispredictSingleCycle,
+      nt_branch_mispredict_i & ~(fetch_valid & fetch_ready) |=> ~nt_branch_mispredict_i)
+    // Note that we should never see a mispredict and an incoming branch on the same cycle.
+    // The mispredict also cancels any predicted branch so overall branch_req must be low.
+    `ASSERT(NoMispredBranch, nt_branch_mispredict_i |-> ~branch_req)
+`endif
+
+  end else begin : g_no_branch_predictor_asserts
+    `ASSERT_IF(IbexPcMuxValid, pc_mux_internal inside {
+        PC_BOOT,
+        PC_JUMP,
+        PC_EXC,
+        PC_ERET,
+        PC_DRET},
+      pc_set_i)
+  end
+
+  // Boot address must be aligned to 256 bytes.
+  `ASSERT(IbexBootAddrUnaligned, boot_addr_i[7:0] == 8'h00)
+
+  // Address must not contain X when request is sent.
+  `ASSERT(IbexInstrAddrUnknown, instr_req_o |-> !$isunknown(instr_addr_o))
+
+  // Address must be word aligned when request is sent.
+  `ASSERT(IbexInstrAddrUnaligned, instr_req_o |-> (instr_addr_o[1:0] == 2'b00))
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_load_store_unit.sv b/hw/ip/cheriot-ibex/rtl/cheriot_load_store_unit.sv
new file mode 100644
index 0000000..ebbe74a
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_load_store_unit.sv
@@ -0,0 +1,760 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+
+/**
+ * Load Store Unit
+ *
+ * Load Store Unit, used to eliminate multiple access during processor stalls,
+ * and to align bytes and halfwords.
+ */
+
+`include "prim_assert.sv"
+`include "dv_fcov_macros.svh"
+
+module cheriot_load_store_unit import cheriot_pkg::*; import cheri_pkg::*; #(
+  parameter bit CHERIoTEn   = 1'b1,
+  parameter bit MemCapFmt   = 1'b0,
+  parameter bit CheriTBRE   = 1'b0,
+  parameter bit CheriCapIT8 = 1'b0
+)(
+  input  logic         clk_i,
+  input  logic         rst_ni,
+  input  logic         cheri_pmode_i,
+
+  // data interface
+  output logic         data_req_o,
+  output logic         data_is_cap_o,
+  input  logic         data_gnt_i,
+  input  logic         data_rvalid_i,
+  input  logic         data_err_i,
+  input  logic         data_pmp_err_i,
+
+  output logic [31:0]  data_addr_o,
+  output logic         data_we_o,
+  output logic [3:0]   data_be_o,
+  output logic [32:0]  data_wdata_o,         // kliu
+  input  logic [32:0]  data_rdata_i,         // kliu
+
+  // signals to/from ID/EX stage
+  input  logic         lsu_we_i,             // write enable                     -> from ID/EX
+  input  logic         lsu_is_cap_i,         // kliu
+  input  logic         lsu_cheri_err_i,      // kliu
+  input  logic [1:0]   lsu_type_i,           // data type: word, half word, byte -> from ID/EX
+  input  logic [32:0]  lsu_wdata_i,          // data to write to memory          -> from ID/EX
+  input  reg_cap_t     lsu_wcap_i,           // kliu
+  input  logic [3:0]   lsu_lc_clrperm_i,
+  input  logic         lsu_sign_ext_i,       // sign extension                   -> from ID/EX
+  input  logic         cpu_stall_by_stkz_i, 
+  input  logic         cpu_grant_to_stkz_i, 
+
+  output reg_cap_t     lsu_rcap_o,           // kliu
+  output logic [32:0]  lsu_rdata_o,          // requested data                   -> to ID/EX
+  output logic         lsu_rdata_valid_o,
+  input  logic         lsu_req_i,            // data request                     -> from ID/EX
+
+  input  logic [31:0]  lsu_addr_i,           // address computed in ALU          -> from ID/EX
+
+  output logic         lsu_addr_incr_req_o,  // request address increment for
+                                              // misaligned accesses              -> to ID/EX
+  output logic [31:0]  addr_last_o,          // address of last transaction      -> to controller
+                                              // -> mtval
+                                              // -> AGU for misaligned accesses
+
+  output logic         lsu_req_done_o,       // Signals that data request is complete
+                                              // (only need to await final data
+                                              // response)                        -> to ID/EX
+  output logic         lsu_resp_valid_o,     // LSU has response from transaction -> to ID/EX & WB
+  output logic         lsu_resp_is_wr_o,
+
+  // TBRE related signals
+  input  logic         tbre_lsu_req_i,
+  input  logic         cpu_lsu_dec_i,
+  output logic         lsu_tbre_sel_o,        // request-side selection signal
+  output logic         lsu_tbre_addr_incr_req_o,  // request address increment for
+  output logic [32:0]  lsu_tbre_raw_lsw_o,
+  output logic         lsu_tbre_req_done_o,
+  output logic         lsu_tbre_resp_valid_o, // response from transaction -> to TBRE 
+  output logic         lsu_tbre_resp_err_o,
+
+  // exception signals
+  output logic         load_err_o,
+  output logic         store_err_o,
+  output logic         lsu_err_is_cheri_o,
+
+  output logic         busy_o,
+  output logic         busy_tbre_o,
+
+  output logic         perf_load_o,
+  output logic         perf_store_o
+);
+
+  logic [31:0]  data_addr;
+  logic [31:0]  data_addr_w_aligned;
+  logic [31:0]  addr_last_q, addr_last_d;
+
+  logic         addr_update;
+  logic         ctrl_update;
+  logic         rdata_update;
+  logic [31:8]  rdata_q;
+  logic [1:0]   rdata_offset_q;
+  logic [1:0]   data_type_q;
+  logic         data_sign_ext_q;
+  logic         data_we_q;
+
+  logic [1:0]   data_offset;   // mux control for data to be written to memory
+
+  logic [3:0]   data_be;
+  logic [32:0]  data_wdata;
+
+  logic [32:0]  data_rdata_ext;
+
+  logic [32:0]  rdata_w_ext; // word realignment for misaligned loads
+  logic [31:0]  rdata_h_ext; // sign extension for half words
+  logic [31:0]  rdata_b_ext; // sign extension for bytes
+
+  logic         split_misaligned_access;
+  logic         handle_misaligned_q, handle_misaligned_d; // high after receiving grant for first
+                                                          // part of a misaligned access
+  logic         pmp_err_q, pmp_err_d;
+  logic         lsu_err_q, lsu_err_d;
+  logic         data_or_pmp_err;
+
+  logic         resp_is_cap_q;
+  logic         cheri_err_d, cheri_err_q;
+  logic [3:0]   resp_lc_clrperm_q;
+  logic         cur_req_is_tbre;
+  logic         req_is_tbre_q;
+  logic         resp_is_tbre;
+  logic         tbre_req_good;
+  logic         outstanding_resp_q, resp_wait;
+  logic         lsu_resp_valid;
+  logic         lsu_go;
+  logic         addr_incr_req;
+  logic         cpu_req_erred, cpu_req_valid;
+  
+
+  ls_fsm_e ls_fsm_cs, ls_fsm_ns;
+
+  cap_rx_fsm_t cap_rx_fsm_q, cap_rx_fsm_d;
+
+  logic         cap_lsw_err_q;
+  logic [32:0]  cap_lsw_q;
+
+  assign data_addr   = lsu_addr_i;
+  assign data_offset = (cheri_pmode_i & lsu_is_cap_i) ? 2'b00 : data_addr[1:0];
+
+  ///////////////////
+  // BE generation //
+  ///////////////////
+
+  always_comb begin
+    if (CHERIoTEn & cheri_pmode_i & lsu_is_cap_i)
+      data_be = 4'b1111;        // caps are always word aligned
+    else begin
+      unique case (lsu_type_i) // Data type 00 Word, 01 Half word, 11,10 byte
+        2'b00: begin // Writing a word
+          if (!handle_misaligned_q) begin // first part of potentially misaligned transaction
+            unique case (data_offset)
+              2'b00:   data_be = 4'b1111;
+              2'b01:   data_be = 4'b1110;
+              2'b10:   data_be = 4'b1100;
+              2'b11:   data_be = 4'b1000;
+              default: data_be = 4'b1111;
+            endcase // case (data_offset)
+          end else begin // second part of misaligned transaction
+            unique case (data_offset)
+              2'b00:   data_be = 4'b0000; // this is not used, but included for completeness
+              2'b01:   data_be = 4'b0001;
+              2'b10:   data_be = 4'b0011;
+              2'b11:   data_be = 4'b0111;
+              default: data_be = 4'b1111;
+            endcase // case (data_offset)
+          end
+        end
+
+        2'b01: begin // Writing a half word
+          if (!handle_misaligned_q) begin // first part of potentially misaligned transaction
+            unique case (data_offset)
+              2'b00:   data_be = 4'b0011;
+              2'b01:   data_be = 4'b0110;
+              2'b10:   data_be = 4'b1100;
+              2'b11:   data_be = 4'b1000;
+              default: data_be = 4'b1111;
+            endcase // case (data_offset)
+          end else begin // second part of misaligned transaction
+            data_be = 4'b0001;
+          end
+        end
+
+        2'b10,
+        2'b11: begin // Writing a byte
+          unique case (data_offset)
+            2'b00:   data_be = 4'b0001;
+            2'b01:   data_be = 4'b0010;
+            2'b10:   data_be = 4'b0100;
+            2'b11:   data_be = 4'b1000;
+            default: data_be = 4'b1111;
+          endcase // case (data_offset)
+        end
+
+        default:     data_be = 4'b1111;
+      endcase // case (lsu_type_i)
+    end  // if lsu_cap_i
+  end
+
+  /////////////////////
+  // WData alignment //
+  /////////////////////
+
+  // prepare data to be written to the memory
+  // we handle misaligned accesses, half word and byte accesses here
+  if (~MemCapFmt) begin : gen_memcap_wr_fmt0
+    always_comb begin
+      if (CHERIoTEn & cheri_pmode_i & lsu_is_cap_i && (ls_fsm_cs == CTX_WAIT_GNT2))
+        data_wdata = CheriCapIT8 ? reg2memcap_it8_fmt0(lsu_wcap_i): 
+                                   reg2memcap_fmt0(lsu_wcap_i);
+      else if (CHERIoTEn & cheri_pmode_i & lsu_is_cap_i)
+        data_wdata = lsu_wdata_i;
+      else begin
+        unique case (data_offset)
+          2'b00:   data_wdata =  lsu_wdata_i[32:0];
+          2'b01:   data_wdata = {1'b0, lsu_wdata_i[23:0], lsu_wdata_i[31:24]};
+          2'b10:   data_wdata = {1'b0, lsu_wdata_i[15:0], lsu_wdata_i[31:16]};
+          2'b11:   data_wdata = {1'b0, lsu_wdata_i[ 7:0], lsu_wdata_i[31: 8]};
+          default: data_wdata =  lsu_wdata_i[32:0];
+        endcase // case (data_offset)
+      end
+    end
+  end else begin : gen_memcap_wr_fmt1
+    logic [65:0] mem_capaddr;
+    assign mem_capaddr = CheriCapIT8 ? reg2mem_it8_fmt1(lsu_wcap_i, lsu_wdata_i) : 
+                                       reg2mem_fmt1(lsu_wcap_i, lsu_wdata_i);
+
+    always_comb begin
+      if (CHERIoTEn & lsu_is_cap_i && (ls_fsm_cs == CTX_WAIT_GNT2))
+        data_wdata = mem_capaddr[65:33];
+      else if (CHERIoTEn & lsu_is_cap_i)
+        data_wdata = mem_capaddr[32:0];
+      else begin
+        unique case (data_offset)
+          2'b00:   data_wdata =  lsu_wdata_i[32:0];
+          2'b01:   data_wdata = {1'b0, lsu_wdata_i[23:0], lsu_wdata_i[31:24]};
+          2'b10:   data_wdata = {1'b0, lsu_wdata_i[15:0], lsu_wdata_i[31:16]};
+          2'b11:   data_wdata = {1'b0, lsu_wdata_i[ 7:0], lsu_wdata_i[31: 8]};
+          default: data_wdata =  lsu_wdata_i[32:0];
+        endcase // case (data_offset)
+      end
+    end
+  end
+  /////////////////////
+  // RData alignment //
+  /////////////////////
+
+  // register for unaligned rdata
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      rdata_q <= '0;
+    end else if (rdata_update) begin
+      rdata_q <= data_rdata_i[31:8];
+    end
+  end
+
+  // registers for transaction control
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      rdata_offset_q  <= 2'h0;
+      data_type_q     <= 2'h0;
+      data_sign_ext_q <= 1'b0;
+      data_we_q       <= 1'b0;
+    end else if (ctrl_update) begin
+      rdata_offset_q  <= data_offset;
+      data_type_q     <= lsu_type_i;
+      data_sign_ext_q <= lsu_sign_ext_i;
+      data_we_q       <= lsu_we_i;
+    end
+  end
+
+  // Store last address for mtval + AGU for misaligned transactions.  Do not update in case of
+  // errors, mtval needs the (first) failing address.  Where an aligned access or the first half of
+  // a misaligned access sees an error provide the calculated access address. For the second half of
+  // a misaligned access provide the word aligned address of the second half.
+  assign addr_last_d = addr_incr_req ? data_addr_w_aligned : data_addr;
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      addr_last_q <= '0;
+    end else if (addr_update & ~cur_req_is_tbre) begin
+      addr_last_q <= addr_last_d;
+    end
+  end
+
+  // take care of misaligned words
+  always_comb begin
+    unique case (rdata_offset_q)
+      2'b00:   rdata_w_ext =  data_rdata_i[32:0];
+      2'b01:   rdata_w_ext = {1'b0, data_rdata_i[ 7:0], rdata_q[31:8]};
+      2'b10:   rdata_w_ext = {1'b0, data_rdata_i[15:0], rdata_q[31:16]};
+      2'b11:   rdata_w_ext = {1'b0, data_rdata_i[23:0], rdata_q[31:24]};
+      default: rdata_w_ext =  data_rdata_i[32:0];
+    endcase
+  end
+
+  ////////////////////
+  // Sign extension //
+  ////////////////////
+
+  // sign extension for half words
+  always_comb begin
+    unique case (rdata_offset_q)
+      2'b00: begin
+        if (!data_sign_ext_q) begin
+          rdata_h_ext = {16'h0000, data_rdata_i[15:0]};
+        end else begin
+          rdata_h_ext = {{16{data_rdata_i[15]}}, data_rdata_i[15:0]};
+        end
+      end
+
+      2'b01: begin
+        if (!data_sign_ext_q) begin
+          rdata_h_ext = {16'h0000, data_rdata_i[23:8]};
+        end else begin
+          rdata_h_ext = {{16{data_rdata_i[23]}}, data_rdata_i[23:8]};
+        end
+      end
+
+      2'b10: begin
+        if (!data_sign_ext_q) begin
+          rdata_h_ext = {16'h0000, data_rdata_i[31:16]};
+        end else begin
+          rdata_h_ext = {{16{data_rdata_i[31]}}, data_rdata_i[31:16]};
+        end
+      end
+
+      2'b11: begin
+        if (!data_sign_ext_q) begin
+          rdata_h_ext = {16'h0000, data_rdata_i[7:0], rdata_q[31:24]};
+        end else begin
+          rdata_h_ext = {{16{data_rdata_i[7]}}, data_rdata_i[7:0], rdata_q[31:24]};
+        end
+      end
+
+      default: rdata_h_ext = {16'h0000, data_rdata_i[15:0]};
+    endcase // case (rdata_offset_q)
+  end
+
+  // sign extension for bytes
+  always_comb begin
+    unique case (rdata_offset_q)
+      2'b00: begin
+        if (!data_sign_ext_q) begin
+          rdata_b_ext = {24'h00_0000, data_rdata_i[7:0]};
+        end else begin
+          rdata_b_ext = {{24{data_rdata_i[7]}}, data_rdata_i[7:0]};
+        end
+      end
+
+      2'b01: begin
+        if (!data_sign_ext_q) begin
+          rdata_b_ext = {24'h00_0000, data_rdata_i[15:8]};
+        end else begin
+          rdata_b_ext = {{24{data_rdata_i[15]}}, data_rdata_i[15:8]};
+        end
+      end
+
+      2'b10: begin
+        if (!data_sign_ext_q) begin
+          rdata_b_ext = {24'h00_0000, data_rdata_i[23:16]};
+        end else begin
+          rdata_b_ext = {{24{data_rdata_i[23]}}, data_rdata_i[23:16]};
+        end
+      end
+
+      2'b11: begin
+        if (!data_sign_ext_q) begin
+          rdata_b_ext = {24'h00_0000, data_rdata_i[31:24]};
+        end else begin
+          rdata_b_ext = {{24{data_rdata_i[31]}}, data_rdata_i[31:24]};
+        end
+      end
+
+      default: rdata_b_ext = {24'h00_0000, data_rdata_i[7:0]};
+    endcase // case (rdata_offset_q)
+  end
+
+  // select word, half word or byte sign extended version
+  always_comb begin
+    unique case (data_type_q)
+      2'b00:       data_rdata_ext = rdata_w_ext;
+      2'b01:       data_rdata_ext = {1'b0, rdata_h_ext};
+      2'b10,2'b11: data_rdata_ext = {1'b0, rdata_b_ext};
+      default:     data_rdata_ext = rdata_w_ext;
+    endcase // case (data_type_q)
+  end
+
+  /////////////
+  // LSU FSM //
+  /////////////
+
+  // check for misaligned accesses that need to be split into two word-aligned accesses
+  assign split_misaligned_access =
+      ((lsu_type_i == 2'b00) && (data_offset != 2'b00)) || // misaligned word access
+      ((lsu_type_i == 2'b01) && (data_offset == 2'b11));   // misaligned half-word access
+
+  assign cpu_req_valid = lsu_req_i & ~lsu_cheri_err_i & ~cpu_stall_by_stkz_i;
+  assign cpu_req_erred = lsu_req_i & lsu_cheri_err_i;
+
+  // FSM
+  always_comb begin
+    ls_fsm_ns       = ls_fsm_cs;
+
+    data_req_o          = 1'b0;
+    addr_incr_req     = 1'b0;
+    handle_misaligned_d = handle_misaligned_q;
+    pmp_err_d           = pmp_err_q;
+    lsu_err_d           = lsu_err_q;
+    cheri_err_d         = cheri_err_q & cheri_pmode_i;
+
+    addr_update         = 1'b0;
+    ctrl_update         = 1'b0;
+    rdata_update        = 1'b0;
+
+    perf_load_o         = 1'b0;
+    perf_store_o        = 1'b0;
+
+    lsu_go              = 1'b0;
+
+    unique case (ls_fsm_cs)
+
+      IDLE: begin
+        pmp_err_d   = 1'b0;
+        cheri_err_d = 1'b0;
+
+        if (CHERIoTEn & cheri_pmode_i & cpu_req_erred & ~resp_wait) begin
+          // cheri access error case, don't issue data_req but send error response back to WB stage
+          data_req_o   = 1'b0;          
+          cheri_err_d  = 1'b1;
+          ctrl_update  = 1'b1;         // update ctrl/address so we can report error correctly
+          addr_update  = 1'b1;
+          pmp_err_d    = 1'b0;
+          lsu_err_d    = 1'b0;
+          perf_load_o  = 1'b0;
+          lsu_go       = 1'b1;         // decision to move forward with a request
+          ls_fsm_ns    = IDLE;
+        end else if (CHERIoTEn & cheri_pmode_i & (cpu_req_valid | tbre_req_good) &
+                     lsu_is_cap_i & ~resp_wait) begin
+          // normal cap access case
+          data_req_o   = 1'b1;
+          cheri_err_d  = 1'b0;
+          pmp_err_d    = data_pmp_err_i;
+          lsu_err_d    = 1'b0;
+          perf_load_o  = ~lsu_we_i;
+          perf_store_o = lsu_we_i;
+          lsu_go       = 1'b1;         // decision to move forward with a request
+
+          if (data_gnt_i) begin
+            ctrl_update         = 1'b1;
+            addr_update         = 1'b1;
+            ls_fsm_ns           = CTX_WAIT_GNT2;
+          end else begin
+            ls_fsm_ns           = CTX_WAIT_GNT1;
+          end
+        end else if ((cpu_req_valid | tbre_req_good) & ~resp_wait) begin   
+          // normal data access case
+          data_req_o   = 1'b1;
+          cheri_err_d  = 1'b0;
+          pmp_err_d    = data_pmp_err_i;
+          lsu_err_d    = 1'b0;
+          perf_load_o  = ~lsu_we_i;
+          perf_store_o = lsu_we_i;
+          lsu_go       = 1'b1;         // decision to move forward with a request
+
+          if (data_gnt_i) begin
+            ctrl_update         = 1'b1;
+            addr_update         = 1'b1;
+            handle_misaligned_d = split_misaligned_access;
+            ls_fsm_ns           = split_misaligned_access ? WAIT_RVALID_MIS : IDLE;
+          end else begin
+            ls_fsm_ns           = split_misaligned_access ? WAIT_GNT_MIS    : WAIT_GNT;
+          end
+        end
+
+      end
+
+      WAIT_GNT_MIS: begin
+        data_req_o = 1'b1;
+        // data_pmp_err_i is valid during the address phase of a request. An error will block the
+        // external request and so a data_gnt_i might never be signalled. The registered version
+        // pmp_err_q is only updated for new address phases and so can be used in WAIT_GNT* and
+        // WAIT_RVALID* states
+        if (data_gnt_i || pmp_err_q ) begin
+          addr_update         = 1'b1;
+          ctrl_update         = 1'b1;
+          handle_misaligned_d = 1'b1;
+          ls_fsm_ns           = WAIT_RVALID_MIS;
+        end
+      end
+
+      WAIT_RVALID_MIS: begin
+        // push out second request
+        data_req_o = 1'b1;
+        // tell ID/EX stage to update the address
+        addr_incr_req = 1'b1;
+
+        // first part rvalid is received, or gets a PMP error
+        if (data_rvalid_i || pmp_err_q) begin
+          // Update the PMP error for the second part
+          pmp_err_d = data_pmp_err_i;
+          // Record the error status of the first part
+          lsu_err_d = data_err_i | pmp_err_q;
+          // Capture the first rdata for loads
+          rdata_update = ~data_we_q;
+          // If already granted, wait for second rvalid
+          ls_fsm_ns = data_gnt_i ? IDLE : WAIT_GNT;
+          // Update the address for the second part, if no error
+          addr_update = data_gnt_i & ~(data_err_i | pmp_err_q);
+          // clear handle_misaligned if second request is granted
+          handle_misaligned_d = ~data_gnt_i;
+        end else begin
+          // first part rvalid is NOT received
+          if (data_gnt_i) begin
+            // second grant is received
+            ls_fsm_ns = WAIT_RVALID_MIS_GNTS_DONE;
+            handle_misaligned_d = 1'b0;
+          end
+        end
+      end
+
+      WAIT_GNT: begin
+        // tell ID/EX stage to update the address
+        addr_incr_req = handle_misaligned_q;
+        data_req_o      = 1'b1;
+        if (data_gnt_i || pmp_err_q) begin
+          ctrl_update         = 1'b1;
+          // Update the address, unless there was an error
+          addr_update         = ~lsu_err_q;
+          ls_fsm_ns           = IDLE;
+          handle_misaligned_d = 1'b0;
+        end
+      end
+
+      WAIT_RVALID_MIS_GNTS_DONE: begin
+        // tell ID/EX stage to update the address (to make sure the
+        // second address can be captured correctly for mtval and PMP checking)
+        addr_incr_req = 1'b1;
+        // Wait for the first rvalid, second request is already granted
+        if (data_rvalid_i) begin
+          // Update the pmp error for the second part
+          pmp_err_d = data_pmp_err_i ;
+          // The first part cannot see a PMP error in this state
+          lsu_err_d = data_err_i;
+          // Now we can update the address for the second part if no error
+          addr_update = ~data_err_i;
+          // Capture the first rdata for loads
+          rdata_update = ~data_we_q;
+          // Wait for second rvalid
+          ls_fsm_ns = IDLE;
+        end
+      end
+
+      CTX_WAIT_GNT1: begin
+        if (cheri_pmode_i) begin
+          addr_incr_req = 1'b0;
+          data_req_o      = 1'b1;
+          if (data_gnt_i) begin
+            ls_fsm_ns = CTX_WAIT_GNT2;
+            ctrl_update         = 1'b1;
+            addr_update         = 1'b1;
+           end
+        end else begin
+          ls_fsm_ns = IDLE;
+        end
+      end
+
+      CTX_WAIT_GNT2: begin
+        if (cheri_pmode_i) begin
+          addr_incr_req = 1'b1;
+          data_req_o      = 1'b1;
+          if (data_gnt_i && (data_rvalid_i || (cap_rx_fsm_q == CRX_WAIT_RESP2))) ls_fsm_ns = IDLE;
+          else if (data_gnt_i) ls_fsm_ns = CTX_WAIT_RESP;
+        end else begin
+          ls_fsm_ns = IDLE;
+        end
+      end
+
+      CTX_WAIT_RESP: begin        // only needed if mem allows 2 active req 
+        if (cheri_pmode_i) begin
+          addr_incr_req = 1'b1; // stay 1 to reduce unnecessary addr toggling
+          data_req_o      = 1'b0;
+          if (data_rvalid_i) ls_fsm_ns = IDLE;
+        end else begin
+          ls_fsm_ns = IDLE;
+        end
+      end
+
+      default: begin
+        ls_fsm_ns = IDLE;
+      end
+    endcase
+  end
+
+  always_comb begin
+    cap_rx_fsm_d = cap_rx_fsm_q;
+
+    case (cap_rx_fsm_q)
+      CRX_IDLE:
+        if (CHERIoTEn & cheri_pmode_i & lsu_is_cap_i && (ls_fsm_ns != IDLE)) cap_rx_fsm_d = CRX_WAIT_RESP1;
+      CRX_WAIT_RESP1:
+        if (data_rvalid_i) cap_rx_fsm_d = CRX_WAIT_RESP2;
+      CRX_WAIT_RESP2:
+        if (data_rvalid_i && lsu_is_cap_i && (ls_fsm_ns != IDLE)) cap_rx_fsm_d = CRX_WAIT_RESP1;
+        else if (data_rvalid_i) cap_rx_fsm_d = CRX_IDLE;
+      default:;
+    endcase
+  end
+
+  // this is the decision of granting LSU to TBRE/STKZ
+  assign tbre_req_good  = CHERIoTEn & cheri_pmode_i & CheriTBRE & tbre_lsu_req_i & 
+                          (~cpu_lsu_dec_i | (cpu_lsu_dec_i & cpu_grant_to_stkz_i));
+
+  assign resp_wait = CHERIoTEn & cheri_pmode_i & CheriTBRE & outstanding_resp_q & ~lsu_resp_valid;
+
+  // we assume ctrl will be held till req_done asserted 
+  // (once req captured in IDLE, it can be deasserted)
+  logic lsu_req_done;
+
+  assign lsu_req_done        = (lsu_go | (ls_fsm_cs != IDLE)) & (ls_fsm_ns == IDLE);
+
+  assign lsu_req_done_o      = lsu_req_done & (~cur_req_is_tbre);
+  assign lsu_tbre_req_done_o = lsu_req_done & cur_req_is_tbre & cheri_pmode_i;
+
+  assign lsu_addr_incr_req_o      = addr_incr_req & ~cur_req_is_tbre;
+  assign lsu_tbre_addr_incr_req_o = addr_incr_req & cur_req_is_tbre;
+
+  assign cur_req_is_tbre = CHERIoTEn & cheri_pmode_i & CheriTBRE & ((ls_fsm_cs == IDLE) ? 
+                           (tbre_req_good & ~resp_wait) : req_is_tbre_q);
+
+  assign lsu_tbre_sel_o = cur_req_is_tbre;        // req ctrl signal mux select (to cheri_ex/tbre_wrapper)
+
+  // registers for FSM
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      ls_fsm_cs           <= IDLE;
+      handle_misaligned_q <= '0;
+      pmp_err_q           <= '0;
+      lsu_err_q           <= '0;
+      resp_is_cap_q       <= 1'b0;
+      resp_lc_clrperm_q   <= 4'h0;
+      req_is_tbre_q       <= 1'b0;
+      cheri_err_q         <= 1'b0;
+      cap_rx_fsm_q        <= CRX_IDLE;
+      cap_lsw_err_q       <= 1'b0;
+      cap_lsw_q           <= 33'h0;
+      outstanding_resp_q  <= 1'b0;
+    end else begin
+      ls_fsm_cs           <= ls_fsm_ns;
+      handle_misaligned_q <= handle_misaligned_d;
+      pmp_err_q           <= pmp_err_d;
+      lsu_err_q           <= lsu_err_d;
+      cheri_err_q         <= cheri_err_d;
+
+      cap_rx_fsm_q        <= cap_rx_fsm_d;
+
+      // resp_is_cap_q aligns with responses on the data interface, lsu_is_cap_i aligns with requests
+      //   we use lsu_go to qualify this update
+      //   - note this implies that LSU only support a outstand request at a time
+      //   - new request can't be issued (go) until resp_valid
+      //   - also note resp_valid is gated by (ls_fsm_cs == IDLE)
+      if (lsu_go) begin
+        resp_is_cap_q     <= lsu_is_cap_i;
+        resp_lc_clrperm_q <= lsu_lc_clrperm_i;
+        req_is_tbre_q     <= cur_req_is_tbre;
+      end
+
+      if (CHERIoTEn & cheri_pmode_i && (cap_rx_fsm_q == CRX_WAIT_RESP1) && data_rvalid_i && (~data_we_q))
+        cap_lsw_q <= data_rdata_i;
+
+      if (CHERIoTEn & cheri_pmode_i && (cap_rx_fsm_q == CRX_WAIT_RESP1) && data_rvalid_i)
+        cap_lsw_err_q <= data_err_i;
+
+      if (lsu_go)
+        outstanding_resp_q <= 1'b1;
+      else if (lsu_resp_valid)
+        outstanding_resp_q <= 1'b0;
+
+    end
+  end
+
+  /////////////
+  // Outputs //
+  /////////////
+
+  assign resp_is_tbre = req_is_tbre_q;
+
+  logic all_resp;
+  assign data_or_pmp_err    = lsu_err_q | data_err_i | pmp_err_q | (cheri_pmode_i & 
+                              (cheri_err_q | (resp_is_cap_q & cap_lsw_err_q)));
+
+  assign all_resp           = data_rvalid_i | pmp_err_q | (cheri_pmode_i & cheri_err_q);
+  assign lsu_resp_valid     = all_resp & (ls_fsm_cs == IDLE) ;
+
+  assign lsu_resp_valid_o        = lsu_resp_valid & (~cheri_pmode_i | (~resp_is_tbre)) ;
+  assign lsu_tbre_resp_valid_o   = lsu_resp_valid & resp_is_tbre;
+  assign lsu_resp_is_wr_o        = data_we_q;
+
+  // this goes to wb as rf_we_lsu, so needs to be gated when data needs to go back to EX
+  assign lsu_rdata_valid_o  = (ls_fsm_cs == IDLE) & data_rvalid_i & ~data_or_pmp_err & ~data_we_q & 
+                              (~cheri_pmode_i | (~resp_is_tbre));
+
+  // output to register file
+  if (CHERIoTEn & ~MemCapFmt) begin : gen_memcap_rd_fmt0
+    assign lsu_rdata_o = (cheri_pmode_i & resp_is_cap_q) ? cap_lsw_q : data_rdata_ext;
+    assign lsu_rcap_o  = (resp_is_cap_q && data_rvalid_i && (cap_rx_fsm_q == CRX_WAIT_RESP2) && (~data_or_pmp_err)) ?
+                         (CheriCapIT8 ? mem2regcap_it8_fmt0(data_rdata_i, cap_lsw_q, resp_lc_clrperm_q) :
+                                        mem2regcap_fmt0(data_rdata_i, cap_lsw_q, resp_lc_clrperm_q)) : NULL_REG_CAP;
+  end else if (CHERIoTEn) begin : gen_memcap_rd_fmt1
+    assign lsu_rdata_o = (cheri_pmode_i & resp_is_cap_q) ? mem2regaddr_fmt1(data_rdata_ext, cap_lsw_q, lsu_rcap_o): data_rdata_ext;
+    assign lsu_rcap_o  = (resp_is_cap_q && data_rvalid_i && (cap_rx_fsm_q == CRX_WAIT_RESP2) && (~data_or_pmp_err)) ?
+                         (CheriCapIT8 ?  mem2regcap_it8_fmt1(data_rdata_i, cap_lsw_q, resp_lc_clrperm_q) :
+                                         mem2regcap_fmt1(data_rdata_i, cap_lsw_q, resp_lc_clrperm_q)) : NULL_REG_CAP;
+  end else begin : gen_no_cap_rd
+    assign lsu_rdata_o = data_rdata_ext;
+    assign lsu_rcap_o  = NULL_REG_CAP;
+  end
+  
+  
+  assign lsu_tbre_raw_lsw_o = cap_lsw_q;          // "raw" memory word to tbre
+
+  // output data address must be word aligned
+  assign data_addr_w_aligned = {data_addr[31:2], 2'b00};
+
+  // output to data interface
+  assign data_addr_o   = data_addr_w_aligned;
+
+  assign data_wdata_o  = data_wdata;
+  assign data_we_o     = lsu_we_i;
+  assign data_be_o     = data_be;
+
+  assign data_is_cap_o = lsu_is_cap_i;
+
+  // output to ID stage: mtval + AGU for misaligned transactions
+  assign addr_last_o   = addr_last_q;
+
+  // Signal a load or store error depending on the transaction type outstanding
+  assign load_err_o    = data_or_pmp_err & ~data_we_q & lsu_resp_valid & (~resp_is_tbre);
+  assign store_err_o   = data_or_pmp_err &  data_we_q & lsu_resp_valid & (~resp_is_tbre);
+
+  assign lsu_err_is_cheri_o  = cheri_pmode_i & cheri_err_q;     // send to controller for mcause encoding
+  assign lsu_tbre_resp_err_o = cheri_pmode_i & data_or_pmp_err & lsu_resp_valid & resp_is_tbre;
+
+  assign busy_o = (ls_fsm_cs != IDLE);
+  // assign busy_tbre_o = (ls_fsm_cs != IDLE) & cur_req_is_tbre;
+  assign busy_tbre_o = (ls_fsm_cs != IDLE) & cheri_pmode_i & resp_is_tbre;
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_lockstep.sv b/hw/ip/cheriot-ibex/rtl/cheriot_lockstep.sv
new file mode 100644
index 0000000..15815a0
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_lockstep.sv
@@ -0,0 +1,657 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Ibex lockstep module
+// This module instantiates a second copy of the core logic, and compares it's outputs against
+// those from the main core. The second core runs synchronously with the main core, delayed by
+// LockstepOffset cycles.
+module cheriot_lockstep import cheriot_pkg::*; import cheri_pkg::*; #(
+  parameter int unsigned LockstepOffset    = 2,
+  parameter bit          PMPEnable         = 1'b0,
+  parameter int unsigned PMPGranularity    = 0,
+  parameter int unsigned PMPNumRegions     = 4,
+  parameter int unsigned MHPMCounterNum    = 0,
+  parameter int unsigned MHPMCounterWidth  = 40,
+  parameter bit          RV32E             = 1'b0,
+  parameter rv32m_e      RV32M             = RV32MFast,
+  parameter rv32b_e      RV32B             = RV32BNone,
+  parameter bit          BranchTargetALU   = 1'b0,
+  parameter bit          WritebackStage    = 1'b0,
+  parameter bit          ICache            = 1'b0,
+  parameter bit          ICacheECC         = 1'b0,
+  parameter int unsigned BusSizeECC        = BUS_SIZE,
+  parameter int unsigned TagSizeECC        = IC_TAG_SIZE,
+  parameter int unsigned LineSizeECC       = IC_LINE_SIZE,
+  parameter bit          BranchPredictor   = 1'b0,
+  parameter bit          DbgTriggerEn      = 1'b0,
+  parameter int unsigned DbgHwBreakNum     = 1,
+  parameter bit          ResetAll          = 1'b0,
+  parameter lfsr_seed_t  RndCnstLfsrSeed   = RndCnstLfsrSeedDefault,
+  parameter lfsr_perm_t  RndCnstLfsrPerm   = RndCnstLfsrPermDefault,
+  parameter bit          SecureIbex        = 1'b0,
+  parameter bit          DummyInstructions = 1'b0,
+  parameter bit          RegFileECC        = 1'b0,
+  parameter int unsigned RegFileDataWidth  = 32,
+  parameter int unsigned DmHaltAddr        = 32'h1A110800,
+  parameter int unsigned DmExceptionAddr   = 32'h1A110808,
+  // CHERIoT paramters
+  parameter bit          CHERIoTEn         = 1'b1,
+  parameter int unsigned DataWidth         = 33,
+  parameter int unsigned HeapBase          = 32'h2001_0000,
+  parameter int unsigned TSMapBase         = 32'h2002_f000,
+  parameter int unsigned TSMapSize         = 1024,
+  parameter bit          MemCapFmt         = 1'b0,
+  parameter bit          CheriPPLBC        = 1'b1,
+  parameter bit          CheriSBND2        = 1'b0,
+  parameter bit          CheriTBRE         = 1'b1,
+  parameter int unsigned MMRegDinW         = 128,
+  parameter int unsigned MMRegDoutW        = 64
+) (
+  input  logic                         clk_i,
+  input  logic                         rst_ni,
+
+  input  logic [31:0]                  hart_id_i,
+  input  logic [31:0]                  boot_addr_i,
+  input  logic                         cheri_pmode_i,
+  input  logic                         cheri_tsafe_en_i,
+ 
+  input  logic                         instr_req_i,
+  input  logic                         instr_gnt_i,
+  input  logic                         instr_rvalid_i,
+  input  logic [31:0]                  instr_addr_i,
+  input  logic [31:0]                  instr_rdata_i,
+  input  logic [6:0]                   instr_rdata_intg_i,
+  input  logic                         instr_err_i,
+
+  input  logic                         data_req_i,
+  input  logic                         data_gnt_i,
+  input  logic                         data_rvalid_i,
+  input  logic                         data_we_i,
+  input  logic [3:0]                   data_be_i,
+  input  logic [31:0]                  data_addr_i,
+  input  logic [DataWidth-1:0]         data_wdata_i,
+  input  logic                         data_is_cap_i,
+  output logic [6:0]                   data_wdata_intg_o,
+  input  logic [DataWidth-1:0]         data_rdata_i,
+  input  logic [6:0]                   data_rdata_intg_i,
+  input  logic                         data_err_i,
+
+  input  logic                         dummy_instr_id_i,
+  input  logic [4:0]                   rf_raddr_a_i,
+  input  logic [4:0]                   rf_raddr_b_i,
+  input  logic [4:0]                   rf_waddr_wb_i,
+  input  logic                         rf_we_wb_i,
+  input  logic [RegFileDataWidth-1:0]  rf_wdata_wb_ecc_i,
+  input  logic [RegFileDataWidth-1:0]  rf_rdata_a_ecc_i,
+  input  logic [RegFileDataWidth-1:0]  rf_rdata_b_ecc_i,
+ 
+  input  reg_cap_t                     rf_wcap_wb_i,
+  input  reg_cap_t                     rf_rcap_a_i,
+  input  reg_cap_t                     rf_rcap_b_i,
+  input  logic [31:0]                  rf_reg_rdy_i,
+  input  logic                         rf_trsv_en_i,
+  input  logic [4:0]                   rf_trsv_addr_i,
+  input  logic [6:0]                   rf_trsv_par_i,
+  input  logic [4:0]                   rf_trvk_addr_i,
+  input  logic                         rf_trvk_en_i,
+  input  logic                         rf_trvk_clrtag_i,
+  input  logic [6:0]                   rf_trvk_par_i,
+  input  logic                         tsmap_cs_i,
+  input  logic [15:0]                  tsmap_addr_i,
+  input  logic [31:0]                  tsmap_rdata_i,
+  input  logic [6:0]                   tsmap_rdata_intg_i,
+  input  logic [MMRegDinW-1:0]         mmreg_corein_i,
+  input  logic [MMRegDoutW-1:0]        mmreg_coreout_i,
+ 
+  input  logic [IC_NUM_WAYS-1:0]       ic_tag_req_i,
+  input  logic                         ic_tag_write_i,
+  input  logic [IC_INDEX_W-1:0]        ic_tag_addr_i,
+  input  logic [TagSizeECC-1:0]        ic_tag_wdata_i,
+  input  logic [TagSizeECC-1:0]        ic_tag_rdata_i [IC_NUM_WAYS],
+  input  logic [IC_NUM_WAYS-1:0]       ic_data_req_i,
+  input  logic                         ic_data_write_i,
+  input  logic [IC_INDEX_W-1:0]        ic_data_addr_i,
+  input  logic [LineSizeECC-1:0]       ic_data_wdata_i,
+  input  logic [LineSizeECC-1:0]       ic_data_rdata_i [IC_NUM_WAYS],
+  input  logic                         ic_scr_key_valid_i,
+
+  input  logic                         irq_software_i,
+  input  logic                         irq_timer_i,
+  input  logic                         irq_external_i,
+  input  logic [14:0]                  irq_fast_i,
+  input  logic                         irq_nm_i,
+  input  logic                         irq_pending_i,
+
+  input  logic                         debug_req_i,
+  input  crash_dump_t                  crash_dump_i,
+  input  logic                         double_fault_seen_i,
+
+  input  fetch_enable_t                fetch_enable_i,
+  output logic                         alert_minor_o,
+  output logic                         alert_major_internal_o,
+  output logic                         alert_major_bus_o,
+  input  logic                         icache_inval_i,
+  input  logic                         core_busy_i,
+  input  logic                         test_en_i,
+  input  logic                         scan_rst_ni
+);
+
+  localparam int unsigned LockstepOffsetW = $clog2(LockstepOffset);
+  // Core outputs are delayed for an extra cycle due to shadow output registers
+  localparam int unsigned OutputsOffset = LockstepOffset + 1;
+
+  //////////////////////
+  // Reset generation //
+  //////////////////////
+
+  // Upon reset, the comparison is stopped and the shadow core is reset, both immediately. A
+  // counter is started. After LockstepOffset clock cycles:
+  // - The counter is stopped.
+  // - The reset of the shadow core is synchronously released.
+  // The comparison is started in the following clock cycle.
+
+  logic [LockstepOffsetW-1:0] rst_shadow_cnt_d, rst_shadow_cnt_q, rst_shadow_cnt_incr;
+  // Internally generated resets cause IMPERFECTSCH warnings
+  /* verilator lint_off IMPERFECTSCH */
+  logic                       rst_shadow_set_d, rst_shadow_set_q;
+  logic                       rst_shadow_n, enable_cmp_q;
+  /* verilator lint_on IMPERFECTSCH */
+
+  assign rst_shadow_cnt_incr = rst_shadow_cnt_q + LockstepOffsetW'(1);
+
+  assign rst_shadow_set_d = (rst_shadow_cnt_q == LockstepOffsetW'(LockstepOffset - 1));
+  assign rst_shadow_cnt_d = rst_shadow_set_d ? rst_shadow_cnt_q : rst_shadow_cnt_incr;
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      rst_shadow_cnt_q <= '0;
+      enable_cmp_q     <= '0;
+    end else begin
+      rst_shadow_cnt_q <= rst_shadow_cnt_d;
+      enable_cmp_q     <= rst_shadow_set_q;
+    end
+  end
+
+  // The primitives below are used to place size-only constraints in order to prevent
+  // synthesis optimizations and preserve anchor points for constraining backend tools.
+  prim_flop #(
+    .Width(1),
+    .ResetValue(1'b0)
+  ) u_prim_rst_shadow_set_flop (
+    .clk_i (clk_i),
+    .rst_ni(rst_ni),
+    .d_i   (rst_shadow_set_d),
+    .q_o   (rst_shadow_set_q)
+  );
+
+  prim_clock_mux2 #(
+    .NoFpgaBufG(1'b1)
+  ) u_prim_rst_shadow_n_mux2 (
+    .clk0_i(rst_shadow_set_q),
+    .clk1_i(scan_rst_ni),
+    .sel_i (test_en_i),
+    .clk_o (rst_shadow_n)
+  );
+
+  //////////////////
+  // Input delays //
+  //////////////////
+
+  typedef struct packed {
+    logic                        instr_gnt;
+    logic                        instr_rvalid;
+    logic [31:0]                 instr_rdata;
+    logic                        instr_err;
+    logic                        data_gnt;
+    logic                        data_rvalid;
+    logic [DataWidth-1:0]        data_rdata;
+    logic                        data_err;
+    logic [RegFileDataWidth-1:0] rf_rdata_a_ecc;
+    logic [RegFileDataWidth-1:0] rf_rdata_b_ecc;
+    logic                        irq_software;
+    logic                        irq_timer;
+    logic                        irq_external;
+    logic [14:0]                 irq_fast;
+    logic                        irq_nm;
+    logic                        debug_req;
+    fetch_enable_t               fetch_enable;
+    logic                        ic_scr_key_valid;
+    logic                        cheri_pmode;
+    logic                        cheri_tsafe_en;
+    reg_cap_t                    rf_rcap_a;
+    reg_cap_t                    rf_rcap_b;
+    logic [31:0]                 rf_reg_rdy;
+    logic [31:0]                 tsmap_rdata;
+    logic [MMRegDinW-1:0]        mmreg_corein;
+  } delayed_inputs_t;
+
+  delayed_inputs_t [LockstepOffset-1:0] shadow_inputs_q;
+  delayed_inputs_t                      shadow_inputs_in;
+  logic [6:0]                           instr_rdata_intg_q, data_rdata_intg_q;
+  logic [6:0]                           tsmap_rdata_intg_q;
+  // Packed arrays must be dealt with separately
+  logic [TagSizeECC-1:0]                shadow_tag_rdata_q [IC_NUM_WAYS][LockstepOffset];
+  logic [LineSizeECC-1:0]               shadow_data_rdata_q [IC_NUM_WAYS][LockstepOffset];
+
+  // Assign the inputs to the delay structure
+  assign shadow_inputs_in.instr_gnt        = instr_gnt_i;
+  assign shadow_inputs_in.instr_rvalid     = instr_rvalid_i;
+  assign shadow_inputs_in.instr_rdata      = instr_rdata_i;
+  assign shadow_inputs_in.instr_err        = instr_err_i;
+  assign shadow_inputs_in.data_gnt         = data_gnt_i;
+  assign shadow_inputs_in.data_rvalid      = data_rvalid_i;
+  assign shadow_inputs_in.data_rdata       = data_rdata_i;
+  assign shadow_inputs_in.data_err         = data_err_i;
+  assign shadow_inputs_in.rf_rdata_a_ecc   = rf_rdata_a_ecc_i;
+  assign shadow_inputs_in.rf_rdata_b_ecc   = rf_rdata_b_ecc_i;
+  assign shadow_inputs_in.irq_software     = irq_software_i;
+  assign shadow_inputs_in.irq_timer        = irq_timer_i;
+  assign shadow_inputs_in.irq_external     = irq_external_i;
+  assign shadow_inputs_in.irq_fast         = irq_fast_i;
+  assign shadow_inputs_in.irq_nm           = irq_nm_i;
+  assign shadow_inputs_in.debug_req        = debug_req_i;
+  assign shadow_inputs_in.fetch_enable     = fetch_enable_i;
+  assign shadow_inputs_in.ic_scr_key_valid = ic_scr_key_valid_i;
+  assign shadow_inputs_in.cheri_pmode      = cheri_pmode_i;
+  assign shadow_inputs_in.cheri_tsafe_en   = cheri_tsafe_en_i;
+  assign shadow_inputs_in.rf_rcap_a        = rf_rcap_a_i;
+  assign shadow_inputs_in.rf_rcap_b        = rf_rcap_b_i;
+  assign shadow_inputs_in.rf_reg_rdy       = rf_reg_rdy_i;
+  assign shadow_inputs_in.tsmap_rdata      = tsmap_rdata_i;
+  assign shadow_inputs_in.mmreg_corein     = mmreg_corein_i;
+
+  // Delay the inputs
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      instr_rdata_intg_q <= '0;
+      data_rdata_intg_q  <= '0;
+      tsmap_rdata_intg_q  <= '0;
+      for (int unsigned i = 0; i < LockstepOffset; i++) begin
+        shadow_inputs_q[i]     <= delayed_inputs_t'('0);
+        shadow_tag_rdata_q[i]  <= '{default: 0};
+        shadow_data_rdata_q[i] <= '{default: 0};
+      end
+    end else begin
+      instr_rdata_intg_q <= instr_rdata_intg_i;
+      data_rdata_intg_q  <= data_rdata_intg_i;
+      tsmap_rdata_intg_q  <= tsmap_rdata_intg_i;
+      for (int unsigned i = 0; i < LockstepOffset - 1; i++) begin
+        shadow_inputs_q[i]     <= shadow_inputs_q[i+1];
+        shadow_tag_rdata_q[i]  <= shadow_tag_rdata_q[i+1];
+        shadow_data_rdata_q[i] <= shadow_data_rdata_q[i+1];
+      end
+      shadow_inputs_q[LockstepOffset-1]     <= shadow_inputs_in;
+      shadow_tag_rdata_q[LockstepOffset-1]  <= ic_tag_rdata_i;
+      shadow_data_rdata_q[LockstepOffset-1] <= ic_data_rdata_i;
+    end
+  end
+
+  ////////////////////////////
+  // Bus integrity checking //
+  ////////////////////////////
+
+  logic        bus_intg_err;
+  logic [1:0]  instr_intg_err, data_intg_err, data_intg_err_tmp;
+  logic [31:0] unused_wdata;
+  logic [1:0]  data_we_q;
+  logic [31:0] rdata_tmp;
+
+  always @(posedge clk_i, negedge rst_ni) begin
+    if (~rst_ni) begin
+      data_we_q <= 2'b00;
+    end else begin
+      if (data_gnt_i) data_we_q[1] <= data_we_i;
+      data_we_q[0] <= data_we_q[1];    // align with shadow_inputs_q[LockstepOffset-1]
+    end
+  end
+
+  // Checks on incoming data
+  prim_secded_inv_39_32_dec u_instr_intg_dec (
+    .data_i     ({instr_rdata_intg_q, shadow_inputs_q[LockstepOffset-1].instr_rdata}),
+    .data_o     (),
+    .syndrome_o (),
+    .err_o      (instr_intg_err)
+  );
+
+  if (CHERIoTEn) begin
+    assign rdata_tmp = shadow_inputs_q[LockstepOffset-1].data_rdata[31:0] ^ 
+                       {31'h0, shadow_inputs_q[LockstepOffset-1].data_rdata[32]};
+  end else begin
+    assign rdata_tmp = shadow_inputs_q[LockstepOffset-1].data_rdata[31:0]; 
+  end
+
+  prim_secded_inv_39_32_dec u_data_intg_dec (
+    .data_i     ({data_rdata_intg_q, rdata_tmp}),
+    .data_o     (),
+    .syndrome_o (),
+    .err_o      (data_intg_err_tmp)
+  );
+
+  // only check read data (data_rvalid includes both reads and writes)
+  assign data_intg_err = data_we_q[0] ? 2'h0 : data_intg_err_tmp;
+
+  assign bus_intg_err = (shadow_inputs_q[LockstepOffset-1].instr_rvalid & |instr_intg_err) |
+                        (shadow_inputs_q[LockstepOffset-1].data_rvalid  & |data_intg_err);
+
+  // Generate integrity bits
+  if (CHERIoTEn) begin
+    prim_secded_inv_39_32_enc u_data_gen (
+      .data_i (data_wdata_i[31:0]^{31'h0, data_wdata_i[32]}),
+      .data_o ({data_wdata_intg_o, unused_wdata})
+    );
+  end else begin
+    prim_secded_inv_39_32_enc u_data_gen (
+      .data_i (data_wdata_i[31:0]),
+      .data_o ({data_wdata_intg_o, unused_wdata})
+    );
+  end
+
+  
+  ////////////////////////////////////////
+  // TSMAP interface integrity checking //
+  ////////////////////////////////////////
+
+  logic       tsmap_intg_err;
+  logic [1:0] tsmap_intg_err_tmp;
+  logic [1:0] tsmap_cs_q;
+
+  if (CHERIoTEn && CheriPPLBC) begin
+    always @(posedge clk_i, negedge rst_ni) begin
+      if (~rst_ni) begin
+        tsmap_cs_q <= 2'b00;
+      end else begin
+        tsmap_cs_q <= {tsmap_cs_i, tsmap_cs_q[1]}; // align with shadow_inputs_q[LockstepOffset-1]
+      end
+    end
+
+    // Checks on incoming data
+    prim_secded_inv_39_32_dec u_tsmap_intg_dec (
+      .data_i     ({tsmap_rdata_intg_q, shadow_inputs_q[LockstepOffset-1].tsmap_rdata}),
+      .data_o     (),
+      .syndrome_o (),
+      .err_o      (tsmap_intg_err_tmp)
+    );
+
+    assign tsmap_intg_err = tsmap_cs_q[0] & tsmap_intg_err_tmp;
+
+  end else begin
+    assign tsmap_intg_err = 1'b0;
+  end
+
+  ///////////////////
+  // Output delays //
+  ///////////////////
+
+  typedef struct packed {
+    logic                        instr_req;
+    logic [31:0]                 instr_addr;
+    logic                        data_req;
+    logic                        data_we;
+    logic [3:0]                  data_be;
+    logic [31:0]                 data_addr;
+    logic [DataWidth-1:0]        data_wdata;
+    logic                        data_is_cap;
+    logic                        dummy_instr_id;
+    logic [4:0]                  rf_raddr_a;
+    logic [4:0]                  rf_raddr_b;
+    logic [4:0]                  rf_waddr_wb;
+    logic                        rf_we_wb;
+    logic [RegFileDataWidth-1:0] rf_wdata_wb_ecc;
+    logic [IC_NUM_WAYS-1:0]      ic_tag_req;
+    logic                        ic_tag_write;
+    logic [IC_INDEX_W-1:0]       ic_tag_addr;
+    logic [TagSizeECC-1:0]       ic_tag_wdata;
+    logic [IC_NUM_WAYS-1:0]      ic_data_req;
+    logic                        ic_data_write;
+    logic [IC_INDEX_W-1:0]       ic_data_addr;
+    logic [LineSizeECC-1:0]      ic_data_wdata;
+    logic                        irq_pending;
+    crash_dump_t                 crash_dump;
+    logic                        double_fault_seen;
+    logic                        icache_inval;
+    logic                        core_busy;
+    reg_cap_t                    rf_wcap_wb;
+    logic                        rf_trsv_en;
+    logic [4:0]                  rf_trsv_addr;
+    logic [6:0]                  rf_trsv_par;
+    logic [4:0]                  rf_trvk_addr;
+    logic                        rf_trvk_en;
+    logic                        rf_trvk_clrtag;
+    logic [6:0]                  rf_trvk_par;
+    logic                        tsmap_cs;
+    logic [15:0]                 tsmap_addr;
+    logic [MMRegDoutW-1:0]       mmreg_coreout;
+  } delayed_outputs_t;
+
+  delayed_outputs_t [OutputsOffset-1:0]  core_outputs_q;
+  delayed_outputs_t                      core_outputs_in;
+  delayed_outputs_t                      shadow_outputs_d, shadow_outputs_q;
+
+  // Assign core outputs to the structure
+  assign core_outputs_in.instr_req           = instr_req_i;
+  assign core_outputs_in.instr_addr          = instr_addr_i;
+  assign core_outputs_in.data_req            = data_req_i;
+  assign core_outputs_in.data_we             = data_we_i;
+  assign core_outputs_in.data_be             = data_be_i;
+  assign core_outputs_in.data_addr           = data_addr_i;
+  assign core_outputs_in.data_wdata          = data_wdata_i;
+  assign core_outputs_in.data_is_cap         = data_is_cap_i;
+  assign core_outputs_in.dummy_instr_id      = dummy_instr_id_i;
+  assign core_outputs_in.rf_raddr_a          = rf_raddr_a_i;
+  assign core_outputs_in.rf_raddr_b          = rf_raddr_b_i;
+  assign core_outputs_in.rf_waddr_wb         = rf_waddr_wb_i;
+  assign core_outputs_in.rf_we_wb            = rf_we_wb_i;
+  assign core_outputs_in.rf_wdata_wb_ecc     = rf_wdata_wb_ecc_i;
+  assign core_outputs_in.ic_tag_req          = ic_tag_req_i;
+  assign core_outputs_in.ic_tag_write        = ic_tag_write_i;
+  assign core_outputs_in.ic_tag_addr         = ic_tag_addr_i;
+  assign core_outputs_in.ic_tag_wdata        = ic_tag_wdata_i;
+  assign core_outputs_in.ic_data_req         = ic_data_req_i;
+  assign core_outputs_in.ic_data_write       = ic_data_write_i;
+  assign core_outputs_in.ic_data_addr        = ic_data_addr_i;
+  assign core_outputs_in.ic_data_wdata       = ic_data_wdata_i;
+  assign core_outputs_in.irq_pending         = irq_pending_i;
+  assign core_outputs_in.crash_dump          = crash_dump_i;
+  assign core_outputs_in.double_fault_seen   = double_fault_seen_i;
+  assign core_outputs_in.icache_inval        = icache_inval_i;
+  assign core_outputs_in.core_busy           = core_busy_i;
+  assign core_outputs_in.rf_wcap_wb          = rf_wcap_wb_i;
+  assign core_outputs_in.rf_trsv_en          = rf_trsv_en_i;
+  assign core_outputs_in.rf_trsv_addr        = rf_trsv_addr_i;
+  assign core_outputs_in.rf_trsv_par         = rf_trsv_par_i;
+  assign core_outputs_in.rf_trvk_addr        = rf_trvk_addr_i;
+  assign core_outputs_in.rf_trvk_en          = rf_trvk_en_i;
+  assign core_outputs_in.rf_trvk_clrtag      = rf_trvk_clrtag_i;
+  assign core_outputs_in.rf_trvk_par         = rf_trvk_par_i;
+  assign core_outputs_in.tsmap_cs            = tsmap_cs_i;
+  assign core_outputs_in.tsmap_addr          = tsmap_addr_i;
+  assign core_outputs_in.mmreg_coreout       = mmreg_coreout_i;     
+                        
+  // Delay the outputs
+  always_ff @(posedge clk_i) begin
+    for (int unsigned i = 0; i < OutputsOffset - 1; i++) begin
+      core_outputs_q[i] <= core_outputs_q[i+1];
+    end
+    core_outputs_q[OutputsOffset-1] <= core_outputs_in;
+  end
+
+  ///////////////////////////////
+  // Shadow core instantiation //
+  ///////////////////////////////
+
+  logic shadow_alert_minor, shadow_alert_major;
+
+  cheriot_core #(
+    .PMPEnable         ( PMPEnable         ),
+    .PMPGranularity    ( PMPGranularity    ),
+    .PMPNumRegions     ( PMPNumRegions     ),
+    .MHPMCounterNum    ( MHPMCounterNum    ),
+    .MHPMCounterWidth  ( MHPMCounterWidth  ),
+    .RV32E             ( RV32E             ),
+    .RV32M             ( RV32M             ),
+    .RV32B             ( RV32B             ),
+    .BranchTargetALU   ( BranchTargetALU   ),
+    .ICache            ( ICache            ),
+    .ICacheECC         ( ICacheECC         ),
+    .BusSizeECC        ( BusSizeECC        ),
+    .TagSizeECC        ( TagSizeECC        ),
+    .LineSizeECC       ( LineSizeECC       ),
+    .BranchPredictor   ( BranchPredictor   ),
+    .DbgTriggerEn      ( DbgTriggerEn      ),
+    .DbgHwBreakNum     ( DbgHwBreakNum     ),
+    .WritebackStage    ( WritebackStage    ),
+    .ResetAll          ( ResetAll          ),
+    .RndCnstLfsrSeed   ( RndCnstLfsrSeed   ),
+    .RndCnstLfsrPerm   ( RndCnstLfsrPerm   ),
+    .SecureIbex        ( SecureIbex        ),
+    .DummyInstructions ( DummyInstructions ),
+    .RegFileECC        ( RegFileECC        ),
+    .RegFileDataWidth  ( RegFileDataWidth  ),
+    .DmHaltAddr        ( DmHaltAddr        ),
+    .DmExceptionAddr   ( DmExceptionAddr   ),
+    .CHERIoTEn         ( CHERIoTEn),
+    .DataWidth         ( DataWidth),
+    .HeapBase          ( HeapBase   ),
+    .TSMapBase         ( TSMapBase  ),
+    .TSMapSize         ( TSMapSize),
+    .MemCapFmt         ( MemCapFmt   ),
+    .CheriPPLBC        ( CheriPPLBC),
+    .CheriSBND2        ( CheriSBND2),
+    .CheriTBRE         ( CheriTBRE)
+  ) u_shadow_core (
+    .clk_i               (clk_i),
+    .rst_ni              (rst_shadow_n),
+
+    .hart_id_i           (hart_id_i),
+    .boot_addr_i         (boot_addr_i),
+
+    .cheri_pmode_i       (shadow_inputs_q[0].cheri_pmode),
+    .cheri_tsafe_en_i    (shadow_inputs_q[0].cheri_tsafe_en),
+
+    .instr_req_o         (shadow_outputs_d.instr_req),
+    .instr_gnt_i         (shadow_inputs_q[0].instr_gnt),
+    .instr_rvalid_i      (shadow_inputs_q[0].instr_rvalid),
+    .instr_addr_o        (shadow_outputs_d.instr_addr),
+    .instr_rdata_i       (shadow_inputs_q[0].instr_rdata),
+    .instr_err_i         (shadow_inputs_q[0].instr_err),
+
+    .data_req_o          (shadow_outputs_d.data_req),
+    .data_gnt_i          (shadow_inputs_q[0].data_gnt),
+    .data_rvalid_i       (shadow_inputs_q[0].data_rvalid),
+    .data_we_o           (shadow_outputs_d.data_we),
+    .data_be_o           (shadow_outputs_d.data_be),
+    .data_addr_o         (shadow_outputs_d.data_addr),
+    .data_wdata_o        (shadow_outputs_d.data_wdata),
+    .data_is_cap_o       (shadow_outputs_d.data_is_cap),
+    .data_rdata_i        (shadow_inputs_q[0].data_rdata),
+    .data_err_i          (shadow_inputs_q[0].data_err),
+
+    .dummy_instr_id_o    (shadow_outputs_d.dummy_instr_id),
+    .rf_raddr_a_o        (shadow_outputs_d.rf_raddr_a),
+    .rf_raddr_b_o        (shadow_outputs_d.rf_raddr_b),
+    .rf_waddr_wb_o       (shadow_outputs_d.rf_waddr_wb),
+    .rf_we_wb_o          (shadow_outputs_d.rf_we_wb),
+    .rf_wdata_wb_ecc_o   (shadow_outputs_d.rf_wdata_wb_ecc),
+    .rf_rdata_a_ecc_i    (shadow_inputs_q[0].rf_rdata_a_ecc),
+    .rf_rdata_b_ecc_i    (shadow_inputs_q[0].rf_rdata_b_ecc),
+    .rf_wcap_wb_o        (shadow_outputs_d.rf_wcap_wb),
+    .rf_rcap_a_i         (shadow_inputs_q[0].rf_rcap_a),
+    .rf_rcap_b_i         (shadow_inputs_q[0].rf_rcap_b),
+    .rf_reg_rdy_i        (shadow_inputs_q[0].rf_reg_rdy),
+    .rf_trsv_en_o        (shadow_outputs_d.rf_trsv_en),  
+    .rf_trsv_addr_o      (shadow_outputs_d.rf_trsv_addr),
+    .rf_trsv_par_o       (shadow_outputs_d.rf_trsv_par),
+    .rf_trvk_addr_o      (shadow_outputs_d.rf_trvk_addr),
+    .rf_trvk_en_o        (shadow_outputs_d.rf_trvk_en),  
+    .rf_trvk_clrtag_o    (shadow_outputs_d.rf_trvk_clrtag),
+    .rf_trvk_par_o       (shadow_outputs_d.rf_trvk_par),
+    .tsmap_cs_o          (shadow_outputs_d.tsmap_cs),    
+    .tsmap_addr_o        (shadow_outputs_d.tsmap_addr), 
+    .tsmap_rdata_i       (shadow_inputs_q[0].tsmap_rdata),
+    .mmreg_corein_i      (shadow_inputs_q[0].mmreg_corein),
+    .mmreg_coreout_o     (shadow_outputs_d.mmreg_coreout),  
+
+    .ic_tag_req_o        (shadow_outputs_d.ic_tag_req),
+    .ic_tag_write_o      (shadow_outputs_d.ic_tag_write),
+    .ic_tag_addr_o       (shadow_outputs_d.ic_tag_addr),
+    .ic_tag_wdata_o      (shadow_outputs_d.ic_tag_wdata),
+    .ic_tag_rdata_i      (shadow_tag_rdata_q[0]),
+    .ic_data_req_o       (shadow_outputs_d.ic_data_req),
+    .ic_data_write_o     (shadow_outputs_d.ic_data_write),
+    .ic_data_addr_o      (shadow_outputs_d.ic_data_addr),
+    .ic_data_wdata_o     (shadow_outputs_d.ic_data_wdata),
+    .ic_data_rdata_i     (shadow_data_rdata_q[0]),
+    .ic_scr_key_valid_i  (shadow_inputs_q[0].ic_scr_key_valid),
+
+    .irq_software_i      (shadow_inputs_q[0].irq_software),
+    .irq_timer_i         (shadow_inputs_q[0].irq_timer),
+    .irq_external_i      (shadow_inputs_q[0].irq_external),
+    .irq_fast_i          (shadow_inputs_q[0].irq_fast),
+    .irq_nm_i            (shadow_inputs_q[0].irq_nm),
+    .irq_pending_o       (shadow_outputs_d.irq_pending),
+
+    .debug_req_i         (shadow_inputs_q[0].debug_req),
+    .crash_dump_o        (shadow_outputs_d.crash_dump),
+    .double_fault_seen_o (shadow_outputs_d.double_fault_seen),
+
+`ifdef RVFI
+    .rvfi_valid         (),
+    .rvfi_order         (),
+    .rvfi_insn          (),
+    .rvfi_trap          (),
+    .rvfi_halt          (),
+    .rvfi_intr          (),
+    .rvfi_mode          (),
+    .rvfi_ixl           (),
+    .rvfi_rs1_addr      (),
+    .rvfi_rs2_addr      (),
+    .rvfi_rs3_addr      (),
+    .rvfi_rs1_rdata     (),
+    .rvfi_rs2_rdata     (),
+    .rvfi_rs3_rdata     (),
+    .rvfi_rd_addr       (),
+    .rvfi_rd_wdata      (),
+    .rvfi_pc_rdata      (),
+    .rvfi_pc_wdata      (),
+    .rvfi_mem_addr      (),
+    .rvfi_mem_rmask     (),
+    .rvfi_mem_wmask     (),
+    .rvfi_mem_rdata     (),
+    .rvfi_mem_wdata     (),
+    .rvfi_ext_mip       (),
+    .rvfi_ext_nmi       (),
+    .rvfi_ext_debug_req (),
+    .rvfi_ext_mcycle    (),
+    .rvfi_mem_wcap      (),
+    .rvfi_mem_rcap      (),
+    .rvfi_mem_is_cap    (),
+    .rvfi_rd_wcap       (),
+    .rvfi_rs2_rcap      (),
+    .rvfi_rs1_rcap      (),
+`endif
+
+    .fetch_enable_i    (shadow_inputs_q[0].fetch_enable),
+    .alert_minor_o     (shadow_alert_minor),
+    .alert_major_o     (shadow_alert_major),
+    .icache_inval_o    (shadow_outputs_d.icache_inval),
+    .core_busy_o       (shadow_outputs_d.core_busy)
+  );
+
+  // Register the shadow core outputs
+  always_ff @(posedge clk_i) begin
+    shadow_outputs_q <= shadow_outputs_d;
+  end
+
+  /////////////////////////
+  // Compare the outputs //
+  /////////////////////////
+
+  logic outputs_mismatch;
+
+  assign outputs_mismatch       = enable_cmp_q & (shadow_outputs_q != core_outputs_q[0]);
+  assign alert_major_internal_o = outputs_mismatch | shadow_alert_major;
+  assign alert_major_bus_o      = bus_intg_err | tsmap_intg_err;
+  assign alert_minor_o          = shadow_alert_minor;
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_multdiv_fast.sv b/hw/ip/cheriot-ibex/rtl/cheriot_multdiv_fast.sv
new file mode 100644
index 0000000..522bb6b
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_multdiv_fast.sv
@@ -0,0 +1,531 @@
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+`define OP_L 15:0
+`define OP_H 31:16
+
+/**
+ * Fast Multiplier and Division
+ *
+ * 16x16 kernel multiplier and Long Division
+ */
+
+`include "prim_assert.sv"
+
+module cheriot_multdiv_fast #(
+  parameter cheriot_pkg::rv32m_e RV32M = cheriot_pkg::RV32MFast
+  ) (
+  input  logic             clk_i,
+  input  logic             rst_ni,
+  input  logic             mult_en_i,  // dynamic enable signal, for FSM control
+  input  logic             div_en_i,   // dynamic enable signal, for FSM control
+  input  logic             mult_sel_i, // static decoder output, for data muxes
+  input  logic             div_sel_i,  // static decoder output, for data muxes
+  input  cheriot_pkg::md_op_e operator_i,
+  input  logic  [1:0]      signed_mode_i,
+  input  logic [31:0]      op_a_i,
+  input  logic [31:0]      op_b_i,
+  input  logic [33:0]      alu_adder_ext_i,
+  input  logic [31:0]      alu_adder_i,
+  input  logic             equal_to_zero_i,
+  input  logic             data_ind_timing_i,
+
+  output logic [32:0]      alu_operand_a_o,
+  output logic [32:0]      alu_operand_b_o,
+
+  input  logic [33:0]      imd_val_q_i[2],
+  output logic [33:0]      imd_val_d_o[2],
+  output logic [1:0]       imd_val_we_o,
+
+  input  logic             multdiv_ready_id_i,
+
+  output logic [31:0]      multdiv_result_o,
+  output logic             valid_o
+);
+
+  import cheriot_pkg::*;
+
+  // Both multiplier variants
+  logic signed [34:0] mac_res_signed;
+  logic        [34:0] mac_res_ext;
+  logic        [33:0] accum;
+  logic        sign_a, sign_b;
+  logic        mult_valid;
+  logic        signed_mult;
+
+  // Results that become intermediate value depending on whether mul or div is being calculated
+  logic [33:0] mac_res_d, op_remainder_d;
+  // Raw output of MAC calculation
+  logic [33:0] mac_res;
+
+  // Divider signals
+  logic        div_sign_a, div_sign_b;
+  logic        is_greater_equal;
+  logic        div_change_sign, rem_change_sign;
+  logic [31:0] one_shift;
+  logic [31:0] op_denominator_q;
+  logic [31:0] op_numerator_q;
+  logic [31:0] op_quotient_q;
+  logic [31:0] op_denominator_d;
+  logic [31:0] op_numerator_d;
+  logic [31:0] op_quotient_d;
+  logic [31:0] next_remainder;
+  logic [32:0] next_quotient;
+  logic [31:0] res_adder_h;
+  logic        div_valid;
+  logic [ 4:0] div_counter_q, div_counter_d;
+  logic        multdiv_en;
+  logic        mult_hold;
+  logic        div_hold;
+  logic        div_by_zero_d, div_by_zero_q;
+
+  logic        mult_en_internal;
+  logic        div_en_internal;
+
+  typedef enum logic [2:0] {
+    MD_IDLE, MD_ABS_A, MD_ABS_B, MD_COMP, MD_LAST, MD_CHANGE_SIGN, MD_FINISH
+  } md_fsm_e;
+  md_fsm_e md_state_q, md_state_d;
+
+  logic unused_mult_sel_i;
+  assign unused_mult_sel_i = mult_sel_i;
+
+  assign mult_en_internal = mult_en_i & ~mult_hold;
+  assign div_en_internal  = div_en_i & ~div_hold;
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      div_counter_q    <= '0;
+      md_state_q       <= MD_IDLE;
+      op_numerator_q   <= '0;
+      op_quotient_q    <= '0;
+      div_by_zero_q    <= '0;
+    end else if (div_en_internal) begin
+      div_counter_q    <= div_counter_d;
+      op_numerator_q   <= op_numerator_d;
+      op_quotient_q    <= op_quotient_d;
+      md_state_q       <= md_state_d;
+      div_by_zero_q    <= div_by_zero_d;
+    end
+  end
+
+  `ASSERT_KNOWN(DivEnKnown, div_en_internal)
+  `ASSERT_KNOWN(MultEnKnown, mult_en_internal)
+  `ASSERT_KNOWN(MultDivEnKnown, multdiv_en)
+
+  assign multdiv_en = mult_en_internal | div_en_internal;
+
+  // Intermediate value register shared with ALU
+  assign imd_val_d_o[0] = div_sel_i ? op_remainder_d : mac_res_d;
+  assign imd_val_we_o[0] = multdiv_en;
+
+  assign imd_val_d_o[1] = {2'b0, op_denominator_d};
+  assign imd_val_we_o[1] = div_en_internal;
+  assign op_denominator_q = imd_val_q_i[1][31:0];
+  logic [1:0] unused_imd_val;
+  assign unused_imd_val = imd_val_q_i[1][33:32];
+  logic unused_mac_res_ext;
+  assign unused_mac_res_ext = mac_res_ext[34];
+
+  assign signed_mult      = (signed_mode_i != 2'b00);
+  assign multdiv_result_o = div_sel_i ? imd_val_q_i[0][31:0] : mac_res_d[31:0];
+
+  // The single cycle multiplier uses three 17 bit multipliers to compute MUL instructions in a
+  // single cycle and MULH instructions in two cycles.
+  if (RV32M == RV32MSingleCycle) begin : gen_mult_single_cycle
+
+    typedef enum logic {
+      MULL, MULH
+    } mult_fsm_e;
+    mult_fsm_e mult_state_q, mult_state_d;
+
+    logic signed [33:0] mult1_res, mult2_res, mult3_res;
+    logic [33:0]        mult1_res_uns;
+    logic [33:32]       unused_mult1_res_uns;
+    logic [15:0]        mult1_op_a, mult1_op_b;
+    logic [15:0]        mult2_op_a, mult2_op_b;
+    logic [15:0]        mult3_op_a, mult3_op_b;
+    logic               mult1_sign_a, mult1_sign_b;
+    logic               mult2_sign_a, mult2_sign_b;
+    logic               mult3_sign_a, mult3_sign_b;
+    logic [33:0]        summand1, summand2, summand3;
+
+    assign mult1_res = $signed({mult1_sign_a, mult1_op_a}) * $signed({mult1_sign_b, mult1_op_b});
+    assign mult2_res = $signed({mult2_sign_a, mult2_op_a}) * $signed({mult2_sign_b, mult2_op_b});
+    assign mult3_res = $signed({mult3_sign_a, mult3_op_a}) * $signed({mult3_sign_b, mult3_op_b});
+
+    assign mac_res_signed = $signed(summand1) + $signed(summand2) + $signed(summand3);
+
+    assign mult1_res_uns  = $unsigned(mult1_res);
+    assign mac_res_ext    = $unsigned(mac_res_signed);
+    assign mac_res        = mac_res_ext[33:0];
+
+    assign sign_a = signed_mode_i[0] & op_a_i[31];
+    assign sign_b = signed_mode_i[1] & op_b_i[31];
+
+    // The first two multipliers are only used in state 1 (MULL). We can assign them statically.
+    // al*bl
+    assign mult1_sign_a = 1'b0;
+    assign mult1_sign_b = 1'b0;
+    assign mult1_op_a = op_a_i[`OP_L];
+    assign mult1_op_b = op_b_i[`OP_L];
+
+    // al*bh
+    assign mult2_sign_a = 1'b0;
+    assign mult2_sign_b = sign_b;
+    assign mult2_op_a = op_a_i[`OP_L];
+    assign mult2_op_b = op_b_i[`OP_H];
+
+    // used in MULH
+    assign accum[17:0] = imd_val_q_i[0][33:16];
+    assign accum[33:18] = {16{signed_mult & imd_val_q_i[0][33]}};
+
+    always_comb begin
+      // Default values == MULL
+
+      // ah*bl
+      mult3_sign_a = sign_a;
+      mult3_sign_b = 1'b0;
+      mult3_op_a = op_a_i[`OP_H];
+      mult3_op_b = op_b_i[`OP_L];
+
+      summand1 = {18'h0, mult1_res_uns[`OP_H]};
+      summand2 = $unsigned(mult2_res);
+      summand3 = $unsigned(mult3_res);
+
+      // mac_res = A*B[47:16], mult1_res = A*B[15:0]
+      mac_res_d = {2'b0, mac_res[`OP_L], mult1_res_uns[`OP_L]};
+      mult_valid = mult_en_i;
+      mult_state_d = MULL;
+
+      mult_hold = 1'b0;
+
+      unique case (mult_state_q)
+
+        MULL: begin
+          if (operator_i != MD_OP_MULL) begin
+            mac_res_d = mac_res;
+            mult_valid = 1'b0;
+            mult_state_d = MULH;
+          end else begin
+            mult_hold = ~multdiv_ready_id_i;
+          end
+        end
+
+        MULH: begin
+          // ah*bh
+          mult3_sign_a = sign_a;
+          mult3_sign_b = sign_b;
+          mult3_op_a = op_a_i[`OP_H];
+          mult3_op_b = op_b_i[`OP_H];
+          mac_res_d = mac_res;
+
+          summand1 = '0;
+          summand2 = accum;
+          summand3 = $unsigned(mult3_res);
+
+          mult_state_d = MULL;
+          mult_valid = 1'b1;
+
+          mult_hold = ~multdiv_ready_id_i;
+        end
+
+        default: begin
+          mult_state_d = MULL;
+        end
+
+      endcase // mult_state_q
+    end
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        mult_state_q <= MULL;
+      end else begin
+        if (mult_en_internal) begin
+          mult_state_q <= mult_state_d;
+        end
+      end
+    end
+
+    assign unused_mult1_res_uns = mult1_res_uns[33:32];
+
+    // States must be knwon/valid.
+    `ASSERT_KNOWN(IbexMultStateKnown, mult_state_q)
+
+  // The fast multiplier uses one 17 bit multiplier to compute MUL instructions in 3 cycles
+  // and MULH instructions in 4 cycles.
+  end else begin : gen_mult_fast
+    logic [15:0] mult_op_a;
+    logic [15:0] mult_op_b;
+
+    typedef enum logic [1:0] {
+      ALBL, ALBH, AHBL, AHBH
+    } mult_fsm_e;
+    mult_fsm_e mult_state_q, mult_state_d;
+
+    // The 2 MSBs of mac_res_ext (mac_res_ext[34:33]) are always equal since:
+    // 1. The 2 MSBs of the multiplicants are always equal, and
+    // 2. The 16 MSBs of the addend (accum[33:18]) are always equal.
+    // Thus, it is safe to ignore mac_res_ext[34].
+    assign mac_res_signed =
+        $signed({sign_a, mult_op_a}) * $signed({sign_b, mult_op_b}) + $signed(accum);
+    assign mac_res_ext    = $unsigned(mac_res_signed);
+    assign mac_res        = mac_res_ext[33:0];
+
+    always_comb begin
+      mult_op_a    = op_a_i[`OP_L];
+      mult_op_b    = op_b_i[`OP_L];
+      sign_a       = 1'b0;
+      sign_b       = 1'b0;
+      accum        = imd_val_q_i[0];
+      mac_res_d    = mac_res;
+      mult_state_d = mult_state_q;
+      mult_valid   = 1'b0;
+      mult_hold    = 1'b0;
+
+      unique case (mult_state_q)
+
+        ALBL: begin
+          // al*bl
+          mult_op_a = op_a_i[`OP_L];
+          mult_op_b = op_b_i[`OP_L];
+          sign_a    = 1'b0;
+          sign_b    = 1'b0;
+          accum     = '0;
+          mac_res_d = mac_res;
+          mult_state_d = ALBH;
+        end
+
+        ALBH: begin
+          // al*bh<<16
+          mult_op_a = op_a_i[`OP_L];
+          mult_op_b = op_b_i[`OP_H];
+          sign_a    = 1'b0;
+          sign_b    = signed_mode_i[1] & op_b_i[31];
+          // result of AL*BL (in imd_val_q_i[0]) always unsigned with no carry
+          accum     = {18'b0, imd_val_q_i[0][31:16]};
+          if (operator_i == MD_OP_MULL) begin
+            mac_res_d = {2'b0, mac_res[`OP_L], imd_val_q_i[0][`OP_L]};
+          end else begin
+            // MD_OP_MULH
+            mac_res_d = mac_res;
+          end
+          mult_state_d = AHBL;
+        end
+
+        AHBL: begin
+          // ah*bl<<16
+          mult_op_a = op_a_i[`OP_H];
+          mult_op_b = op_b_i[`OP_L];
+          sign_a    = signed_mode_i[0] & op_a_i[31];
+          sign_b    = 1'b0;
+          if (operator_i == MD_OP_MULL) begin
+            accum        = {18'b0, imd_val_q_i[0][31:16]};
+            mac_res_d    = {2'b0, mac_res[15:0], imd_val_q_i[0][15:0]};
+            mult_valid   = 1'b1;
+
+            // Note no state transition will occur if mult_hold is set
+            mult_state_d = ALBL;
+            mult_hold    = ~multdiv_ready_id_i;
+          end else begin
+            accum        = imd_val_q_i[0];
+            mac_res_d    = mac_res;
+            mult_state_d = AHBH;
+          end
+        end
+
+        AHBH: begin
+          // only MD_OP_MULH here
+          // ah*bh
+          mult_op_a = op_a_i[`OP_H];
+          mult_op_b = op_b_i[`OP_H];
+          sign_a    = signed_mode_i[0] & op_a_i[31];
+          sign_b    = signed_mode_i[1] & op_b_i[31];
+          accum[17: 0]  = imd_val_q_i[0][33:16];
+          accum[33:18]  = {16{signed_mult & imd_val_q_i[0][33]}};
+          // result of AH*BL is not signed only if signed_mode_i == 2'b00
+          mac_res_d    = mac_res;
+          mult_valid   = 1'b1;
+
+          // Note no state transition will occur if mult_hold is set
+          mult_state_d = ALBL;
+          mult_hold    = ~multdiv_ready_id_i;
+        end
+        default: begin
+          mult_state_d = ALBL;
+        end
+      endcase // mult_state_q
+    end
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        mult_state_q <= ALBL;
+      end else begin
+        if (mult_en_internal) begin
+          mult_state_q <= mult_state_d;
+        end
+      end
+    end
+
+    // States must be knwon/valid.
+    `ASSERT_KNOWN(IbexMultStateKnown, mult_state_q)
+
+  end // gen_mult_fast
+
+  // Divider
+  assign res_adder_h    = alu_adder_ext_i[32:1];
+  logic [1:0] unused_alu_adder_ext;
+  assign unused_alu_adder_ext = {alu_adder_ext_i[33],alu_adder_ext_i[0]};
+
+  assign next_remainder = is_greater_equal ? res_adder_h[31:0] : imd_val_q_i[0][31:0];
+  assign next_quotient  = is_greater_equal ? {1'b0, op_quotient_q} | {1'b0, one_shift} :
+                                             {1'b0, op_quotient_q};
+
+  assign one_shift      = {31'b0, 1'b1} << div_counter_q;
+
+  // The adder in the ALU computes alu_operand_a_o + alu_operand_b_o which means
+  // Remainder - Divisor. If Remainder - Divisor >= 0, is_greater_equal is equal to 1,
+  // the next Remainder is Remainder - Divisor contained in res_adder_h and the
+  always_comb begin
+    if ((imd_val_q_i[0][31] ^ op_denominator_q[31]) == 1'b0) begin
+      is_greater_equal = (res_adder_h[31] == 1'b0);
+    end else begin
+      is_greater_equal = imd_val_q_i[0][31];
+    end
+  end
+
+  assign div_sign_a      = op_a_i[31] & signed_mode_i[0];
+  assign div_sign_b      = op_b_i[31] & signed_mode_i[1];
+  assign div_change_sign = (div_sign_a ^ div_sign_b) & ~div_by_zero_q;
+  assign rem_change_sign = div_sign_a;
+
+
+  always_comb begin
+    div_counter_d    = div_counter_q - 5'h1;
+    op_remainder_d   = imd_val_q_i[0];
+    op_quotient_d    = op_quotient_q;
+    md_state_d       = md_state_q;
+    op_numerator_d   = op_numerator_q;
+    op_denominator_d = op_denominator_q;
+    alu_operand_a_o  = {32'h0  , 1'b1};
+    alu_operand_b_o  = {~op_b_i, 1'b1};
+    div_valid        = 1'b0;
+    div_hold         = 1'b0;
+    div_by_zero_d    = div_by_zero_q;
+
+    unique case (md_state_q)
+      MD_IDLE: begin
+        if (operator_i == MD_OP_DIV) begin
+          // Check if the Denominator is 0
+          // quotient for division by 0 is specified to be -1
+          // Note with data-independent time option, the full divide operation will proceed as
+          // normal and will naturally return -1
+          op_remainder_d = '1;
+          md_state_d     = (!data_ind_timing_i && equal_to_zero_i) ? MD_FINISH : MD_ABS_A;
+          // Record that this is a div by zero to stop the sign change at the end of the
+          // division (in data_ind_timing mode).
+          div_by_zero_d  = equal_to_zero_i;
+        end else begin
+          // Check if the Denominator is 0
+          // remainder for division by 0 is specified to be the numerator (operand a)
+          // Note with data-independent time option, the full divide operation will proceed as
+          // normal and will naturally return operand a
+          op_remainder_d = {2'b0, op_a_i};
+          md_state_d     = (!data_ind_timing_i && equal_to_zero_i) ? MD_FINISH : MD_ABS_A;
+        end
+        // 0 - B = 0 iff B == 0
+        alu_operand_a_o  = {32'h0  , 1'b1};
+        alu_operand_b_o  = {~op_b_i, 1'b1};
+        div_counter_d    = 5'd31;
+      end
+
+      MD_ABS_A: begin
+        // quotient
+        op_quotient_d   = '0;
+        // A abs value
+        op_numerator_d  = div_sign_a ? alu_adder_i : op_a_i;
+        md_state_d      = MD_ABS_B;
+        div_counter_d   = 5'd31;
+        // ABS(A) = 0 - A
+        alu_operand_a_o = {32'h0  , 1'b1};
+        alu_operand_b_o = {~op_a_i, 1'b1};
+      end
+
+      MD_ABS_B: begin
+        // remainder
+        op_remainder_d   = { 33'h0, op_numerator_q[31]};
+        // B abs value
+        op_denominator_d = div_sign_b ? alu_adder_i : op_b_i;
+        md_state_d       = MD_COMP;
+        div_counter_d    = 5'd31;
+        // ABS(B) = 0 - B
+        alu_operand_a_o  = {32'h0  , 1'b1};
+        alu_operand_b_o  = {~op_b_i, 1'b1};
+      end
+
+      MD_COMP: begin
+        op_remainder_d  = {1'b0, next_remainder[31:0], op_numerator_q[div_counter_d]};
+        op_quotient_d   = next_quotient[31:0];
+        md_state_d      = (div_counter_q == 5'd1) ? MD_LAST : MD_COMP;
+        // Division
+        alu_operand_a_o = {imd_val_q_i[0][31:0], 1'b1}; // it contains the remainder
+        alu_operand_b_o = {~op_denominator_q[31:0], 1'b1};  // -denominator two's compliment
+      end
+
+      MD_LAST: begin
+        if (operator_i == MD_OP_DIV) begin
+          // this time we save the quotient in op_remainder_d (i.e. imd_val_q_i[0]) since
+          // we do not need anymore the remainder
+          op_remainder_d = {1'b0, next_quotient};
+        end else begin
+          // this time we do not save the quotient anymore since we need only the remainder
+          op_remainder_d = {2'b0, next_remainder[31:0]};
+        end
+        // Division
+        alu_operand_a_o  = {imd_val_q_i[0][31:0], 1'b1}; // it contains the remainder
+        alu_operand_b_o  = {~op_denominator_q[31:0], 1'b1};  // -denominator two's compliment
+
+        md_state_d = MD_CHANGE_SIGN;
+      end
+
+      MD_CHANGE_SIGN: begin
+        md_state_d  = MD_FINISH;
+        if (operator_i == MD_OP_DIV) begin
+          op_remainder_d = (div_change_sign) ? {2'h0, alu_adder_i} : imd_val_q_i[0];
+        end else begin
+          op_remainder_d = (rem_change_sign) ? {2'h0, alu_adder_i} : imd_val_q_i[0];
+        end
+        // ABS(Quotient) = 0 - Quotient (or Remainder)
+        alu_operand_a_o  = {32'h0  , 1'b1};
+        alu_operand_b_o  = {~imd_val_q_i[0][31:0], 1'b1};
+      end
+
+      MD_FINISH: begin
+        // Hold result until ID stage is ready to accept it
+        // Note no state transition will occur if div_hold is set
+        md_state_d = MD_IDLE;
+        div_hold   = ~multdiv_ready_id_i;
+        div_valid   = 1'b1;
+      end
+
+      default: begin
+        md_state_d = MD_IDLE;
+      end
+    endcase // md_state_q
+  end
+
+  assign valid_o = mult_valid | div_valid;
+
+  // States must be knwon/valid.
+  `ASSERT(IbexMultDivStateValid, md_state_q inside {
+      MD_IDLE, MD_ABS_A, MD_ABS_B, MD_COMP, MD_LAST, MD_CHANGE_SIGN, MD_FINISH})
+
+`ifdef FORMAL
+  `ifdef YOSYS
+    `include "formal_tb_frag.svh"
+  `endif
+`endif
+
+endmodule // ibex_mult
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_multdiv_slow.sv b/hw/ip/cheriot-ibex/rtl/cheriot_multdiv_slow.sv
new file mode 100644
index 0000000..8fbc929
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_multdiv_slow.sv
@@ -0,0 +1,374 @@
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Slow Multiplier and Division
+ *
+ * Baugh-Wooley multiplier and Long Division
+ */
+
+`include "prim_assert.sv"
+
+module cheriot_multdiv_slow
+(
+  input  logic             clk_i,
+  input  logic             rst_ni,
+  input  logic             mult_en_i,  // dynamic enable signal, for FSM control
+  input  logic             div_en_i,   // dynamic enable signal, for FSM control
+  input  logic             mult_sel_i, // static decoder output, for data muxes
+  input  logic             div_sel_i,  // static decoder output, for data muxes
+  input  cheriot_pkg::md_op_e operator_i,
+  input  logic  [1:0]      signed_mode_i,
+  input  logic [31:0]      op_a_i,
+  input  logic [31:0]      op_b_i,
+  input  logic [33:0]      alu_adder_ext_i,
+  input  logic [31:0]      alu_adder_i,
+  input  logic             equal_to_zero_i,
+  input  logic             data_ind_timing_i,
+
+  output logic [32:0]      alu_operand_a_o,
+  output logic [32:0]      alu_operand_b_o,
+
+  input  logic [33:0]      imd_val_q_i[2],
+  output logic [33:0]      imd_val_d_o[2],
+  output logic  [1:0]      imd_val_we_o,
+
+  input  logic             multdiv_ready_id_i,
+
+  output logic [31:0]      multdiv_result_o,
+
+  output logic             valid_o
+);
+
+  import cheriot_pkg::*;
+
+  typedef enum logic [2:0] {
+    MD_IDLE, MD_ABS_A, MD_ABS_B, MD_COMP, MD_LAST, MD_CHANGE_SIGN, MD_FINISH
+  } md_fsm_e;
+  md_fsm_e md_state_q, md_state_d;
+
+  logic [32:0] accum_window_q, accum_window_d;
+  logic        unused_imd_val0;
+  logic [ 1:0] unused_imd_val1;
+
+  logic [32:0] res_adder_l;
+  logic [32:0] res_adder_h;
+
+  logic [ 4:0] multdiv_count_q, multdiv_count_d;
+  logic [32:0] op_b_shift_q, op_b_shift_d;
+  logic [32:0] op_a_shift_q, op_a_shift_d;
+  logic [32:0] op_a_ext, op_b_ext;
+  logic [32:0] one_shift;
+  logic [32:0] op_a_bw_pp, op_a_bw_last_pp;
+  logic [31:0] b_0;
+  logic        sign_a, sign_b;
+  logic [32:0] next_quotient;
+  logic [31:0] next_remainder;
+  logic [31:0] op_numerator_q, op_numerator_d;
+  logic        is_greater_equal;
+  logic        div_change_sign, rem_change_sign;
+  logic        div_by_zero_d, div_by_zero_q;
+  logic        multdiv_hold;
+  logic        multdiv_en;
+
+   // (accum_window_q + op_a_shift_q)
+  assign res_adder_l = alu_adder_ext_i[32:0];
+   // (accum_window_q + op_a_shift_q)>>1
+  assign res_adder_h = alu_adder_ext_i[33:1];
+
+  /////////////////////
+  // ALU Operand MUX //
+  /////////////////////
+
+  // Intermediate value register shared with ALU
+  assign imd_val_d_o[0]  = {1'b0,accum_window_d};
+  assign imd_val_we_o[0] = ~multdiv_hold;
+  assign accum_window_q  = imd_val_q_i[0][32:0];
+  assign unused_imd_val0 = imd_val_q_i[0][33];
+
+  assign imd_val_d_o[1]  = {2'b00, op_numerator_d};
+  assign imd_val_we_o[1] = multdiv_en;
+  assign op_numerator_q  = imd_val_q_i[1][31:0];
+  assign unused_imd_val1 = imd_val_q_i[1][33:32];
+
+  always_comb begin
+    alu_operand_a_o = accum_window_q;
+
+    unique case (operator_i)
+
+      MD_OP_MULL: begin
+        alu_operand_b_o = op_a_bw_pp;
+      end
+
+      MD_OP_MULH: begin
+        alu_operand_b_o = (md_state_q == MD_LAST) ? op_a_bw_last_pp : op_a_bw_pp;
+      end
+
+      MD_OP_DIV,
+      MD_OP_REM: begin
+        unique case (md_state_q)
+          MD_IDLE: begin
+            // 0 - B = 0 iff B == 0
+            alu_operand_a_o = {32'h0  , 1'b1};
+            alu_operand_b_o = {~op_b_i, 1'b1};
+          end
+          MD_ABS_A: begin
+            // ABS(A) = 0 - A
+            alu_operand_a_o = {32'h0  , 1'b1};
+            alu_operand_b_o = {~op_a_i, 1'b1};
+          end
+          MD_ABS_B: begin
+            // ABS(B) = 0 - B
+            alu_operand_a_o = {32'h0  , 1'b1};
+            alu_operand_b_o = {~op_b_i, 1'b1};
+          end
+          MD_CHANGE_SIGN: begin
+            // ABS(Quotient) = 0 - Quotient (or Reminder)
+            alu_operand_a_o = {32'h0  , 1'b1};
+            alu_operand_b_o = {~accum_window_q[31:0], 1'b1};
+          end
+          default: begin
+            // Division
+            alu_operand_a_o = {accum_window_q[31:0], 1'b1}; // it contains the remainder
+            alu_operand_b_o = {~op_b_shift_q[31:0], 1'b1};  // -denominator two's compliment
+          end
+        endcase
+      end
+      default: begin
+        alu_operand_a_o = accum_window_q;
+        alu_operand_b_o = {~op_b_shift_q[31:0], 1'b1};
+      end
+    endcase
+  end
+
+  // Multiplier partial product calculation
+  assign b_0             = {32{op_b_shift_q[0]}};
+  assign op_a_bw_pp      = { ~(op_a_shift_q[32] & op_b_shift_q[0]),  (op_a_shift_q[31:0] & b_0) };
+  assign op_a_bw_last_pp = {  (op_a_shift_q[32] & op_b_shift_q[0]), ~(op_a_shift_q[31:0] & b_0) };
+
+  // Sign extend the input operands
+  assign sign_a   = op_a_i[31] & signed_mode_i[0];
+  assign sign_b   = op_b_i[31] & signed_mode_i[1];
+
+  assign op_a_ext = {sign_a, op_a_i};
+  assign op_b_ext = {sign_b, op_b_i};
+
+  // Divider calculations
+
+  // The adder in the ALU computes Remainder - Divisor. If Remainder - Divisor >= 0,
+  // is_greater_equal is true, the next Remainder is the subtraction result and the Quotient
+  // multdiv_count_q-th bit is set to 1.
+  assign is_greater_equal = (accum_window_q[31] == op_b_shift_q[31]) ?
+      ~res_adder_h[31] : accum_window_q[31];
+
+  assign one_shift      = {32'b0, 1'b1} << multdiv_count_q;
+
+  assign next_remainder = is_greater_equal ? res_adder_h[31:0]        : accum_window_q[31:0];
+  assign next_quotient  = is_greater_equal ? op_a_shift_q | one_shift : op_a_shift_q;
+
+  assign div_change_sign  = (sign_a ^ sign_b) & ~div_by_zero_q;
+  assign rem_change_sign  = sign_a;
+
+  always_comb begin
+    multdiv_count_d  = multdiv_count_q;
+    accum_window_d   = accum_window_q;
+    op_b_shift_d     = op_b_shift_q;
+    op_a_shift_d     = op_a_shift_q;
+    op_numerator_d   = op_numerator_q;
+    md_state_d       = md_state_q;
+    multdiv_hold     = 1'b0;
+    div_by_zero_d    = div_by_zero_q;
+    if (mult_sel_i || div_sel_i) begin
+      unique case (md_state_q)
+        MD_IDLE: begin
+          unique case (operator_i)
+            MD_OP_MULL: begin
+              op_a_shift_d   = op_a_ext << 1;
+              accum_window_d = {       ~(op_a_ext[32]   &     op_b_i[0]),
+                                         op_a_ext[31:0] & {32{op_b_i[0]}}  };
+              op_b_shift_d   = op_b_ext >> 1;
+              // Proceed with multiplication by 0/1 in data-independent time mode
+              md_state_d     = (!data_ind_timing_i && ((op_b_ext >> 1) == 0)) ? MD_LAST : MD_COMP;
+            end
+            MD_OP_MULH: begin
+              op_a_shift_d   = op_a_ext;
+              accum_window_d = { 1'b1, ~(op_a_ext[32]   &     op_b_i[0]),
+                                         op_a_ext[31:1] & {31{op_b_i[0]}}  };
+              op_b_shift_d   = op_b_ext >> 1;
+              md_state_d     = MD_COMP;
+            end
+            MD_OP_DIV: begin
+              // Check if the denominator is 0
+              // quotient for division by 0 is specified to be -1
+              // Note with data-independent time option, the full divide operation will proceed as
+              // normal and will naturally return -1
+              accum_window_d = {33{1'b1}};
+              md_state_d     = (!data_ind_timing_i && equal_to_zero_i) ? MD_FINISH : MD_ABS_A;
+              // Record that this is a div by zero to stop the sign change at the end of the
+              // division (in data_ind_timing mode).
+              div_by_zero_d  = equal_to_zero_i;
+            end
+            MD_OP_REM: begin
+              // Check if the denominator is 0
+              // remainder for division by 0 is specified to be the numerator (operand a)
+              // Note with data-independent time option, the full divide operation will proceed as
+              // normal and will naturally return operand a
+              accum_window_d = op_a_ext;
+              md_state_d     = (!data_ind_timing_i && equal_to_zero_i) ? MD_FINISH : MD_ABS_A;
+            end
+            default:;
+          endcase
+          multdiv_count_d   = 5'd31;
+        end
+
+        MD_ABS_A: begin
+          // quotient
+          op_a_shift_d   = '0;
+          // A abs value
+          op_numerator_d = sign_a ? alu_adder_i : op_a_i;
+          md_state_d     = MD_ABS_B;
+        end
+
+        MD_ABS_B: begin
+          // remainder
+          accum_window_d = {32'h0, op_numerator_q[31]};
+          // B abs value
+          op_b_shift_d   = sign_b ? {1'b0, alu_adder_i} : {1'b0, op_b_i};
+          md_state_d     = MD_COMP;
+        end
+
+        MD_COMP: begin
+          multdiv_count_d = multdiv_count_q - 5'h1;
+          unique case (operator_i)
+            MD_OP_MULL: begin
+              accum_window_d = res_adder_l;
+              op_a_shift_d   = op_a_shift_q << 1;
+              op_b_shift_d   = op_b_shift_q >> 1;
+              // Multiplication is complete once op_b is zero, unless in data_ind_timing mode where
+              // the maximum possible shift-add operations will be completed regardless of op_b
+              md_state_d     = ((!data_ind_timing_i && (op_b_shift_d == 0)) ||
+                                (multdiv_count_q == 5'd1)) ? MD_LAST : MD_COMP;
+            end
+            MD_OP_MULH: begin
+              accum_window_d = res_adder_h;
+              op_a_shift_d   = op_a_shift_q;
+              op_b_shift_d   = op_b_shift_q >> 1;
+              md_state_d     = (multdiv_count_q == 5'd1) ? MD_LAST : MD_COMP;
+            end
+            MD_OP_DIV,
+            MD_OP_REM: begin
+              accum_window_d = {next_remainder[31:0], op_numerator_q[multdiv_count_d]};
+              op_a_shift_d   = next_quotient;
+              md_state_d     = (multdiv_count_q == 5'd1) ? MD_LAST : MD_COMP;
+            end
+            default: ;
+          endcase
+        end
+
+        MD_LAST: begin
+          unique case (operator_i)
+            MD_OP_MULL: begin
+              accum_window_d = res_adder_l;
+
+              // Note no state transition will occur if multdiv_hold is set
+              md_state_d   = MD_IDLE;
+              multdiv_hold = ~multdiv_ready_id_i;
+            end
+            MD_OP_MULH: begin
+              accum_window_d = res_adder_l;
+              md_state_d     = MD_IDLE;
+
+              // Note no state transition will occur if multdiv_hold is set
+              md_state_d   = MD_IDLE;
+              multdiv_hold = ~multdiv_ready_id_i;
+            end
+            MD_OP_DIV: begin
+              // this time we save the quotient in accum_window_q since we do not need anymore the
+              // remainder
+              accum_window_d = next_quotient;
+              md_state_d     = MD_CHANGE_SIGN;
+            end
+            MD_OP_REM: begin
+              // this time we do not save the quotient anymore since we need only the remainder
+              accum_window_d = {1'b0, next_remainder[31:0]};
+              md_state_d     = MD_CHANGE_SIGN;
+            end
+            default: ;
+          endcase
+        end
+
+        MD_CHANGE_SIGN: begin
+          md_state_d = MD_FINISH;
+          unique case (operator_i)
+            MD_OP_DIV:
+              accum_window_d = div_change_sign ? {1'b0,alu_adder_i} : accum_window_q;
+            MD_OP_REM:
+              accum_window_d = rem_change_sign ? {1'b0,alu_adder_i} : accum_window_q;
+            default: ;
+          endcase
+        end
+
+        MD_FINISH: begin
+          // Note no state transition will occur if multdiv_hold is set
+          md_state_d   = MD_IDLE;
+          multdiv_hold = ~multdiv_ready_id_i;
+        end
+
+        default: begin
+          md_state_d = MD_IDLE;
+        end
+      endcase // md_state_q
+    end // (mult_sel_i || div_sel_i)
+  end
+
+  //////////////////////////////////////////
+  // Mutliplier / Divider state registers //
+  //////////////////////////////////////////
+
+  assign multdiv_en = (mult_en_i | div_en_i) & ~multdiv_hold;
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      multdiv_count_q  <= 5'h0;
+      op_b_shift_q     <= 33'h0;
+      op_a_shift_q     <= 33'h0;
+      md_state_q       <= MD_IDLE;
+      div_by_zero_q    <= 1'b0;
+    end else if (multdiv_en) begin
+      multdiv_count_q  <= multdiv_count_d;
+      op_b_shift_q     <= op_b_shift_d;
+      op_a_shift_q     <= op_a_shift_d;
+      md_state_q       <= md_state_d;
+      div_by_zero_q    <= div_by_zero_d;
+    end
+  end
+
+  /////////////
+  // Outputs //
+  /////////////
+
+  assign valid_o = (md_state_q == MD_FINISH) |
+                   (md_state_q == MD_LAST &
+                   (operator_i == MD_OP_MULL |
+                    operator_i == MD_OP_MULH));
+
+  assign multdiv_result_o = div_en_i ? accum_window_q[31:0] : res_adder_l[31:0];
+
+  ////////////////
+  // Assertions //
+  ////////////////
+
+  // State must be valid.
+  `ASSERT(IbexMultDivStateValid, md_state_q inside {
+      MD_IDLE, MD_ABS_A, MD_ABS_B, MD_COMP, MD_LAST, MD_CHANGE_SIGN, MD_FINISH
+      }, clk_i, !rst_ni)
+
+`ifdef FORMAL
+  `ifdef YOSYS
+    `include "formal_tb_frag.svh"
+  `endif
+`endif
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_pkg.sv b/hw/ip/cheriot-ibex/rtl/cheriot_pkg.sv
new file mode 100644
index 0000000..d40fd94
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_pkg.sv
@@ -0,0 +1,676 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright lowRISC contributors.
+// Copyright 2017 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Package with constants used by Ibex
+ */
+package cheriot_pkg;
+
+  ////////////////
+  // IO Structs //
+  ////////////////
+
+  typedef struct packed {
+    logic [31:0] current_pc;
+    logic [31:0] next_pc;
+    logic [31:0] last_data_addr;
+    logic [31:0] exception_addr;
+  } crash_dump_t;
+
+  typedef struct packed {
+    logic        dummy_instr_id;
+    logic [4:0]  raddr_a;
+    logic [4:0]  waddr_a;
+    logic        we_a;
+    logic [4:0]  raddr_b;
+  } core2rf_t;
+
+  /////////////////////
+  // Parameter Enums //
+  /////////////////////
+
+  typedef enum integer {
+    RegFileFF    = 0,
+    RegFileFPGA  = 1,
+    RegFileLatch = 2
+  } regfile_e;
+
+  typedef enum integer {
+    RV32MNone        = 0,
+    RV32MSlow        = 1,
+    RV32MFast        = 2,
+    RV32MSingleCycle = 3
+  } rv32m_e;
+
+  typedef enum integer {
+    RV32BNone       = 0,
+    RV32BBalanced   = 1,
+    RV32BOTEarlGrey = 2,
+    RV32BFull       = 3
+  } rv32b_e;
+
+  /////////////
+  // Opcodes //
+  /////////////
+
+  typedef enum logic [6:0] {
+    OPCODE_LOAD     = 7'h03,
+    OPCODE_MISC_MEM = 7'h0f,
+    OPCODE_OP_IMM   = 7'h13,
+    OPCODE_AUIPC    = 7'h17,
+    OPCODE_STORE    = 7'h23,
+    OPCODE_OP       = 7'h33,
+    OPCODE_LUI      = 7'h37,
+    OPCODE_BRANCH   = 7'h63,
+    OPCODE_JALR     = 7'h67,
+    OPCODE_JAL      = 7'h6f,
+    OPCODE_SYSTEM   = 7'h73,
+    OPCODE_CHERI    = 7'h5b,
+    OPCODE_AUICGP   = 7'h7b
+  } opcode_e;
+
+
+  ////////////////////
+  // ALU operations //
+  ////////////////////
+
+  typedef enum logic [6:0] {
+    // Arithmetics
+    ALU_ADD,
+    ALU_SUB,
+
+    // Logics
+    ALU_XOR,
+    ALU_OR,
+    ALU_AND,
+    // RV32B
+    ALU_XNOR,
+    ALU_ORN,
+    ALU_ANDN,
+
+    // Shifts
+    ALU_SRA,
+    ALU_SRL,
+    ALU_SLL,
+    // RV32B
+    ALU_SRO,
+    ALU_SLO,
+    ALU_ROR,
+    ALU_ROL,
+    ALU_GREV,
+    ALU_GORC,
+    ALU_SHFL,
+    ALU_UNSHFL,
+    ALU_XPERM_N,
+    ALU_XPERM_B,
+    ALU_XPERM_H,
+
+    // Address Calculations
+    // RV32B
+    ALU_SH1ADD,
+    ALU_SH2ADD,
+    ALU_SH3ADD,
+
+    // Comparisons
+    ALU_LT,
+    ALU_LTU,
+    ALU_GE,
+    ALU_GEU,
+    ALU_EQ,
+    ALU_NE,
+    // RV32B
+    ALU_MIN,
+    ALU_MINU,
+    ALU_MAX,
+    ALU_MAXU,
+
+    // Pack
+    // RV32B
+    ALU_PACK,
+    ALU_PACKU,
+    ALU_PACKH,
+
+    // Sign-Extend
+    // RV32B
+    ALU_SEXTB,
+    ALU_SEXTH,
+
+    // Bitcounting
+    // RV32B
+    ALU_CLZ,
+    ALU_CTZ,
+    ALU_CPOP,
+
+    // Set lower than
+    ALU_SLT,
+    ALU_SLTU,
+
+    // Ternary Bitmanip Operations
+    // RV32B
+    ALU_CMOV,
+    ALU_CMIX,
+    ALU_FSL,
+    ALU_FSR,
+
+    // Single-Bit Operations
+    // RV32B
+    ALU_BSET,
+    ALU_BCLR,
+    ALU_BINV,
+    ALU_BEXT,
+
+    // Bit Compress / Decompress
+    // RV32B
+    ALU_BCOMPRESS,
+    ALU_BDECOMPRESS,
+
+    // Bit Field Place
+    // RV32B
+    ALU_BFP,
+
+    // Carry-less Multiply
+    // RV32B
+    ALU_CLMUL,
+    ALU_CLMULR,
+    ALU_CLMULH,
+
+    // Cyclic Redundancy Check
+    ALU_CRC32_B,
+    ALU_CRC32C_B,
+    ALU_CRC32_H,
+    ALU_CRC32C_H,
+    ALU_CRC32_W,
+    ALU_CRC32C_W
+  } alu_op_e;
+
+  typedef enum logic [1:0] {
+    // Multiplier/divider
+    MD_OP_MULL,
+    MD_OP_MULH,
+    MD_OP_DIV,
+    MD_OP_REM
+  } md_op_e;
+
+
+  //////////////////////////////////
+  // Control and status registers //
+  //////////////////////////////////
+
+  // CSR operations
+  typedef enum logic [1:0] {
+    CSR_OP_READ,
+    CSR_OP_WRITE,
+    CSR_OP_SET,
+    CSR_OP_CLEAR
+  } csr_op_e;
+
+  // Privileged mode
+  typedef enum logic[1:0] {
+    PRIV_LVL_M = 2'b11,
+    PRIV_LVL_H = 2'b10,
+    PRIV_LVL_S = 2'b01,
+    PRIV_LVL_U = 2'b00
+  } priv_lvl_e;
+
+  // Constants for the dcsr.xdebugver fields
+  typedef enum logic[3:0] {
+    XDEBUGVER_NO     = 4'd0, // no external debug support
+    XDEBUGVER_STD    = 4'd4, // external debug according to RISC-V debug spec
+    XDEBUGVER_NONSTD = 4'd15 // debug not conforming to RISC-V debug spec
+  } x_debug_ver_e;
+
+  //////////////
+  // WB stage //
+  //////////////
+
+  // Type of instruction present in writeback stage
+  typedef enum logic[1:0] {
+    WB_INSTR_LOAD,  // Instruction is awaiting load data
+    WB_INSTR_STORE, // Instruction is awaiting store response
+    WB_INSTR_OTHER  // Instruction doesn't fit into above categories
+  } wb_instr_type_e;
+
+  //////////////
+  // ID stage //
+  //////////////
+
+  // Operand a selection
+  typedef enum logic[1:0] {
+    OP_A_REG_A,
+    OP_A_FWD,
+    OP_A_CURRPC,
+    OP_A_IMM
+  } op_a_sel_e;
+
+  // Immediate a selection
+  typedef enum logic {
+    IMM_A_Z,
+    IMM_A_ZERO
+  } imm_a_sel_e;
+
+  // Operand b selection
+  typedef enum logic {
+    OP_B_REG_B,
+    OP_B_IMM
+  } op_b_sel_e;
+
+  // Immediate b selection
+  typedef enum logic [2:0] {
+    IMM_B_I,
+    IMM_B_S,
+    IMM_B_B,
+    IMM_B_U,
+    IMM_B_J,
+    IMM_B_INCR_PC,
+    IMM_B_INCR_ADDR
+  } imm_b_sel_e;
+
+  // Regfile write data selection
+  typedef enum logic {
+    RF_WD_EX,
+    RF_WD_CSR
+  } rf_wd_sel_e;
+
+
+  //////////////
+  // IF stage //
+  //////////////
+
+  // PC mux selection
+  typedef enum logic [2:0] {
+    PC_BOOT,
+    PC_JUMP,
+    PC_EXC,
+    PC_ERET,
+    PC_DRET,
+    PC_BP
+  } pc_sel_e;
+
+  // Exception PC mux selection
+  typedef enum logic [1:0] {
+    EXC_PC_EXC,
+    EXC_PC_IRQ,
+    EXC_PC_DBD,
+    EXC_PC_DBG_EXC // Exception while in debug mode
+  } exc_pc_sel_e;
+
+  // Interrupt requests
+  typedef struct packed {
+    logic        irq_software;
+    logic        irq_timer;
+    logic        irq_external;
+    logic [14:0] irq_fast; // 15 fast interrupts,
+                          // one interrupt is reserved for NMI (not visible through mip/mie)
+  } irqs_t;
+
+  // Exception cause
+  typedef enum logic [5:0] {
+    EXC_CAUSE_IRQ_SOFTWARE_M      = {1'b1, 5'd03},
+    EXC_CAUSE_IRQ_TIMER_M         = {1'b1, 5'd07},
+    EXC_CAUSE_IRQ_EXTERNAL_M      = {1'b1, 5'd11},
+    // EXC_CAUSE_IRQ_FAST_0       = {1'b1, 5'd16},
+    // EXC_CAUSE_IRQ_FAST_14      = {1'b1, 5'd30},
+    EXC_CAUSE_IRQ_NM              = {1'b1, 5'd31}, // == EXC_CAUSE_IRQ_FAST_15
+    EXC_CAUSE_INSN_ADDR_MISA      = {1'b0, 5'd00},
+    EXC_CAUSE_INSTR_ACCESS_FAULT  = {1'b0, 5'd01},
+    EXC_CAUSE_ILLEGAL_INSN        = {1'b0, 5'd02},
+    EXC_CAUSE_BREAKPOINT          = {1'b0, 5'd03},
+    EXC_CAUSE_LOAD_ADDR_MISALIGN  = {1'b0, 5'd04},
+    EXC_CAUSE_LOAD_ACCESS_FAULT   = {1'b0, 5'd05},
+    EXC_CAUSE_STORE_ADDR_MISALIGN = {1'b0, 5'd06},
+    EXC_CAUSE_STORE_ACCESS_FAULT  = {1'b0, 5'd07},
+    EXC_CAUSE_ECALL_UMODE         = {1'b0, 5'd08},
+    EXC_CAUSE_ECALL_MMODE         = {1'b0, 5'd11},
+    EXC_CAUSE_CHERI_FAULT         = {1'b0, 5'd28}
+  } exc_cause_e;
+
+  // Debug cause
+  typedef enum logic [2:0] {
+    DBG_CAUSE_NONE    = 3'h0,
+    DBG_CAUSE_EBREAK  = 3'h1,
+    DBG_CAUSE_TRIGGER = 3'h2,
+    DBG_CAUSE_HALTREQ = 3'h3,
+    DBG_CAUSE_STEP    = 3'h4
+  } dbg_cause_e;
+
+  // ICache constants
+  parameter int unsigned ADDR_W           = 32;
+  parameter int unsigned BUS_SIZE         = 32;
+  parameter int unsigned BUS_BYTES        = BUS_SIZE/8;
+  parameter int unsigned BUS_W            = $clog2(BUS_BYTES);
+  parameter int unsigned IC_SIZE_BYTES    = 4096;
+  parameter int unsigned IC_NUM_WAYS      = 2;
+  parameter int unsigned IC_LINE_SIZE     = 64;
+  parameter int unsigned IC_LINE_BYTES    = IC_LINE_SIZE/8;
+  parameter int unsigned IC_LINE_W        = $clog2(IC_LINE_BYTES);
+  parameter int unsigned IC_NUM_LINES     = IC_SIZE_BYTES / IC_NUM_WAYS / IC_LINE_BYTES;
+  parameter int unsigned IC_LINE_BEATS    = IC_LINE_BYTES / BUS_BYTES;
+  parameter int unsigned IC_LINE_BEATS_W  = $clog2(IC_LINE_BEATS);
+  parameter int unsigned IC_INDEX_W       = $clog2(IC_NUM_LINES);
+  parameter int unsigned IC_INDEX_HI      = IC_INDEX_W + IC_LINE_W - 1;
+  parameter int unsigned IC_TAG_SIZE      = ADDR_W - IC_INDEX_W - IC_LINE_W + 1; // 1 valid bit
+  parameter int unsigned IC_OUTPUT_BEATS  = (BUS_BYTES / 2); // number of halfwords
+  // ICache Scrambling Parameters
+  parameter int unsigned SCRAMBLE_KEY_W   = 128;
+  parameter int unsigned SCRAMBLE_NONCE_W = 64;
+
+  // PMP constants
+  parameter int unsigned PMP_MAX_REGIONS      = 16;
+  parameter int unsigned PMP_CFG_W            = 8;
+
+  // PMP acces type
+  parameter int unsigned PMP_I  = 0;
+  parameter int unsigned PMP_I2 = 1;
+  parameter int unsigned PMP_D  = 2;
+
+  typedef enum logic [1:0] {
+    PMP_ACC_EXEC    = 2'b00,
+    PMP_ACC_WRITE   = 2'b01,
+    PMP_ACC_READ    = 2'b10
+  } pmp_req_e;
+
+  // PMP cfg structures
+  typedef enum logic [1:0] {
+    PMP_MODE_OFF   = 2'b00,
+    PMP_MODE_TOR   = 2'b01,
+    PMP_MODE_NA4   = 2'b10,
+    PMP_MODE_NAPOT = 2'b11
+  } pmp_cfg_mode_e;
+
+  typedef struct packed {
+    logic          lock;
+    pmp_cfg_mode_e mode;
+    logic          exec;
+    logic          write;
+    logic          read;
+  } pmp_cfg_t;
+
+  // Machine Security Configuration (ePMP)
+  typedef struct packed {
+    logic rlb;  // Rule Locking Bypass
+    logic mmwp; // Machine Mode Whitelist Policy
+    logic mml;  // Machine Mode Lockdown
+  } pmp_mseccfg_t;
+
+  // CSRs
+  typedef enum logic[11:0] {
+    // Machine information
+    CSR_MVENDORID = 12'hF11,
+    CSR_MARCHID   = 12'hF12,
+    CSR_MIMPID    = 12'hF13,
+    CSR_MHARTID   = 12'hF14,
+
+    // Machine trap setup
+    CSR_MSTATUS   = 12'h300,
+    CSR_MISA      = 12'h301,
+    CSR_MIE       = 12'h304,
+    CSR_MTVEC     = 12'h305,
+    CSR_MCOUNTEREN= 12'h306,
+
+    // Machine trap handling
+    CSR_MSCRATCH  = 12'h340,
+    CSR_MEPC      = 12'h341,
+    CSR_MCAUSE    = 12'h342,
+    CSR_MTVAL     = 12'h343,
+    CSR_MIP       = 12'h344,
+
+    // Physical memory protection
+    CSR_PMPCFG0   = 12'h3A0,
+    CSR_PMPCFG1   = 12'h3A1,
+    CSR_PMPCFG2   = 12'h3A2,
+    CSR_PMPCFG3   = 12'h3A3,
+    CSR_PMPADDR0  = 12'h3B0,
+    CSR_PMPADDR1  = 12'h3B1,
+    CSR_PMPADDR2  = 12'h3B2,
+    CSR_PMPADDR3  = 12'h3B3,
+    CSR_PMPADDR4  = 12'h3B4,
+    CSR_PMPADDR5  = 12'h3B5,
+    CSR_PMPADDR6  = 12'h3B6,
+    CSR_PMPADDR7  = 12'h3B7,
+    CSR_PMPADDR8  = 12'h3B8,
+    CSR_PMPADDR9  = 12'h3B9,
+    CSR_PMPADDR10 = 12'h3BA,
+    CSR_PMPADDR11 = 12'h3BB,
+    CSR_PMPADDR12 = 12'h3BC,
+    CSR_PMPADDR13 = 12'h3BD,
+    CSR_PMPADDR14 = 12'h3BE,
+    CSR_PMPADDR15 = 12'h3BF,
+
+    // ePMP control
+    CSR_MSECCFG   = 12'h747,
+    CSR_MSECCFGH  = 12'h757,
+
+    // Debug trigger
+    CSR_TSELECT   = 12'h7A0,
+    CSR_TDATA1    = 12'h7A1,
+    CSR_TDATA2    = 12'h7A2,
+    CSR_TDATA3    = 12'h7A3,
+    CSR_MCONTEXT  = 12'h7A8,
+    CSR_SCONTEXT  = 12'h7AA,
+
+    // Debug/trace
+    CSR_DCSR      = 12'h7b0,
+    CSR_DPC       = 12'h7b1,
+
+    // Debug
+    CSR_DSCRATCH0 = 12'h7b2, // optional
+    CSR_DSCRATCH1 = 12'h7b3, // optional
+
+    // Machine Counter/Timers
+    CSR_MCOUNTINHIBIT  = 12'h320,
+    CSR_MHPMEVENT3     = 12'h323,
+    CSR_MHPMEVENT4     = 12'h324,
+    CSR_MHPMEVENT5     = 12'h325,
+    CSR_MHPMEVENT6     = 12'h326,
+    CSR_MHPMEVENT7     = 12'h327,
+    CSR_MHPMEVENT8     = 12'h328,
+    CSR_MHPMEVENT9     = 12'h329,
+    CSR_MHPMEVENT10    = 12'h32A,
+    CSR_MHPMEVENT11    = 12'h32B,
+    CSR_MHPMEVENT12    = 12'h32C,
+    CSR_MHPMEVENT13    = 12'h32D,
+    CSR_MHPMEVENT14    = 12'h32E,
+    CSR_MHPMEVENT15    = 12'h32F,
+    CSR_MHPMEVENT16    = 12'h330,
+    CSR_MHPMEVENT17    = 12'h331,
+    CSR_MHPMEVENT18    = 12'h332,
+    CSR_MHPMEVENT19    = 12'h333,
+    CSR_MHPMEVENT20    = 12'h334,
+    CSR_MHPMEVENT21    = 12'h335,
+    CSR_MHPMEVENT22    = 12'h336,
+    CSR_MHPMEVENT23    = 12'h337,
+    CSR_MHPMEVENT24    = 12'h338,
+    CSR_MHPMEVENT25    = 12'h339,
+    CSR_MHPMEVENT26    = 12'h33A,
+    CSR_MHPMEVENT27    = 12'h33B,
+    CSR_MHPMEVENT28    = 12'h33C,
+    CSR_MHPMEVENT29    = 12'h33D,
+    CSR_MHPMEVENT30    = 12'h33E,
+    CSR_MHPMEVENT31    = 12'h33F,
+    CSR_MCYCLE         = 12'hB00,
+    CSR_MINSTRET       = 12'hB02,
+    CSR_MHPMCOUNTER3   = 12'hB03,
+    CSR_MHPMCOUNTER4   = 12'hB04,
+    CSR_MHPMCOUNTER5   = 12'hB05,
+    CSR_MHPMCOUNTER6   = 12'hB06,
+    CSR_MHPMCOUNTER7   = 12'hB07,
+    CSR_MHPMCOUNTER8   = 12'hB08,
+    CSR_MHPMCOUNTER9   = 12'hB09,
+    CSR_MHPMCOUNTER10  = 12'hB0A,
+    CSR_MHPMCOUNTER11  = 12'hB0B,
+    CSR_MHPMCOUNTER12  = 12'hB0C,
+    CSR_MHPMCOUNTER13  = 12'hB0D,
+    CSR_MHPMCOUNTER14  = 12'hB0E,
+    CSR_MHPMCOUNTER15  = 12'hB0F,
+    CSR_MHPMCOUNTER16  = 12'hB10,
+    CSR_MHPMCOUNTER17  = 12'hB11,
+    CSR_MHPMCOUNTER18  = 12'hB12,
+    CSR_MHPMCOUNTER19  = 12'hB13,
+    CSR_MHPMCOUNTER20  = 12'hB14,
+    CSR_MHPMCOUNTER21  = 12'hB15,
+    CSR_MHPMCOUNTER22  = 12'hB16,
+    CSR_MHPMCOUNTER23  = 12'hB17,
+    CSR_MHPMCOUNTER24  = 12'hB18,
+    CSR_MHPMCOUNTER25  = 12'hB19,
+    CSR_MHPMCOUNTER26  = 12'hB1A,
+    CSR_MHPMCOUNTER27  = 12'hB1B,
+    CSR_MHPMCOUNTER28  = 12'hB1C,
+    CSR_MHPMCOUNTER29  = 12'hB1D,
+    CSR_MHPMCOUNTER30  = 12'hB1E,
+    CSR_MHPMCOUNTER31  = 12'hB1F,
+    CSR_MCYCLEH        = 12'hB80,
+    CSR_MINSTRETH      = 12'hB82,
+    CSR_MHPMCOUNTER3H  = 12'hB83,
+    CSR_MHPMCOUNTER4H  = 12'hB84,
+    CSR_MHPMCOUNTER5H  = 12'hB85,
+    CSR_MHPMCOUNTER6H  = 12'hB86,
+    CSR_MHPMCOUNTER7H  = 12'hB87,
+    CSR_MHPMCOUNTER8H  = 12'hB88,
+    CSR_MHPMCOUNTER9H  = 12'hB89,
+    CSR_MHPMCOUNTER10H = 12'hB8A,
+    CSR_MHPMCOUNTER11H = 12'hB8B,
+    CSR_MHPMCOUNTER12H = 12'hB8C,
+    CSR_MHPMCOUNTER13H = 12'hB8D,
+    CSR_MHPMCOUNTER14H = 12'hB8E,
+    CSR_MHPMCOUNTER15H = 12'hB8F,
+    CSR_MHPMCOUNTER16H = 12'hB90,
+    CSR_MHPMCOUNTER17H = 12'hB91,
+    CSR_MHPMCOUNTER18H = 12'hB92,
+    CSR_MHPMCOUNTER19H = 12'hB93,
+    CSR_MHPMCOUNTER20H = 12'hB94,
+    CSR_MHPMCOUNTER21H = 12'hB95,
+    CSR_MHPMCOUNTER22H = 12'hB96,
+    CSR_MHPMCOUNTER23H = 12'hB97,
+    CSR_MHPMCOUNTER24H = 12'hB98,
+    CSR_MHPMCOUNTER25H = 12'hB99,
+    CSR_MHPMCOUNTER26H = 12'hB9A,
+    CSR_MHPMCOUNTER27H = 12'hB9B,
+    CSR_MHPMCOUNTER28H = 12'hB9C,
+    CSR_MHPMCOUNTER29H = 12'hB9D,
+    CSR_MHPMCOUNTER30H = 12'hB9E,
+    CSR_MHPMCOUNTER31H = 12'hB9F,
+    CSR_MSHWM          = 12'hBC1,
+    CSR_MSHWMB         = 12'hBC2,
+    CSR_CDBG_CTRL      = 12'hBC4,
+    CSR_CPUCTRL        = 12'h7C0,
+    CSR_SECURESEED     = 12'h7C1
+  } csr_num_e;
+
+  // CSR pmp-related offsets
+  parameter logic [11:0] CSR_OFF_PMP_CFG  = 12'h3A0; // pmp_cfg  @ 12'h3a0 - 12'h3a3
+  parameter logic [11:0] CSR_OFF_PMP_ADDR = 12'h3B0; // pmp_addr @ 12'h3b0 - 12'h3bf
+
+  // CSR status bits
+  parameter int unsigned CSR_MSTATUS_MIE_BIT      = 3;
+  parameter int unsigned CSR_MSTATUS_MPIE_BIT     = 7;
+  parameter int unsigned CSR_MSTATUS_MPP_BIT_LOW  = 11;
+  parameter int unsigned CSR_MSTATUS_MPP_BIT_HIGH = 12;
+  parameter int unsigned CSR_MSTATUS_MPRV_BIT     = 17;
+  parameter int unsigned CSR_MSTATUS_TW_BIT       = 21;
+
+  // CSR machine ISA
+  parameter logic [1:0] CSR_MISA_MXL = 2'd1; // M-XLEN: XLEN in M-Mode for RV32
+
+  // CSR interrupt pending/enable bits
+  parameter int unsigned CSR_MSIX_BIT      = 3;
+  parameter int unsigned CSR_MTIX_BIT      = 7;
+  parameter int unsigned CSR_MEIX_BIT      = 11;
+  parameter int unsigned CSR_MFIX_BIT_LOW  = 16;
+  parameter int unsigned CSR_MFIX_BIT_HIGH = 30;
+
+  // CSR Machine Security Configuration bits
+  parameter int unsigned CSR_MSECCFG_MML_BIT  = 0;
+  parameter int unsigned CSR_MSECCFG_MMWP_BIT = 1;
+  parameter int unsigned CSR_MSECCFG_RLB_BIT  = 2;
+
+  // Vendor ID
+  // No JEDEC ID has been allocated to lowRISC so the value is 0 to indicate the field is not
+  // implemented
+  localparam logic [31:0] CSR_MVENDORID_VALUE  = 32'b0;
+  localparam logic [31:0] CSR_MVENDORID_CHERI_VALUE  = 32'h255;
+
+  // Architecture ID
+  // Top bit is unset to indicate an open source project. The lower bits are an ID allocated by the
+  // RISC-V Foundation. Note this is allocated specifically to Ibex, should significant changes be
+  // made a different architecture ID should be supplied.
+  localparam logic [31:0] CSR_MARCHID_VALUE = {1'b0, 31'd22};
+  localparam logic [31:0] CSR_MARCHID_CHERI_VALUE = 32'hce1;
+
+
+  // Implementation ID
+  // 0 indicates this field is not implemeted. Ibex implementors may wish to indicate an RTL/netlist
+  // version here using their own unique encoding (e.g. 32 bits of the git hash of the implemented
+  // commit).
+  localparam logic [31:0] CSR_MIMPID_VALUE = 32'b0;
+
+  // These LFSR parameters have been generated with
+  // $ opentitan/util/design/gen-lfsr-seed.py --width 32 --seed 2480124384 --prefix ""
+  parameter int LfsrWidth = 32;
+  typedef logic [LfsrWidth-1:0] lfsr_seed_t;
+  typedef logic [LfsrWidth-1:0][$clog2(LfsrWidth)-1:0] lfsr_perm_t;
+  parameter lfsr_seed_t RndCnstLfsrSeedDefault = 32'hac533bf4;
+  parameter lfsr_perm_t RndCnstLfsrPermDefault = {
+    160'h1e35ecba467fd1b12e958152c04fa43878a8daed
+  };
+  parameter logic [SCRAMBLE_KEY_W-1:0]   RndCnstIbexKeyDefault =
+      128'h14e8cecae3040d5e12286bb3cc113298;
+  parameter logic [SCRAMBLE_NONCE_W-1:0] RndCnstIbexNonceDefault =
+      64'hf79780bc735f3843;
+
+  // Fetch enable. Mult-bit signal used for security hardening. For non-secure implementation all
+  // bits other than the bottom bit are ignored.
+  typedef logic [3:0] fetch_enable_t;
+
+  // Note that if adjusting these parameters it is assumed the bottom bit is set for On and unset
+  // for Off. This allows the use of FetchEnableOn/FetchEnableOff to work for both secure and
+  // non-secure Ibex. If this assumption is broken the RTL that uses the fetch_enable signal within
+  // `cheriot_core` may need adjusting.
+  parameter fetch_enable_t FetchEnableOn  = 4'b1001;
+  parameter fetch_enable_t FetchEnableOff = 4'b0110;
+
+  typedef logic [3:0] ibex_mubi_t;
+
+  // Note that if adjusting these parameters it is assumed the bottom bit is set for On and unset
+  // for Off. This allows the use of IbexMuBiOn/IbexMuBiOff to work for both secure and non-secure
+  // Ibex. If this assumption is broken the RTL that uses ibex_mubi_t types such as the fetch_enable
+  // and core_busy signals within `cheriot_core` may need adjusting.
+  parameter ibex_mubi_t IbexMuBiOn  = 4'b0101;
+  parameter ibex_mubi_t IbexMuBiOff = 4'b1010;
+
+  //////////////
+  // ID stage //
+  //////////////
+
+  typedef enum logic [3:0] {
+    RESET,
+    BOOT_SET,
+    WAIT_SLEEP,
+    SLEEP,
+    FIRST_FETCH,
+    DECODE,
+    FLUSH,
+    IRQ_TAKEN,
+    DBG_TAKEN_IF,
+    DBG_TAKEN_ID
+  } ctrl_fsm_e;
+
+  //////////////
+  // LSU      //
+  //////////////
+
+  typedef enum logic [3:0]  {
+    IDLE, WAIT_GNT_MIS, WAIT_RVALID_MIS, WAIT_GNT,
+    WAIT_RVALID_MIS_GNTS_DONE,
+    CTX_WAIT_GNT1, CTX_WAIT_GNT2, CTX_WAIT_RESP
+  } ls_fsm_e;
+
+  typedef enum logic [2:0] {CRX_IDLE, CRX_WAIT_RESP1, CRX_WAIT_RESP2} cap_rx_fsm_t;
+
+
+endpackage
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_pmp.sv b/hw/ip/cheriot-ibex/rtl/cheriot_pmp.sv
new file mode 100644
index 0000000..6363e70
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_pmp.sv
@@ -0,0 +1,184 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+module cheriot_pmp #(
+  // Granularity of NAPOT access,
+  // 0 = No restriction, 1 = 8 byte, 2 = 16 byte, 3 = 32 byte, etc.
+  parameter int unsigned PMPGranularity = 0,
+  // Number of access channels (e.g. i-side + d-side)
+  parameter int unsigned PMPNumChan     = 2,
+  // Number of implemented regions
+  parameter int unsigned PMPNumRegions  = 4
+) (
+  // Clock and Reset
+  input  logic                    clk_i,
+  input  logic                    rst_ni,
+
+  // Interface to CSRs
+  input  cheriot_pkg::pmp_cfg_t      csr_pmp_cfg_i     [PMPNumRegions],
+  input  logic [33:0]             csr_pmp_addr_i    [PMPNumRegions],
+  input  cheriot_pkg::pmp_mseccfg_t  csr_pmp_mseccfg_i,
+
+  input  cheriot_pkg::priv_lvl_e     priv_mode_i    [PMPNumChan],
+  // Access checking channels
+  input  logic [33:0]             pmp_req_addr_i [PMPNumChan],
+  input  cheriot_pkg::pmp_req_e      pmp_req_type_i [PMPNumChan],
+  output logic                    pmp_req_err_o  [PMPNumChan]
+
+);
+
+  import cheriot_pkg::*;
+
+  // Access Checking Signals
+  logic [33:0]                                region_start_addr [PMPNumRegions];
+  logic [33:PMPGranularity+2]                 region_addr_mask  [PMPNumRegions];
+  logic [PMPNumChan-1:0][PMPNumRegions-1:0]   region_match_gt;
+  logic [PMPNumChan-1:0][PMPNumRegions-1:0]   region_match_lt;
+  logic [PMPNumChan-1:0][PMPNumRegions-1:0]   region_match_eq;
+  logic [PMPNumChan-1:0][PMPNumRegions-1:0]   region_match_all;
+  logic [PMPNumChan-1:0][PMPNumRegions-1:0]   region_basic_perm_check;
+  logic [PMPNumChan-1:0][PMPNumRegions-1:0]   region_mml_perm_check;
+  logic [PMPNumChan-1:0]                      access_fault;
+
+
+  // ---------------
+  // Access checking
+  // ---------------
+
+  for (genvar r = 0; r < PMPNumRegions; r++) begin : g_addr_exp
+    // Start address for TOR matching
+    if (r == 0) begin : g_entry0
+      assign region_start_addr[r] = (csr_pmp_cfg_i[r].mode == PMP_MODE_TOR) ? 34'h000000000 :
+                                                                              csr_pmp_addr_i[r];
+    end else begin : g_oth
+      assign region_start_addr[r] = (csr_pmp_cfg_i[r].mode == PMP_MODE_TOR) ? csr_pmp_addr_i[r-1] :
+                                                                              csr_pmp_addr_i[r];
+    end
+    // Address mask for NA matching
+    for (genvar b = PMPGranularity + 2; b < 34; b++) begin : g_bitmask
+      if (b == 2) begin : g_bit0
+        // Always mask bit 2 for NAPOT
+        assign region_addr_mask[r][b] = (csr_pmp_cfg_i[r].mode != PMP_MODE_NAPOT);
+      end else begin : g_others
+        // We will mask this bit if it is within the programmed granule
+        // i.e. addr = yyyy 0111
+        //                  ^
+        //                  | This bit pos is the top of the mask, all lower bits set
+        // thus mask = 1111 0000
+        if (PMPGranularity == 0) begin : g_region_addr_mask_zero_granularity
+          assign region_addr_mask[r][b] = (csr_pmp_cfg_i[r].mode != PMP_MODE_NAPOT) |
+                                          ~&csr_pmp_addr_i[r][b-1:2];
+        end else begin : g_region_addr_mask_other_granularity
+          assign region_addr_mask[r][b] = (csr_pmp_cfg_i[r].mode != PMP_MODE_NAPOT) |
+                                          ~&csr_pmp_addr_i[r][b-1:PMPGranularity+1];
+        end
+      end
+    end
+  end
+
+  for (genvar c = 0; c < PMPNumChan; c++) begin : g_access_check
+    for (genvar r = 0; r < PMPNumRegions; r++) begin : g_regions
+      // Comparators are sized according to granularity
+      assign region_match_eq[c][r] = (pmp_req_addr_i[c][33:PMPGranularity+2] &
+                                      region_addr_mask[r]) ==
+                                     (region_start_addr[r][33:PMPGranularity+2] &
+                                      region_addr_mask[r]);
+      assign region_match_gt[c][r] = pmp_req_addr_i[c][33:PMPGranularity+2] >
+                                     region_start_addr[r][33:PMPGranularity+2];
+      assign region_match_lt[c][r] = pmp_req_addr_i[c][33:PMPGranularity+2] <
+                                     csr_pmp_addr_i[r][33:PMPGranularity+2];
+
+      always_comb begin
+        region_match_all[c][r] = 1'b0;
+        unique case (csr_pmp_cfg_i[r].mode)
+          PMP_MODE_OFF:   region_match_all[c][r] = 1'b0;
+          PMP_MODE_NA4:   region_match_all[c][r] = region_match_eq[c][r];
+          PMP_MODE_NAPOT: region_match_all[c][r] = region_match_eq[c][r];
+          PMP_MODE_TOR: begin
+            region_match_all[c][r] = (region_match_eq[c][r] | region_match_gt[c][r]) &
+                                     region_match_lt[c][r];
+          end
+          default:        region_match_all[c][r] = 1'b0;
+        endcase
+      end
+
+      // Check specific required permissions
+      assign region_basic_perm_check[c][r] =
+          ((pmp_req_type_i[c] == PMP_ACC_EXEC)  & csr_pmp_cfg_i[r].exec) |
+          ((pmp_req_type_i[c] == PMP_ACC_WRITE) & csr_pmp_cfg_i[r].write) |
+          ((pmp_req_type_i[c] == PMP_ACC_READ)  & csr_pmp_cfg_i[r].read);
+
+
+      // Compute permission checks that apply when MSECCFG.MML is set.
+      always_comb begin
+        region_mml_perm_check[c][r] = 1'b0;
+
+        if (!csr_pmp_cfg_i[r].read && csr_pmp_cfg_i[r].write) begin
+          // Special-case shared regions where R = 0, W = 1
+          unique case ({csr_pmp_cfg_i[r].lock, csr_pmp_cfg_i[r].exec})
+            // Read/write in M, read only in S/U
+            2'b00: region_mml_perm_check[c][r] =
+                (pmp_req_type_i[c] == PMP_ACC_READ) |
+                ((pmp_req_type_i[c] == PMP_ACC_WRITE) & (priv_mode_i[c] == PRIV_LVL_M));
+            // Read/write in M/S/U
+            2'b01: region_mml_perm_check[c][r] =
+                (pmp_req_type_i[c] == PMP_ACC_READ) | (pmp_req_type_i[c] == PMP_ACC_WRITE);
+            // Execute only on M/S/U
+            2'b10: region_mml_perm_check[c][r] = (pmp_req_type_i[c] == PMP_ACC_EXEC);
+            // Read/execute in M, execute only on S/U
+            2'b11: region_mml_perm_check[c][r] =
+                (pmp_req_type_i[c] == PMP_ACC_EXEC) |
+                ((pmp_req_type_i[c] == PMP_ACC_READ) & (priv_mode_i[c] == PRIV_LVL_M));
+            default: ;
+          endcase
+        end else begin
+          if (csr_pmp_cfg_i[r].read & csr_pmp_cfg_i[r].write & csr_pmp_cfg_i[r].exec
+              & csr_pmp_cfg_i[r].lock) begin
+            // Special-case shared read only region when R = 1, W = 1, X = 1, L = 1
+            region_mml_perm_check[c][r] = pmp_req_type_i[c] == PMP_ACC_READ;
+          end else begin
+            // Otherwise use basic permission check. Permission is always denied if in S/U mode and
+            // L is set or if in M mode and L is unset.
+            region_mml_perm_check[c][r] =
+              priv_mode_i[c] == PRIV_LVL_M ? csr_pmp_cfg_i[r].lock & region_basic_perm_check[c][r] :
+                                            ~csr_pmp_cfg_i[r].lock & region_basic_perm_check[c][r];
+          end
+        end
+      end
+    end
+
+    // Access fault determination / prioritization
+    always_comb begin
+      // When MSECCFG.MMWP is set default deny always, otherwise allow for M-mode, deny for other
+      // modes
+      access_fault[c] = csr_pmp_mseccfg_i.mmwp | (priv_mode_i[c] != PRIV_LVL_M);
+
+      // PMP entries are statically prioritized, from 0 to N-1
+      // The lowest-numbered PMP entry which matches an address determines accessability
+      for (int r = PMPNumRegions - 1; r >= 0; r--) begin
+        if (region_match_all[c][r]) begin
+          if (csr_pmp_mseccfg_i.mml) begin
+            // When MSECCFG.MML is set use MML specific permission check
+            access_fault[c] = ~region_mml_perm_check[c][r];
+          end else begin
+            // Otherwise use original PMP behaviour
+            access_fault[c] = (priv_mode_i[c] == PRIV_LVL_M) ?
+                // For M-mode, any region which matches with the L-bit clear, or with sufficient
+                // access permissions will be allowed
+                (csr_pmp_cfg_i[r].lock & ~region_basic_perm_check[c][r]) :
+                // For other modes, the lock bit doesn't matter
+                ~region_basic_perm_check[c][r];
+          end
+        end
+      end
+    end
+
+    assign pmp_req_err_o[c] = access_fault[c];
+  end
+
+  // RLB, rule locking bypass, is only relevant to cheriot_cs_registers which controls writes to the
+  // PMP CSRs. Tie to unused signal here to prevent lint warnings.
+  logic unused_csr_pmp_mseccfg_rlb;
+  assign unused_csr_pmp_mseccfg_rlb = csr_pmp_mseccfg_i.rlb;
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_pmp_reset_default.svh b/hw/ip/cheriot-ibex/rtl/cheriot_pmp_reset_default.svh
new file mode 100644
index 0000000..cda701b
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_pmp_reset_default.svh
@@ -0,0 +1,53 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Default reset values for PMP CSRs. Where the number of regions
+// (PMPNumRegions) is less than 16 the reset values for the higher numbered
+// regions are ignored.
+//
+// See the Ibex Reference Guide (Custom Reset Values under Physical Memory
+// Protection) for more information.
+
+localparam pmp_cfg_t pmp_cfg_rst[16] = '{
+  '{lock: 1'b0, mode: PMP_MODE_OFF, exec: 1'b0, write: 1'b0, read: 1'b0}, // region 0
+  '{lock: 1'b0, mode: PMP_MODE_OFF, exec: 1'b0, write: 1'b0, read: 1'b0}, // region 1
+  '{lock: 1'b0, mode: PMP_MODE_OFF, exec: 1'b0, write: 1'b0, read: 1'b0}, // region 2
+  '{lock: 1'b0, mode: PMP_MODE_OFF, exec: 1'b0, write: 1'b0, read: 1'b0}, // region 3
+  '{lock: 1'b0, mode: PMP_MODE_OFF, exec: 1'b0, write: 1'b0, read: 1'b0}, // region 4
+  '{lock: 1'b0, mode: PMP_MODE_OFF, exec: 1'b0, write: 1'b0, read: 1'b0}, // region 5
+  '{lock: 1'b0, mode: PMP_MODE_OFF, exec: 1'b0, write: 1'b0, read: 1'b0}, // region 6
+  '{lock: 1'b0, mode: PMP_MODE_OFF, exec: 1'b0, write: 1'b0, read: 1'b0}, // region 7
+  '{lock: 1'b0, mode: PMP_MODE_OFF, exec: 1'b0, write: 1'b0, read: 1'b0}, // region 8
+  '{lock: 1'b0, mode: PMP_MODE_OFF, exec: 1'b0, write: 1'b0, read: 1'b0}, // region 9
+  '{lock: 1'b0, mode: PMP_MODE_OFF, exec: 1'b0, write: 1'b0, read: 1'b0}, // region 10
+  '{lock: 1'b0, mode: PMP_MODE_OFF, exec: 1'b0, write: 1'b0, read: 1'b0}, // region 11
+  '{lock: 1'b0, mode: PMP_MODE_OFF, exec: 1'b0, write: 1'b0, read: 1'b0}, // region 12
+  '{lock: 1'b0, mode: PMP_MODE_OFF, exec: 1'b0, write: 1'b0, read: 1'b0}, // region 13
+  '{lock: 1'b0, mode: PMP_MODE_OFF, exec: 1'b0, write: 1'b0, read: 1'b0}, // region 14
+  '{lock: 1'b0, mode: PMP_MODE_OFF, exec: 1'b0, write: 1'b0, read: 1'b0}  // region 15
+};
+
+// Addresses are given in byte granularity for readibility. A minimum of two
+// bits will be stripped off the bottom (PMPGranularity == 0) with more stripped
+// off at coarser granularities.
+localparam [33:0] pmp_addr_rst[16] = '{
+  34'h0, // region 0
+  34'h0, // region 1
+  34'h0, // region 2
+  34'h0, // region 3
+  34'h0, // region 4
+  34'h0, // region 5
+  34'h0, // region 6
+  34'h0, // region 7
+  34'h0, // region 8
+  34'h0, // region 9
+  34'h0, // region 10
+  34'h0, // region 11
+  34'h0, // region 12
+  34'h0, // region 13
+  34'h0, // region 14
+  34'h0  // region 15
+};
+
+localparam pmp_mseccfg_t pmp_mseccfg_rst = '{rlb : 1'b0, mmwp: 1'b0, mml: 1'b0};
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_prefetch_buffer.sv b/hw/ip/cheriot-ibex/rtl/cheriot_prefetch_buffer.sv
new file mode 100644
index 0000000..00de519
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_prefetch_buffer.sv
@@ -0,0 +1,281 @@
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Prefetcher Buffer for 32 bit memory interface
+ *
+ * Prefetch Buffer that caches instructions. This cuts overly long critical
+ * paths to the instruction cache.
+ */
+module cheriot_prefetch_buffer #(
+  parameter bit ResetAll        = 1'b0
+) (
+  input  logic        clk_i,
+  input  logic        rst_ni,
+
+  input  logic        req_i,
+
+  input  logic        branch_i,
+  input  logic        branch_mispredict_i,
+  input  logic [31:0] mispredict_addr_i,
+  input  logic [31:0] addr_i,
+
+
+  input  logic        ready_i,
+  output logic        valid_o,
+  output logic [31:0] rdata_o,
+  output logic [31:0] addr_o,
+  output logic        err_o,
+  output logic        err_plus2_o,
+
+  input  logic       cheri_force_uc_i,
+
+  // goes to instruction memory / instruction cache
+  output logic        instr_req_o,
+  input  logic        instr_gnt_i,
+  output logic [31:0] instr_addr_o,
+  input  logic [31:0] instr_rdata_i,
+  input  logic        instr_err_i,
+  input  logic        instr_rvalid_i,
+
+  // Prefetch Buffer Status
+  output logic        busy_o
+);
+
+  localparam int unsigned NUM_REQS  = 2;
+
+  logic                valid_new_req, valid_req;
+  logic                valid_req_d, valid_req_q;
+  logic                discard_req_d, discard_req_q;
+  logic [NUM_REQS-1:0] rdata_outstanding_n, rdata_outstanding_s, rdata_outstanding_q;
+  logic [NUM_REQS-1:0] branch_discard_n, branch_discard_s, branch_discard_q;
+  logic [NUM_REQS-1:0] rdata_outstanding_rev;
+
+  logic [31:0]         stored_addr_d, stored_addr_q;
+  logic                stored_addr_en;
+  logic [31:0]         fetch_addr_d, fetch_addr_q;
+  logic                fetch_addr_en;
+  logic [31:0]         instr_addr, instr_addr_w_aligned;
+
+  logic                fifo_valid;
+  logic [31:0]         fifo_addr;
+  logic                fifo_ready;
+  logic                fifo_clear;
+  logic [NUM_REQS-1:0] fifo_busy;
+
+  logic                valid_raw;
+
+  logic                branch_or_mispredict;
+
+  ////////////////////////////
+  // Prefetch buffer status //
+  ////////////////////////////
+
+  assign busy_o = (|rdata_outstanding_q) | instr_req_o;
+
+  assign branch_or_mispredict = branch_i | branch_mispredict_i;
+
+  //////////////////////////////////////////////
+  // Fetch fifo - consumes addresses and data //
+  //////////////////////////////////////////////
+
+  // A branch will invalidate any previously fetched instructions.
+  // Note that the FENCE.I instruction relies on this flushing behaviour on branch. If it is
+  // altered the FENCE.I implementation may require changes.
+  assign fifo_clear = branch_or_mispredict;
+
+  // Reversed version of rdata_outstanding_q which can be overlaid with fifo fill state
+  for (genvar i = 0; i < NUM_REQS; i++) begin : gen_rd_rev
+    assign rdata_outstanding_rev[i] = rdata_outstanding_q[NUM_REQS-1-i];
+  end
+
+  // The fifo is ready to accept a new request if it is not full - including space reserved for
+  // requests already outstanding.
+  // Overlay the fifo fill state with the outstanding requests to see if there is space.
+  assign fifo_ready = ~&(fifo_busy | rdata_outstanding_rev);
+
+  cheriot_fetch_fifo #(
+    .NUM_REQS (NUM_REQS),
+    .ResetAll (ResetAll)
+  ) fifo_i (
+      .clk_i                 ( clk_i             ),
+      .rst_ni                ( rst_ni            ),
+
+      .clear_i               ( fifo_clear        ),
+      .busy_o                ( fifo_busy         ),
+
+      .in_valid_i            ( fifo_valid        ),
+      .in_addr_i             ( fifo_addr         ),
+      .in_rdata_i            ( instr_rdata_i     ),
+      .in_err_i              ( instr_err_i       ),
+      .cheri_force_uc_i      ( cheri_force_uc_i  ),
+
+      .out_valid_o           ( valid_raw         ),
+      .out_ready_i           ( ready_i           ),
+      .out_rdata_o           ( rdata_o           ),
+      .out_addr_o            ( addr_o            ),
+      .out_err_o             ( err_o             ),
+      .out_err_plus2_o       ( err_plus2_o       )
+  );
+
+  //////////////
+  // Requests //
+  //////////////
+
+  // Make a new request any time there is space in the FIFO, and space in the request queue
+  assign valid_new_req = req_i & (fifo_ready | branch_or_mispredict) &
+                         ~rdata_outstanding_q[NUM_REQS-1];
+
+  assign valid_req = valid_req_q | valid_new_req;
+
+  // Hold the request stable for requests that didn't get granted
+  assign valid_req_d = valid_req & ~instr_gnt_i;
+
+  // Record whether an outstanding bus request is cancelled by a branch
+  assign discard_req_d = valid_req_q & (branch_or_mispredict | discard_req_q);
+
+  ////////////////
+  // Fetch addr //
+  ////////////////
+
+  // Two addresses are tracked in the prefetch buffer:
+  // 1. stored_addr_q - This is the address issued on the bus. It stays stable until
+  //                    the request is granted.
+  // 2. fetch_addr_q  - This is our next address to fetch from. It is updated on branches to
+  //                    capture the new address, and then for each new request issued.
+  // A third address is tracked in the fetch FIFO itself:
+  // 3. instr_addr_q  - This is the address at the head of the FIFO, efectively our oldest fetched
+  //                    address. This address is updated on branches, and does its own increment
+  //                    each time the FIFO is popped.
+
+  // 1. stored_addr_q
+
+  // Only update stored_addr_q for new ungranted requests
+  assign stored_addr_en = valid_new_req & ~valid_req_q & ~instr_gnt_i;
+
+  // Store whatever address was issued on the bus
+  assign stored_addr_d = instr_addr;
+
+  // CPU resets with a branch, so no need to reset these addresses
+  if (ResetAll) begin : g_stored_addr_ra
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        stored_addr_q <= '0;
+      end else if (stored_addr_en) begin
+        stored_addr_q <= stored_addr_d;
+      end
+    end
+  end else begin : g_stored_addr_nr
+    always_ff @(posedge clk_i) begin
+      if (stored_addr_en) begin
+        stored_addr_q <= stored_addr_d;
+      end
+    end
+  end
+  // 2. fetch_addr_q
+
+  // Update on a branch or as soon as a request is issued
+  assign fetch_addr_en = branch_or_mispredict | (valid_new_req & ~valid_req_q);
+
+  assign fetch_addr_d = (branch_i            ? addr_i :
+                         branch_mispredict_i ? {mispredict_addr_i[31:2], 2'b00} :
+                                               {fetch_addr_q[31:2], 2'b00}) +
+                        // Current address + 4
+                        {{29{1'b0}},(valid_new_req & ~valid_req_q),2'b00};
+
+  if (ResetAll) begin : g_fetch_addr_ra
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        fetch_addr_q <= '0;
+      end else if (fetch_addr_en) begin
+        fetch_addr_q <= fetch_addr_d;
+      end
+    end
+  end else begin : g_fetch_addr_nr
+    always_ff @(posedge clk_i) begin
+      if (fetch_addr_en) begin
+        fetch_addr_q <= fetch_addr_d;
+      end
+    end
+  end
+
+  // Address mux
+  assign instr_addr = valid_req_q         ? stored_addr_q :
+                      branch_i            ? addr_i :
+                      branch_mispredict_i ? mispredict_addr_i :
+                                            fetch_addr_q;
+
+  assign instr_addr_w_aligned = {instr_addr[31:2], 2'b00};
+
+  ///////////////////////////////
+  // Request outstanding queue //
+  ///////////////////////////////
+
+  for (genvar i = 0; i < NUM_REQS; i++) begin : g_outstanding_reqs
+    // Request 0 (always the oldest outstanding request)
+    if (i == 0) begin : g_req0
+      // A request becomes outstanding once granted, and is cleared once the rvalid is received.
+      // Outstanding requests shift down the queue towards entry 0.
+      assign rdata_outstanding_n[i] = (valid_req & instr_gnt_i) |
+                                      rdata_outstanding_q[i];
+      // If a branch is received at any point while a request is outstanding, it must be tracked
+      // to ensure we discard the data once received
+      assign branch_discard_n[i]    = (valid_req & instr_gnt_i & discard_req_d) |
+                                      (branch_or_mispredict & rdata_outstanding_q[i]) |
+                                      branch_discard_q[i];
+
+    end else begin : g_reqtop
+    // Entries > 0 consider the FIFO fill state to calculate their next state (by checking
+    // whether the previous entry is valid)
+
+      assign rdata_outstanding_n[i] = (valid_req & instr_gnt_i &
+                                       rdata_outstanding_q[i-1]) |
+                                      rdata_outstanding_q[i];
+      assign branch_discard_n[i]    = (valid_req & instr_gnt_i & discard_req_d &
+                                       rdata_outstanding_q[i-1]) |
+                                      (branch_or_mispredict & rdata_outstanding_q[i]) |
+                                      branch_discard_q[i];
+    end
+  end
+
+  // Shift the entries down on each instr_rvalid_i
+  assign rdata_outstanding_s = instr_rvalid_i ? {1'b0,rdata_outstanding_n[NUM_REQS-1:1]} :
+                                                rdata_outstanding_n;
+  assign branch_discard_s    = instr_rvalid_i ? {1'b0,branch_discard_n[NUM_REQS-1:1]} :
+                                                branch_discard_n;
+
+  // Push a new entry to the FIFO once complete (and not cancelled by a branch)
+  assign fifo_valid = instr_rvalid_i & ~branch_discard_q[0];
+
+  assign fifo_addr = branch_i ? addr_i : mispredict_addr_i;
+
+  ///////////////
+  // Registers //
+  ///////////////
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      valid_req_q          <= 1'b0;
+      discard_req_q        <= 1'b0;
+      rdata_outstanding_q  <= 'b0;
+      branch_discard_q     <= 'b0;
+    end else begin
+      valid_req_q          <= valid_req_d;
+      discard_req_q        <= discard_req_d;
+      rdata_outstanding_q  <= rdata_outstanding_s;
+      branch_discard_q     <= branch_discard_s;
+    end
+  end
+
+  /////////////
+  // Outputs //
+  /////////////
+
+  assign instr_req_o  = valid_req;
+  assign instr_addr_o = instr_addr_w_aligned;
+
+  assign valid_o = valid_raw & ~branch_mispredict_i;
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_register_file_ff.sv b/hw/ip/cheriot-ibex/rtl/cheriot_register_file_ff.sv
new file mode 100644
index 0000000..1da818e
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_register_file_ff.sv
@@ -0,0 +1,103 @@
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * RISC-V register file
+ *
+ * Register file with 31 or 15x 32 bit wide registers. Register 0 is fixed to 0.
+ * This register file is based on flip flops. Use this register file when
+ * targeting FPGA synthesis or Verilator simulation.
+ */
+module cheriot_register_file_ff #(
+  parameter bit                   RV32E             = 0,
+  parameter int unsigned          DataWidth         = 32,
+  parameter bit                   DummyInstructions = 0,
+  parameter logic [DataWidth-1:0] WordZeroVal       = '0
+) (
+  // Clock and Reset
+  input  logic                 clk_i,
+  input  logic                 rst_ni,
+
+  input  logic                 test_en_i,
+  input  logic                 dummy_instr_id_i,
+
+  //Read port R1
+  input  logic [4:0]           raddr_a_i,
+  output logic [DataWidth-1:0] rdata_a_o,
+
+  //Read port R2
+  input  logic [4:0]           raddr_b_i,
+  output logic [DataWidth-1:0] rdata_b_o,
+
+
+  // Write port W1
+  input  logic [4:0]           waddr_a_i,
+  input  logic [DataWidth-1:0] wdata_a_i,
+  input  logic                 we_a_i
+
+);
+
+  localparam int unsigned ADDR_WIDTH = RV32E ? 4 : 5;
+  localparam int unsigned NUM_WORDS  = 2**ADDR_WIDTH;
+
+  logic [NUM_WORDS-1:0][DataWidth-1:0] rf_reg;
+  logic [NUM_WORDS-1:1][DataWidth-1:0] rf_reg_q;
+  logic [NUM_WORDS-1:1]                we_a_dec;
+
+  always_comb begin : we_a_decoder
+    for (int unsigned i = 1; i < NUM_WORDS; i++) begin
+      we_a_dec[i] = (waddr_a_i == 5'(i)) ? we_a_i : 1'b0;
+    end
+  end
+
+  // No flops for R0 as it's hard-wired to 0
+  for (genvar i = 1; i < NUM_WORDS; i++) begin : g_rf_flops
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        rf_reg_q[i] <= WordZeroVal;
+      end else if (we_a_dec[i]) begin
+        rf_reg_q[i] <= wdata_a_i;
+      end
+    end
+  end
+
+  // With dummy instructions enabled, R0 behaves as a real register but will always return 0 for
+  // real instructions.
+  if (DummyInstructions) begin : g_dummy_r0
+    logic                 we_r0_dummy;
+    logic [DataWidth-1:0] rf_r0_q;
+
+    // Write enable for dummy R0 register (waddr_a_i will always be 0 for dummy instructions)
+    assign we_r0_dummy = we_a_i & dummy_instr_id_i;
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        rf_r0_q <= WordZeroVal;
+      end else if (we_r0_dummy) begin
+        rf_r0_q <= wdata_a_i;
+      end
+    end
+
+    // Output the dummy data for dummy instructions, otherwise R0 reads as zero
+    assign rf_reg[0] = dummy_instr_id_i ? rf_r0_q : WordZeroVal;
+
+  end else begin : g_normal_r0
+    logic unused_dummy_instr_id;
+    assign unused_dummy_instr_id = dummy_instr_id_i;
+
+    // R0 is nil
+    assign rf_reg[0] = WordZeroVal;
+  end
+
+  assign rf_reg[NUM_WORDS-1:1] = rf_reg_q[NUM_WORDS-1:1];
+
+  assign rdata_a_o = rf_reg[raddr_a_i];
+  assign rdata_b_o = rf_reg[raddr_b_i];
+
+  // Signal not used in FF register file
+  logic unused_test_en;
+  assign unused_test_en = test_en_i;
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_register_file_fpga.sv b/hw/ip/cheriot-ibex/rtl/cheriot_register_file_fpga.sv
new file mode 100644
index 0000000..2c00bc6
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_register_file_fpga.sv
@@ -0,0 +1,83 @@
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * RISC-V register file
+ *
+ * Register file with 31 or 15x 32 bit wide registers. Register 0 is fixed to 0.
+ *
+ * This register file is designed to make FPGA synthesis tools infer RAM primitives. For Xilinx
+ * FPGA architectures, it will produce RAM32M primitives. Other vendors have not yet been tested.
+ */
+module cheriot_register_file_fpga #(
+    parameter bit                   RV32E             = 0,
+    parameter int unsigned          DataWidth         = 32,
+    parameter bit                   DummyInstructions = 0,
+    parameter logic [DataWidth-1:0] WordZeroVal       = '0
+) (
+  // Clock and Reset
+  input  logic                 clk_i,
+  input  logic                 rst_ni,
+
+  input  logic                 test_en_i,
+  input  logic                 dummy_instr_id_i,
+
+  //Read port R1
+  input  logic [          4:0] raddr_a_i,
+  output logic [DataWidth-1:0] rdata_a_o,
+  //Read port R2
+  input  logic [          4:0] raddr_b_i,
+  output logic [DataWidth-1:0] rdata_b_o,
+  // Write port W1
+  input  logic [          4:0] waddr_a_i,
+  input  logic [DataWidth-1:0] wdata_a_i,
+  input  logic                 we_a_i
+);
+
+  localparam int ADDR_WIDTH = RV32E ? 4 : 5;
+  localparam int NUM_WORDS = 2 ** ADDR_WIDTH;
+
+  logic [DataWidth-1:0] mem[NUM_WORDS];
+  logic we; // write enable if writing to any register other than R0
+
+  // async_read a
+  assign rdata_a_o = (raddr_a_i == '0) ? '0 : mem[raddr_a_i];
+
+  // async_read b
+  assign rdata_b_o = (raddr_b_i == '0) ? '0 : mem[raddr_b_i];
+
+  // we select
+  assign we = (waddr_a_i == '0) ? 1'b0 : we_a_i;
+
+  // Note that the SystemVerilog LRM requires variables on the LHS of assignments within
+  // "always_ff" to not be written to by any other process. However, to enable the initialization
+  // of the inferred RAM32M primitives with non-zero values, below "initial" procedure is needed.
+  // Therefore, we use "always" instead of the generally preferred "always_ff" for the synchronous
+  // write procedure.
+  always @(posedge clk_i) begin : sync_write
+    if (we == 1'b1) begin
+      mem[waddr_a_i] <= wdata_a_i;
+    end
+  end : sync_write
+
+  // Make sure we initialize the BRAM with the correct register reset value.
+  initial begin
+    for (int k = 0; k < NUM_WORDS; k++) begin
+      mem[k] = WordZeroVal;
+    end
+  end
+
+  // Reset not used in this register file version
+  logic unused_rst_ni;
+  assign unused_rst_ni = rst_ni;
+
+  // Dummy instruction changes not relevant for FPGA implementation
+  logic unused_dummy_instr;
+  assign unused_dummy_instr = dummy_instr_id_i;
+  // Test enable signal not used in FPGA implementation
+  logic unused_test_en;
+  assign unused_test_en = test_en_i;
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_register_file_latch.sv b/hw/ip/cheriot-ibex/rtl/cheriot_register_file_latch.sv
new file mode 100644
index 0000000..d953b79
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_register_file_latch.sv
@@ -0,0 +1,163 @@
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * RISC-V register file
+ *
+ * Register file with 31 or 15x 32 bit wide registers. Register 0 is fixed to 0.
+ * This register file is based on latches and is thus smaller than the flip-flop
+ * based RF. It requires a target technology-specific clock gating cell. Use this
+ * register file when targeting ASIC synthesis or event-based simulators.
+ */
+module cheriot_register_file_latch #(
+  parameter bit                   RV32E             = 0,
+  parameter int unsigned          DataWidth         = 32,
+  parameter bit                   DummyInstructions = 0,
+  parameter logic [DataWidth-1:0] WordZeroVal       = '0
+) (
+  // Clock and Reset
+  input  logic                 clk_i,
+  input  logic                 rst_ni,
+
+  input  logic                 test_en_i,
+  input  logic                 dummy_instr_id_i,
+
+  //Read port R1
+  input  logic [4:0]           raddr_a_i,
+  output logic [DataWidth-1:0] rdata_a_o,
+
+  //Read port R2
+  input  logic [4:0]           raddr_b_i,
+  output logic [DataWidth-1:0] rdata_b_o,
+
+  // Write port W1
+  input  logic [4:0]           waddr_a_i,
+  input  logic [DataWidth-1:0] wdata_a_i,
+  input  logic                 we_a_i
+
+);
+
+  localparam int unsigned ADDR_WIDTH = RV32E ? 4 : 5;
+  localparam int unsigned NUM_WORDS  = 2**ADDR_WIDTH;
+
+  logic [DataWidth-1:0] mem[NUM_WORDS];
+
+  logic [NUM_WORDS-1:1] waddr_onehot_a;
+
+  logic [NUM_WORDS-1:1] mem_clocks;
+  logic [DataWidth-1:0] wdata_a_q;
+
+  // internal addresses
+  logic [ADDR_WIDTH-1:0] raddr_a_int, raddr_b_int, waddr_a_int;
+
+  assign raddr_a_int = raddr_a_i[ADDR_WIDTH-1:0];
+  assign raddr_b_int = raddr_b_i[ADDR_WIDTH-1:0];
+  assign waddr_a_int = waddr_a_i[ADDR_WIDTH-1:0];
+
+  logic clk_int;
+
+  //////////
+  // READ //
+  //////////
+  assign rdata_a_o = mem[raddr_a_int];
+  assign rdata_b_o = mem[raddr_b_int];
+
+  ///////////
+  // WRITE //
+  ///////////
+  // Global clock gating
+  prim_clock_gating cg_we_global (
+      .clk_i     ( clk_i     ),
+      .en_i      ( we_a_i    ),
+      .test_en_i ( test_en_i ),
+      .clk_o     ( clk_int   )
+  );
+
+  // Sample input data
+  // Use clk_int here, since otherwise we don't want to write anything anyway.
+  always_ff @(posedge clk_int or negedge rst_ni) begin : sample_wdata
+    if (!rst_ni) begin
+      wdata_a_q   <= WordZeroVal;
+    end else begin
+      if (we_a_i) begin
+        wdata_a_q <= wdata_a_i;
+      end
+    end
+  end
+
+  // Write address decoding
+  always_comb begin : wad
+    for (int i = 1; i < NUM_WORDS; i++) begin : wad_word_iter
+      if (we_a_i && (waddr_a_int == 5'(i))) begin
+        waddr_onehot_a[i] = 1'b1;
+      end else begin
+        waddr_onehot_a[i] = 1'b0;
+      end
+    end
+  end
+
+  // Individual clock gating (if integrated clock-gating cells are available)
+  for (genvar x = 1; x < NUM_WORDS; x++) begin : gen_cg_word_iter
+    prim_clock_gating cg_i (
+        .clk_i     ( clk_int           ),
+        .en_i      ( waddr_onehot_a[x] ),
+        .test_en_i ( test_en_i         ),
+        .clk_o     ( mem_clocks[x]     )
+    );
+  end
+
+  // Actual write operation:
+  // Generate the sequential process for the NUM_WORDS words of the memory.
+  // The process is synchronized with the clocks mem_clocks[i], i = 1, ..., NUM_WORDS-1.
+  for (genvar i = 1; i < NUM_WORDS; i++) begin : g_rf_latches
+    always_latch begin
+      if (mem_clocks[i]) begin
+        mem[i] = wdata_a_q;
+      end
+    end
+  end
+
+  // With dummy instructions enabled, R0 behaves as a real register but will always return 0 for
+  // real instructions.
+  if (DummyInstructions) begin : g_dummy_r0
+    logic                 we_r0_dummy;
+    logic                 r0_clock;
+    logic [DataWidth-1:0] mem_r0;
+
+    // Write enable for dummy R0 register (waddr_a_i will always be 0 for dummy instructions)
+    assign we_r0_dummy = we_a_i & dummy_instr_id_i;
+
+    // R0 clock gate
+    prim_clock_gating cg_i (
+        .clk_i     ( clk_int     ),
+        .en_i      ( we_r0_dummy ),
+        .test_en_i ( test_en_i   ),
+        .clk_o     ( r0_clock    )
+    );
+
+    always_latch begin : latch_wdata
+      if (r0_clock) begin
+        mem_r0 = wdata_a_q;
+      end
+    end
+
+    // Output the dummy data for dummy instructions, otherwise R0 reads as zero
+    assign mem[0] = dummy_instr_id_i ? mem_r0 : WordZeroVal;
+
+  end else begin : g_normal_r0
+    logic unused_dummy_instr_id;
+    assign unused_dummy_instr_id = dummy_instr_id_i;
+
+    assign mem[0] = WordZeroVal;
+  end
+
+`ifdef VERILATOR
+  initial begin
+    $display("Latch-based register file not supported for Verilator simulation");
+    $fatal;
+  end
+`endif
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_top.sv b/hw/ip/cheriot-ibex/rtl/cheriot_top.sv
new file mode 100644
index 0000000..7dd2663
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_top.sv
@@ -0,0 +1,1191 @@
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+`ifdef RISCV_FORMAL
+  `define RVFI
+`endif
+
+`include "prim_assert.sv"
+
+/**
+ * Top level module of the ibex RISC-V core
+ */
+module cheriot_top import cheriot_pkg::*; import cheri_pkg::*; #(
+  parameter bit          PMPEnable        = 1'b0,
+  parameter int unsigned PMPGranularity   = 0,
+  parameter int unsigned PMPNumRegions    = 4,
+  parameter int unsigned MHPMCounterNum   = 0,
+  parameter int unsigned MHPMCounterWidth = 40,
+  parameter bit          RV32E            = 1'b0,
+  parameter rv32m_e      RV32M            = RV32MFast,
+  parameter rv32b_e      RV32B            = RV32BNone,
+  parameter regfile_e    RegFile          = RegFileFF,
+  parameter bit          BranchTargetALU  = 1'b0,
+  parameter bit          WritebackStage   = 1'b0,
+  parameter bit          ICache           = 1'b0,
+  parameter bit          ICacheECC        = 1'b0,
+  parameter bit          BranchPredictor  = 1'b0,
+  parameter bit          DbgTriggerEn     = 1'b0,
+  parameter int unsigned DbgHwBreakNum    = 1,
+  parameter bit          SecureIbex       = 1'b0,
+  parameter bit          ICacheScramble   = 1'b0,
+  parameter lfsr_seed_t  RndCnstLfsrSeed  = RndCnstLfsrSeedDefault,
+  parameter lfsr_perm_t  RndCnstLfsrPerm  = RndCnstLfsrPermDefault,
+  parameter int unsigned DmHaltAddr       = 32'h1A110800,
+  parameter int unsigned DmExceptionAddr  = 32'h1A110808,
+  // Default seed and nonce for scrambling
+  parameter logic [SCRAMBLE_KEY_W-1:0]   RndCnstIbexKey   = RndCnstIbexKeyDefault,
+  parameter logic [SCRAMBLE_NONCE_W-1:0] RndCnstIbexNonce = RndCnstIbexNonceDefault,
+  // CHERIoT paramters
+  parameter bit          CHERIoTEn        = 1'b1,
+  parameter int unsigned DataWidth        = 33,
+  parameter int unsigned HeapBase         = 32'h2001_0000,
+  parameter int unsigned TSMapBase        = 32'h2002_f000, // 4kB default
+  parameter int unsigned TSMapSize        = 1024,           // 32-bit words
+  parameter bit          MemCapFmt        = 1'b0,
+  parameter bit          CheriPPLBC       = 1'b1,
+  parameter bit          CheriSBND2       = 1'b0,
+  parameter bit          CheriTBRE        = 1'b1,
+  parameter bit          CheriStkZ        = 1'b1,
+  parameter int unsigned MMRegDinW         = 128,
+  parameter int unsigned MMRegDoutW        = 64
+) (
+  // Clock and Reset
+  input  logic                         clk_i,
+  input  logic                         rst_ni,
+
+  input  logic                         test_en_i,     // enable all clock gates for testing
+  input  prim_ram_1p_pkg::ram_1p_cfg_t ram_cfg_i,
+
+  input  logic                         cheri_pmode_i,
+  input  logic                         cheri_tsafe_en_i,
+
+  input  logic [31:0]                  hart_id_i,
+  input  logic [31:0]                  boot_addr_i,
+
+  // Instruction memory interface
+  output logic                         instr_req_o,
+  input  logic                         instr_gnt_i,
+  input  logic                         instr_rvalid_i,
+  output logic [31:0]                  instr_addr_o,
+  input  logic [31:0]                  instr_rdata_i,
+  input  logic [6:0]                   instr_rdata_intg_i,
+  input  logic                         instr_err_i,
+
+  // Data memory interface
+  output logic                         data_req_o,
+  output logic                         data_is_cap_o,
+  input  logic                         data_gnt_i,
+  input  logic                         data_rvalid_i,
+  output logic                         data_we_o,
+  output logic [3:0]                   data_be_o,
+  output logic [31:0]                  data_addr_o,
+  output logic [DataWidth-1:0]         data_wdata_o,
+  output logic [6:0]                   data_wdata_intg_o,
+  input  logic [DataWidth-1:0]         data_rdata_i,
+  input  logic [6:0]                   data_rdata_intg_i,
+  input  logic                         data_err_i,
+
+  // TS map memory interface
+  output logic                         tsmap_cs_o,
+  output logic [15:0]                  tsmap_addr_o,
+  input  logic [31:0]                  tsmap_rdata_i,
+  input  logic [6:0]                   tsmap_rdata_intg_i,
+  input  logic [MMRegDinW-1:0]         mmreg_corein_i,
+  output logic [MMRegDoutW-1:0]        mmreg_coreout_o,
+
+  // Interrupt inputs
+  input  logic                         irq_software_i,
+  input  logic                         irq_timer_i,
+  input  logic                         irq_external_i,
+  input  logic [14:0]                  irq_fast_i,
+  input  logic                         irq_nm_i,       // non-maskeable interrupt
+
+  // Scrambling Interface
+  input  logic                         scramble_key_valid_i,
+  input  logic [SCRAMBLE_KEY_W-1:0]    scramble_key_i,
+  input  logic [SCRAMBLE_NONCE_W-1:0]  scramble_nonce_i,
+  output logic                         scramble_req_o,
+
+  // Debug Interface
+  input  logic                         debug_req_i,
+  output crash_dump_t                  crash_dump_o,
+  output logic                         double_fault_seen_o,
+
+  // RISC-V Formal Interface
+  // Does not comply with the coding standards of _i/_o suffixes, but follows
+  // the convention of RISC-V Formal Interface Specification.
+`ifdef RVFI
+  output logic                         rvfi_valid,
+  output logic [63:0]                  rvfi_order,
+  output logic [31:0]                  rvfi_insn,
+  output logic                         rvfi_trap,
+  output logic                         rvfi_halt,
+  output logic                         rvfi_intr,
+  output logic [ 1:0]                  rvfi_mode,
+  output logic [ 1:0]                  rvfi_ixl,
+  output logic [ 4:0]                  rvfi_rs1_addr,
+  output logic [ 4:0]                  rvfi_rs2_addr,
+  output logic [ 4:0]                  rvfi_rs3_addr,
+  output logic [31:0]                  rvfi_rs1_rdata,
+  output logic [31:0]                  rvfi_rs2_rdata,
+  output logic [31:0]                  rvfi_rs3_rdata,
+  output reg_cap_t                     rvfi_rs1_rcap,
+  output reg_cap_t                     rvfi_rs2_rcap,
+  output reg_cap_t                     rvfi_rd_wcap,
+  output logic [ 4:0]                  rvfi_rd_addr,
+  output logic [31:0]                  rvfi_rd_wdata,
+  output logic [31:0]                  rvfi_pc_rdata,
+  output logic [31:0]                  rvfi_pc_wdata,
+  output logic [31:0]                  rvfi_mem_addr,
+  output logic [ 3:0]                  rvfi_mem_rmask,
+  output logic [ 3:0]                  rvfi_mem_wmask,
+  output logic [DataWidth-1:0]         rvfi_mem_rdata,
+  output logic [DataWidth-1:0]         rvfi_mem_wdata,
+  output logic                         rvfi_mem_is_cap,
+  output reg_cap_t                     rvfi_mem_rcap,
+  output reg_cap_t                     rvfi_mem_wcap,
+
+  output logic [31:0]                  rvfi_ext_mip,
+  output logic                         rvfi_ext_nmi,
+  output logic                         rvfi_ext_debug_req,
+  output logic [63:0]                  rvfi_ext_mcycle,
+`endif
+
+  // CPU Control Signals
+  input  fetch_enable_t                fetch_enable_i,
+  output logic                         alert_minor_o,
+  output logic                         alert_major_internal_o,
+  output logic                         alert_major_bus_o,
+  output logic                         core_sleep_o,
+
+  // DFT bypass controls
+  input logic                          scan_rst_ni
+);
+
+  localparam bit          Lockstep          = SecureIbex;
+  localparam bit          ResetAll          = Lockstep;
+  localparam bit          DummyInstructions = SecureIbex;
+  localparam bit          RegFileECC        = SecureIbex;
+  localparam int unsigned RegFileDataWidth  = RegFileECC ? 32 + 7 : 32;
+  // Icache parameters
+  localparam int unsigned BusSizeECC        = ICacheECC ? (BUS_SIZE + 7) : BUS_SIZE;
+  localparam int unsigned LineSizeECC       = BusSizeECC * IC_LINE_BEATS;
+  localparam int unsigned TagSizeECC        = ICacheECC ? (IC_TAG_SIZE + 6) : IC_TAG_SIZE;
+  // Scrambling Parameter
+  localparam int unsigned NumAddrScrRounds  = ICacheScramble ? 2 : 0;
+  localparam int unsigned NumDiffRounds     = NumAddrScrRounds;
+
+  // Clock signals
+  logic                        clk;
+  logic                        core_busy_d, core_busy_q;
+  logic                        clock_en;
+  logic                        irq_pending;
+  // Core <-> Register file signals
+  logic                        dummy_instr_id;
+  logic [4:0]                  rf_raddr_a;
+  logic [4:0]                  rf_raddr_b;
+  logic [4:0]                  rf_waddr_wb;
+  logic                        rf_we_wb;
+  logic [RegFileDataWidth-1:0] rf_wdata_wb_ecc;
+  logic [RegFileDataWidth-1:0] rf_rdata_a_ecc, rf_rdata_a_ecc_buf;
+  logic [RegFileDataWidth-1:0] rf_rdata_b_ecc, rf_rdata_b_ecc_buf;
+  reg_cap_t                    rf_rcap_a, rf_rcap_b;
+  reg_cap_t                    rf_wcap;
+
+  // Core <-> RAMs signals
+  logic [IC_NUM_WAYS-1:0]      ic_tag_req;
+  logic                        ic_tag_write;
+  logic [IC_INDEX_W-1:0]       ic_tag_addr;
+  logic [TagSizeECC-1:0]       ic_tag_wdata;
+  logic [TagSizeECC-1:0]       ic_tag_rdata [IC_NUM_WAYS];
+  logic [IC_NUM_WAYS-1:0]      ic_data_req;
+  logic                        ic_data_write;
+  logic [IC_INDEX_W-1:0]       ic_data_addr;
+  logic [LineSizeECC-1:0]      ic_data_wdata;
+  logic [LineSizeECC-1:0]      ic_data_rdata [IC_NUM_WAYS];
+  // Alert signals
+  logic                        core_alert_major, core_alert_minor;
+  logic                        lockstep_alert_major_internal, lockstep_alert_major_bus;
+  logic                        lockstep_alert_minor;
+  // Scramble signals
+  logic                         icache_inval;
+  logic [SCRAMBLE_KEY_W-1:0]    scramble_key_q;
+  logic [SCRAMBLE_NONCE_W-1:0]  scramble_nonce_q;
+  logic                         scramble_key_valid_d, scramble_key_valid_q;
+  logic                         scramble_req_d, scramble_req_q;
+
+  fetch_enable_t fetch_enable_buf;
+
+  logic [31:0]   rf_reg_rdy;
+  logic [4:0]    rf_trvk_addr;
+  logic          rf_trvk_en;
+  logic          rf_trvk_clrtag;
+  logic [6:0]    rf_trvk_par;
+  logic [4:0]    rf_trsv_addr;
+  logic          rf_trsv_en;
+  logic [6:0]    rf_trsv_par;
+  logic          rf_alert;
+
+  /////////////////////
+  // Main clock gate //
+  /////////////////////
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      core_busy_q <= 1'b0;
+    end else begin
+      core_busy_q <= core_busy_d;
+    end
+  end
+
+  assign clock_en     = core_busy_q | debug_req_i | irq_pending | irq_nm_i;
+  assign core_sleep_o = ~clock_en;
+
+  prim_clock_gating core_clock_gate_i (
+    .clk_i    (clk_i),
+    .en_i     (clock_en),
+    .test_en_i(test_en_i),
+    .clk_o    (clk)
+  );
+
+  ////////////////////////
+  // Core instantiation //
+  ////////////////////////
+
+  // Buffer security critical signals to prevent synthesis optimisation removing them
+  prim_buf #(.Width($bits(fetch_enable_t))) u_fetch_enable_buf (
+    .in_i (fetch_enable_i),
+    .out_o(fetch_enable_buf)
+  );
+
+  prim_buf #(.Width(RegFileDataWidth)) u_rf_rdata_a_ecc_buf (
+    .in_i (rf_rdata_a_ecc),
+    .out_o(rf_rdata_a_ecc_buf)
+  );
+
+  prim_buf #(.Width(RegFileDataWidth)) u_rf_rdata_b_ecc_buf (
+    .in_i (rf_rdata_b_ecc),
+    .out_o(rf_rdata_b_ecc_buf)
+  );
+
+  cheriot_core #(
+    .PMPEnable        (PMPEnable),
+    .PMPGranularity   (PMPGranularity),
+    .PMPNumRegions    (PMPNumRegions),
+    .MHPMCounterNum   (MHPMCounterNum),
+    .MHPMCounterWidth (MHPMCounterWidth),
+    .RV32E            (RV32E),
+    .RV32M            (RV32M),
+    .RV32B            (RV32B),
+    .BranchTargetALU  (BranchTargetALU),
+    .ICache           (ICache),
+    .ICacheECC        (ICacheECC),
+    .BusSizeECC       (BusSizeECC),
+    .TagSizeECC       (TagSizeECC),
+    .LineSizeECC      (LineSizeECC),
+    .BranchPredictor  (BranchPredictor),
+    .DbgTriggerEn     (DbgTriggerEn),
+    .DbgHwBreakNum    (DbgHwBreakNum),
+    .WritebackStage   (WritebackStage),
+    .ResetAll         (ResetAll),
+    .RndCnstLfsrSeed  (RndCnstLfsrSeed),
+    .RndCnstLfsrPerm  (RndCnstLfsrPerm),
+    .SecureIbex       (SecureIbex),
+    .DummyInstructions(DummyInstructions),
+    .RegFileECC       (RegFileECC),
+    .RegFileDataWidth (RegFileDataWidth),
+    .DmHaltAddr       (DmHaltAddr),
+    .DmExceptionAddr  (DmExceptionAddr),
+    .CHERIoTEn        (CHERIoTEn),
+    .DataWidth        (DataWidth),
+    .HeapBase         (HeapBase   ),
+    .TSMapBase        (TSMapBase  ),
+    .TSMapSize        (TSMapSize),
+    .MemCapFmt        (MemCapFmt   ),
+    .CheriPPLBC       (CheriPPLBC),
+    .CheriSBND2       (CheriSBND2),
+    .CheriTBRE        (CheriTBRE),
+    .CheriStkZ        (CheriStkZ)
+  ) u_cheriot_core (
+    .clk_i(clk),
+    .rst_ni,
+
+    .hart_id_i,
+    .boot_addr_i,
+    .cheri_pmode_i,
+    .cheri_tsafe_en_i,
+
+    .instr_req_o,
+    .instr_gnt_i,
+    .instr_rvalid_i,
+    .instr_addr_o,
+    .instr_rdata_i,
+    .instr_err_i,
+
+    .data_req_o,
+    .data_is_cap_o,
+    .data_gnt_i,
+    .data_rvalid_i,
+    .data_we_o,
+    .data_be_o,
+    .data_addr_o,
+    .data_wdata_o,
+    .data_rdata_i,
+    .data_err_i,
+
+    .dummy_instr_id_o (dummy_instr_id),
+    .rf_raddr_a_o     (rf_raddr_a),
+    .rf_raddr_b_o     (rf_raddr_b),
+    .rf_waddr_wb_o    (rf_waddr_wb),
+    .rf_we_wb_o       (rf_we_wb),
+    .rf_wdata_wb_ecc_o(rf_wdata_wb_ecc),
+    .rf_rdata_a_ecc_i (rf_rdata_a_ecc_buf),
+    .rf_rdata_b_ecc_i (rf_rdata_b_ecc_buf),
+    .rf_wcap_wb_o     (rf_wcap),
+    .rf_rcap_a_i      (rf_rcap_a),
+    .rf_rcap_b_i      (rf_rcap_b),
+    .rf_reg_rdy_i     (rf_reg_rdy),
+    .rf_trsv_en_o     (rf_trsv_en),
+    .rf_trsv_addr_o   (rf_trsv_addr),
+    .rf_trsv_par_o    (rf_trsv_par),
+    .rf_trvk_addr_o   (rf_trvk_addr),
+    .rf_trvk_en_o     (rf_trvk_en    ),
+    .rf_trvk_clrtag_o (rf_trvk_clrtag),
+    .rf_trvk_par_o    (rf_trvk_par),
+    .tsmap_cs_o,
+    .tsmap_addr_o,
+    .tsmap_rdata_i,
+    .mmreg_corein_i,
+    .mmreg_coreout_o,
+
+    .ic_tag_req_o      (ic_tag_req),
+    .ic_tag_write_o    (ic_tag_write),
+    .ic_tag_addr_o     (ic_tag_addr),
+    .ic_tag_wdata_o    (ic_tag_wdata),
+    .ic_tag_rdata_i    (ic_tag_rdata),
+    .ic_data_req_o     (ic_data_req),
+    .ic_data_write_o   (ic_data_write),
+    .ic_data_addr_o    (ic_data_addr),
+    .ic_data_wdata_o   (ic_data_wdata),
+    .ic_data_rdata_i   (ic_data_rdata),
+    .ic_scr_key_valid_i(scramble_key_valid_q),
+
+    .irq_software_i,
+    .irq_timer_i,
+    .irq_external_i,
+    .irq_fast_i,
+    .irq_nm_i,
+    .irq_pending_o(irq_pending),
+
+    .debug_req_i,
+    .crash_dump_o,
+    .double_fault_seen_o,
+
+`ifdef RVFI
+    .rvfi_valid,
+    .rvfi_order,
+    .rvfi_insn,
+    .rvfi_trap,
+    .rvfi_halt,
+    .rvfi_intr,
+    .rvfi_mode,
+    .rvfi_ixl,
+    .rvfi_rs1_addr,
+    .rvfi_rs2_addr,
+    .rvfi_rs3_addr,
+    .rvfi_rs1_rdata,
+    .rvfi_rs1_rcap,
+    .rvfi_rs2_rdata,
+    .rvfi_rs2_rcap,
+    .rvfi_rs3_rdata,
+    .rvfi_rd_addr,
+    .rvfi_rd_wdata,
+    .rvfi_rd_wcap,
+    .rvfi_pc_rdata,
+    .rvfi_pc_wdata,
+    .rvfi_mem_addr,
+    .rvfi_mem_rmask,
+    .rvfi_mem_wmask,
+    .rvfi_mem_rdata,
+    .rvfi_mem_wdata,
+    .rvfi_mem_rcap,
+    .rvfi_mem_wcap,
+    .rvfi_mem_is_cap,
+    .rvfi_ext_mip,
+    .rvfi_ext_nmi,
+    .rvfi_ext_debug_req,
+    .rvfi_ext_mcycle,
+`endif
+
+    .fetch_enable_i(fetch_enable_buf),
+    .alert_minor_o (core_alert_minor),
+    .alert_major_o (core_alert_major),
+    .icache_inval_o(icache_inval),
+    .core_busy_o   (core_busy_d)
+  );
+
+  /////////////////////////////////
+  // Register file Instantiation //
+  /////////////////////////////////
+  if (!CHERIoTEn) begin
+    assign rf_alert = 1'b0;     // rf_alert only available in cheri_regfile
+  end
+
+  if (CHERIoTEn) begin : gen_regfile_cheriot
+
+    localparam int unsigned NRegs = RV32E? 16 : 32;
+    localparam int unsigned NCaps = 16;
+
+    cheri_regfile #(
+      .NREGS     (NRegs),
+      .NCAPS     (NCaps),
+      .RegFileECC(RegFileECC),
+      .DataWidth (RegFileDataWidth),
+      .CheriPPLBC(CheriPPLBC)
+    ) register_file_i (
+      .clk_i         (clk),
+      .rst_ni        (rst_ni),
+      .par_rst_ni    (rst_ni),
+      .raddr_a_i     (rf_raddr_a),
+      .rdata_a_o     (rf_rdata_a_ecc),
+      .rcap_a_o      (rf_rcap_a),
+      .raddr_b_i     (rf_raddr_b),
+      .rdata_b_o     (rf_rdata_b_ecc),
+      .rcap_b_o      (rf_rcap_b),
+      .waddr_a_i     (rf_waddr_wb),
+      .wdata_a_i     (rf_wdata_wb_ecc),
+      .wcap_a_i      (rf_wcap),
+      .we_a_i        (rf_we_wb),
+      .reg_rdy_o     (rf_reg_rdy),
+      .trvk_addr_i   (rf_trvk_addr),
+      .trvk_en_i     (rf_trvk_en),
+      .trvk_clrtag_i (rf_trvk_clrtag),
+      .trvk_par_i    (rf_trvk_par),
+      .trsv_addr_i   (rf_trsv_addr),
+      .trsv_en_i     (rf_trsv_en),
+      .trsv_par_i    (rf_trsv_par),
+      .alert_o       (rf_alert)
+    );
+
+  end else if (RegFile == RegFileFF) begin : gen_regfile_ff
+    cheriot_register_file_ff #(
+      .RV32E            (RV32E),
+      .DataWidth        (RegFileDataWidth),
+      .DummyInstructions(DummyInstructions),
+      .WordZeroVal      (RegFileDataWidth'(prim_secded_pkg::SecdedInv3932ZeroWord))
+    ) register_file_i (
+      .clk_i (clk),
+      .rst_ni(rst_ni),
+
+      .test_en_i       (test_en_i),
+      .dummy_instr_id_i(dummy_instr_id),
+
+      .raddr_a_i(rf_raddr_a),
+      .rdata_a_o(rf_rdata_a_ecc),
+      .raddr_b_i(rf_raddr_b),
+      .rdata_b_o(rf_rdata_b_ecc),
+      .waddr_a_i(rf_waddr_wb),
+      .wdata_a_i(rf_wdata_wb_ecc),
+      .we_a_i   (rf_we_wb)
+    );
+
+    assign rf_rcap_a  = NULL_REG_CAP;
+    assign rf_rcap_b  = NULL_REG_CAP;
+    assign rf_reg_rdy = {32{1'b1}};
+
+  end else if (RegFile == RegFileFPGA) begin : gen_regfile_fpga
+    cheriot_register_file_fpga #(
+      .RV32E            (RV32E),
+      .DataWidth        (RegFileDataWidth),
+      .DummyInstructions(DummyInstructions),
+      .WordZeroVal      (RegFileDataWidth'(prim_secded_pkg::SecdedInv3932ZeroWord))
+    ) register_file_i (
+      .clk_i (clk),
+      .rst_ni(rst_ni),
+
+      .test_en_i       (test_en_i),
+      .dummy_instr_id_i(dummy_instr_id),
+
+      .raddr_a_i(rf_raddr_a),
+      .rdata_a_o(rf_rdata_a_ecc),
+      .raddr_b_i(rf_raddr_b),
+      .rdata_b_o(rf_rdata_b_ecc),
+      .waddr_a_i(rf_waddr_wb),
+      .wdata_a_i(rf_wdata_wb_ecc),
+      .we_a_i   (rf_we_wb)
+    );
+
+    assign rf_rcap_a  = NULL_REG_CAP;
+    assign rf_rcap_b  = NULL_REG_CAP;
+    assign rf_reg_rdy = {32{1'b1}};
+
+  end else if (RegFile == RegFileLatch) begin : gen_regfile_latch
+    cheriot_register_file_latch #(
+      .RV32E            (RV32E),
+      .DataWidth        (RegFileDataWidth),
+      .DummyInstructions(DummyInstructions),
+      .WordZeroVal      (RegFileDataWidth'(prim_secded_pkg::SecdedInv3932ZeroWord))
+    ) register_file_i (
+      .clk_i (clk),
+      .rst_ni(rst_ni),
+
+      .test_en_i       (test_en_i),
+      .dummy_instr_id_i(dummy_instr_id),
+
+      .raddr_a_i(rf_raddr_a),
+      .rdata_a_o(rf_rdata_a_ecc),
+      .raddr_b_i(rf_raddr_b),
+      .rdata_b_o(rf_rdata_b_ecc),
+      .waddr_a_i(rf_waddr_wb),
+      .wdata_a_i(rf_wdata_wb_ecc),
+      .we_a_i   (rf_we_wb)
+    );
+
+    assign rf_rcap_a  = NULL_REG_CAP;
+    assign rf_rcap_b  = NULL_REG_CAP;
+    assign rf_reg_rdy = {32{1'b1}};
+
+  end
+
+  ///////////////////////////////
+  // Scrambling Infrastructure //
+  ///////////////////////////////
+
+  if (ICacheScramble) begin : gen_scramble
+
+  // Scramble key valid starts with OTP returning new valid key and stays high
+  // until we request a new valid key.
+    assign scramble_key_valid_d = scramble_req_q ? scramble_key_valid_i :
+                                  icache_inval   ? 1'b0                 :
+                                                   scramble_key_valid_q;
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        scramble_key_q       <= RndCnstIbexKey;
+        scramble_nonce_q     <= RndCnstIbexNonce;
+      end else if (scramble_key_valid_i) begin
+        scramble_key_q       <= scramble_key_i;
+        scramble_nonce_q     <= scramble_nonce_i;
+      end
+    end
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        scramble_key_valid_q <= 1'b1;
+        scramble_req_q       <= '0;
+      end else begin
+        scramble_key_valid_q <= scramble_key_valid_d;
+        scramble_req_q       <= scramble_req_d;
+      end
+    end
+
+  // Scramble key request starts with invalidate signal from ICache and stays high
+  // until we got a valid key.
+    assign scramble_req_d = scramble_req_q ? ~scramble_key_valid_i : icache_inval;
+    assign scramble_req_o = scramble_req_q;
+
+  end else begin : gen_noscramble
+
+    logic unused_scramble_inputs = scramble_key_valid_i & (|scramble_key_i) & (|RndCnstIbexKey) &
+                                   (|scramble_nonce_i) & (|RndCnstIbexNonce) & scramble_req_q &
+                                   icache_inval & scramble_key_valid_d & scramble_req_d;
+
+    assign scramble_req_d       = 1'b0;
+    assign scramble_req_q       = 1'b0;
+    assign scramble_req_o       = 1'b0;
+    assign scramble_key_q       = '0;
+    assign scramble_nonce_q     = '0;
+    assign scramble_key_valid_q = 1'b1;
+    assign scramble_key_valid_d = 1'b1;
+  end
+
+  ////////////////////////
+  // Rams Instantiation //
+  ////////////////////////
+
+  if (ICache) begin : gen_rams
+
+    for (genvar way = 0; way < IC_NUM_WAYS; way++) begin : gen_rams_inner
+
+      // Tag RAM instantiation
+      prim_ram_1p_scr #(
+        .Width            (TagSizeECC),
+        .Depth            (IC_NUM_LINES),
+        .DataBitsPerMask  (TagSizeECC),
+        .EnableParity     (0),
+        .DiffWidth        (TagSizeECC),
+        .NumAddrScrRounds (NumAddrScrRounds),
+        .NumDiffRounds    (NumDiffRounds)
+      ) tag_bank (
+        .clk_i,
+        .rst_ni,
+
+        .key_valid_i (scramble_key_valid_q),
+        .key_i       (scramble_key_q),
+        .nonce_i     (scramble_nonce_q),
+
+        .req_i       (ic_tag_req[way]),
+
+        .gnt_o       (),
+        .write_i     (ic_tag_write),
+        .addr_i      (ic_tag_addr),
+        .wdata_i     (ic_tag_wdata),
+        .wmask_i     ({TagSizeECC{1'b1}}),
+        .intg_error_i(1'b0),
+
+        .rdata_o     (ic_tag_rdata[way]),
+        .rvalid_o    (),
+        .raddr_o     (),
+        .rerror_o    (),
+        .cfg_i       (ram_cfg_i)
+      );
+
+      // Data RAM instantiation
+      prim_ram_1p_scr #(
+        .Width              (LineSizeECC),
+        .Depth              (IC_NUM_LINES),
+        .DataBitsPerMask    (LineSizeECC),
+        .ReplicateKeyStream (1),
+        .EnableParity       (0),
+        .DiffWidth          (LineSizeECC),
+        .NumAddrScrRounds   (NumAddrScrRounds),
+        .NumDiffRounds      (NumDiffRounds)
+      ) data_bank (
+        .clk_i,
+        .rst_ni,
+
+        .key_valid_i (scramble_key_valid_q),
+        .key_i       (scramble_key_q),
+        .nonce_i     (scramble_nonce_q),
+
+        .req_i       (ic_data_req[way]),
+
+        .gnt_o       (),
+        .write_i     (ic_data_write),
+        .addr_i      (ic_data_addr),
+        .wdata_i     (ic_data_wdata),
+        .wmask_i     ({LineSizeECC{1'b1}}),
+        .intg_error_i(1'b0),
+
+        .rdata_o     (ic_data_rdata[way]),
+        .rvalid_o    (),
+        .raddr_o     (),
+        .rerror_o    (),
+        .cfg_i       (ram_cfg_i)
+      );
+    end
+
+  end else begin : gen_norams
+
+    prim_ram_1p_pkg::ram_1p_cfg_t unused_ram_cfg;
+    logic unused_ram_inputs;
+
+    assign unused_ram_cfg    = ram_cfg_i;
+    assign unused_ram_inputs = (|ic_tag_req) & ic_tag_write & (|ic_tag_addr) & (|ic_tag_wdata) &
+                               (|ic_data_req) & ic_data_write & (|ic_data_addr) & (|ic_data_wdata) &
+                               (|scramble_key_q) & (|scramble_nonce_q) & scramble_key_valid_q &
+                               scramble_key_valid_d & (|scramble_nonce_q) &
+                               (|NumAddrScrRounds);
+
+    assign ic_tag_rdata      = '{default:'b0};
+    assign ic_data_rdata     = '{default:'b0};
+
+  end
+
+  // Redundant lockstep core implementation
+  if (Lockstep) begin : gen_lockstep
+    // Note: certain synthesis tools like DC are very smart at optimizing away redundant logic.
+    // Hence, we have to insert an optimization barrier at the IOs of the lockstep Ibex.
+    // This is achieved by manually buffering each bit using prim_buf.
+    // Our Xilinx and DC synthesis flows make sure that these buffers cannot be optimized away
+    // using keep attributes (Vivado) and size_only constraints (DC).
+    logic [37:0] rf_wcap_vec, rf_rcap_a_vec, rf_rcap_b_vec;
+
+    localparam int NumBufferBits = $bits({
+      hart_id_i,
+      boot_addr_i,
+      instr_req_o,
+      instr_gnt_i,
+      instr_rvalid_i,
+      instr_addr_o,
+      instr_rdata_i,
+      instr_rdata_intg_i,
+      instr_err_i,
+      data_req_o,
+      data_gnt_i,
+      data_rvalid_i,
+      data_we_o,
+      data_be_o,
+      data_addr_o,
+      data_wdata_o,
+      data_is_cap_o,
+      data_rdata_i,
+      data_rdata_intg_i,
+      data_err_i,
+      dummy_instr_id,
+      rf_raddr_a,
+      rf_raddr_b,
+      rf_waddr_wb,
+      rf_we_wb,
+      rf_wdata_wb_ecc,
+      rf_rdata_a_ecc,
+      rf_rdata_b_ecc,
+      ic_tag_req,
+      ic_tag_write,
+      ic_tag_addr,
+      ic_tag_wdata,
+      ic_data_req,
+      ic_data_write,
+      ic_data_addr,
+      ic_data_wdata,
+      scramble_key_valid_i,
+      irq_software_i,
+      irq_timer_i,
+      irq_external_i,
+      irq_fast_i,
+      irq_nm_i,
+      irq_pending,
+      debug_req_i,
+      crash_dump_o,
+      double_fault_seen_o,
+      fetch_enable_i,
+      icache_inval,
+      core_busy_d,
+      cheri_pmode_i,
+      cheri_tsafe_en_i,
+      rf_wcap_vec,
+      rf_rcap_a_vec,
+      rf_rcap_b_vec,
+      rf_reg_rdy,
+      rf_trsv_en,
+      rf_trsv_addr,
+      rf_trsv_par,
+      rf_trvk_addr,
+      rf_trvk_en,
+      rf_trvk_clrtag,
+      rf_trvk_par,
+      tsmap_cs_o,
+      tsmap_addr_o,
+      tsmap_rdata_i,
+      tsmap_rdata_intg_i,
+      mmreg_corein_i,
+      mmreg_coreout_o
+    });
+
+    logic [NumBufferBits-1:0] buf_in, buf_out;
+
+    logic [31:0]                  hart_id_local;
+    logic [31:0]                  boot_addr_local;
+
+    logic                         instr_req_local;
+    logic                         instr_gnt_local;
+    logic                         instr_rvalid_local;
+    logic [31:0]                  instr_addr_local;
+    logic [31:0]                  instr_rdata_local;
+    logic [6:0]                   instr_rdata_intg_local;
+    logic                         instr_err_local;
+
+    logic                         data_req_local;
+    logic                         data_gnt_local;
+    logic                         data_rvalid_local;
+    logic                         data_we_local;
+    logic [3:0]                   data_be_local;
+    logic [31:0]                  data_addr_local;
+    logic [DataWidth-1:0]         data_wdata_local;
+    logic                         data_is_cap_local;
+    logic [6:0]                   data_wdata_intg_local;
+    logic [DataWidth-1:0]         data_rdata_local;
+    logic [6:0]                   data_rdata_intg_local;
+    logic                         data_err_local;
+
+    logic                         dummy_instr_id_local;
+    logic [4:0]                   rf_raddr_a_local;
+    logic [4:0]                   rf_raddr_b_local;
+    logic [4:0]                   rf_waddr_wb_local;
+    logic                         rf_we_wb_local;
+    logic [RegFileDataWidth-1:0]  rf_wdata_wb_ecc_local;
+    logic [RegFileDataWidth-1:0]  rf_rdata_a_ecc_local;
+    logic [RegFileDataWidth-1:0]  rf_rdata_b_ecc_local;
+
+    logic                         cheri_pmode_local;
+    logic                         cheri_tsafe_en_local;
+    logic [37:0]                  rf_wcap_vec_local;
+    logic [37:0]                  rf_rcap_a_vec_local; 
+    logic [37:0]                  rf_rcap_b_vec_local;
+    logic [31:0]                  rf_reg_rdy_local;
+    logic                         rf_trsv_en_local;
+    logic [4:0]                   rf_trsv_addr_local;
+    logic [6:0]                   rf_trsv_par_local;
+    logic [4:0]                   rf_trvk_addr_local;
+    logic                         rf_trvk_en_local;
+    logic                         rf_trvk_clrtag_local;
+    logic [6:0]                   rf_trvk_par_local;
+    logic                         tsmap_cs_local;
+    logic [15:0]                  tsmap_addr_local;
+    logic [31:0]                  tsmap_rdata_local;
+    logic [6:0]                   tsmap_rdata_intg_local;
+    logic [MMRegDinW-1:0]         mmreg_corein_local;
+    logic [MMRegDoutW-1:0]        mmreg_coreout_local;
+    reg_cap_t                     rf_wcap_local, rf_rcap_a_local, rf_rcap_b_local;
+
+    logic [IC_NUM_WAYS-1:0]       ic_tag_req_local;
+    logic                         ic_tag_write_local;
+    logic [IC_INDEX_W-1:0]        ic_tag_addr_local;
+    logic [TagSizeECC-1:0]        ic_tag_wdata_local;
+    logic [IC_NUM_WAYS-1:0]       ic_data_req_local;
+    logic                         ic_data_write_local;
+    logic [IC_INDEX_W-1:0]        ic_data_addr_local;
+    logic [LineSizeECC-1:0]       ic_data_wdata_local;
+    logic                         scramble_key_valid_local;
+
+    logic                         irq_software_local;
+    logic                         irq_timer_local;
+    logic                         irq_external_local;
+    logic [14:0]                  irq_fast_local;
+    logic                         irq_nm_local;
+    logic                         irq_pending_local;
+
+    logic                         debug_req_local;
+    crash_dump_t                  crash_dump_local;
+    logic                         double_fault_seen_local;
+    fetch_enable_t                fetch_enable_local;
+
+    logic                         icache_inval_local;
+    logic                         core_busy_local;
+
+    assign buf_in = {
+      hart_id_i,
+      boot_addr_i,
+      instr_req_o,
+      instr_gnt_i,
+      instr_rvalid_i,
+      instr_addr_o,
+      instr_rdata_i,
+      instr_rdata_intg_i,
+      instr_err_i,
+      data_req_o,
+      data_gnt_i,
+      data_rvalid_i,
+      data_we_o,
+      data_be_o,
+      data_addr_o,
+      data_wdata_o,
+      data_is_cap_o,
+      data_rdata_i,
+      data_rdata_intg_i,
+      data_err_i,
+      dummy_instr_id,
+      rf_raddr_a,
+      rf_raddr_b,
+      rf_waddr_wb,
+      rf_we_wb,
+      rf_wdata_wb_ecc,
+      rf_rdata_a_ecc,
+      rf_rdata_b_ecc,
+      ic_tag_req,
+      ic_tag_write,
+      ic_tag_addr,
+      ic_tag_wdata,
+      ic_data_req,
+      ic_data_write,
+      ic_data_addr,
+      ic_data_wdata,
+      scramble_key_valid_q,
+      irq_software_i,
+      irq_timer_i,
+      irq_external_i,
+      irq_fast_i,
+      irq_nm_i,
+      irq_pending,
+      debug_req_i,
+      crash_dump_o,
+      double_fault_seen_o,
+      fetch_enable_i,
+      icache_inval,
+      core_busy_d,
+      cheri_pmode_i,
+      cheri_tsafe_en_i,
+      rf_wcap_vec,
+      rf_rcap_a_vec,
+      rf_rcap_b_vec,
+      rf_reg_rdy,
+      rf_trsv_en,
+      rf_trsv_addr,
+      rf_trsv_par,
+      rf_trvk_addr,
+      rf_trvk_en,
+      rf_trvk_clrtag,
+      rf_trvk_par,
+      tsmap_cs_o,
+      tsmap_addr_o,
+      tsmap_rdata_i,
+      tsmap_rdata_intg_i,
+      mmreg_corein_i,
+      mmreg_coreout_o
+    };
+
+    assign {
+      hart_id_local,
+      boot_addr_local,
+      instr_req_local,
+      instr_gnt_local,
+      instr_rvalid_local,
+      instr_addr_local,
+      instr_rdata_local,
+      instr_rdata_intg_local,
+      instr_err_local,
+      data_req_local,
+      data_gnt_local,
+      data_rvalid_local,
+      data_we_local,
+      data_be_local,
+      data_addr_local,
+      data_wdata_local,
+      data_is_cap_local,
+      data_rdata_local,
+      data_rdata_intg_local,
+      data_err_local,
+      dummy_instr_id_local,
+      rf_raddr_a_local,
+      rf_raddr_b_local,
+      rf_waddr_wb_local,
+      rf_we_wb_local,
+      rf_wdata_wb_ecc_local,
+      rf_rdata_a_ecc_local,
+      rf_rdata_b_ecc_local,
+      ic_tag_req_local,
+      ic_tag_write_local,
+      ic_tag_addr_local,
+      ic_tag_wdata_local,
+      ic_data_req_local,
+      ic_data_write_local,
+      ic_data_addr_local,
+      ic_data_wdata_local,
+      scramble_key_valid_local,
+      irq_software_local,
+      irq_timer_local,
+      irq_external_local,
+      irq_fast_local,
+      irq_nm_local,
+      irq_pending_local,
+      debug_req_local,
+      crash_dump_local,
+      double_fault_seen_local,
+      fetch_enable_local,
+      icache_inval_local,
+      core_busy_local,
+      cheri_pmode_local,
+      cheri_tsafe_en_local,
+      rf_wcap_vec_local,
+      rf_rcap_a_vec_local,
+      rf_rcap_b_vec_local,
+      rf_reg_rdy_local,
+      rf_trsv_en_local,
+      rf_trsv_addr_local,
+      rf_trsv_par_local,
+      rf_trvk_addr_local,
+      rf_trvk_en_local,
+      rf_trvk_clrtag_local,
+      rf_trvk_par_local,
+      tsmap_cs_local,
+      tsmap_addr_local,
+      tsmap_rdata_local,
+      tsmap_rdata_intg_local,
+      mmreg_corein_local,
+      mmreg_coreout_local
+    } = buf_out;
+
+    assign rf_wcap_vec     = reg2vec(rf_wcap);
+    assign rf_rcap_a_vec   = reg2vec(rf_rcap_a);
+    assign rf_rcap_b_vec   = reg2vec(rf_rcap_b);
+    assign rf_wcap_local   = vec2reg(rf_wcap_vec_local);
+    assign rf_rcap_a_local = vec2reg(rf_rcap_a_vec_local);
+    assign rf_rcap_b_local = vec2reg(rf_rcap_b_vec_local);
+
+    // Manually buffer all input signals.
+    prim_buf #(.Width(NumBufferBits)) u_signals_prim_buf (
+      .in_i(buf_in),
+      .out_o(buf_out)
+    );
+
+    logic [TagSizeECC-1:0]  ic_tag_rdata_local [IC_NUM_WAYS];
+    logic [LineSizeECC-1:0] ic_data_rdata_local [IC_NUM_WAYS];
+    for (genvar k = 0; k < IC_NUM_WAYS; k++) begin : gen_ways
+      prim_buf #(.Width(TagSizeECC)) u_tag_prim_buf (
+        .in_i(ic_tag_rdata[k]),
+        .out_o(ic_tag_rdata_local[k])
+      );
+      prim_buf #(.Width(LineSizeECC)) u_data_prim_buf (
+        .in_i(ic_data_rdata[k]),
+        .out_o(ic_data_rdata_local[k])
+      );
+    end
+
+    logic lockstep_alert_minor_local, lockstep_alert_major_internal_local;
+    logic lockstep_alert_major_bus_local;
+
+    cheriot_lockstep #(
+      .PMPEnable        (PMPEnable),
+      .PMPGranularity   (PMPGranularity),
+      .PMPNumRegions    (PMPNumRegions),
+      .MHPMCounterNum   (MHPMCounterNum),
+      .MHPMCounterWidth (MHPMCounterWidth),
+      .RV32E            (RV32E),
+      .RV32M            (RV32M),
+      .RV32B            (RV32B),
+      .BranchTargetALU  (BranchTargetALU),
+      .ICache           (ICache),
+      .ICacheECC        (ICacheECC),
+      .BusSizeECC       (BusSizeECC),
+      .TagSizeECC       (TagSizeECC),
+      .LineSizeECC      (LineSizeECC),
+      .BranchPredictor  (BranchPredictor),
+      .DbgTriggerEn     (DbgTriggerEn),
+      .DbgHwBreakNum    (DbgHwBreakNum),
+      .WritebackStage   (WritebackStage),
+      .ResetAll         (ResetAll),
+      .RndCnstLfsrSeed  (RndCnstLfsrSeed),
+      .RndCnstLfsrPerm  (RndCnstLfsrPerm),
+      .SecureIbex       (SecureIbex),
+      .DummyInstructions(DummyInstructions),
+      .RegFileECC       (RegFileECC),
+      .RegFileDataWidth (RegFileDataWidth),
+      .DmHaltAddr       (DmHaltAddr),
+      .DmExceptionAddr  (DmExceptionAddr),
+      .CHERIoTEn        (CHERIoTEn),
+      .DataWidth        (DataWidth),
+      .HeapBase         (HeapBase   ),
+      .TSMapBase        (TSMapBase  ),
+      .TSMapSize        (TSMapSize),
+      .MemCapFmt        (MemCapFmt   ),
+      .CheriPPLBC       (CheriPPLBC),
+      .CheriSBND2       (CheriSBND2),
+      .CheriTBRE        (CheriTBRE)
+    ) u_cheriot_lockstep (
+      .clk_i                  (clk),
+      .rst_ni                 (rst_ni),   // should use a different reset tree 
+
+      .hart_id_i              (hart_id_local),
+      .boot_addr_i            (boot_addr_local),
+      .cheri_pmode_i          (cheri_pmode_local),
+      .cheri_tsafe_en_i       (cheri_tsafe_en_local),
+
+      .instr_req_i            (instr_req_local),
+      .instr_gnt_i            (instr_gnt_local),
+      .instr_rvalid_i         (instr_rvalid_local),
+      .instr_addr_i           (instr_addr_local),
+      .instr_rdata_i          (instr_rdata_local),
+      .instr_rdata_intg_i     (instr_rdata_intg_local),
+      .instr_err_i            (instr_err_local),
+
+      .data_req_i             (data_req_local),
+      .data_gnt_i             (data_gnt_local),
+      .data_rvalid_i          (data_rvalid_local),
+      .data_we_i              (data_we_local),
+      .data_be_i              (data_be_local),
+      .data_addr_i            (data_addr_local),
+      .data_wdata_i           (data_wdata_local),
+      .data_is_cap_i          (data_is_cap_local),
+      .data_wdata_intg_o      (data_wdata_intg_local),
+      .data_rdata_i           (data_rdata_local),
+      .data_rdata_intg_i      (data_rdata_intg_local),
+      .data_err_i             (data_err_local),
+
+      .dummy_instr_id_i       (dummy_instr_id_local),
+      .rf_raddr_a_i           (rf_raddr_a_local),
+      .rf_raddr_b_i           (rf_raddr_b_local),
+      .rf_waddr_wb_i          (rf_waddr_wb_local),
+      .rf_we_wb_i             (rf_we_wb_local),
+      .rf_wdata_wb_ecc_i      (rf_wdata_wb_ecc_local),
+      .rf_rdata_a_ecc_i       (rf_rdata_a_ecc_local),
+      .rf_rdata_b_ecc_i       (rf_rdata_b_ecc_local),
+      .rf_wcap_wb_i           (rf_wcap_local     ),
+      .rf_rcap_a_i            (rf_rcap_a_local      ),
+      .rf_rcap_b_i            (rf_rcap_b_local      ),
+      .rf_reg_rdy_i           (rf_reg_rdy_local     ),
+      .rf_trsv_en_i           (rf_trsv_en_local     ),
+      .rf_trsv_addr_i         (rf_trsv_addr_local   ),
+      .rf_trsv_par_i          (rf_trsv_par_local ),
+      .rf_trvk_addr_i         (rf_trvk_addr_local   ),
+      .rf_trvk_en_i           (rf_trvk_en_local     ),
+      .rf_trvk_clrtag_i       (rf_trvk_clrtag_local ),
+      .rf_trvk_par_i          (rf_trvk_par_local ),
+      .tsmap_cs_i             (tsmap_cs_local       ),
+      .tsmap_addr_i           (tsmap_addr_local     ),
+      .tsmap_rdata_i          (tsmap_rdata_local    ),
+      .tsmap_rdata_intg_i     (tsmap_rdata_intg_local),
+      .mmreg_corein_i         (mmreg_corein_local  ),
+      .mmreg_coreout_i        (mmreg_coreout_local      ), 
+
+      .ic_tag_req_i           (ic_tag_req_local),
+      .ic_tag_write_i         (ic_tag_write_local),
+      .ic_tag_addr_i          (ic_tag_addr_local),
+      .ic_tag_wdata_i         (ic_tag_wdata_local),
+      .ic_tag_rdata_i         (ic_tag_rdata_local),
+      .ic_data_req_i          (ic_data_req_local),
+      .ic_data_write_i        (ic_data_write_local),
+      .ic_data_addr_i         (ic_data_addr_local),
+      .ic_data_wdata_i        (ic_data_wdata_local),
+      .ic_data_rdata_i        (ic_data_rdata_local),
+      .ic_scr_key_valid_i     (scramble_key_valid_local),
+
+      .irq_software_i         (irq_software_local),
+      .irq_timer_i            (irq_timer_local),
+      .irq_external_i         (irq_external_local),
+      .irq_fast_i             (irq_fast_local),
+      .irq_nm_i               (irq_nm_local),
+      .irq_pending_i          (irq_pending_local),
+
+      .debug_req_i            (debug_req_local),
+      .crash_dump_i           (crash_dump_local),
+      .double_fault_seen_i    (double_fault_seen_local),
+
+      .fetch_enable_i         (fetch_enable_local),
+      .alert_minor_o          (lockstep_alert_minor_local),
+      .alert_major_internal_o (lockstep_alert_major_internal_local),
+      .alert_major_bus_o      (lockstep_alert_major_bus_local),
+      .icache_inval_i         (icache_inval_local),
+      .core_busy_i            (core_busy_local),
+      .test_en_i              (test_en_i),
+      .scan_rst_ni            (scan_rst_ni)
+    );
+
+    // Manually buffer the output signals.
+    prim_buf #(.Width (7)) u_prim_buf_wdata_intg (
+      .in_i(data_wdata_intg_local),
+      .out_o(data_wdata_intg_o)
+    );
+
+    prim_buf u_prim_buf_alert_minor (
+      .in_i (lockstep_alert_minor_local),
+      .out_o(lockstep_alert_minor)
+    );
+
+    prim_buf u_prim_buf_alert_major_internal (
+      .in_i (lockstep_alert_major_internal_local),
+      .out_o(lockstep_alert_major_internal)
+    );
+
+    prim_buf u_prim_buf_alert_major_bus (
+      .in_i (lockstep_alert_major_bus_local),
+      .out_o(lockstep_alert_major_bus)
+    );
+
+  end else begin : gen_no_lockstep
+    assign lockstep_alert_major_internal = 1'b0;
+    assign lockstep_alert_major_bus      = 1'b0;
+    assign lockstep_alert_minor          = 1'b0;
+    assign data_wdata_intg_o             = 'b0;
+    logic unused_scan, unused_intg;
+    assign unused_scan = scan_rst_ni;
+    assign unused_intg = |{instr_rdata_intg_i, data_rdata_intg_i};
+  end
+
+  assign alert_major_internal_o = core_alert_major | lockstep_alert_major_internal | rf_alert;
+  assign alert_major_bus_o      = lockstep_alert_major_bus;
+  assign alert_minor_o          = core_alert_minor | lockstep_alert_minor;
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_top_tracing.sv b/hw/ip/cheriot-ibex/rtl/cheriot_top_tracing.sv
new file mode 100644
index 0000000..aa74060
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_top_tracing.sv
@@ -0,0 +1,347 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Top level module of the ibex RISC-V core with tracing enabled
+ */
+
+module cheriot_top_tracing import cheriot_pkg::*; import cheri_pkg::*; #(
+  parameter bit          PMPEnable        = 1'b0,
+  parameter int unsigned PMPGranularity   = 0,
+  parameter int unsigned PMPNumRegions    = 4,
+  parameter int unsigned MHPMCounterNum   = 0,
+  parameter int unsigned MHPMCounterWidth = 40,
+  parameter bit          RV32E            = 1'b0,
+  parameter rv32m_e      RV32M            = RV32MFast,
+  parameter rv32b_e      RV32B            = RV32BNone,
+  parameter regfile_e    RegFile          = RegFileFF,
+  parameter bit          BranchTargetALU  = 1'b1,
+  parameter bit          WritebackStage   = 1'b1,
+  parameter bit          ICache           = 1'b0,
+  parameter bit          ICacheECC        = 1'b0,
+  parameter bit          BranchPredictor  = 1'b0,
+  parameter bit          DbgTriggerEn     = 1'b0,
+  parameter int unsigned DbgHwBreakNum    = 1,
+  parameter bit          SecureIbex       = 1'b0,
+  parameter bit          ICacheScramble   = 1'b0,
+  parameter lfsr_seed_t  RndCnstLfsrSeed  = RndCnstLfsrSeedDefault,
+  parameter lfsr_perm_t  RndCnstLfsrPerm  = RndCnstLfsrPermDefault,
+  parameter bit          HWTraceEn        = 1'b0,
+  parameter int unsigned DmHaltAddr       = 32'h1A110800,
+  parameter int unsigned DmExceptionAddr  = 32'h1A110808,
+  parameter bit          CHERIoTEn        = 1'b1,
+  parameter int unsigned DataWidth        = 33,
+  parameter int unsigned HeapBase         = 32'h2001_0000,
+  parameter int unsigned TSMapBase        = 32'h2004_0000, // 4kB default
+  parameter int unsigned TSMapSize        = 1024,
+  parameter bit          MemCapFmt        = 1'b0,
+  parameter bit          CheriPPLBC       = 1'b1,
+  parameter bit          CheriSBND2       = 1'b0,
+  parameter bit          CheriTBRE        = 1'b1,
+  parameter bit          CheriStkZ        = 1'b1,
+  parameter int unsigned MMRegDinW        = 128,
+  parameter int unsigned MMRegDoutW       = 64
+) (
+  // Clock and Reset
+  input  logic                         clk_i,
+  input  logic                         rst_ni,
+
+  input  logic                         test_en_i,     // enable all clock gates for testing
+  input  logic                         scan_rst_ni,
+  input  prim_ram_1p_pkg::ram_1p_cfg_t ram_cfg_i,
+
+  input  logic                         cheri_pmode_i,
+  input  logic                         cheri_tsafe_en_i,
+  input  logic [31:0]                  hart_id_i,
+  input  logic [31:0]                  boot_addr_i,
+
+  // Instruction memory interface
+  output logic                         instr_req_o,
+  input  logic                         instr_gnt_i,
+  input  logic                         instr_rvalid_i,
+  output logic [31:0]                  instr_addr_o,
+  input  logic [31:0]                  instr_rdata_i,
+  input  logic [6:0]                   instr_rdata_intg_i,
+  input  logic                         instr_err_i,
+
+  // Data memory interface
+  output logic                         data_req_o,
+  output logic                         data_is_cap_o,
+  input  logic                         data_gnt_i,
+  input  logic                         data_rvalid_i,
+  output logic                         data_we_o,
+  output logic [3:0]                   data_be_o,
+  output logic [31:0]                  data_addr_o,
+  output logic [DataWidth-1:0]         data_wdata_o,
+  output logic [6:0]                   data_wdata_intg_o,
+  input  logic [DataWidth-1:0]         data_rdata_i,
+  input  logic [6:0]                   data_rdata_intg_i,
+  input  logic                         data_err_i,
+
+  // TS map memory interface
+  output logic                         tsmap_cs_o,
+  output logic [15:0]                  tsmap_addr_o,
+  input  logic [31:0]                  tsmap_rdata_i,
+  input  logic [6:0]                   tsmap_rdata_intg_i,
+  input  logic [MMRegDinW-1:0]         mmreg_corein_i,
+  output logic [MMRegDoutW-1:0]        mmreg_coreout_o,
+
+  // Interrupt inputs
+  input  logic                         irq_software_i,
+  input  logic                         irq_timer_i,
+  input  logic                         irq_external_i,
+  input  logic [14:0]                  irq_fast_i,
+  input  logic                         irq_nm_i,       // non-maskeable interrupt
+
+  // Scrambling Interface
+  input  logic                         scramble_key_valid_i,
+  input  logic [SCRAMBLE_KEY_W-1:0]    scramble_key_i,
+  input  logic [SCRAMBLE_NONCE_W-1:0]  scramble_nonce_i,
+  output logic                         scramble_req_o,
+
+  // Debug Interface
+  input  logic                         debug_req_i,
+  output crash_dump_t                  crash_dump_o,
+  output logic                         double_fault_seen_o,
+
+  // CPU Control Signals
+  input  fetch_enable_t                fetch_enable_i,
+  output logic                         core_sleep_o
+);
+
+  // cheriot_tracer relies on the signals from the RISC-V Formal Interface
+  `ifndef RVFI
+    $fatal("Fatal error: RVFI needs to be defined globally.");
+  `endif
+
+  logic        rvfi_valid;
+  logic [63:0] rvfi_order;
+  logic [31:0] rvfi_insn;
+  logic        rvfi_trap;
+  logic        rvfi_halt;
+  logic        rvfi_intr;
+  logic [ 1:0] rvfi_mode;
+  logic [ 1:0] rvfi_ixl;
+  logic [ 4:0] rvfi_rs1_addr;
+  logic [ 4:0] rvfi_rs2_addr;
+  logic [ 4:0] rvfi_rs3_addr;
+  logic [31:0] rvfi_rs1_rdata;
+  reg_cap_t    rvfi_rs1_rcap;
+  reg_cap_t    rvfi_rs2_rcap;
+  logic [31:0] rvfi_rs2_rdata;
+  logic [31:0] rvfi_rs3_rdata;
+  logic [ 4:0] rvfi_rd_addr;
+  logic [31:0] rvfi_rd_wdata;
+  reg_cap_t    rvfi_rd_wcap;
+  logic [31:0] rvfi_pc_rdata;
+  logic [31:0] rvfi_pc_wdata;
+  logic [31:0] rvfi_mem_addr;
+  logic [ 3:0] rvfi_mem_rmask;
+  logic [ 3:0] rvfi_mem_wmask;
+  logic [DataWidth-1:0] rvfi_mem_rdata;
+  logic [DataWidth-1:0] rvfi_mem_wdata;
+  logic        rvfi_mem_is_cap;
+  reg_cap_t     rvfi_mem_rcap;
+  reg_cap_t     rvfi_mem_wcap;
+  logic [31:0] rvfi_mem2_addr;
+  logic        rvfi_mem2_we;
+  logic [65:0] rvfi_mem2_rdata;
+  logic [65:0] rvfi_mem2_wdata;
+  logic [31:0] rvfi_ext_mip;
+  logic        rvfi_ext_nmi;
+  logic        rvfi_ext_debug_req;
+  logic [63:0] rvfi_ext_mcycle;
+
+  logic [31:0] unused_rvfi_ext_mip;
+  logic        unused_rvfi_ext_nmi;
+  logic        unused_rvfi_ext_debug_req;
+  logic [63:0] unused_rvfi_ext_mcycle;
+
+
+  // Tracer doesn't use these signals, though other modules may probe down into tracer to observe
+  // them.
+  assign unused_rvfi_ext_mip = rvfi_ext_mip;
+  assign unused_rvfi_ext_nmi = rvfi_ext_nmi;
+  assign unused_rvfi_ext_debug_req = rvfi_ext_debug_req;
+  assign unused_rvfi_ext_mcycle = rvfi_ext_mcycle;
+
+  cheriot_top #(
+    .PMPEnable        ( PMPEnable        ),
+    .PMPGranularity   ( PMPGranularity   ),
+    .PMPNumRegions    ( PMPNumRegions    ),
+    .MHPMCounterNum   ( MHPMCounterNum   ),
+    .MHPMCounterWidth ( MHPMCounterWidth ),
+    .RV32E            ( RV32E            ),
+    .RV32M            ( RV32M            ),
+    .RV32B            ( RV32B            ),
+    .RegFile          ( RegFile          ),
+    .BranchTargetALU  ( BranchTargetALU  ),
+    .ICache           ( ICache           ),
+    .ICacheECC        ( ICacheECC        ),
+    .BranchPredictor  ( BranchPredictor  ),
+    .DbgTriggerEn     ( DbgTriggerEn     ),
+    .DbgHwBreakNum    ( DbgHwBreakNum    ),
+    .WritebackStage   ( WritebackStage   ),
+    .SecureIbex       ( SecureIbex       ),
+    .ICacheScramble   ( ICacheScramble   ),
+    .RndCnstLfsrSeed  ( RndCnstLfsrSeed  ),
+    .RndCnstLfsrPerm  ( RndCnstLfsrPerm  ),
+    .DmHaltAddr       (DmHaltAddr       ),
+    .DmExceptionAddr  (DmExceptionAddr  ),
+    .CHERIoTEn        (CHERIoTEn),
+    .DataWidth        (DataWidth),
+    .HeapBase         (HeapBase   ),
+    .TSMapBase        (TSMapBase  ),
+    .TSMapSize        (TSMapSize),
+    .MemCapFmt        (MemCapFmt   ),
+    .CheriPPLBC       (CheriPPLBC),
+    .CheriSBND2       (CheriSBND2),
+    .CheriTBRE        (CheriTBRE),
+    .CheriStkZ        (CheriStkZ)
+  ) u_cheriot_top (
+    .clk_i,
+    .rst_ni,
+
+    .test_en_i,
+    .scan_rst_ni,
+    .ram_cfg_i,
+
+    .cheri_pmode_i,
+    .cheri_tsafe_en_i,
+    .hart_id_i,
+    .boot_addr_i,
+
+    .instr_req_o,
+    .instr_gnt_i,
+    .instr_rvalid_i,
+    .instr_addr_o,
+    .instr_rdata_i,
+    .instr_rdata_intg_i,
+    .instr_err_i,
+
+    .data_req_o,
+    .data_is_cap_o,
+    .data_gnt_i,
+    .data_rvalid_i,
+    .data_we_o,
+    .data_be_o,
+    .data_addr_o,
+    .data_wdata_o,
+    .data_wdata_intg_o,
+    .data_rdata_i,
+    .data_rdata_intg_i,
+    .data_err_i,
+
+    .tsmap_cs_o,
+    .tsmap_addr_o,
+    .tsmap_rdata_i,
+    .tsmap_rdata_intg_i,
+    .mmreg_corein_i,
+    .mmreg_coreout_o,
+
+    .irq_software_i,
+    .irq_timer_i,
+    .irq_external_i,
+    .irq_fast_i,
+    .irq_nm_i,
+
+    .scramble_key_valid_i,
+    .scramble_key_i,
+    .scramble_nonce_i,
+    .scramble_req_o,
+
+    .debug_req_i,
+    .crash_dump_o,
+    .double_fault_seen_o,
+
+`ifdef RVFI
+    .rvfi_valid,
+    .rvfi_order,
+    .rvfi_insn,
+    .rvfi_trap,
+    .rvfi_halt,
+    .rvfi_intr,
+    .rvfi_mode,
+    .rvfi_ixl,
+    .rvfi_rs1_addr,
+    .rvfi_rs2_addr,
+    .rvfi_rs3_addr,
+    .rvfi_rs1_rdata,
+    .rvfi_rs1_rcap,
+    .rvfi_rs2_rdata,
+    .rvfi_rs2_rcap,
+    .rvfi_rs3_rdata,
+    .rvfi_rd_addr,
+    .rvfi_rd_wdata,
+    .rvfi_rd_wcap,
+    .rvfi_pc_rdata,
+    .rvfi_pc_wdata,
+    .rvfi_mem_addr,
+    .rvfi_mem_rmask,
+    .rvfi_mem_wmask,
+    .rvfi_mem_rdata,
+    .rvfi_mem_wdata,
+    .rvfi_mem_rcap,
+    .rvfi_mem_wcap,
+    .rvfi_mem_is_cap,
+    .rvfi_ext_mip,
+    .rvfi_ext_nmi,
+    .rvfi_ext_debug_req,
+    .rvfi_ext_mcycle,
+`endif
+    .fetch_enable_i,
+    .core_sleep_o,
+    .alert_major_bus_o(),
+    .alert_major_internal_o(),
+    .alert_minor_o()
+  );
+
+`ifdef RVFI
+  cheriot_tracer #(
+    .DataWidth        (DataWidth)
+  ) u_cheriot_tracer (
+    .clk_i,
+    .rst_ni,
+
+    .cheri_pmode_i,
+    .cheri_tsafe_en_i,
+    .hart_id_i,
+
+    .rvfi_valid,
+    .rvfi_order,
+    .rvfi_insn,
+    .rvfi_trap,
+    .rvfi_halt,
+    .rvfi_intr,
+    .rvfi_mode,
+    .rvfi_ixl,
+    .rvfi_rs1_addr,
+    .rvfi_rs2_addr,
+    .rvfi_rs3_addr,
+    .rvfi_rs1_rdata,
+    .rvfi_rs2_rdata,
+    .rvfi_rs3_rdata,
+    .rvfi_rs1_rcap,
+    .rvfi_rs2_rcap,
+    .rvfi_rd_wcap,
+    .rvfi_rd_addr,
+    .rvfi_rd_wdata,
+    .rvfi_pc_rdata,
+    .rvfi_pc_wdata,
+    .rvfi_mem_addr,
+    .rvfi_mem_rmask,
+    .rvfi_mem_wmask,
+    .rvfi_mem_rdata,
+    .rvfi_mem_wdata,
+    .rvfi_mem_rcap,
+    .rvfi_mem_wcap,
+    .rvfi_mem_is_cap
+  );
+`endif
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_tracer.sv b/hw/ip/cheriot-ibex/rtl/cheriot_tracer.sv
new file mode 100644
index 0000000..2f08ba9
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_tracer.sv
@@ -0,0 +1,1410 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Trace executed instructions in simulation
+ *
+ * This tracer takes execution information from the RISC-V Verification Interface (RVFI) and
+ * produces a text file with a human-readable trace.
+ *
+ * All traced instructions are written to a log file. By default, the log file is named
+ * trace_core_<HARTID>.log, with <HARTID> being the 8 digit hart ID of the core being traced.
+ *
+ * The file name base, defaulting to "trace_core" can be set using the "cheriot_tracer_file_base"
+ * plusarg passed to the simulation, e.g. "+cheriot_tracer_file_base=ibex_my_trace". The exact syntax
+ * of passing plusargs to a simulation depends on the simulator.
+ *
+ * The creation of the instruction trace is enabled by default but can be disabled for a simulation.
+ * This behaviour is controlled by the plusarg "cheriot_tracer_enable". Use "cheriot_tracer_enable=0" to
+ * disable the tracer.
+ *
+ * The trace contains six columns, separated by tabs:
+ * - The simulation time
+ * - The clock cycle count since reset
+ * - The program counter (PC)
+ * - The instruction
+ * - The decoded instruction in the same format as objdump, together with the accessed registers and
+ *   read/written memory values. Jumps and branches show the target address.
+ *   This column may be omitted if the instruction does not decode into a long form.
+ * - Accessed registers and memory locations.
+ *
+ * Significant effort is spent to make the decoding produced by this tracer as similar as possible
+ * to the one produced by objdump. This simplifies the correlation between the static program
+ * information from the objdump-generated disassembly, and the runtime information from this tracer.
+ */
+
+module cheriot_tracer import cheri_pkg::*; # (
+  parameter int unsigned DataWidth = 32,
+  parameter bit CheriCapIT8        = 1'b0
+) (
+  input logic        clk_i,
+  input logic        rst_ni,
+
+  input logic        cheri_pmode_i,
+  input logic        cheri_tsafe_en_i,
+  input logic [31:0] hart_id_i,
+
+  // RVFI as described at https://github.com/SymbioticEDA/riscv-formal/blob/master/docs/rvfi.md
+  // The standard interface does not have _i/_o suffixes. For consistency with the standard the
+  // signals in this module don't have the suffixes either.
+  input logic        rvfi_valid,
+  input logic [63:0] rvfi_order,
+  input logic [31:0] rvfi_insn,
+  input logic        rvfi_trap,
+  input logic        rvfi_halt,
+  input logic        rvfi_intr,
+  input logic [ 1:0] rvfi_mode,
+  input logic [ 1:0] rvfi_ixl,
+  input logic [ 4:0] rvfi_rs1_addr,
+  input logic [ 4:0] rvfi_rs2_addr,
+  input logic [ 4:0] rvfi_rs3_addr,
+  input logic [31:0] rvfi_rs1_rdata,
+  input reg_cap_t    rvfi_rs1_rcap,
+  input logic [31:0] rvfi_rs2_rdata,
+  input reg_cap_t    rvfi_rs2_rcap,
+  input logic [31:0] rvfi_rs3_rdata,
+  input logic [ 4:0] rvfi_rd_addr,
+  input logic [31:0] rvfi_rd_wdata,
+  input reg_cap_t    rvfi_rd_wcap,
+  input logic [31:0] rvfi_pc_rdata,
+  input logic [31:0] rvfi_pc_wdata,
+  input logic [31:0] rvfi_mem_addr,
+  input logic [ 3:0] rvfi_mem_rmask,
+  input logic [ 3:0] rvfi_mem_wmask,
+  input logic [DataWidth-1:0] rvfi_mem_rdata,
+  input logic [DataWidth-1:0] rvfi_mem_wdata,
+  input logic        rvfi_mem_is_cap,
+  input reg_cap_t    rvfi_mem_rcap,
+  input reg_cap_t    rvfi_mem_wcap
+);
+
+// synthesis translate_off
+
+  // These signals are part of RVFI, but not used in this module currently.
+  // Keep them as part of the interface to change the tracer more easily in the future. Assigning
+  // these signals to unused_* signals marks them explicitly as unused, an annotation picked up by
+  // linters, including Verilator lint.
+  logic [63:0] unused_rvfi_order = rvfi_order;
+  logic        unused_rvfi_trap = rvfi_trap;
+  logic        unused_rvfi_halt = rvfi_halt;
+  logic        unused_rvfi_intr = rvfi_intr;
+  logic [ 1:0] unused_rvfi_mode = rvfi_mode;
+  logic [ 1:0] unused_rvfi_ixl = rvfi_ixl;
+
+  import cheriot_tracer_pkg::*;
+
+  int          file_handle;
+  string       file_name;
+
+  int unsigned cycle;
+  string       decoded_str;
+  logic        insn_is_compressed;
+  logic        rvfi_mem_wdata_bit32;
+
+  // Data items accessed during this instruction
+  localparam logic [9:0] RS1 = (1 << 0);
+  localparam logic [9:0] RS2 = (1 << 1);
+  localparam logic [9:0] RS3 = (1 << 2);
+  localparam logic [9:0] RD  = (1 << 3);
+  localparam logic [9:0] MEM = (1 << 4);
+  localparam logic [9:0] CS1 = (1 << 5);
+  localparam logic [9:0] CS2 = (1 << 6);
+  localparam logic [9:0] CD  = (1 << 7);
+  localparam logic [9:0] MEMC = (1 << 8);
+  localparam logic [9:0] MEM2 = (1 << 9);
+  logic [9:0] data_accessed;
+
+  logic trace_log_enable;
+  initial begin
+    if ($value$plusargs("cheriot_tracer_enable=%b", trace_log_enable)) begin
+      if (trace_log_enable == 1'b0) begin
+        $display("%m: Instruction trace disabled.");
+      end
+    end else begin
+      trace_log_enable = 1'b1;
+    end
+  end
+
+  function automatic void printbuffer_dumpline();
+    string rvfi_insn_str;
+    string disp_str;
+    logic [32:0] tmp33;
+
+    if (file_handle == 32'h0) begin
+      string file_name_base = "trace_core";
+      void'($value$plusargs("cheriot_tracer_file_base=%s", file_name_base));
+      $sformat(file_name, "%s_%h.log", file_name_base, hart_id_i);
+
+      $display("%m: Writing execution trace to %s", file_name);
+      file_handle = $fopen(file_name, "w");
+      $fwrite(file_handle,
+              "Time\tCycle\tPC\tInsn\tDecoded instruction\tRegister and memory contents\n");
+    end
+
+    // Write compressed instructions as four hex digits (16 bit word), and
+    // uncompressed ones as 8 hex digits (32 bit words).
+    if (insn_is_compressed) begin
+      rvfi_insn_str = $sformatf("    %4h", rvfi_insn[15:0]);
+    end else begin
+      rvfi_insn_str = $sformatf("%8h", rvfi_insn);
+    end
+
+    if (rvfi_trap) disp_str = $sformatf("-->%s", decoded_str);
+    else if (rvfi_intr) disp_str = $sformatf("==>%s", decoded_str);
+    else           disp_str = decoded_str;
+
+    $fwrite(file_handle, "%15t\t%d\t%h\t%s\t%s\t",
+            $time, cycle, rvfi_pc_rdata, rvfi_insn_str, disp_str);
+
+    if ((data_accessed & RS1) != 0) begin
+      $fwrite(file_handle, " %s:0x%08x", reg_addr_to_str(rvfi_rs1_addr), rvfi_rs1_rdata);
+    end
+    if ((data_accessed & CS1) != 0) begin
+      tmp33 = CheriCapIT8 ? reg2memcap_it8_fmt0(rvfi_rs1_rcap) : reg2memcap_fmt0(rvfi_rs1_rcap);
+      $fwrite(file_handle, " %s:0x%08x+0x%09x", reg_addr_to_str(rvfi_rs1_addr), rvfi_rs1_rdata, tmp33);
+    end
+    if ((data_accessed & RS2) != 0) begin
+      $fwrite(file_handle, " %s:0x%08x", reg_addr_to_str(rvfi_rs2_addr), rvfi_rs2_rdata);
+    end
+    if ((data_accessed & CS2) != 0) begin
+      tmp33 = CheriCapIT8 ?  reg2memcap_it8_fmt0(rvfi_rs2_rcap) :  reg2memcap_fmt0(rvfi_rs2_rcap);
+      $fwrite(file_handle, " %s:0x%08x+0x%09x", reg_addr_to_str(rvfi_rs2_addr), rvfi_rs2_rdata, tmp33);
+    end
+    if ((data_accessed & RS3) != 0) begin
+      $fwrite(file_handle, " %s:0x%08x", reg_addr_to_str(rvfi_rs3_addr), rvfi_rs3_rdata);
+    end
+    if ((data_accessed & RD) != 0) begin
+      $fwrite(file_handle, " %s=0x%08x", reg_addr_to_str(rvfi_rd_addr), rvfi_rd_wdata);
+    end
+
+    if ((data_accessed & CD) != 0) begin
+      tmp33 = CheriCapIT8 ? reg2memcap_it8_fmt0(rvfi_rd_wcap) : reg2memcap_fmt0(rvfi_rd_wcap);
+      $fwrite(file_handle, " %s=0x%08x+0x%09x", reg_addr_to_str(rvfi_rd_addr), rvfi_rd_wdata, tmp33);
+    end
+
+    if ((data_accessed & MEM) != 0) begin
+      $fwrite(file_handle, " PA:0x%08x", rvfi_mem_addr);
+
+      if (rvfi_mem_wmask == 4'b0001)                           
+        $fwrite(file_handle, " store:0x%1b??????%02x", rvfi_mem_wdata_bit32, rvfi_mem_wdata[7:0]);
+      else if (rvfi_mem_wmask == 4'b0011)
+        $fwrite(file_handle, " store:0x%1b????%04x", rvfi_mem_wdata_bit32, rvfi_mem_wdata[15:0]);
+      else if (rvfi_mem_wmask != 4'b0000)
+        $fwrite(file_handle, " store:0x%09x", rvfi_mem_wdata);
+
+      if (rvfi_mem_rmask != 4'b0000)
+        $fwrite(file_handle, " load:0x%08x", rvfi_mem_rdata);  
+    end
+
+    if ((data_accessed & MEMC) != 0) begin
+      $fwrite(file_handle, " PA:0x%08x", rvfi_mem_addr);
+
+      if (rvfi_mem_wmask != 0) begin        
+        tmp33 = CheriCapIT8 ? reg2memcap_it8_fmt0(rvfi_mem_wcap) : reg2memcap_fmt0(rvfi_mem_wcap);
+        $fwrite(file_handle, " store:0x%09x+0x%09x", rvfi_mem_wdata, tmp33);
+      end else begin
+        tmp33 = CheriCapIT8 ? reg2memcap_it8_fmt0(rvfi_mem_rcap) : reg2memcap_fmt0(rvfi_mem_rcap);
+        $fwrite(file_handle, " load:0x%09x+0x%09x", rvfi_mem_rdata, tmp33);
+      end
+    end
+
+    $fwrite(file_handle, "\n");
+  endfunction
+
+
+  // Format register address with "x" prefix, left-aligned to a fixed width of 3 characters.
+  function automatic string reg_addr_to_str(input logic [4:0] addr);
+    if (addr < 10) begin
+      return $sformatf(" x%0d", addr);
+    end else begin
+      return $sformatf("x%0d", addr);
+    end
+  endfunction
+
+  // Get a SCR name for a CHERI SCR address.
+  function automatic string get_scr_name(input logic [4:0] scr_addr);
+    unique case (scr_addr)
+      5'd27:   return "ztopc";
+      5'd28:   return "mtcc";
+      5'd29:   return "mtdc";
+      5'd30:   return "mscratchc";
+      5'd31:   return "mepcc";
+      default: return $sformatf("scr%d", scr_addr);
+    endcase
+  endfunction
+
+  // Get a CSR name for a CSR address.
+  function automatic string get_csr_name(input logic [11:0] csr_addr);
+    unique case (csr_addr)
+      12'd0: return "ustatus";
+      12'd4: return "uie";
+      12'd5: return "utvec";
+      12'd64: return "uscratch";
+      12'd65: return "uepc";
+      12'd66: return "ucause";
+      12'd67: return "utval";
+      12'd68: return "uip";
+      12'd1: return "fflags";
+      12'd2: return "frm";
+      12'd3: return "fcsr";
+      12'd3072: return "cycle";
+      12'd3073: return "time";
+      12'd3074: return "instret";
+      12'd3075: return "hpmcounter3";
+      12'd3076: return "hpmcounter4";
+      12'd3077: return "hpmcounter5";
+      12'd3078: return "hpmcounter6";
+      12'd3079: return "hpmcounter7";
+      12'd3080: return "hpmcounter8";
+      12'd3081: return "hpmcounter9";
+      12'd3082: return "hpmcounter10";
+      12'd3083: return "hpmcounter11";
+      12'd3084: return "hpmcounter12";
+      12'd3085: return "hpmcounter13";
+      12'd3086: return "hpmcounter14";
+      12'd3087: return "hpmcounter15";
+      12'd3088: return "hpmcounter16";
+      12'd3089: return "hpmcounter17";
+      12'd3090: return "hpmcounter18";
+      12'd3091: return "hpmcounter19";
+      12'd3092: return "hpmcounter20";
+      12'd3093: return "hpmcounter21";
+      12'd3094: return "hpmcounter22";
+      12'd3095: return "hpmcounter23";
+      12'd3096: return "hpmcounter24";
+      12'd3097: return "hpmcounter25";
+      12'd3098: return "hpmcounter26";
+      12'd3099: return "hpmcounter27";
+      12'd3100: return "hpmcounter28";
+      12'd3101: return "hpmcounter29";
+      12'd3102: return "hpmcounter30";
+      12'd3103: return "hpmcounter31";
+      12'd3200: return "cycleh";
+      12'd3201: return "timeh";
+      12'd3202: return "instreth";
+      12'd3203: return "hpmcounter3h";
+      12'd3204: return "hpmcounter4h";
+      12'd3205: return "hpmcounter5h";
+      12'd3206: return "hpmcounter6h";
+      12'd3207: return "hpmcounter7h";
+      12'd3208: return "hpmcounter8h";
+      12'd3209: return "hpmcounter9h";
+      12'd3210: return "hpmcounter10h";
+      12'd3211: return "hpmcounter11h";
+      12'd3212: return "hpmcounter12h";
+      12'd3213: return "hpmcounter13h";
+      12'd3214: return "hpmcounter14h";
+      12'd3215: return "hpmcounter15h";
+      12'd3216: return "hpmcounter16h";
+      12'd3217: return "hpmcounter17h";
+      12'd3218: return "hpmcounter18h";
+      12'd3219: return "hpmcounter19h";
+      12'd3220: return "hpmcounter20h";
+      12'd3221: return "hpmcounter21h";
+      12'd3222: return "hpmcounter22h";
+      12'd3223: return "hpmcounter23h";
+      12'd3224: return "hpmcounter24h";
+      12'd3225: return "hpmcounter25h";
+      12'd3226: return "hpmcounter26h";
+      12'd3227: return "hpmcounter27h";
+      12'd3228: return "hpmcounter28h";
+      12'd3229: return "hpmcounter29h";
+      12'd3230: return "hpmcounter30h";
+      12'd3231: return "hpmcounter31h";
+      12'd256: return "sstatus";
+      12'd258: return "sedeleg";
+      12'd259: return "sideleg";
+      12'd260: return "sie";
+      12'd261: return "stvec";
+      12'd262: return "scounteren";
+      12'd320: return "sscratch";
+      12'd321: return "sepc";
+      12'd322: return "scause";
+      12'd323: return "stval";
+      12'd324: return "sip";
+      12'd384: return "satp";
+      12'd3857: return "mvendorid";
+      12'd3858: return "marchid";
+      12'd3859: return "mimpid";
+      12'd3860: return "mhartid";
+      12'd768: return "mstatus";
+      12'd769: return "misa";
+      12'd770: return "medeleg";
+      12'd771: return "mideleg";
+      12'd772: return "mie";
+      12'd773: return "mtvec";
+      12'd774: return "mcounteren";
+      12'd832: return "mscratch";
+      12'd833: return "mepc";
+      12'd834: return "mcause";
+      12'd835: return "mtval";
+      12'd836: return "mip";
+      12'd928: return "pmpcfg0";
+      12'd929: return "pmpcfg1";
+      12'd930: return "pmpcfg2";
+      12'd931: return "pmpcfg3";
+      12'd944: return "pmpaddr0";
+      12'd945: return "pmpaddr1";
+      12'd946: return "pmpaddr2";
+      12'd947: return "pmpaddr3";
+      12'd948: return "pmpaddr4";
+      12'd949: return "pmpaddr5";
+      12'd950: return "pmpaddr6";
+      12'd951: return "pmpaddr7";
+      12'd952: return "pmpaddr8";
+      12'd953: return "pmpaddr9";
+      12'd954: return "pmpaddr10";
+      12'd955: return "pmpaddr11";
+      12'd956: return "pmpaddr12";
+      12'd957: return "pmpaddr13";
+      12'd958: return "pmpaddr14";
+      12'd959: return "pmpaddr15";
+      12'd2816: return "mcycle";
+      12'd2818: return "minstret";
+      12'd2819: return "mhpmcounter3";
+      12'd2820: return "mhpmcounter4";
+      12'd2821: return "mhpmcounter5";
+      12'd2822: return "mhpmcounter6";
+      12'd2823: return "mhpmcounter7";
+      12'd2824: return "mhpmcounter8";
+      12'd2825: return "mhpmcounter9";
+      12'd2826: return "mhpmcounter10";
+      12'd2827: return "mhpmcounter11";
+      12'd2828: return "mhpmcounter12";
+      12'd2829: return "mhpmcounter13";
+      12'd2830: return "mhpmcounter14";
+      12'd2831: return "mhpmcounter15";
+      12'd2832: return "mhpmcounter16";
+      12'd2833: return "mhpmcounter17";
+      12'd2834: return "mhpmcounter18";
+      12'd2835: return "mhpmcounter19";
+      12'd2836: return "mhpmcounter20";
+      12'd2837: return "mhpmcounter21";
+      12'd2838: return "mhpmcounter22";
+      12'd2839: return "mhpmcounter23";
+      12'd2840: return "mhpmcounter24";
+      12'd2841: return "mhpmcounter25";
+      12'd2842: return "mhpmcounter26";
+      12'd2843: return "mhpmcounter27";
+      12'd2844: return "mhpmcounter28";
+      12'd2845: return "mhpmcounter29";
+      12'd2846: return "mhpmcounter30";
+      12'd2847: return "mhpmcounter31";
+      12'd2944: return "mcycleh";
+      12'd2946: return "minstreth";
+      12'd2947: return "mhpmcounter3h";
+      12'd2948: return "mhpmcounter4h";
+      12'd2949: return "mhpmcounter5h";
+      12'd2950: return "mhpmcounter6h";
+      12'd2951: return "mhpmcounter7h";
+      12'd2952: return "mhpmcounter8h";
+      12'd2953: return "mhpmcounter9h";
+      12'd2954: return "mhpmcounter10h";
+      12'd2955: return "mhpmcounter11h";
+      12'd2956: return "mhpmcounter12h";
+      12'd2957: return "mhpmcounter13h";
+      12'd2958: return "mhpmcounter14h";
+      12'd2959: return "mhpmcounter15h";
+      12'd2960: return "mhpmcounter16h";
+      12'd2961: return "mhpmcounter17h";
+      12'd2962: return "mhpmcounter18h";
+      12'd2963: return "mhpmcounter19h";
+      12'd2964: return "mhpmcounter20h";
+      12'd2965: return "mhpmcounter21h";
+      12'd2966: return "mhpmcounter22h";
+      12'd2967: return "mhpmcounter23h";
+      12'd2968: return "mhpmcounter24h";
+      12'd2969: return "mhpmcounter25h";
+      12'd2970: return "mhpmcounter26h";
+      12'd2971: return "mhpmcounter27h";
+      12'd2972: return "mhpmcounter28h";
+      12'd2973: return "mhpmcounter29h";
+      12'd2974: return "mhpmcounter30h";
+      12'd2975: return "mhpmcounter31h";
+      12'd803: return "mhpmevent3";
+      12'd804: return "mhpmevent4";
+      12'd805: return "mhpmevent5";
+      12'd806: return "mhpmevent6";
+      12'd807: return "mhpmevent7";
+      12'd808: return "mhpmevent8";
+      12'd809: return "mhpmevent9";
+      12'd810: return "mhpmevent10";
+      12'd811: return "mhpmevent11";
+      12'd812: return "mhpmevent12";
+      12'd813: return "mhpmevent13";
+      12'd814: return "mhpmevent14";
+      12'd815: return "mhpmevent15";
+      12'd816: return "mhpmevent16";
+      12'd817: return "mhpmevent17";
+      12'd818: return "mhpmevent18";
+      12'd819: return "mhpmevent19";
+      12'd820: return "mhpmevent20";
+      12'd821: return "mhpmevent21";
+      12'd822: return "mhpmevent22";
+      12'd823: return "mhpmevent23";
+      12'd824: return "mhpmevent24";
+      12'd825: return "mhpmevent25";
+      12'd826: return "mhpmevent26";
+      12'd827: return "mhpmevent27";
+      12'd828: return "mhpmevent28";
+      12'd829: return "mhpmevent29";
+      12'd830: return "mhpmevent30";
+      12'd831: return "mhpmevent31";
+      12'd1952: return "tselect";
+      12'd1953: return "tdata1";
+      12'd1954: return "tdata2";
+      12'd1955: return "tdata3";
+      12'd1968: return "dcsr";
+      12'd1969: return "dpc";
+      12'd1970: return "dscratch";
+      12'd512: return "hstatus";
+      12'd514: return "hedeleg";
+      12'd515: return "hideleg";
+      12'd516: return "hie";
+      12'd517: return "htvec";
+      12'd576: return "hscratch";
+      12'd577: return "hepc";
+      12'd578: return "hcause";
+      12'd579: return "hbadaddr";
+      12'd580: return "hip";
+      12'd896: return "mbase";
+      12'd897: return "mbound";
+      12'd898: return "mibase";
+      12'd899: return "mibound";
+      12'd900: return "mdbase";
+      12'd901: return "mdbound";
+      12'd800: return "mcountinhibit";
+      12'd3009: return "mshwm";
+      12'd3010: return "mshwmb";
+      12'd3012: return "cdbgctrl";
+      default: return $sformatf("0x%x", csr_addr);
+    endcase
+  endfunction
+
+  function automatic void decode_mnemonic(input string mnemonic);
+    decoded_str = mnemonic;
+  endfunction
+
+  function automatic void decode_r_insn(input string mnemonic);
+    data_accessed = RS1 | RS2 | RD;
+    decoded_str = $sformatf("%s\tx%0d,x%0d,x%0d", mnemonic, rvfi_rd_addr, rvfi_rs1_addr,
+        rvfi_rs2_addr);
+  endfunction
+
+  function automatic void decode_r1_insn(input string mnemonic);
+    data_accessed = RS1 | RD;
+    decoded_str = $sformatf("%s\tx%0d,x%0d", mnemonic, rvfi_rd_addr, rvfi_rs1_addr);
+  endfunction
+
+  function automatic void decode_r_cmixcmov_insn(input string mnemonic);
+    data_accessed = RS1 | RS2 | RS3 | RD;
+    decoded_str = $sformatf("%s\tx%0d,x%0d,x%0d,x%0d", mnemonic, rvfi_rd_addr, rvfi_rs2_addr,
+        rvfi_rs1_addr, rvfi_rs3_addr);
+  endfunction
+
+  function automatic void decode_r_funnelshift_insn(input string mnemonic);
+    data_accessed = RS1 | RS2 | RS3 | RD;
+    decoded_str = $sformatf("%s\tx%0d,x%0d,x%0d,x%0d", mnemonic, rvfi_rd_addr, rvfi_rs1_addr,
+        rvfi_rs3_addr, rvfi_rs2_addr);
+  endfunction
+
+  function automatic void decode_i_insn(input string mnemonic);
+    data_accessed = RS1 | RD;
+    decoded_str = $sformatf("%s\tx%0d,x%0d,%0d", mnemonic, rvfi_rd_addr, rvfi_rs1_addr,
+                    $signed({{20 {rvfi_insn[31]}}, rvfi_insn[31:20]}));
+  endfunction
+
+  function automatic void decode_i_shift_insn(input string mnemonic);
+    // SLLI, SRLI, SRAI, SROI, SLOI, RORI
+    logic [4:0] shamt;
+    shamt = {rvfi_insn[24:20]};
+    data_accessed = RS1 | RD;
+    decoded_str = $sformatf("%s\tx%0d,x%0d,0x%0x", mnemonic, rvfi_rd_addr, rvfi_rs1_addr, shamt);
+  endfunction
+
+  function automatic void decode_i_funnelshift_insn( input string mnemonic);
+    // fsri
+    logic [5:0] shamt;
+    shamt = {rvfi_insn[25:20]};
+    data_accessed = RS1 | RS3 | RD;
+    decoded_str = $sformatf("%s\tx%0d,x%0d,x%0d,0x%0x", mnemonic, rvfi_rd_addr, rvfi_rs1_addr,
+        rvfi_rs3_addr, shamt);
+  endfunction
+
+  function automatic void decode_i_jalr_insn(input string mnemonic);
+    // JALR
+    if (cheri_pmode_i) begin
+      data_accessed = CS1 | CD;
+      // CH.cjalr
+      decoded_str = $sformatf("CH.c%s\tc%0d,%0d(c%0d)", mnemonic, rvfi_rd_addr,
+          $signed({{20 {rvfi_insn[31]}}, rvfi_insn[31:20]}), rvfi_rs1_addr);
+    end else begin
+      // jalr
+      data_accessed = RS1 | RD;
+      decoded_str = $sformatf("%s\tx%0d,%0d(x%0d)", mnemonic, rvfi_rd_addr,
+          $signed({{20 {rvfi_insn[31]}}, rvfi_insn[31:20]}), rvfi_rs1_addr);
+    end
+  endfunction
+
+  function automatic void decode_u_insn(input string mnemonic);
+    data_accessed = RD;
+    decoded_str = $sformatf("%s\tx%0d,0x%0x", mnemonic, rvfi_rd_addr, {rvfi_insn[31:12]});
+  endfunction
+
+  function automatic void decode_j_insn(input string mnemonic);
+    // JAL
+    if (cheri_pmode_i) begin
+      data_accessed = CD;
+      decoded_str = $sformatf("%s\tc%0d,%0x", "CH.cjal", rvfi_rd_addr, rvfi_pc_wdata);
+    end else begin
+      data_accessed = RD;
+      decoded_str = $sformatf("%s\tx%0d,%0x", mnemonic, rvfi_rd_addr, rvfi_pc_wdata);
+    end
+  endfunction
+
+  function automatic void decode_b_insn(input string mnemonic);
+    logic [31:0] branch_target;
+    logic [31:0] imm;
+
+    // We cannot use rvfi_pc_wdata for conditional jumps.
+    imm = $signed({ {19 {rvfi_insn[31]}}, rvfi_insn[31], rvfi_insn[7],
+             rvfi_insn[30:25], rvfi_insn[11:8], 1'b0 });
+    branch_target = rvfi_pc_rdata + imm;
+
+    data_accessed = RS1 | RS2;
+    decoded_str = $sformatf("%s\tx%0d,x%0d,%0x",
+                            mnemonic, rvfi_rs1_addr, rvfi_rs2_addr, branch_target);
+  endfunction
+
+  function automatic void decode_csr_insn(input string mnemonic);
+    logic [11:0] csr;
+    string csr_name;
+    csr = rvfi_insn[31:20];
+    csr_name = get_csr_name(csr);
+
+    data_accessed = RD;
+
+    if (!rvfi_insn[14]) begin
+      data_accessed |= RS1;
+      decoded_str = $sformatf("%s\tx%0d,%s,x%0d",
+                              mnemonic, rvfi_rd_addr, csr_name, rvfi_rs1_addr);
+    end else begin
+      decoded_str = $sformatf("%s\tx%0d,%s,%0d",
+                              mnemonic, rvfi_rd_addr, csr_name, {27'b0, rvfi_insn[19:15]});
+    end
+  endfunction
+
+  function automatic void decode_cr_insn(input string mnemonic);
+    if (rvfi_rs2_addr == 5'b0) begin
+      if ((rvfi_insn[12] == 1'b1) && cheri_pmode_i) begin
+        // C.CH.JALR
+        data_accessed = CS1 | CD;
+        decoded_str = $sformatf("%s\tc%0d", "c.CH.cjalr", rvfi_rs1_addr);
+      end else if (rvfi_insn[12] == 1'b1) begin
+        // C.JALR
+        data_accessed = RS1 | RD;
+        decoded_str = $sformatf("%s\tx%0d", mnemonic, rvfi_rs1_addr);
+      end else if (cheri_pmode_i) begin
+        // C.CH.JR
+        data_accessed = CS1;
+        decoded_str = $sformatf("%s\tc%0d", "c.CH.cjr" , rvfi_rs1_addr);
+      end else begin
+        // C.JR
+        data_accessed = RS1;
+        decoded_str = $sformatf("%s\tx%0d", mnemonic, rvfi_rs1_addr);
+      end
+    end else begin
+      data_accessed = RS1 | RS2 | RD; // RS1 == RD
+      decoded_str = $sformatf("%s\tx%0d,x%0d", mnemonic, rvfi_rd_addr, rvfi_rs2_addr);
+    end
+  endfunction
+
+  function automatic void decode_ci_cli_insn(input string mnemonic);
+    logic [5:0] imm;
+    imm = {rvfi_insn[12], rvfi_insn[6:2]};
+    data_accessed = RD;
+    decoded_str = $sformatf("%s\tx%0d,%0d", mnemonic, rvfi_rd_addr, $signed(imm));
+  endfunction
+
+  function automatic void decode_ci_caddi_insn(input string mnemonic);
+    logic [5:0] nzimm;
+    nzimm = {rvfi_insn[12], rvfi_insn[6:2]};
+    data_accessed = RS1 | RD;
+    decoded_str = $sformatf("%s\tx%0d,%0d", mnemonic, rvfi_rd_addr, $signed(nzimm));
+  endfunction
+
+  function automatic void decode_ci_caddi16sp_insn(input string mnemonic);
+    logic [9:0] nzimm;
+    nzimm = {rvfi_insn[12], rvfi_insn[4:3], rvfi_insn[5], rvfi_insn[2], rvfi_insn[6], 4'b0};
+    if (cheri_pmode_i) begin
+      data_accessed = CS1 | CD;
+      decoded_str = $sformatf("%s\tc%0d,%0d", "c.CH.cinc16csp", rvfi_rd_addr, $signed(nzimm));
+    end else begin
+      data_accessed = RS1 | RD;
+      decoded_str = $sformatf("%s\tx%0d,%0d", mnemonic, rvfi_rd_addr, $signed(nzimm));
+    end
+  endfunction
+
+  function automatic void decode_ci_clui_insn(input string mnemonic);
+    logic [5:0] nzimm;
+    nzimm = {rvfi_insn[12], rvfi_insn[6:2]};
+    data_accessed = RD;
+    decoded_str = $sformatf("%s\tx%0d,0x%0x", mnemonic, rvfi_rd_addr, 20'($signed(nzimm)));
+  endfunction
+
+  function automatic void decode_ci_cslli_insn(input string mnemonic);
+    logic [5:0] shamt;
+    shamt = {rvfi_insn[12], rvfi_insn[6:2]};
+    data_accessed = RS1 | RD;
+    decoded_str = $sformatf("%s\tx%0d,0x%0x", mnemonic, rvfi_rd_addr, shamt);
+  endfunction
+
+  function automatic void decode_ciw_insn(input string mnemonic);
+    // C.ADDI4SPN
+    logic [9:0] nzuimm;
+    nzuimm = {rvfi_insn[10:7], rvfi_insn[12:11], rvfi_insn[5], rvfi_insn[6], 2'b00};
+    if (cheri_pmode_i) begin
+      // c.CH.incaddr4spn
+      data_accessed = CD | CS1;
+      decoded_str = $sformatf("%s\tc%0d,csp,%0d", mnemonic, rvfi_rd_addr, nzuimm);
+    end else begin
+      // c.addi4spn
+      data_accessed = RD;
+      decoded_str = $sformatf("%s\tx%0d,x2,%0d", mnemonic, rvfi_rd_addr, nzuimm);
+    end
+  endfunction
+
+  function automatic void decode_cb_sr_insn(input string mnemonic);
+    logic [5:0] shamt;
+    shamt = {rvfi_insn[12], rvfi_insn[6:2]};
+    data_accessed = RS1 | RD;
+    decoded_str = $sformatf("%s\tx%0d,0x%0x", mnemonic, rvfi_rs1_addr, shamt);
+  endfunction
+
+  function automatic void decode_cb_insn(input string mnemonic);
+    logic [7:0] imm;
+    logic [31:0] jump_target;
+    if (rvfi_insn[15:13] == 3'b110 || rvfi_insn[15:13] == 3'b111) begin
+      // C.BNEZ and C.BEQZ
+      // We cannot use rvfi_pc_wdata for conditional jumps.
+      imm = {rvfi_insn[12], rvfi_insn[6:5], rvfi_insn[2], rvfi_insn[11:10], rvfi_insn[4:3]};
+      jump_target = rvfi_pc_rdata + 32'($signed({imm, 1'b0}));
+      data_accessed = RS1;
+      decoded_str = $sformatf("%s\tx%0d,%0x", mnemonic, rvfi_rs1_addr, jump_target);
+    end else if (rvfi_insn[15:13] == 3'b100) begin
+      // C.ANDI
+      imm = {{2{rvfi_insn[12]}}, rvfi_insn[12], rvfi_insn[6:2]};
+      data_accessed = RS1 | RD; // RS1 == RD
+      decoded_str = $sformatf("%s\tx%0d,%0d", mnemonic, rvfi_rd_addr, $signed(imm));
+    end else begin
+      imm = {rvfi_insn[12], rvfi_insn[6:2], 2'b00};
+      data_accessed = RS1;
+      decoded_str = $sformatf("%s\tx%0d,0x%0x", mnemonic, rvfi_rs1_addr, imm);
+    end
+  endfunction
+
+  function automatic void decode_cs_insn(input string mnemonic);
+    data_accessed = RS1 | RS2 | RD; // RS1 == RD
+    decoded_str = $sformatf("%s\tx%0d,x%0d", mnemonic, rvfi_rd_addr, rvfi_rs2_addr);
+  endfunction
+
+  function automatic void decode_cj_insn(input string mnemonic);
+    if (rvfi_insn[15:13] == 3'b001) begin
+      // C.JAL
+      if (cheri_pmode_i) begin
+        data_accessed = CD;
+        decoded_str = $sformatf("%s\t%0x", "c.CH.cjal", rvfi_pc_wdata);
+      end else begin
+        data_accessed = RD;
+        decoded_str = $sformatf("%s\t%0x", mnemonic, rvfi_pc_wdata);
+      end
+    end else begin
+      // C.J
+      if (cheri_pmode_i)
+        decoded_str = $sformatf("%s\t%0x", "c.CH.cj", rvfi_pc_wdata);
+      else
+        decoded_str = $sformatf("%s\t%0x", mnemonic, rvfi_pc_wdata);
+    end
+  endfunction
+
+  function automatic void decode_compressed_load_insn(input string mnemonic);
+    logic [7:0] imm;
+
+    if ((rvfi_insn[15:13] == 3'b011) && (rvfi_insn[1:0] == OPCODE_C0))  begin
+      // CHERI: c.clc, use RV64 c.ld encoding
+      imm = {rvfi_insn[6:5], rvfi_insn[12:10], 3'b000};
+      data_accessed = CS1 | CD | MEMC;
+      decoded_str = $sformatf("%s\tc%0d,%0d(c%0d)", mnemonic, rvfi_rd_addr, imm, rvfi_rs1_addr);
+    end else if ((rvfi_insn[15:13] == 3'b011) && (rvfi_insn[1:0] == OPCODE_C2))  begin
+      // CHERI: c.clcsp, RV32: c.ldsp
+      imm = {rvfi_insn[4:2], rvfi_insn[12], rvfi_insn[6:5], 3'b000};
+      data_accessed = CS1 | CD | MEMC;
+      decoded_str = $sformatf("%s\tc%0d,%0d(c%0d)", mnemonic, rvfi_rd_addr, imm, rvfi_rs1_addr);
+    end else begin
+      if (rvfi_insn[1:0] == OPCODE_C0) begin
+        // C.LW
+        imm = {1'b0, rvfi_insn[5], rvfi_insn[12:10], rvfi_insn[6], 2'b00};
+      end else begin
+        // C.LWSP
+        imm = {rvfi_insn[3:2], rvfi_insn[12], rvfi_insn[6:4], 2'b00};
+      end
+      if (cheri_pmode_i) begin
+        data_accessed = CS1 | RD | MEM;
+        decoded_str = $sformatf("%s\tx%0d,%0d(c%0d)", mnemonic, rvfi_rd_addr, imm, rvfi_rs1_addr);
+      end else begin
+        data_accessed = RS1 | RD | MEM;
+        decoded_str = $sformatf("%s\tx%0d,%0d(x%0d)", mnemonic, rvfi_rd_addr, imm, rvfi_rs1_addr);
+      end
+    end
+  endfunction
+
+  function automatic void decode_compressed_store_insn(input string mnemonic);
+    logic [7:0] imm;
+
+
+    if ((rvfi_insn[15:13] == 3'b111) && (rvfi_insn[1:0] == OPCODE_C0)) begin
+      // CHERI: c.csc, use RV64 c.sd encoding
+      imm = {rvfi_insn[6:5], rvfi_insn[12:10], 3'b000};
+      data_accessed = CS1 | CS2 | MEMC;
+      decoded_str = $sformatf("%s\tc%0d,%0d(c%0d)", mnemonic, rvfi_rs2_addr, imm, rvfi_rs1_addr);
+    end else if ((rvfi_insn[15:13] == 3'b111) && (rvfi_insn[1:0] == OPCODE_C2)) begin
+      // CHERI: c.cscsp, RV32: c.sdsp
+      imm = {rvfi_insn[9:7], rvfi_insn[12:10], 3'b000};
+      data_accessed = CS1 | CS2 | MEMC;
+      decoded_str = $sformatf("%s\tc%0d,%0d(c%0d)", mnemonic, rvfi_rs2_addr, imm, rvfi_rs1_addr);
+    end else begin
+      if (rvfi_insn[1:0] == OPCODE_C0) begin
+        // C.SW
+        imm = {1'b0, rvfi_insn[5], rvfi_insn[12:10], rvfi_insn[6], 2'b00};
+      end else begin
+        // C.SWSP
+        imm = {rvfi_insn[8:7], rvfi_insn[12:9], 2'b00};
+      end
+      if (cheri_pmode_i) begin
+        data_accessed = CS1 | RS2 | MEM;
+        decoded_str = $sformatf("%s\tx%0d,%0d(c%0d)", mnemonic, rvfi_rs2_addr, imm, rvfi_rs1_addr);
+      end else begin
+        data_accessed = RS1 | RS2 | MEM;
+        decoded_str = $sformatf("%s\tx%0d,%0d(x%0d)", mnemonic, rvfi_rs2_addr, imm, rvfi_rs1_addr);
+      end
+    end
+  endfunction
+
+  function automatic void decode_load_insn();
+    string       mnemonic;
+    logic [13:0] imm;
+
+    /*
+    Gives wrong results in Verilator < 4.020.
+    See https://github.com/lowRISC/ibex/issues/372 and
+    https://www.veripool.org/issues/1536-Verilator-Misoptimization-in-if-and-case-with-default-statement-inside-a-function
+
+    unique case (rvfi_insn[14:12])
+      3'b000: mnemonic = "lb";
+      3'b001: mnemonic = "lh";
+      3'b010: mnemonic = "lw";
+      3'b100: mnemonic = "lbu";
+      3'b101: mnemonic = "lhu";
+      default: begin
+        decode_mnemonic("INVALID");
+        return;
+      end
+    endcase
+    */
+    logic [2:0] size;
+    logic is_cap;
+
+    size = rvfi_insn[14:12];
+    is_cap = 1'b0;
+
+    if (size == 3'b000) begin
+      mnemonic = cheri_pmode_i ? "clb" : "lb";
+    end else if (size == 3'b001) begin
+      mnemonic = cheri_pmode_i ? "clh" :"lh";
+    end else if (size == 3'b010) begin
+      mnemonic = cheri_pmode_i ? "clw" :"lw";
+    end else if (size == 3'b100) begin
+      mnemonic = cheri_pmode_i ? "clbu" :"lbu";
+    end else if (size == 3'b101) begin
+      mnemonic = cheri_pmode_i ? "clhu" :"lhu";
+    end else if (size == 3'b011) begin
+      mnemonic = "CH.clc";
+      is_cap = 1'b1;
+    end else begin
+      decode_mnemonic("INVALID");
+      return;
+    end
+
+    imm = {{3{rvfi_insn[31]}},rvfi_insn[30:20]};
+
+    if (is_cap) begin
+      data_accessed = CD | CS1 | MEMC;
+      decoded_str = $sformatf("%s\tc%0d,%0d(c%0d)", mnemonic, rvfi_rd_addr,
+                      $signed(imm), rvfi_rs1_addr);
+    end else if (cheri_pmode_i) begin
+      data_accessed = RD | CS1 | MEM;
+      decoded_str = $sformatf("%s\tx%0d,%0d(c%0d)", mnemonic, rvfi_rd_addr,
+                      $signed(imm), rvfi_rs1_addr);
+    end else begin
+      data_accessed = RD | RS1 | MEM;
+      decoded_str = $sformatf("%s\tx%0d,%0d(x%0d)", mnemonic, rvfi_rd_addr,
+                      $signed(imm), rvfi_rs1_addr);
+    end
+  endfunction
+
+  function automatic void decode_store_insn();
+    string    mnemonic;
+    logic     is_cap;
+    logic [13:0] imm;
+
+    is_cap = 1'b0;
+    unique case (rvfi_insn[13:12])
+      2'b00:  mnemonic = cheri_pmode_i ? "csb" : "sb";
+      2'b01:  mnemonic = cheri_pmode_i ? "csh" : "sh";
+      2'b10:  mnemonic = cheri_pmode_i ? "csw" : "sw";
+      2'b11:  begin
+        mnemonic = "CH.csc";
+        is_cap = 1'b1;
+      end
+      default: begin
+        decode_mnemonic("INVALID");
+        return;
+      end
+    endcase
+
+    imm = {{3{rvfi_insn[31]}},rvfi_insn[30:25], rvfi_insn[11:7]};
+
+    if (!rvfi_insn[14]) begin
+      // regular store
+      if (is_cap) begin
+        data_accessed = CS1 | CS2 | MEMC;
+        decoded_str = $sformatf("%s\tc%0d,%0d(c%0d)",
+                                mnemonic,
+                                rvfi_rs2_addr,
+                                $signed(imm),
+                                rvfi_rs1_addr);
+      end else if (cheri_pmode_i) begin
+        data_accessed = CS1 | RS2 | MEM;
+        decoded_str = $sformatf("%s\tx%0d,%0d(c%0d)",
+                                mnemonic,
+                                rvfi_rs2_addr,
+                                $signed(imm),
+                                rvfi_rs1_addr);
+      end else begin
+        data_accessed = RS1 | RS2 | MEM;
+        decoded_str = $sformatf("%s\tx%0d,%0d(x%0d)",
+                                mnemonic,
+                                rvfi_rs2_addr,
+                                $signed(imm),
+                                rvfi_rs1_addr);
+      end
+    end else begin
+      decode_mnemonic("INVALID");
+    end
+  endfunction
+
+  function automatic string get_fence_description(logic [3:0] bits);
+    string desc = "";
+    if (bits[3]) begin
+      desc = {desc, "i"};
+    end
+    if (bits[2]) begin
+      desc = {desc, "o"};
+    end
+    if (bits[1]) begin
+      desc = {desc, "r"};
+    end
+    if (bits[0]) begin
+      desc = {desc, "w"};
+    end
+    return desc;
+  endfunction
+
+  function automatic void decode_fence();
+    string predecessor;
+    string successor;
+    predecessor = get_fence_description(rvfi_insn[27:24]);
+    successor = get_fence_description(rvfi_insn[23:20]);
+    decoded_str = $sformatf("fence\t%s,%s", predecessor, successor);
+  endfunction
+
+  function automatic void decode_cheri_rd_rs1_insn(input string mnemonic);
+    data_accessed = RS1 | RD;
+    decoded_str = $sformatf("%s\tx%0d,x%0d", mnemonic, rvfi_rd_addr, rvfi_rs1_addr);
+  endfunction
+
+  function automatic void decode_cheri_rd_cs1_insn(input string mnemonic);
+    data_accessed = CS1 | RD;
+    decoded_str = $sformatf("%s\tx%0d,c%0d", mnemonic, rvfi_rd_addr, rvfi_rs1_addr);
+  endfunction
+
+  function automatic void decode_cheri_cd_cs1_insn(input string mnemonic);
+    data_accessed = CS1 | CD;
+    decoded_str = $sformatf("%s\tc%0d,c%0d", mnemonic, rvfi_rd_addr, rvfi_rs1_addr);
+  endfunction
+
+  function automatic void decode_cheri_rd_cs1_cs2_insn(input string mnemonic);
+    data_accessed = CS2 | CS1 | RD;
+    decoded_str = $sformatf("%s\tx%0d,c%0d,c%0d", mnemonic, rvfi_rd_addr, rvfi_rs1_addr, rvfi_rs2_addr);
+  endfunction
+
+  function automatic void decode_cheri_cd_cs1_cs2_insn(input string mnemonic);
+    data_accessed = CS2 | CS1 | CD;
+    decoded_str = $sformatf("%s\tc%0d,c%0d,c%0d", mnemonic, rvfi_rd_addr, rvfi_rs1_addr, rvfi_rs2_addr);
+  endfunction
+
+  function automatic void decode_cheri_cd_cs1_rs2_insn(input string mnemonic);
+    data_accessed = RS2 | CS1 | CD;
+    decoded_str = $sformatf("%s\tc%0d,c%0d,x%0d", mnemonic, rvfi_rd_addr, rvfi_rs1_addr, rvfi_rs2_addr);
+  endfunction
+
+  function automatic void decode_cheri_cd_cs1_imm_insn(input string mnemonic);
+    logic [13:0] imm;
+
+    data_accessed =  CS1 | CD;
+
+    // cincaddrimm and csetboundsimm
+    imm = {{3{rvfi_insn[31]}}, rvfi_insn[30:20]};  // imm not extended
+
+    if (rvfi_insn[14:12] == 3'b001) // cincaddrimm
+      decoded_str = $sformatf("%s\tc%0d,c%0d,%0d", mnemonic, rvfi_rd_addr, rvfi_rs1_addr, $signed(imm));
+    else                            // csetboundsimm
+      decoded_str = $sformatf("%s\tc%0d,c%0d,%0d", mnemonic, rvfi_rd_addr, rvfi_rs1_addr, imm);
+
+  endfunction
+
+  function automatic void decode_cheri_auipcc_insn();
+    logic [31:0] imm;
+
+    // We cannot use rvfi_pc_wdata for conditional jumps.
+    imm = rvfi_insn[31:12];
+    data_accessed =  CD;
+    if (cheri_pmode_i) begin
+      decoded_str = $sformatf("%s\tc%0d,0x%0x", "CH.auipcc", rvfi_rd_addr, imm);
+    end else begin
+      decoded_str = $sformatf("%s\tx%0d,0x%0x", "auipc", rvfi_rd_addr, imm);
+    end
+
+  endfunction
+
+
+  function automatic void decode_cheri_auicgp_insn();
+    logic [31:0] imm;
+
+    // We cannot use rvfi_pc_wdata for conditional jumps.
+    imm = rvfi_insn[31:12];
+    data_accessed =  CD | CS1;
+    decoded_str = $sformatf("%s\tc%0d,0x%0x", "CH.auicgp", rvfi_rd_addr, imm);
+  endfunction
+
+
+  function automatic void decode_cheri_cs1_cs2_insn(input string mnemonic);
+    data_accessed = CS2 | CS1;
+    decoded_str = $sformatf("%s\tc%0d,c%0d", mnemonic, rvfi_rs1_addr, rvfi_rs2_addr);
+  endfunction
+
+  function automatic void decode_cheri_scrrw_insn();
+    string mnemonic, scr_name;
+
+    scr_name = get_scr_name(rvfi_insn[24:20]);
+    data_accessed = CS1 | CD;
+
+    if (rvfi_rd_addr == 0) begin
+      mnemonic = "CH.cspecialw";
+      decoded_str = $sformatf("%s\t%s,c%0d", mnemonic, scr_name, rvfi_rs1_addr);
+    end else if (rvfi_rs1_addr == 0) begin
+      mnemonic = "CH.cspecialr";
+      decoded_str = $sformatf("%s\tc%0d,%s", mnemonic, rvfi_rd_addr, scr_name);
+    end else begin
+      mnemonic = "CH.cspecialrw";
+      decoded_str = $sformatf("%s\tc%0d,%s,c%0d", mnemonic, rvfi_rd_addr, scr_name, rvfi_rs1_addr);
+    end
+  endfunction
+
+  // cycle counter
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      cycle <= 0;
+    end else begin
+      cycle <= cycle + 1;
+    end
+  end
+
+  // close output file for writing
+  final begin
+    if (file_handle != 32'h0) begin
+      $fclose(file_handle);
+    end
+  end
+  // log execution
+  always_ff @(posedge clk_i) begin
+    if (rvfi_valid && trace_log_enable) begin
+      printbuffer_dumpline();
+    end
+  end
+
+  if (DataWidth == 33) begin
+    assign rvfi_mem_wdata_bit32 = rvfi_mem_wdata[32];
+  end else begin
+    assign rvfi_mem_wdata_bit32 = 1'b0;
+  end
+
+  //always_comb begin
+  // change to always @* to get rid of VCS warnings about dynamic type and sensitivity list
+  always @* begin
+    decoded_str = "";
+    data_accessed = 5'h0;
+    insn_is_compressed = 0;
+
+    // Check for compressed instructions
+    if (rvfi_insn[1:0] != 2'b11) begin
+      insn_is_compressed = 1;
+      // Separate case to avoid overlapping decoding
+      if (rvfi_insn[15:13] == INSN_CMV[15:13] && rvfi_insn[1:0] == OPCODE_C2) begin
+        if (rvfi_insn[12] == INSN_CADD[12]) begin
+          if (rvfi_insn[11:2] == INSN_CEBREAK[11:2]) begin
+            decode_mnemonic("c.ebreak");
+          end else if (rvfi_insn[6:2] == INSN_CJALR[6:2]) begin
+            decode_cr_insn("c.jalr");
+          end else begin
+            decode_cr_insn("c.add");
+          end
+        end else begin
+          if (rvfi_insn[6:2] == INSN_CJR[6:2]) begin
+            decode_cr_insn("c.jr");
+          end else begin
+            decode_cr_insn("c.mv");
+          end
+        end
+      end else begin
+        unique casez (rvfi_insn[15:0])
+          // C0 Opcodes
+          INSN_CADDI4SPN: begin
+            if (rvfi_insn[12:2] == 11'h0) begin
+              // Align with pseudo-mnemonic used by GNU binutils and LLVM's MC layer
+              decode_mnemonic("c.unimp");
+            end else begin
+              decode_ciw_insn("c.addi4spn");
+            end
+          end
+          INSN_CLW:        decode_compressed_load_insn("c.lw");
+          INSN_CSW:        decode_compressed_store_insn("c.sw");
+          INSN_CCLC:       decode_compressed_load_insn("c.CH.clc");
+          INSN_CCSC:       decode_compressed_store_insn("c.CH.csc");
+          // C1 Opcodes
+          INSN_CADDI:      decode_ci_caddi_insn("c.addi");
+          INSN_CJAL:       decode_cj_insn("c.jal");
+          INSN_CJ:         decode_cj_insn("c.j");
+          INSN_CLI:        decode_ci_cli_insn("c.li");
+          INSN_CLUI: begin
+            // These two instructions share opcode
+            if (rvfi_insn[11:7] == 5'd2) begin
+              decode_ci_caddi16sp_insn("c.addi16sp");
+            end else begin
+              decode_ci_clui_insn("c.lui");
+            end
+          end
+          INSN_CSRLI:      decode_cb_sr_insn("c.srli");
+          INSN_CSRAI:      decode_cb_sr_insn("c.srai");
+          INSN_CANDI:      decode_cb_insn("c.andi");
+          INSN_CSUB:       decode_cs_insn("c.sub");
+          INSN_CXOR:       decode_cs_insn("c.xor");
+          INSN_COR:        decode_cs_insn("c.or");
+          INSN_CAND:       decode_cs_insn("c.and");
+          INSN_CBEQZ:      decode_cb_insn("c.beqz");
+          INSN_CBNEZ:      decode_cb_insn("c.bnez");
+          // C2 Opcodes
+          INSN_CSLLI:      decode_ci_cslli_insn("c.slli");
+          INSN_CLWSP:      decode_compressed_load_insn("c.lwsp");
+          INSN_SWSP:       decode_compressed_store_insn("c.swsp");
+          INSN_CCLCSP:     decode_compressed_load_insn("c.CH.clcsp");
+          INSN_CCSCSP:     decode_compressed_store_insn("c.CH.cscsp");
+          default:         decode_mnemonic("INVALID");
+        endcase
+      end
+    end else begin
+      unique casez (rvfi_insn)
+        // Regular opcodes
+        INSN_LUI:        decode_u_insn("lui");
+        // INSN_AUIPC:      decode_u_insn("auipc");
+        INSN_JAL:        decode_j_insn("jal");
+        INSN_JALR:       decode_i_jalr_insn("jalr");
+        // BRANCH
+        INSN_BEQ:        decode_b_insn("beq");
+        INSN_BNE:        decode_b_insn("bne");
+        INSN_BLT:        decode_b_insn("blt");
+        INSN_BGE:        decode_b_insn("bge");
+        INSN_BLTU:       decode_b_insn("bltu");
+        INSN_BGEU:       decode_b_insn("bgeu");
+        // OPIMM
+        INSN_ADDI: begin
+          if (rvfi_insn == 32'h00_00_00_13) begin
+            // TODO: objdump doesn't decode this as nop currently, even though it would be helpful
+            // Decide what to do here: diverge from objdump, or make the trace less readable to
+            // users.
+            //decode_mnemonic("nop");
+            decode_i_insn("addi");
+          end else begin
+            decode_i_insn("addi");
+          end
+        end
+        INSN_SLTI:       decode_i_insn("slti");
+        INSN_SLTIU:      decode_i_insn("sltiu");
+        INSN_XORI:       decode_i_insn("xori");
+        INSN_ORI:        decode_i_insn("ori");
+        // Unlike the ratified v.1.0.0 bitmanip extension, the v.0.94 draft extension continues to
+        // define the pseudo-instruction
+        //   zext.b rd rs = andi rd, rs, 255.
+        // However, for now the tracer doesn't emit this due to a lack of support in the LLVM and
+        // GCC toolchains. Enabling this functionality when the time is right is tracked in
+        // https://github.com/lowRISC/ibex/issues/1228
+        INSN_ANDI:       decode_i_insn("andi");
+        // INSN_ANDI:begin
+          // casez (rvfi_insn)
+            // INSN_ZEXTB:  decode_r1_insn("zext.b");
+            // default:     decode_i_insn("andi");
+          // endcase
+        // end
+        INSN_SLLI:       decode_i_shift_insn("slli");
+        INSN_SRLI:       decode_i_shift_insn("srli");
+        INSN_SRAI:       decode_i_shift_insn("srai");
+        // OP
+        INSN_ADD:        decode_r_insn("add");
+        INSN_SUB:        decode_r_insn("sub");
+        INSN_SLL:        decode_r_insn("sll");
+        INSN_SLT:        decode_r_insn("slt");
+        INSN_SLTU:       decode_r_insn("sltu");
+        INSN_XOR:        decode_r_insn("xor");
+        INSN_SRL:        decode_r_insn("srl");
+        INSN_SRA:        decode_r_insn("sra");
+        INSN_OR:         decode_r_insn("or");
+        INSN_AND:        decode_r_insn("and");
+        // SYSTEM (CSR manipulation)
+        INSN_CSRRW:      decode_csr_insn("csrrw");
+        INSN_CSRRS:      decode_csr_insn("csrrs");
+        INSN_CSRRC:      decode_csr_insn("csrrc");
+        INSN_CSRRWI:     decode_csr_insn("csrrwi");
+        INSN_CSRRSI:     decode_csr_insn("csrrsi");
+        INSN_CSRRCI:     decode_csr_insn("csrrci");
+        // SYSTEM (others)
+        INSN_ECALL:      decode_mnemonic("ecall");
+        INSN_EBREAK:     decode_mnemonic("ebreak");
+        INSN_MRET:       decode_mnemonic("mret");
+        INSN_DRET:       decode_mnemonic("dret");
+        INSN_WFI:        decode_mnemonic("wfi");
+        // RV32M
+        INSN_PMUL:       decode_r_insn("mul");
+        INSN_PMUH:       decode_r_insn("mulh");
+        INSN_PMULHSU:    decode_r_insn("mulhsu");
+        INSN_PMULHU:     decode_r_insn("mulhu");
+        INSN_DIV:        decode_r_insn("div");
+        INSN_DIVU:       decode_r_insn("divu");
+        INSN_REM:        decode_r_insn("rem");
+        INSN_REMU:       decode_r_insn("remu");
+        // LOAD & STORE
+        INSN_LOAD:       decode_load_insn();
+        INSN_STORE:      decode_store_insn();
+        // MISC-MEM
+        INSN_FENCE:      decode_fence();
+        INSN_FENCEI:     decode_mnemonic("fence.i");
+        // RV32B - ZBA
+        INSN_SH1ADD:     decode_r_insn("sh1add");
+        INSN_SH2ADD:     decode_r_insn("sh2add");
+        INSN_SH3ADD:     decode_r_insn("sh3add");
+        // RV32B - ZBB
+        INSN_RORI:       decode_i_shift_insn("rori");
+        INSN_ROL:        decode_r_insn("rol");
+        INSN_ROR:        decode_r_insn("ror");
+        INSN_MIN:        decode_r_insn("min");
+        INSN_MAX:        decode_r_insn("max");
+        INSN_MINU:       decode_r_insn("minu");
+        INSN_MAXU:       decode_r_insn("maxu");
+        INSN_XNOR:       decode_r_insn("xnor");
+        INSN_ORN:        decode_r_insn("orn");
+        INSN_ANDN:       decode_r_insn("andn");
+        // The ratified v.1.0.0 bitmanip extension defines the pseudo-instruction
+        //   zext.h rd rs = pack rd, rs, zero.
+        // However, for now the tracer doesn't emit this due to a lack of support in the LLVM and
+        // GCC toolchains. Enabling this functionality when the time is right is tracked in
+        // https://github.com/lowRISC/ibex/issues/1228
+        INSN_PACK:       decode_r_insn("pack");
+        // INSN_PACK: begin
+          // casez (rvfi_insn)
+            // INSN_ZEXTH:  decode_r1_insn("zext.h");
+            // default:     decode_r_insn("pack");
+          // endcase
+        // end
+        INSN_PACKH:      decode_r_insn("packh");
+        INSN_PACKU:      decode_r_insn("packu");
+        INSN_CLZ:        decode_r1_insn("clz");
+        INSN_CTZ:        decode_r1_insn("ctz");
+        INSN_CPOP:       decode_r1_insn("cpop");
+        INSN_SEXTB:      decode_r1_insn("sext.b");
+        INSN_SEXTH:      decode_r1_insn("sext.h");
+        // RV32B - ZBS
+        INSN_BCLRI:     decode_i_insn("bclri");
+        INSN_BSETI:     decode_i_insn("bseti");
+        INSN_BINVI:     decode_i_insn("binvi");
+        INSN_BEXTI:     decode_i_insn("bexti");
+        INSN_BCLR:      decode_r_insn("bclr");
+        INSN_BSET:      decode_r_insn("bset");
+        INSN_BINV:      decode_r_insn("binv");
+        INSN_BEXT:      decode_r_insn("bext");
+        // RV32B - ZBE
+        INSN_BDECOMPRESS: decode_r_insn("bdecompress");
+        INSN_BCOMPRESS:   decode_r_insn("bcompress");
+        // RV32B - ZBP
+        INSN_GREV:       decode_r_insn("grev");
+        INSN_GREVI: begin
+          unique casez (rvfi_insn)
+            INSN_REV_P:  decode_r1_insn("rev.p");
+            INSN_REV2_N: decode_r1_insn("rev2.n");
+            INSN_REV_N:  decode_r1_insn("rev.n");
+            INSN_REV4_B: decode_r1_insn("rev4.b");
+            INSN_REV2_B: decode_r1_insn("rev2.b");
+            INSN_REV_B:  decode_r1_insn("rev.b");
+            INSN_REV8_H: decode_r1_insn("rev8.h");
+            INSN_REV4_H: decode_r1_insn("rev4.h");
+            INSN_REV2_H: decode_r1_insn("rev2.h");
+            INSN_REV_H:  decode_r1_insn("rev.h");
+            INSN_REV16:  decode_r1_insn("rev16");
+            INSN_REV8:   decode_r1_insn("rev8");
+            INSN_REV4:   decode_r1_insn("rev4");
+            INSN_REV2:   decode_r1_insn("rev2");
+            INSN_REV:    decode_r1_insn("rev");
+            default:     decode_i_insn("grevi");
+          endcase
+        end
+        INSN_GORC:       decode_r_insn("gorc");
+        INSN_GORCI: begin
+          unique casez (rvfi_insn)
+            INSN_ORC_P:  decode_r1_insn("orc.p");
+            INSN_ORC2_N: decode_r1_insn("orc2.n");
+            INSN_ORC_N:  decode_r1_insn("orc.n");
+            INSN_ORC4_B: decode_r1_insn("orc4.b");
+            INSN_ORC2_B: decode_r1_insn("orc2.b");
+            INSN_ORC_B:  decode_r1_insn("orc.b");
+            INSN_ORC8_H: decode_r1_insn("orc8.h");
+            INSN_ORC4_H: decode_r1_insn("orc4.h");
+            INSN_ORC2_H: decode_r1_insn("orc2.h");
+            INSN_ORC_H:  decode_r1_insn("orc.h");
+            INSN_ORC16:  decode_r1_insn("orc16");
+            INSN_ORC8:   decode_r1_insn("orc8");
+            INSN_ORC4:   decode_r1_insn("orc4");
+            INSN_ORC2:   decode_r1_insn("orc2");
+            INSN_ORC:    decode_r1_insn("orc");
+            default:     decode_i_insn("gorci");
+          endcase
+        end
+        INSN_SHFL:       decode_r_insn("shfl");
+        INSN_SHFLI: begin
+          unique casez (rvfi_insn)
+            INSN_ZIP_N:  decode_r1_insn("zip.n");
+            INSN_ZIP2_B: decode_r1_insn("zip2.b");
+            INSN_ZIP_B:  decode_r1_insn("zip.b");
+            INSN_ZIP4_H: decode_r1_insn("zip4.h");
+            INSN_ZIP2_H: decode_r1_insn("zip2.h");
+            INSN_ZIP_H:  decode_r1_insn("zip.h");
+            INSN_ZIP8:   decode_r1_insn("zip8");
+            INSN_ZIP4:   decode_r1_insn("zip4");
+            INSN_ZIP2:   decode_r1_insn("zip2");
+            INSN_ZIP:    decode_r1_insn("zip");
+            default:     decode_i_insn("shfli");
+          endcase
+        end
+        INSN_UNSHFL:       decode_r_insn("unshfl");
+        INSN_UNSHFLI: begin
+          unique casez (rvfi_insn)
+            INSN_UNZIP_N:  decode_r1_insn("unzip.n");
+            INSN_UNZIP2_B: decode_r1_insn("unzip2.b");
+            INSN_UNZIP_B:  decode_r1_insn("unzip.b");
+            INSN_UNZIP4_H: decode_r1_insn("unzip4.h");
+            INSN_UNZIP2_H: decode_r1_insn("unzip2.h");
+            INSN_UNZIP_H:  decode_r1_insn("unzip.h");
+            INSN_UNZIP8:   decode_r1_insn("unzip8");
+            INSN_UNZIP4:   decode_r1_insn("unzip4");
+            INSN_UNZIP2:   decode_r1_insn("unzip2");
+            INSN_UNZIP:    decode_r1_insn("unzip");
+            default:       decode_i_insn("unshfli");
+          endcase
+        end
+        INSN_XPERM_N:    decode_r_insn("xperm_n");
+        INSN_XPERM_B:    decode_r_insn("xperm_b");
+        INSN_XPERM_H:    decode_r_insn("xperm_h");
+        INSN_SLO:        decode_r_insn("slo");
+        INSN_SRO:        decode_r_insn("sro");
+        INSN_SLOI:       decode_i_shift_insn("sloi");
+        INSN_SROI:       decode_i_shift_insn("sroi");
+
+        // RV32B - ZBT
+        INSN_CMIX:       decode_r_cmixcmov_insn("cmix");
+        INSN_CMOV:       decode_r_cmixcmov_insn("cmov");
+        INSN_FSR:        decode_r_funnelshift_insn("fsr");
+        INSN_FSL:        decode_r_funnelshift_insn("fsl");
+        INSN_FSRI:       decode_i_funnelshift_insn("fsri");
+
+        // RV32B - ZBF
+        INSN_BFP:        decode_r_insn("bfp");
+
+        // RV32B - ZBC
+        INSN_CLMUL:      decode_r_insn("clmul");
+        INSN_CLMULR:     decode_r_insn("clmulr");
+        INSN_CLMULH:     decode_r_insn("clmulh");
+
+        // RV32B - ZBR
+        INSN_CRC32_B:    decode_r1_insn("crc32.b");
+        INSN_CRC32_H:    decode_r1_insn("crc32.h");
+        INSN_CRC32_W:    decode_r1_insn("crc32.w");
+        INSN_CRC32C_B:   decode_r1_insn("crc32c.b");
+        INSN_CRC32C_H:   decode_r1_insn("crc32c.h");
+        INSN_CRC32C_W:   decode_r1_insn("crc32c.w");
+
+        // CHERI, get fields
+        INSN_CHGETPERM:    decode_cheri_rd_cs1_insn("CH.cgetperm");
+        INSN_CHGETTYPE:    decode_cheri_rd_cs1_insn("CH.cgettype");
+        INSN_CHGETBASE:    decode_cheri_rd_cs1_insn("CH.cgetbase");
+        INSN_CHGETTOP:     decode_cheri_rd_cs1_insn("CH.cgettop");
+        INSN_CHGETLEN:     decode_cheri_rd_cs1_insn("CH.cgetlen");
+        INSN_CHGETTAG:     decode_cheri_rd_cs1_insn("CH.cgettag");
+        INSN_CHGETSEALED:  decode_cheri_rd_cs1_insn("CH.cgetseald");
+        INSN_CHGETADDR:    decode_cheri_rd_cs1_insn("CH.cgetaddr");
+        INSN_CHGETHIGH:     decode_cheri_rd_cs1_insn("CH.cgethigh");
+
+        INSN_CHSEAL:       decode_cheri_cd_cs1_cs2_insn("CH.cseal");
+        INSN_CHUNSEAL:     decode_cheri_cd_cs1_cs2_insn("CH.cunseal");
+        INSN_CHANDPERM:    decode_cheri_cd_cs1_rs2_insn("CH.candperm");
+        INSN_CHSETADDR:    decode_cheri_cd_cs1_rs2_insn("CH.csetaddr");
+        INSN_CHINCADDR:    decode_cheri_cd_cs1_rs2_insn("CH.cincaddr");
+        INSN_CHINCADDRIMM: decode_cheri_cd_cs1_imm_insn("CH.cincaddrimm");
+        INSN_CHSETBOUNDS:  decode_cheri_cd_cs1_rs2_insn("CH.csetbounds");
+        INSN_CHSETBOUNDSEX:  decode_cheri_cd_cs1_rs2_insn("CH.csetboundsexact");
+        INSN_CHSETBOUNDSRNDN: decode_cheri_cd_cs1_rs2_insn("CH.csetboundsrounddown");
+
+        INSN_CHSETBOUNDSIMM: decode_cheri_cd_cs1_imm_insn("CH.csetboundsimm");
+        INSN_CHCLEARTAG:     decode_cheri_cd_cs1_insn("CH.ccleartag");
+        INSN_CHCRRL:         decode_cheri_rd_rs1_insn("CH.crrl");
+        INSN_CHCRAM:         decode_cheri_rd_rs1_insn("CH.cram");
+
+        INSN_CHSUB:        decode_cheri_rd_cs1_cs2_insn("CH.csub");
+        INSN_CHMOVE:       decode_cheri_cd_cs1_insn("CH.cmove");
+        INSN_CHTESTSUB:    decode_cheri_rd_cs1_cs2_insn("CH.ctestsubset");
+        INSN_CHSETEQUAL:   decode_cheri_rd_cs1_cs2_insn("CH.csetequalexact");
+        INSN_CHSETHIGH:    decode_cheri_cd_cs1_rs2_insn("CH.csethigh");
+        //INSN_CHJALR:       decode_cheri_cd_cs1_insn("CH.jalr");
+        INSN_CHCSRRW:      decode_cheri_scrrw_insn();
+        INSN_AUIPC:        decode_cheri_auipcc_insn();
+        INSN_AUICGP:       decode_cheri_auicgp_insn();
+
+        default:         decode_mnemonic("INVALID");
+      endcase
+    end
+  end
+// synthesis translate_on
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_tracer_pkg.sv b/hw/ip/cheriot-ibex/rtl/cheriot_tracer_pkg.sv
new file mode 100644
index 0000000..ce0fed8
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_tracer_pkg.sv
@@ -0,0 +1,379 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright lowRISC contributors.
+// Copyright 2017 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+package cheriot_tracer_pkg;
+  import cheriot_pkg::*;
+
+  parameter logic [1:0] OPCODE_C0 = 2'b00;
+  parameter logic [1:0] OPCODE_C1 = 2'b01;
+  parameter logic [1:0] OPCODE_C2 = 2'b10;
+
+  // instruction masks (for tracer)
+  parameter logic [31:0] INSN_LUI     = { 25'h?,                           {OPCODE_LUI  } };
+  parameter logic [31:0] INSN_AUIPC   = { 25'h?,                           {OPCODE_AUIPC} };
+  parameter logic [31:0] INSN_JAL     = { 25'h?,                           {OPCODE_JAL  } };
+  parameter logic [31:0] INSN_JALR    = { 17'h?,             3'b000, 5'h?, {OPCODE_JALR } };
+
+  // BRANCH
+  parameter logic [31:0] INSN_BEQ     = { 17'h?,             3'b000, 5'h?, {OPCODE_BRANCH} };
+  parameter logic [31:0] INSN_BNE     = { 17'h?,             3'b001, 5'h?, {OPCODE_BRANCH} };
+  parameter logic [31:0] INSN_BLT     = { 17'h?,             3'b100, 5'h?, {OPCODE_BRANCH} };
+  parameter logic [31:0] INSN_BGE     = { 17'h?,             3'b101, 5'h?, {OPCODE_BRANCH} };
+  parameter logic [31:0] INSN_BLTU    = { 17'h?,             3'b110, 5'h?, {OPCODE_BRANCH} };
+  parameter logic [31:0] INSN_BGEU    = { 17'h?,             3'b111, 5'h?, {OPCODE_BRANCH} };
+
+  // OPIMM
+  parameter logic [31:0] INSN_ADDI    = { 17'h?,             3'b000, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_SLTI    = { 17'h?,             3'b010, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_SLTIU   = { 17'h?,             3'b011, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_XORI    = { 17'h?,             3'b100, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ORI     = { 17'h?,             3'b110, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ANDI    = { 17'h?,             3'b111, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_SLLI    = { 7'b0000000, 10'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_SRLI    = { 7'b0000000, 10'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_SRAI    = { 7'b0100000, 10'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+
+  // OP
+  parameter logic [31:0] INSN_ADD     = { 7'b0000000, 10'h?, 3'b000, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_SUB     = { 7'b0100000, 10'h?, 3'b000, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_SLL     = { 7'b0000000, 10'h?, 3'b001, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_SLT     = { 7'b0000000, 10'h?, 3'b010, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_SLTU    = { 7'b0000000, 10'h?, 3'b011, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_XOR     = { 7'b0000000, 10'h?, 3'b100, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_SRL     = { 7'b0000000, 10'h?, 3'b101, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_SRA     = { 7'b0100000, 10'h?, 3'b101, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_OR      = { 7'b0000000, 10'h?, 3'b110, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_AND     = { 7'b0000000, 10'h?, 3'b111, 5'h?, {OPCODE_OP} };
+
+  // SYSTEM
+  parameter logic [31:0] INSN_CSRRW   = { 17'h?,             3'b001, 5'h?, {OPCODE_SYSTEM} };
+  parameter logic [31:0] INSN_CSRRS   = { 17'h?,             3'b010, 5'h?, {OPCODE_SYSTEM} };
+  parameter logic [31:0] INSN_CSRRC   = { 17'h?,             3'b011, 5'h?, {OPCODE_SYSTEM} };
+  parameter logic [31:0] INSN_CSRRWI  = { 17'h?,             3'b101, 5'h?, {OPCODE_SYSTEM} };
+  parameter logic [31:0] INSN_CSRRSI  = { 17'h?,             3'b110, 5'h?, {OPCODE_SYSTEM} };
+  parameter logic [31:0] INSN_CSRRCI  = { 17'h?,             3'b111, 5'h?, {OPCODE_SYSTEM} };
+  parameter logic [31:0] INSN_ECALL   = { 12'b000000000000,         13'b0, {OPCODE_SYSTEM} };
+  parameter logic [31:0] INSN_EBREAK  = { 12'b000000000001,         13'b0, {OPCODE_SYSTEM} };
+  parameter logic [31:0] INSN_MRET    = { 12'b001100000010,         13'b0, {OPCODE_SYSTEM} };
+  parameter logic [31:0] INSN_DRET    = { 12'b011110110010,         13'b0, {OPCODE_SYSTEM} };
+  parameter logic [31:0] INSN_WFI     = { 12'b000100000101,         13'b0, {OPCODE_SYSTEM} };
+
+  // RV32M
+  parameter logic [31:0] INSN_DIV     = { 7'b0000001, 10'h?, 3'b100, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_DIVU    = { 7'b0000001, 10'h?, 3'b101, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_REM     = { 7'b0000001, 10'h?, 3'b110, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_REMU    = { 7'b0000001, 10'h?, 3'b111, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_PMUL    = { 7'b0000001, 10'h?, 3'b000, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_PMUH    = { 7'b0000001, 10'h?, 3'b001, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_PMULHSU = { 7'b0000001, 10'h?, 3'b010, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_PMULHU  = { 7'b0000001, 10'h?, 3'b011, 5'h?, {OPCODE_OP} };
+
+  // RV32B
+  // ZBA
+  parameter logic [31:0] INSN_SH1ADD = { 7'b0010000, 10'h?, 3'b010, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_SH2ADD = { 7'b0010000, 10'h?, 3'b100, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_SH3ADD = { 7'b0010000, 10'h?, 3'b110, 5'h?, {OPCODE_OP} };
+
+  // ZBB
+  // Only log2(XLEN) bits of the immediate are used. For RV32, this means only the bits in
+  // instr[24:20] are effectively used. Whenever instr[26] is set, sroi/rori is instead decoded as
+  // fsri.
+  parameter logic [31:0] INSN_RORI = { 5'b01100  , 1'b0, 11'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_CLZ  = { 12'b011000000000, 5'h?,  3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_CTZ  = { 12'b011000000001, 5'h?,  3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_CPOP = { 12'b011000000010, 5'h?,  3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_SEXTB = { 12'b011000000100, 5'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_SEXTH = { 12'b011000000101, 5'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+
+  // The zext.h and zext.b pseudo-instructions are defined in the ratified v.1.0.0 and draft v.0.94
+  // specifications of the bitmanip extension, respectively. They are currently not emitted by the
+  // tracer due to a lack of support in the LLVM and GCC toolchains. Enabling this functionality
+  // when the time is right is tracked in https://github.com/lowRISC/ibex/issues/1228
+  // zext.b -- pseudo-instruction: andi rd, rs 255
+  // parameter logic [31:0] INSN_ZEXTB =
+  //     { 4'b0000, 8'b11111111, 5'h?, 3'b111, 5'h?, {OPCODE_OP_IMM} };
+  // zext.h -- pseudo-instruction: pack rd, rs zero
+  // parameter logic [31:0] INSN_ZEXTH = { 7'b0000100, 5'b00000, 5'h?, 3'b100, 5'h?, {OPCODE_OP} };
+
+  parameter logic [31:0] INSN_ROL   = { 7'b0110000, 10'h?, 3'b001, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_ROR   = { 7'b0110000, 10'h?, 3'b101, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_MIN   = { 7'b0000101, 10'h?, 3'b100, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_MAX   = { 7'b0000101, 10'h?, 3'b110, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_MINU  = { 7'b0000101, 10'h?, 3'b101, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_MAXU  = { 7'b0000101, 10'h?, 3'b111, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_XNOR  = { 7'b0100000, 10'h?, 3'b100, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_ORN   = { 7'b0100000, 10'h?, 3'b110, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_ANDN  = { 7'b0100000, 10'h?, 3'b111, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_PACK  = { 7'b0000100, 10'h?, 3'b100, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_PACKU = { 7'b0100100, 10'h?, 3'b100, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_PACKH = { 7'b0000100, 10'h?, 3'b111, 5'h?, {OPCODE_OP} };
+
+  // ZBS
+  parameter logic [31:0] INSN_BCLRI = { 5'b01001, 12'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_BSETI = { 5'b00101, 12'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_BINVI = { 5'b01101, 12'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  // Only log2(XLEN) bits of the immediate are used. For RV32, this means only the bits in
+  // instr[24:20] are effectively used. Whenever instr[26] is set, bexti is instead decoded as fsri.
+  parameter logic [31:0] INSN_BEXTI = { 5'b01001, 1'b0, 11'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+
+  parameter logic [31:0] INSN_BCLR = { 7'b0100100, 10'h?, 3'b001, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_BSET = { 7'b0010100, 10'h?, 3'b001, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_BINV = { 7'b0110100, 10'h?, 3'b001, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_BEXT = { 7'b0100100, 10'h?, 3'b101, 5'h?, {OPCODE_OP} };
+
+  // ZBP
+  // grevi
+  // Only log2(XLEN) bits of the immediate are used. For RV32, this means only the bits in
+  // instr[24:20] are effectively used. Whenever instr[26] is set, grevi is instead decoded as fsri.
+  parameter logic [31:0] INSN_GREVI = { 5'b01101, 1'b0, 11'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  // grevi -- pseudo-instructions
+  parameter logic [31:0] INSN_REV_P =
+      { 5'b01101, 1'b0, 1'b?, 5'b00001, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_REV2_N =
+      { 5'b01101, 1'b0, 1'b?, 5'b00010, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_REV_N =
+      { 5'b01101, 1'b0, 1'b?, 5'b00011, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_REV4_B =
+      { 5'b01101, 1'b0, 1'b?, 5'b00100, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_REV2_B =
+      { 5'b01101, 1'b0, 1'b?, 5'b00110, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_REV_B =
+      { 5'b01101, 1'b0, 1'b?, 5'b00111, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_REV8_H =
+      { 5'b01101, 1'b0, 1'b?, 5'b01000, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_REV4_H =
+      { 5'b01101, 1'b0, 1'b?, 5'b01100, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_REV2_H =
+      { 5'b01101, 1'b0, 1'b?, 5'b01110, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_REV_H =
+      { 5'b01101, 1'b0, 1'b?, 5'b01111, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_REV16 =
+      { 5'b01101, 1'b0, 1'b?, 5'b10000, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_REV8 =
+      { 5'b01101, 1'b0, 1'b?, 5'b11000, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_REV4 =
+      { 5'b01101, 1'b0, 1'b?, 5'b11100, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_REV2 =
+      { 5'b01101, 1'b0, 1'b?, 5'b11110, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_REV =
+      { 5'b01101, 1'b0, 1'b?, 5'b11111, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  // gorci
+  // Only log2(XLEN) bits of the immediate are used. For RV32, this means only the bits in
+  // instr[24:20] are effectively used. Whenever instr[26] is set, gorci is instead decoded as fsri.
+  parameter logic [31:0] INSN_GORCI = { 5'b00101, 1'b0, 11'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  // gorci -- pseudo-instructions
+  parameter logic [31:0] INSN_ORC_P =
+      { 5'b00101, 1'b0, 1'b?, 5'b00001, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ORC2_N =
+      { 5'b00101, 1'b0, 1'b?, 5'b00010, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ORC_N =
+      { 5'b00101, 1'b0, 1'b?, 5'b00011, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ORC4_B =
+      { 5'b00101, 1'b0, 1'b?, 5'b00100, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ORC2_B =
+      { 5'b00101, 1'b0, 1'b?, 5'b00110, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ORC_B =
+      { 5'b00101, 1'b0, 1'b?, 5'b00111, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ORC8_H =
+      { 5'b00101, 1'b0, 1'b?, 5'b01000, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ORC4_H =
+      { 5'b00101, 1'b0, 1'b?, 5'b01100, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ORC2_H =
+      { 5'b00101, 1'b0, 1'b?, 5'b01110, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ORC_H =
+      { 5'b00101, 1'b0, 1'b?, 5'b01111, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ORC16 =
+      { 5'b00101, 1'b0, 1'b?, 5'b10000, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ORC8 =
+      { 5'b00101, 1'b0, 1'b?, 5'b11000, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ORC4 =
+      { 5'b00101, 1'b0, 1'b?, 5'b11100, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ORC2 =
+      { 5'b00101, 1'b0, 1'b?, 5'b11110, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ORC =
+      { 5'b00101, 1'b0, 1'b?, 5'b11111, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  // shfli
+  parameter logic [31:0] INSN_SHFLI = { 6'b000010, 11'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  // shfli -- pseudo-instructions
+  parameter logic [31:0] INSN_ZIP_N =
+      { 6'b000010, 2'h?, 4'b0001, 5'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ZIP2_B =
+      { 6'b000010, 2'h?, 4'b0010, 5'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ZIP_B =
+      { 6'b000010, 2'h?, 4'b0011, 5'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ZIP4_H =
+      { 6'b000010, 2'h?, 4'b0100, 5'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ZIP2_H =
+      { 6'b000010, 2'h?, 4'b0110, 5'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ZIP_H =
+      { 6'b000010, 2'h?, 4'b0111, 5'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ZIP8 =
+      { 6'b000010, 2'h?, 4'b1000, 5'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ZIP4 =
+      { 6'b000010, 2'h?, 4'b1100, 5'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ZIP2 =
+      { 6'b000010, 2'h?, 4'b1110, 5'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_ZIP =
+      { 6'b000010, 2'h?, 4'b1111, 5'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  // unshfli
+  parameter logic [31:0] INSN_UNSHFLI = { 6'b000010, 11'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  // unshfli -- pseudo-instructions
+  parameter logic [31:0] INSN_UNZIP_N =
+      { 6'b000010, 2'h?, 4'b0001, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_UNZIP2_B =
+      { 6'b000010, 2'h?, 4'b0010, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_UNZIP_B =
+      { 6'b000010, 2'h?, 4'b0011, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_UNZIP4_H =
+      { 6'b000010, 2'h?, 4'b0100, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_UNZIP2_H =
+      { 6'b000010, 2'h?, 4'b0110, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_UNZIP_H =
+      { 6'b000010, 2'h?, 4'b0111, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_UNZIP8 =
+      { 6'b000010, 2'h?, 4'b1000, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_UNZIP4 =
+      { 6'b000010, 2'h?, 4'b1100, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_UNZIP2 =
+      { 6'b000010, 2'h?, 4'b1110, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_UNZIP =
+      { 6'b000010, 2'h?, 4'b1111, 5'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+
+  parameter logic [31:0] INSN_GREV   = { 7'b0110100, 10'h?, 3'b101, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_GORC   = { 7'b0010100, 10'h?, 3'b101, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_SHFL   = { 7'b0000100, 10'h?, 3'b001, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_UNSHFL = { 7'b0000100, 10'h?, 3'b101, 5'h?, {OPCODE_OP} };
+
+  parameter logic [31:0] INSN_XPERM_N = { 7'b0010100, 10'h?, 3'b010, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_XPERM_B = { 7'b0010100, 10'h?, 3'b100, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_XPERM_H = { 7'b0010100, 10'h?, 3'b110, 5'h?, {OPCODE_OP} };
+
+  parameter logic [31:0] INSN_SLO    = { 7'b0010000, 10'h?, 3'b001, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_SRO    = { 7'b0010000, 10'h?, 3'b101, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_SLOI   = { 5'b00100        , 12'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  // Only log2(XLEN) bits of the immediate are used. For RV32, this means only the bits in
+  // instr[24:20] are effectively used. Whenever instr[26] is set, sroi/rori is instead decoded as
+  // fsri.
+  parameter logic [31:0] INSN_SROI   = { 5'b00100  , 1'b0, 11'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+
+  // ZBE
+  parameter logic [31:0] INSN_BDECOMPRESS = {7'b0100100, 10'h?, 3'b110, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_BCOMPRESS   = {7'b0000100, 10'h?, 3'b110, 5'h?, {OPCODE_OP} };
+
+  // ZBT
+  parameter logic [31:0] INSN_FSRI = { 5'h?, 1'b1, 11'h?, 3'b101, 5'h?, {OPCODE_OP_IMM} };
+
+  parameter logic [31:0] INSN_CMIX = {5'h?, 2'b11, 10'h?, 3'b001, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_CMOV = {5'h?, 2'b11, 10'h?, 3'b101, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_FSL  = {5'h?, 2'b10, 10'h?, 3'b001, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_FSR  = {5'h?, 2'b10, 10'h?, 3'b101, 5'h?, {OPCODE_OP} };
+
+  // ZBF
+  parameter logic [31:0] INSN_BFP  = {7'b0100100, 10'h?, 3'b111, 5'h?, {OPCODE_OP} };
+
+  // ZBC
+  parameter logic [31:0] INSN_CLMUL  = {7'b0000101, 10'h?, 3'b001, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_CLMULR = {7'b0000101, 10'h?, 3'b010, 5'h?, {OPCODE_OP} };
+  parameter logic [31:0] INSN_CLMULH = {7'b0000101, 10'h?, 3'b011, 5'h?, {OPCODE_OP} };
+
+  // ZBR
+  parameter logic [31:0] INSN_CRC32_B  =
+      {7'b0110000, 5'b10000, 5'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_CRC32_H  =
+      {7'b0110000, 5'b10001, 5'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_CRC32_W  =
+      {7'b0110000, 5'b10010, 5'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_CRC32C_B =
+      {7'b0110000, 5'b11000, 5'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_CRC32C_H =
+      {7'b0110000, 5'b11001, 5'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+  parameter logic [31:0] INSN_CRC32C_W =
+      {7'b0110000, 5'b11010, 5'h?, 3'b001, 5'h?, {OPCODE_OP_IMM} };
+
+  // LOAD & STORE
+  parameter logic [31:0] INSN_LOAD    = {25'h?,                            {OPCODE_LOAD } };
+  parameter logic [31:0] INSN_STORE   = {25'h?,                            {OPCODE_STORE} };
+
+  // MISC-MEM
+  parameter logic [31:0] INSN_FENCE   = { 17'h?,             3'b000, 5'h?, {OPCODE_MISC_MEM} };
+  parameter logic [31:0] INSN_FENCEI  = { 17'h0,             3'b001, 5'h0, {OPCODE_MISC_MEM} };
+
+  // Compressed Instructions
+  // C0
+  parameter logic [15:0] INSN_CADDI4SPN  = { 3'b000,       11'h?,                    {OPCODE_C0} };
+  parameter logic [15:0] INSN_CLW        = { 3'b010,       11'h?,                    {OPCODE_C0} };
+  parameter logic [15:0] INSN_CSW        = { 3'b110,       11'h?,                    {OPCODE_C0} };
+  parameter logic [15:0] INSN_CCLC       = { 3'b011,       11'h?,                    {OPCODE_C0} };
+  parameter logic [15:0] INSN_CCSC       = { 3'b111,       11'h?,                    {OPCODE_C0} };
+
+  // C1
+  parameter logic [15:0] INSN_CADDI      = { 3'b000,       11'h?,                    {OPCODE_C1} };
+  parameter logic [15:0] INSN_CJAL       = { 3'b001,       11'h?,                    {OPCODE_C1} };
+  parameter logic [15:0] INSN_CJ         = { 3'b101,       11'h?,                    {OPCODE_C1} };
+  parameter logic [15:0] INSN_CLI        = { 3'b010,       11'h?,                    {OPCODE_C1} };
+  parameter logic [15:0] INSN_CLUI       = { 3'b011,       11'h?,                    {OPCODE_C1} };
+  parameter logic [15:0] INSN_CBEQZ      = { 3'b110,       11'h?,                    {OPCODE_C1} };
+  parameter logic [15:0] INSN_CBNEZ      = { 3'b111,       11'h?,                    {OPCODE_C1} };
+  parameter logic [15:0] INSN_CSRLI      = { 3'b100, 1'h?, 2'b00, 8'h?,              {OPCODE_C1} };
+  parameter logic [15:0] INSN_CSRAI      = { 3'b100, 1'h?, 2'b01, 8'h?,              {OPCODE_C1} };
+  parameter logic [15:0] INSN_CANDI      = { 3'b100, 1'h?, 2'b10, 8'h?,              {OPCODE_C1} };
+  parameter logic [15:0] INSN_CSUB       = { 3'b100, 1'b0, 2'b11, 3'h?, 2'b00, 3'h?, {OPCODE_C1} };
+  parameter logic [15:0] INSN_CXOR       = { 3'b100, 1'b0, 2'b11, 3'h?, 2'b01, 3'h?, {OPCODE_C1} };
+  parameter logic [15:0] INSN_COR        = { 3'b100, 1'b0, 2'b11, 3'h?, 2'b10, 3'h?, {OPCODE_C1} };
+  parameter logic [15:0] INSN_CAND       = { 3'b100, 1'b0, 2'b11, 3'h?, 2'b11, 3'h?, {OPCODE_C1} };
+
+  // C2
+  parameter logic [15:0] INSN_CSLLI      = { 3'b000,       11'h?,                    {OPCODE_C2} };
+  parameter logic [15:0] INSN_CLWSP      = { 3'b010,       11'h?,                    {OPCODE_C2} };
+  parameter logic [15:0] INSN_SWSP       = { 3'b110,       11'h?,                    {OPCODE_C2} };
+  parameter logic [15:0] INSN_CMV        = { 3'b100, 1'b0, 10'h?,                    {OPCODE_C2} };
+  parameter logic [15:0] INSN_CADD       = { 3'b100, 1'b1, 10'h?,                    {OPCODE_C2} };
+  parameter logic [15:0] INSN_CEBREAK    = { 3'b100, 1'b1,        5'h0,  5'h0,       {OPCODE_C2} };
+  parameter logic [15:0] INSN_CJR        = { 3'b100, 1'b0,        5'h0,  5'h0,       {OPCODE_C2} };
+  parameter logic [15:0] INSN_CJALR      = { 3'b100, 1'b1,        5'h?,  5'h0,       {OPCODE_C2} };
+  parameter logic [15:0] INSN_CCLCSP     = { 3'b011,       11'h?,                    {OPCODE_C2} };  // FLWSP
+  parameter logic [15:0] INSN_CCSCSP     = { 3'b111,       11'h?,                    {OPCODE_C2} };  // FSWSP
+
+  // 32-bit CHERI instructions
+  parameter logic [31:0] INSN_CHGETPERM    = {7'h7f, 5'h0, 5'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHGETTYPE    = {7'h7f, 5'h1, 5'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHGETBASE    = {7'h7f, 5'h2, 5'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHGETHIGH    = {7'h7f, 5'h17, 5'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHGETTOP     = {7'h7f, 5'h18, 5'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHGETLEN     = {7'h7f, 5'h3, 5'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHGETTAG     = {7'h7f, 5'h4, 5'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHGETSEALED  = {7'h7f, 5'h5, 5'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHGETADDR    = {7'h7f, 5'hf, 5'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+
+  parameter logic [31:0] INSN_CHSEAL          = {7'h0b, 10'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHUNSEAL        = {7'h0c, 10'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHANDPERM       = {7'h0d, 10'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHSETADDR       = {7'h10, 10'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHINCADDR       = {7'h11, 10'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHINCADDRIMM    = {12'h?, 5'h?,  3'b001, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHSETBOUNDS     = {7'h08, 10'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHSETBOUNDSEX   = {7'h09, 10'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHSETBOUNDSRNDN = {7'h0a, 10'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHSETBOUNDSIMM  = {12'h?, 5'h?,  3'b010, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHCLEARTAG      = {7'h7f, 5'hb, 5'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHCRRL          = {7'h7f, 5'h8, 5'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHCRAM          = {7'h7f, 5'h9, 5'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+
+  parameter logic [31:0] INSN_CHSUB      = {7'h14, 5'h?, 5'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHMOVE     = {7'h7f, 5'ha, 5'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHTESTSUB  = {7'h20, 5'h?, 5'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHSETEQUAL = {7'h21, 5'h?, 5'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_CHSETHIGH  = {7'h16, 5'h?, 5'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+
+  parameter logic [31:0] INSN_CHJALR   = {7'h7f, 5'hc, 5'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+
+  parameter logic [31:0] INSN_CHCSRRW = {7'h01, 5'h?, 5'h?, 3'b000, 5'h?, {OPCODE_CHERI} };
+  parameter logic [31:0] INSN_AUICGP  = { 25'h?,                          {OPCODE_AUICGP} };
+
+endpackage
diff --git a/hw/ip/cheriot-ibex/rtl/cheriot_wb_stage.sv b/hw/ip/cheriot-ibex/rtl/cheriot_wb_stage.sv
new file mode 100644
index 0000000..8ff5461
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriot_wb_stage.sv
@@ -0,0 +1,269 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Writeback Stage
+ *
+ * Writeback is an optional third pipeline stage. It writes data back to the register file that was
+ * produced in the ID/EX stage or awaits a response to a load/store (LSU writes direct to register
+ * file for load data). If the writeback stage is not present (WritebackStage == 0) this acts as
+ * a simple passthrough to write data direct to the register file.
+ */
+
+`include "prim_assert.sv"
+`include "dv_fcov_macros.svh"
+
+
+module cheriot_wb_stage import cheri_pkg::*; #(
+  parameter bit ResetAll       = 1'b0,
+  parameter bit WritebackStage = 1'b0
+) (
+  input  logic                     clk_i,
+  input  logic                     rst_ni,
+
+  input  logic                     en_wb_i,
+  input  cheriot_pkg::wb_instr_type_e instr_type_wb_i,
+  input  logic [31:0]              pc_id_i,
+  input  logic                     instr_is_compressed_id_i,
+  input  logic                     instr_perf_count_id_i,
+  input  logic                     instr_is_cheri_i,
+  input  logic                     cheri_load_i,
+  input  logic                     cheri_store_i,
+
+  output logic                     ready_wb_o,
+  output logic                     rf_write_wb_o,
+  output logic                     outstanding_load_wb_o,
+  output logic                     outstanding_store_wb_o,
+  output logic [31:0]              pc_wb_o,
+  output logic                     perf_instr_ret_wb_o,
+  output logic                     perf_instr_ret_compressed_wb_o,
+  output logic                     perf_instr_ret_wb_spec_o,
+  output logic                     perf_instr_ret_compressed_wb_spec_o,
+
+  input  logic [4:0]               rf_waddr_id_i,
+  input  logic [31:0]              rf_wdata_id_i,
+  input  logic                     rf_we_id_i,
+
+  input  logic                     cheri_rf_we_i,
+  input  logic [31:0]              cheri_rf_wdata_i,
+  input  reg_cap_t                 cheri_rf_wcap_i,
+
+  input  logic [31:0]              rf_wdata_lsu_i,
+  input  reg_cap_t                 rf_wcap_lsu_i,
+  input  logic                     rf_we_lsu_i,
+
+  output logic [31:0]              rf_wdata_fwd_wb_o,
+  output reg_cap_t                 rf_wcap_fwd_wb_o,
+
+  output logic [4:0]               rf_waddr_wb_o,
+  output logic [31:0]              rf_wdata_wb_o,
+  output reg_cap_t                 rf_wcap_wb_o,
+  output logic                     rf_we_wb_o,
+
+  input logic                      lsu_resp_valid_i,
+  input logic                      lsu_resp_err_i,
+
+  output logic                     instr_done_wb_o
+);
+
+  import cheriot_pkg::*;
+
+  // 0 == RF write from ID
+  // 1 == RF write from LSU
+  logic [31:0] rf_wdata_wb_mux    [2];
+  logic [1:0]  rf_wdata_wb_mux_we;
+
+  reg_cap_t   rf_wcap_wb;
+
+  if (WritebackStage) begin : g_writeback_stage
+    logic [31:0]    rf_wdata_wb_q;
+    logic           rf_we_wb_q;
+    logic [4:0]     rf_waddr_wb_q;
+
+    logic           wb_done;
+
+    logic           wb_valid_q;
+    logic [31:0]    wb_pc_q;
+    logic           wb_compressed_q;
+    logic           wb_count_q;
+    wb_instr_type_e wb_instr_type_q;
+
+    logic           wb_valid_d;
+
+    logic           wb_is_cheri_q;
+    logic           wb_cheri_load_q, wb_cheri_store_q;
+    logic           cheri_rf_we_q;
+    logic [31:0]    cheri_rf_wdata_q;
+    reg_cap_t       cheri_rf_wcap_q;
+
+    // Stage becomes valid if an instruction enters for ID/EX and valid is cleared when instruction
+    // is done
+    assign wb_valid_d = (en_wb_i & ready_wb_o) | (wb_valid_q & ~wb_done);
+
+    // Writeback for non load/store instructions always completes in a cycle (so instantly done)
+    // Writeback for load/store must wait for response to be received by the LSU
+    // Signal only relevant if wb_valid_q set
+
+    // note cheri_load/store doesn't just come from the decoder, but includes bound/permission check results
+    assign wb_done = (wb_instr_type_q == WB_INSTR_OTHER && ~wb_cheri_load_q && ~wb_cheri_store_q) | lsu_resp_valid_i;
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        wb_valid_q <= 1'b0;
+      end else begin
+        wb_valid_q <= wb_valid_d;
+      end
+    end
+
+    if (ResetAll) begin : g_wb_regs_ra
+      always_ff @(posedge clk_i or negedge rst_ni) begin
+        if (!rst_ni) begin
+          rf_we_wb_q      <= '0;
+          rf_waddr_wb_q   <= '0;
+          rf_wdata_wb_q   <= '0;
+          wb_instr_type_q <= wb_instr_type_e'(0);
+          wb_pc_q         <= '0;
+          wb_compressed_q <= '0;
+          wb_count_q      <= '0;
+
+          wb_is_cheri_q       <= 1'b0;
+          wb_cheri_load_q     <= 1'b0;
+          wb_cheri_store_q    <= 1'b0;
+          cheri_rf_we_q       <= 1'b0;
+          cheri_rf_wdata_q    <= 32'h0;
+          cheri_rf_wcap_q     <= NULL_REG_CAP;
+        end else if (en_wb_i) begin
+          rf_we_wb_q      <= rf_we_id_i;
+          rf_waddr_wb_q   <= rf_waddr_id_i;
+          rf_wdata_wb_q   <= rf_wdata_id_i;
+          wb_instr_type_q <= instr_type_wb_i;
+          wb_pc_q         <= pc_id_i;
+          wb_compressed_q <= instr_is_compressed_id_i;
+          wb_count_q      <= instr_perf_count_id_i;
+
+          wb_is_cheri_q       <= instr_is_cheri_i;
+          wb_cheri_load_q     <= cheri_load_i;
+          wb_cheri_store_q    <= cheri_store_i;
+          cheri_rf_we_q       <= cheri_rf_we_i;
+          cheri_rf_wdata_q    <= cheri_rf_wdata_i;
+          cheri_rf_wcap_q     <= cheri_rf_wcap_i;
+        end
+      end
+    end else begin : g_wb_regs_nr
+      always_ff @(posedge clk_i) begin
+        if (en_wb_i) begin
+          rf_we_wb_q      <= rf_we_id_i;
+          rf_waddr_wb_q   <= rf_waddr_id_i;
+          rf_wdata_wb_q   <= rf_wdata_id_i;
+          wb_instr_type_q <= instr_type_wb_i;
+          wb_pc_q         <= pc_id_i;
+          wb_compressed_q <= instr_is_compressed_id_i;
+          wb_count_q      <= instr_perf_count_id_i;
+
+          wb_is_cheri_q       <= instr_is_cheri_i;
+          wb_cheri_load_q     <= cheri_load_i;
+          wb_cheri_store_q    <= cheri_store_i;
+          cheri_rf_we_q       <= cheri_rf_we_i;
+          cheri_rf_wdata_q    <= cheri_rf_wdata_i;
+          cheri_rf_wcap_q     <= cheri_rf_wcap_i;
+        end
+      end
+    end
+
+    assign rf_waddr_wb_o         = rf_waddr_wb_q;
+    assign rf_wdata_wb_mux[0]    = wb_is_cheri_q ? cheri_rf_wdata_q : rf_wdata_wb_q;
+    assign rf_wdata_wb_mux_we[0] = (wb_is_cheri_q ? cheri_rf_we_q : rf_we_wb_q) & wb_valid_q;
+
+    assign ready_wb_o = ~wb_valid_q | wb_done;
+
+    // This is used for determining RF read hazards & forwarding in ID/EX
+    // Instruction in writeback will be writing to register file if either rf_we is set or writeback
+    // is awaiting load data.
+    assign rf_write_wb_o = wb_valid_q & (rf_we_wb_q | cheri_rf_we_q | (wb_instr_type_q == WB_INSTR_LOAD) | wb_cheri_load_q);
+
+    assign outstanding_load_wb_o  = wb_valid_q & ((wb_instr_type_q == WB_INSTR_LOAD)  | wb_cheri_load_q);
+    assign outstanding_store_wb_o = wb_valid_q & ((wb_instr_type_q == WB_INSTR_STORE) | wb_cheri_store_q);
+
+    assign pc_wb_o = wb_pc_q;
+
+    assign instr_done_wb_o = wb_valid_q & wb_done;
+
+    // Increment instruction retire counters for valid instructions which are not lsu errors.
+    // Speculative versions of the signals do not factor in exceptions and whether the instruction
+    // is done yet. These are used to get correct values for instructions reading the relevant
+    // performance counters in the ID stage.
+    assign perf_instr_ret_wb_spec_o            = wb_count_q;
+    assign perf_instr_ret_compressed_wb_spec_o = perf_instr_ret_wb_spec_o & wb_compressed_q;
+    assign perf_instr_ret_wb_o                 = instr_done_wb_o & wb_count_q &
+                                                 ~(lsu_resp_valid_i & lsu_resp_err_i);
+    assign perf_instr_ret_compressed_wb_o      = perf_instr_ret_wb_o & wb_compressed_q;
+
+    // Forward data that will be written to the RF back to ID to resolve data hazards. The flopped
+    // rf_wdata_wb_q is used rather than rf_wdata_wb_o as the latter includes read data from memory
+    // that returns too late to be used on the forwarding path.
+    assign rf_wdata_fwd_wb_o = wb_is_cheri_q ? cheri_rf_wdata_q : rf_wdata_wb_q;
+    assign rf_wcap_fwd_wb_o  = wb_is_cheri_q ? cheri_rf_wcap_q : NULL_REG_CAP;
+    assign rf_wcap_wb        = (wb_is_cheri_q && (~wb_cheri_load_q)) ? cheri_rf_wcap_q : NULL_REG_CAP;
+
+  end else begin : g_bypass_wb
+    // without writeback stage just pass through register write signals
+    assign rf_waddr_wb_o         = rf_waddr_id_i;
+    assign rf_wdata_wb_mux[0]    = instr_is_cheri_i ? cheri_rf_wdata_i : rf_wdata_id_i;
+    assign rf_wdata_wb_mux_we[0] = instr_is_cheri_i ? cheri_rf_we_i : rf_we_id_i;
+    assign rf_wcap_wb            = (instr_is_cheri_i && (~cheri_load_i)) ? cheri_rf_wcap_i : NULL_REG_CAP;
+
+    // Increment instruction retire counters for valid instructions which are not lsu errors.
+    // The speculative signals are always 0 when no writeback stage is present as the raw counter
+    // values will be correct.
+    assign perf_instr_ret_wb_spec_o            = 1'b0;
+    assign perf_instr_ret_compressed_wb_spec_o = 1'b0;
+    assign perf_instr_ret_wb_o                 = instr_perf_count_id_i & en_wb_i &
+                                                 ~(lsu_resp_valid_i & lsu_resp_err_i);
+    assign perf_instr_ret_compressed_wb_o      = perf_instr_ret_wb_o & instr_is_compressed_id_i;
+
+    // ready needs to be constant 1 without writeback stage (otherwise ID/EX stage will stall)
+    assign ready_wb_o    = 1'b1;
+
+    // Unused Writeback stage only IO & wiring
+    // Assign inputs and internal wiring to unused signals to satisfy lint checks
+    // Tie-off outputs to constant values
+    logic           unused_clk;
+    logic           unused_rst;
+    wb_instr_type_e unused_instr_type_wb;
+    logic [31:0]    unused_pc_id;
+
+    assign unused_clk            = clk_i;
+    assign unused_rst            = rst_ni;
+    assign unused_instr_type_wb  = instr_type_wb_i;
+    assign unused_pc_id          = pc_id_i;
+
+    assign outstanding_load_wb_o  = 1'b0;
+    assign outstanding_store_wb_o = 1'b0;
+    assign pc_wb_o                = '0;
+    assign rf_write_wb_o          = 1'b0;
+    assign rf_wdata_fwd_wb_o      = 32'b0;
+    assign rf_wcap_fwd_wb_o       = NULL_REG_CAP;
+    assign instr_done_wb_o        = 1'b0;
+  end
+
+  assign rf_wdata_wb_mux[1]    = rf_wdata_lsu_i;
+  assign rf_wdata_wb_mux_we[1] = rf_we_lsu_i;
+
+  // RF write data can come from ID results (all RF writes that aren't because of loads will come
+  // from here) or the LSU (RF writes for load data)
+  assign rf_wdata_wb_o = ({32{rf_wdata_wb_mux_we[0]}} & rf_wdata_wb_mux[0]) |
+                         ({32{rf_wdata_wb_mux_we[1]}} & rf_wdata_wb_mux[1]);
+  assign rf_we_wb_o    = |rf_wdata_wb_mux_we;
+
+  assign rf_wcap_wb_o  = rf_wdata_wb_mux_we[0] ? rf_wcap_wb :
+                         (rf_wdata_wb_mux_we[1] ? rf_wcap_lsu_i : NULL_REG_CAP);
+
+  `DV_FCOV_SIGNAL_GEN_IF(logic, wb_valid, g_writeback_stage.wb_valid_q, WritebackStage)
+
+  `ASSERT(RFWriteFromOneSourceOnly, $onehot0(rf_wdata_wb_mux_we))
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriotc_top.sv b/hw/ip/cheriot-ibex/rtl/cheriotc_top.sv
new file mode 100644
index 0000000..5d949bf
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriotc_top.sv
@@ -0,0 +1,465 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright lowRISC contributors.
+// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+`ifdef RISCV_FORMAL
+  `define RVFI
+`endif
+
+`include "prim_assert.sv"
+
+
+/**
+ * Top level module of the ibex RISC-V core
+ */
+module cheriot_top import cheriot_pkg::*; import cheri_pkg::*; #(
+  parameter int unsigned DmHaltAddr       = 32'h1A110800,
+  parameter int unsigned DmExceptionAddr  = 32'h1A110808,
+  parameter bit          DbgTriggerEn     = 1'b1,
+  parameter int unsigned DbgHwBreakNum    = 2,
+  parameter int unsigned MHPMCounterNum   = 0,
+  parameter int unsigned MHPMCounterWidth = 40,
+  parameter bit          RV32E            = 1'b0,
+  parameter rv32b_e      RV32B            = RV32BNone,
+  parameter rv32m_e      RV32M            = RV32MFast, 
+  parameter bit          WritebackStage   = 1'b1,
+  parameter bit          BranchPredictor  = 1'b0,
+  parameter bit          SecureIbex       = 1'b0,   // placeholder for TB compatbility
+  parameter bit          CHERIoTEn        = 1'b1,
+  parameter int unsigned DataWidth        = 33,
+  parameter int unsigned HeapBase         = 32'h2001_0000,
+  parameter int unsigned TSMapBase        = 32'h2002_f000, // 4kB default
+  parameter int unsigned TSMapSize        = 1024,
+  parameter bit          MemCapFmt        = 1'b0,
+  parameter bit          CheriPPLBC       = 1'b1,
+  parameter bit          CheriSBND2       = 1'b0,
+  parameter bit          CheriTBRE        = 1'b1,
+  parameter bit          CheriStkZ        = 1'b1,
+  parameter int unsigned MMRegDinW        = 128,
+  parameter int unsigned MMRegDoutW       = 64,
+  parameter bit          CheriCapIT8      = 1'b0
+) (
+  // Clock and Reset
+  input  logic                         clk_i,
+  input  logic                         rst_ni,
+
+  input  logic                         test_en_i,     // enable all clock gates for testing
+  input  prim_ram_1p_pkg::ram_1p_cfg_t ram_cfg_i,
+
+  input  logic                         cheri_pmode_i,
+  input  logic                         cheri_tsafe_en_i,
+  input  logic [31:0]                  hart_id_i,
+  input  logic [31:0]                  boot_addr_i,
+
+  // Instruction memory interface
+  output logic                         instr_req_o,
+  input  logic                         instr_gnt_i,
+  input  logic                         instr_rvalid_i,
+  output logic [31:0]                  instr_addr_o,
+  input  logic [31:0]                  instr_rdata_i,
+  input  logic [6:0]                   instr_rdata_intg_i,
+  input  logic                         instr_err_i,
+
+  // Data memory interface
+  output logic                         data_req_o,
+  output logic                         data_is_cap_o,
+  input  logic                         data_gnt_i,
+  input  logic                         data_rvalid_i,
+  output logic                         data_we_o,
+  output logic [3:0]                   data_be_o,
+  output logic [31:0]                  data_addr_o,
+  output logic [32:0]                  data_wdata_o,
+  output logic [6:0]                   data_wdata_intg_o,
+  input  logic [32:0]                  data_rdata_i,
+  input  logic [6:0]                   data_rdata_intg_i,
+  input  logic                         data_err_i,
+
+  // TS map memory interface
+  output logic                         tsmap_cs_o,
+  output logic [15:0]                  tsmap_addr_o,
+  input  logic [31:0]                  tsmap_rdata_i,
+  input  logic [MMRegDinW-1:0]         mmreg_corein_i,
+  output logic [MMRegDoutW-1:0]        mmreg_coreout_o,
+  output logic                         cheri_fatal_err_o,
+
+  // Interrupt inputs
+  input  logic                         irq_software_i,
+  input  logic                         irq_timer_i,
+  input  logic                         irq_external_i,
+  input  logic [14:0]                  irq_fast_i,
+  input  logic                         irq_nm_i,       // non-maskeable interrupt
+
+  // Scrambling Interface
+  input  logic                         scramble_key_valid_i,
+  input  logic [SCRAMBLE_KEY_W-1:0]    scramble_key_i,
+  input  logic [SCRAMBLE_NONCE_W-1:0]  scramble_nonce_i,
+  output logic                         scramble_req_o,
+
+  // Debug Interface
+  input  logic                         debug_req_i,
+  output crash_dump_t                  crash_dump_o,
+  output logic                         double_fault_seen_o,
+
+  // RISC-V Formal Interface
+  // Does not comply with the coding standards of _i/_o suffixes, but follows
+  // the convention of RISC-V Formal Interface Specification.
+`ifdef RVFI
+  output logic                         rvfi_valid,
+  output logic [63:0]                  rvfi_order,
+  output logic [31:0]                  rvfi_insn,
+  output logic                         rvfi_trap,
+  output logic                         rvfi_halt,
+  output logic                         rvfi_intr,
+  output logic [ 1:0]                  rvfi_mode,
+  output logic [ 1:0]                  rvfi_ixl,
+  output logic [ 4:0]                  rvfi_rs1_addr,
+  output logic [ 4:0]                  rvfi_rs2_addr,
+  output logic [ 4:0]                  rvfi_rs3_addr,
+  output logic [31:0]                  rvfi_rs1_rdata,
+  output reg_cap_t                     rvfi_rs1_rcap,
+  output logic [31:0]                  rvfi_rs2_rdata,
+  output reg_cap_t                     rvfi_rs2_rcap,
+  output logic [31:0]                  rvfi_rs3_rdata,
+  output logic [ 4:0]                  rvfi_rd_addr,
+  output logic [31:0]                  rvfi_rd_wdata,
+  output reg_cap_t                     rvfi_rd_wcap,
+  output logic [31:0]                  rvfi_pc_rdata,
+  output logic [31:0]                  rvfi_pc_wdata,
+  output logic [31:0]                  rvfi_mem_addr,
+  output logic [ 3:0]                  rvfi_mem_rmask,
+  output logic [ 3:0]                  rvfi_mem_wmask,
+  output logic [32:0]                  rvfi_mem_rdata,
+  output logic [32:0]                  rvfi_mem_wdata,
+  output logic                         rvfi_mem_is_cap,
+  output reg_cap_t                     rvfi_mem_rcap,
+  output reg_cap_t                     rvfi_mem_wcap,
+  output logic [31:0]                  rvfi_ext_mip,
+  output logic                         rvfi_ext_nmi,
+  output logic                         rvfi_ext_debug_req,
+  output logic [63:0]                  rvfi_ext_mcycle,
+`endif
+
+  // CPU Control Signals
+  input  fetch_enable_t                fetch_enable_i,
+  output logic                         core_sleep_o,
+  output logic                         alert_minor_o,
+  output logic                         alert_major_internal_o,
+  output logic                         alert_major_bus_o,
+
+
+  // DFT bypass controls
+  input logic                          scan_rst_ni
+);
+
+  localparam bit          ResetAll          = 1'b1;
+  localparam int unsigned RegFileDataWidth  = 32;
+
+  // Clock signals
+  logic                        clk;
+  logic                        core_busy_d, core_busy_q;
+  logic                        clock_en;
+  logic                        irq_pending;
+  // Core <-> Register file signals
+  logic [4:0]                  rf_raddr_a;
+  logic [4:0]                  rf_raddr_b;
+  logic [4:0]                  rf_waddr_wb;
+  logic                        rf_we_wb;
+  logic [RegFileDataWidth-1:0] rf_wdata_wb_ecc;
+  logic [RegFileDataWidth-1:0] rf_rdata_a_ecc, rf_rdata_a_ecc_buf;
+  logic [RegFileDataWidth-1:0] rf_rdata_b_ecc, rf_rdata_b_ecc_buf;
+  reg_cap_t                    rf_rcap_a, rf_rcap_b;
+  reg_cap_t                    rf_wcap;
+
+  logic [31:0]   rf_reg_rdy;
+  logic [4:0]    rf_trvk_addr;
+  logic          rf_trvk_en;
+  logic          rf_trvk_clrtag;
+  logic [4:0]    rf_trsv_addr;
+  logic          rf_trsv_en;
+
+  fetch_enable_t fetch_enable_buf;
+
+  /////////////////////
+  // Main clock gate //
+  /////////////////////
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      core_busy_q <= 1'b0;
+    end else begin
+      core_busy_q <= core_busy_d;
+    end
+  end
+
+  assign clock_en     = core_busy_q | debug_req_i | irq_pending | irq_nm_i;
+  assign core_sleep_o = ~clock_en;
+
+  // let's not worry about clock gating for now. kliu
+  assign clk = clk_i;
+
+//  prim_clock_gating core_clock_gate_i (
+//    .clk_i    (clk_i),
+//    .en_i     (clock_en),
+//    .test_en_i(test_en_i),
+//    .clk_o    (clk)
+//  );
+
+  ////////////////////////
+  // Core instantiation //
+  ////////////////////////
+
+`ifdef FPGA
+  // Buffer security critical signals to prevent synthesis optimisation removing them
+  prim_buf #(.Width($bits(fetch_enable_t))) u_fetch_enable_buf (
+    .in_i (fetch_enable_i),
+    .out_o(fetch_enable_buf)
+  );
+
+  prim_buf #(.Width(RegFileDataWidth)) u_rf_rdata_a_ecc_buf (
+    .in_i (rf_rdata_a_ecc),
+    .out_o(rf_rdata_a_ecc_buf)
+  );
+
+  prim_buf #(.Width(RegFileDataWidth)) u_rf_rdata_b_ecc_buf (
+    .in_i (rf_rdata_b_ecc),
+    .out_o(rf_rdata_b_ecc_buf)
+  );
+`else
+  assign fetch_enable_buf = fetch_enable_i;
+  assign rf_rdata_a_ecc_buf = rf_rdata_a_ecc;
+  assign rf_rdata_b_ecc_buf = rf_rdata_b_ecc;
+`endif
+
+  cheriot_core #(
+    .PMPEnable        (1'b0),
+    .PMPGranularity   (0),
+    .PMPNumRegions    (4),
+    .MHPMCounterNum   (MHPMCounterNum  ),
+    .MHPMCounterWidth (MHPMCounterWidth),
+    .RV32E            (RV32E),
+    .RV32M            (RV32M),
+    .RV32B            (RV32BNone),
+    .BranchTargetALU  (1'b1),
+    .ICache           (1'b0),
+    .ICacheECC        (1'b0),
+    .BusSizeECC       (BUS_SIZE),
+    .TagSizeECC       (IC_TAG_SIZE),
+    .LineSizeECC      (IC_LINE_SIZE),
+    .BranchPredictor  (BranchPredictor),
+    .DbgTriggerEn     (DbgTriggerEn),
+    .DbgHwBreakNum    (DbgHwBreakNum),
+    .WritebackStage   (WritebackStage),
+    .ResetAll         (ResetAll),
+    .RndCnstLfsrSeed  (RndCnstLfsrSeedDefault),
+    .RndCnstLfsrPerm  (RndCnstLfsrPermDefault),
+    .SecureIbex       (1'b0),
+    .DummyInstructions(1'b0),
+    .RegFileECC       (1'b0),
+    .RegFileDataWidth (RegFileDataWidth),
+    .DmHaltAddr       (DmHaltAddr),
+    .DmExceptionAddr  (DmExceptionAddr),
+    .CHERIoTEn        (CHERIoTEn),
+    .DataWidth        (DataWidth),
+    .HeapBase         (HeapBase),
+    .TSMapBase        (TSMapBase),
+    .TSMapSize        (TSMapSize),
+    .MemCapFmt        (MemCapFmt),
+    .CheriPPLBC       (CheriPPLBC),
+    .CheriSBND2       (CheriSBND2),
+    .CheriTBRE        (CheriTBRE),
+    .CheriStkZ        (CheriStkZ),
+    .MMRegDinW        (MMRegDinW),
+    .MMRegDoutW       (MMRegDoutW),
+    .CheriCapIT8      (CheriCapIT8)
+  ) u_cheriot_core (
+    .clk_i      (clk),
+    .rst_ni     (rst_ni),
+
+    .cheri_pmode_i  (cheri_pmode_i),
+    .cheri_tsafe_en_i  (cheri_tsafe_en_i),
+    .hart_id_i      (hart_id_i    ) ,
+    .boot_addr_i    (boot_addr_i  ) ,
+
+    .instr_req_o    (instr_req_o   ),
+    .instr_gnt_i    (instr_gnt_i   ),
+    .instr_rvalid_i (instr_rvalid_i),
+    .instr_addr_o   (instr_addr_o  ),
+    .instr_rdata_i  (instr_rdata_i ),
+    .instr_err_i    (instr_err_i   ),
+
+    .data_req_o     (data_req_o    ),
+    .data_is_cap_o  (data_is_cap_o ),
+    .data_gnt_i     (data_gnt_i    ),
+    .data_rvalid_i  (data_rvalid_i ),
+    .data_we_o      (data_we_o     ),
+    .data_be_o      (data_be_o     ),
+    .data_addr_o    (data_addr_o   ),
+    .data_wdata_o   (data_wdata_o  ),
+    .data_rdata_i   (data_rdata_i  ),
+    .data_err_i     (data_err_i    ),
+
+    .dummy_instr_id_o (),
+    .rf_raddr_a_o     (rf_raddr_a),
+    .rf_raddr_b_o     (rf_raddr_b),
+    .rf_waddr_wb_o    (rf_waddr_wb),
+    .rf_we_wb_o       (rf_we_wb),
+    .rf_wdata_wb_ecc_o(rf_wdata_wb_ecc),
+    .rf_rdata_a_ecc_i (rf_rdata_a_ecc_buf),
+    .rf_rdata_b_ecc_i (rf_rdata_b_ecc_buf),
+    .rf_wcap_wb_o     (rf_wcap),
+    .rf_rcap_a_i      (rf_rcap_a),
+    .rf_rcap_b_i      (rf_rcap_b),
+    .rf_reg_rdy_i     (rf_reg_rdy),
+    .rf_trsv_en_o     (rf_trsv_en),
+    .rf_trsv_addr_o   (rf_trsv_addr),
+    .rf_trvk_addr_o   (rf_trvk_addr),
+    .rf_trvk_en_o     (rf_trvk_en    ),
+    .rf_trvk_clrtag_o (rf_trvk_clrtag),
+    .rf_trvk_par_o    (),
+    .rf_trsv_par_o    (),
+    .tsmap_cs_o       (tsmap_cs_o   ),
+    .tsmap_addr_o     (tsmap_addr_o ),
+    .tsmap_rdata_i    (tsmap_rdata_i),
+    .mmreg_corein_i   (mmreg_corein_i),
+    .mmreg_coreout_o  (mmreg_coreout_o),
+    .cheri_fatal_err_o(cheri_fatal_err_o),
+
+    .irq_software_i (irq_software_i),
+    .irq_timer_i    (irq_timer_i   ),
+    .irq_external_i (irq_external_i),
+    .irq_fast_i     (irq_fast_i    ),
+    .irq_nm_i       (irq_nm_i      ),
+    .irq_pending_o(irq_pending),
+
+    .debug_req_i,
+    .crash_dump_o,
+    .double_fault_seen_o,
+
+`ifdef RVFI
+    .rvfi_valid,
+    .rvfi_order,
+    .rvfi_insn,
+    .rvfi_trap,
+    .rvfi_halt,
+    .rvfi_intr,
+    .rvfi_mode,
+    .rvfi_ixl,
+    .rvfi_rs1_addr,
+    .rvfi_rs2_addr,
+    .rvfi_rs3_addr,
+    .rvfi_rs1_rdata,
+    .rvfi_rs1_rcap,
+    .rvfi_rs2_rdata,
+    .rvfi_rs2_rcap,
+    .rvfi_rs3_rdata,
+    .rvfi_rd_addr,
+    .rvfi_rd_wdata,
+    .rvfi_rd_wcap,
+    .rvfi_pc_rdata,
+    .rvfi_pc_wdata,
+    .rvfi_mem_addr,
+    .rvfi_mem_rmask,
+    .rvfi_mem_wmask,
+    .rvfi_mem_rdata,
+    .rvfi_mem_wdata,
+    .rvfi_mem_is_cap,
+    .rvfi_mem_rcap,
+    .rvfi_mem_wcap,
+    .rvfi_ext_mip,
+    .rvfi_ext_nmi,
+    .rvfi_ext_debug_req,
+    .rvfi_ext_mcycle,
+`endif
+
+    .fetch_enable_i(fetch_enable_buf),
+    .alert_minor_o(alert_minor_o),
+    .alert_major_o(alert_major_internal_o),
+    .icache_inval_o(),
+    .core_busy_o   (core_busy_d),
+    .ic_scr_key_valid_i (1'b0),
+    .ic_data_rdata_i    (),
+    .ic_data_wdata_o    (),
+    .ic_data_addr_o     (),
+    .ic_data_write_o    (),
+    .ic_data_req_o      (),
+    .ic_tag_rdata_i     (),
+    .ic_tag_wdata_o     (),
+    .ic_tag_addr_o      (),
+    .ic_tag_write_o     (), 
+    .ic_tag_req_o       ()
+  );
+
+  assign data_wdata_intg_o = 7'h0;
+  assign alert_major_bus_o = 1'b0;
+
+  /////////////////////////////////
+  // Register file Instantiation //
+  /////////////////////////////////
+  if (RV32E) begin
+    cheri_regfile #(
+      .NREGS(16),
+      .NCAPS(16),
+      .CheriPPLBC(CheriPPLBC)
+    ) register_file_i (
+      .clk_i         (clk),
+      .rst_ni        (rst_ni),
+      .raddr_a_i     (rf_raddr_a),
+      .rdata_a_o     (rf_rdata_a_ecc),
+      .rcap_a_o      (rf_rcap_a),
+      .raddr_b_i     (rf_raddr_b),
+      .rdata_b_o     (rf_rdata_b_ecc),
+      .rcap_b_o      (rf_rcap_b),
+      .waddr_a_i     (rf_waddr_wb),
+      .wdata_a_i     (rf_wdata_wb_ecc),
+      .wcap_a_i      (rf_wcap),
+      .we_a_i        (rf_we_wb),
+      .reg_rdy_o     (rf_reg_rdy),
+      .trvk_addr_i   (rf_trvk_addr),
+      .trvk_en_i     (rf_trvk_en),
+      .trvk_clrtag_i (rf_trvk_clrtag),
+      .trsv_addr_i   (rf_trsv_addr),
+      .trsv_en_i     (rf_trsv_en),
+      .trsv_par_i    (7'h0),
+      .trvk_par_i    (7'h0),
+      .par_rst_ni    (1'b0),
+      .alert_o       ()
+    );
+  end else begin
+    cheri_regfile #(
+      .NREGS(32),
+      .NCAPS(16),
+      .CheriPPLBC(CheriPPLBC)
+    ) register_file_i (
+      .clk_i         (clk),
+      .rst_ni        (rst_ni),
+      .raddr_a_i     (rf_raddr_a),
+      .rdata_a_o     (rf_rdata_a_ecc),
+      .rcap_a_o      (rf_rcap_a),
+      .raddr_b_i     (rf_raddr_b),
+      .rdata_b_o     (rf_rdata_b_ecc),
+      .rcap_b_o      (rf_rcap_b),
+      .waddr_a_i     (rf_waddr_wb),
+      .wdata_a_i     (rf_wdata_wb_ecc),
+      .wcap_a_i      (rf_wcap),
+      .we_a_i        (rf_we_wb),
+      .reg_rdy_o     (rf_reg_rdy),
+      .trvk_addr_i   (rf_trvk_addr),
+      .trvk_en_i     (rf_trvk_en),
+      .trvk_clrtag_i (rf_trvk_clrtag),
+      .trsv_addr_i   (rf_trsv_addr),
+      .trsv_en_i     (rf_trsv_en),
+      .trsv_par_i    (7'h0),
+      .trvk_par_i    (7'h0),
+      .par_rst_ni    (1'b0),
+      .alert_o       ()
+    );
+  end
+
+  assign scramble_req_o = 0;
+
+endmodule
diff --git a/hw/ip/cheriot-ibex/rtl/cheriotc_top_tracing.sv b/hw/ip/cheriot-ibex/rtl/cheriotc_top_tracing.sv
new file mode 100644
index 0000000..5840322
--- /dev/null
+++ b/hw/ip/cheriot-ibex/rtl/cheriotc_top_tracing.sv
@@ -0,0 +1,316 @@
+// Copyright Microsoft Corporation
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Top level module of the ibex RISC-V core with tracing enabled
+ */
+
+module cheriot_top_tracing import cheriot_pkg::*; import cheri_pkg::*; #(
+  parameter int unsigned DmHaltAddr       = 32'h1A110800,
+  parameter int unsigned DmExceptionAddr  = 32'h1A110808,
+  parameter bit          RV32E            = 1'b0,
+  parameter bit          CheriTBRE        = 1'b1,
+  parameter bit          CheriStkZ        = 1'b1,
+  parameter int unsigned HeapBase         = 32'h2001_0000,
+  parameter int unsigned TSMapBase        = 32'h2004_0000, // 4kB default
+  parameter int unsigned TSMapSize        = 1024,          // in words
+  parameter int unsigned MMRegDinW        = 128,
+  parameter int unsigned MMRegDoutW       = 64,
+  parameter int unsigned DataWidth        = 33,     // this enables testbench to use defparam to override
+  parameter bit          CheriCapIT8      = 1'b0
+) (
+  // Clock and Reset
+  input  logic                         clk_i,
+  input  logic                         rst_ni,
+
+  input  logic                         test_en_i,     // enable all clock gates for testing
+  input  logic                         scan_rst_ni,
+  input  prim_ram_1p_pkg::ram_1p_cfg_t ram_cfg_i,
+
+  input  logic                         cheri_pmode_i,
+  input  logic                         cheri_tsafe_en_i,
+  input  logic [31:0]                  hart_id_i,
+  input  logic [31:0]                  boot_addr_i,
+
+  // Instruction memory interface
+  output logic                         instr_req_o,
+  input  logic                         instr_gnt_i,
+  input  logic                         instr_rvalid_i,
+  output logic [31:0]                  instr_addr_o,
+  input  logic [31:0]                  instr_rdata_i,
+  input  logic [6:0]                   instr_rdata_intg_i,
+  input  logic                         instr_err_i,
+
+  // Data memory interface
+  output logic                         data_req_o,
+  output logic                         data_is_cap_o,
+  input  logic                         data_gnt_i,
+  input  logic                         data_rvalid_i,
+  output logic                         data_we_o,
+  output logic [3:0]                   data_be_o,
+  output logic [31:0]                  data_addr_o,
+  output logic [32:0]                  data_wdata_o,
+  output logic [6:0]                   data_wdata_intg_o,
+  input  logic [32:0]                  data_rdata_i,
+  input  logic [6:0]                   data_rdata_intg_i,
+  input  logic                         data_err_i,
+
+  // TS map memory interface
+  output logic                         tsmap_cs_o,
+  output logic [15:0]                  tsmap_addr_o,
+  input  logic [31:0]                  tsmap_rdata_i,
+  input  logic [6:0]                   tsmap_rdata_intg_i,   // not used in cheriotc_top
+  input  logic [MMRegDinW-1:0]         mmreg_corein_i,
+  output logic [MMRegDoutW-1:0]        mmreg_coreout_o,
+  output logic                         cheri_fatal_err_o,
+
+  // Interrupt inputs
+  input  logic                         irq_software_i,
+  input  logic                         irq_timer_i,
+  input  logic                         irq_external_i,
+  input  logic [14:0]                  irq_fast_i,
+  input  logic                         irq_nm_i,       // non-maskeable interrupt
+
+  // Scrambling Interface
+  input  logic                         scramble_key_valid_i,
+  input  logic [SCRAMBLE_KEY_W-1:0]    scramble_key_i,
+  input  logic [SCRAMBLE_NONCE_W-1:0]  scramble_nonce_i,
+  output logic                         scramble_req_o,
+
+  // Debug Interface
+  input  logic                         debug_req_i,
+  output crash_dump_t                  crash_dump_o,
+  output logic                         double_fault_seen_o,
+
+  // CPU Control Signals
+  input  fetch_enable_t                fetch_enable_i,
+  output logic                         core_sleep_o
+);
+
+
+  logic        rvfi_valid;
+  logic [63:0] rvfi_order;
+  logic [31:0] rvfi_insn;
+  logic        rvfi_trap;
+  logic        rvfi_halt;
+  logic        rvfi_intr;
+  logic [ 1:0] rvfi_mode;
+  logic [ 1:0] rvfi_ixl;
+  logic [ 4:0] rvfi_rs1_addr;
+  logic [ 4:0] rvfi_rs2_addr;
+  logic [ 4:0] rvfi_rs3_addr;
+  logic [31:0] rvfi_rs1_rdata;
+  reg_cap_t    rvfi_rs1_rcap;
+  reg_cap_t    rvfi_rs2_rcap;
+  logic [31:0] rvfi_rs2_rdata;
+  logic [31:0] rvfi_rs3_rdata;
+  logic [ 4:0] rvfi_rd_addr;
+  logic [31:0] rvfi_rd_wdata;
+  reg_cap_t    rvfi_rd_wcap;
+  logic [31:0] rvfi_pc_rdata;
+  logic [31:0] rvfi_pc_wdata;
+  logic [31:0] rvfi_mem_addr;
+  logic [ 3:0] rvfi_mem_rmask;
+  logic [ 3:0] rvfi_mem_wmask;
+  logic [DataWidth-1:0] rvfi_mem_rdata;
+  logic [DataWidth-1:0] rvfi_mem_wdata;
+  logic        rvfi_mem_is_cap;
+  reg_cap_t     rvfi_mem_rcap;
+  reg_cap_t     rvfi_mem_wcap;
+  logic [31:0] rvfi_ext_mip;
+  logic        rvfi_ext_nmi;
+  logic        rvfi_ext_debug_req;
+  logic [63:0] rvfi_ext_mcycle;
+
+  logic [31:0] unused_rvfi_ext_mip;
+  logic        unused_rvfi_ext_nmi;
+  logic        unused_rvfi_ext_debug_req;
+  logic [63:0] unused_rvfi_ext_mcycle;
+
+  // Tracer doesn't use these signals, though other modules may probe down into tracer to observe
+  // them.
+  assign unused_rvfi_ext_mip = rvfi_ext_mip;
+  assign unused_rvfi_ext_nmi = rvfi_ext_nmi;
+  assign unused_rvfi_ext_debug_req = rvfi_ext_debug_req;
+  assign unused_rvfi_ext_mcycle = rvfi_ext_mcycle;
+
+  cheriot_top #(
+    .DmHaltAddr       (DmHaltAddr       ),
+    .DmExceptionAddr  (DmExceptionAddr  ),
+    .MHPMCounterNum   (13  ),
+    .MHPMCounterWidth (40),
+    .DbgTriggerEn     (1'b1),
+    .DbgHwBreakNum    (4),
+    .RV32E            (RV32E),
+    .RV32B            (RV32BFull),
+    .WritebackStage   (1'b1),
+    .BranchPredictor  (1'b0),
+    .CHERIoTEn        (1'b1),
+    .DataWidth        (DataWidth),
+    .HeapBase         (HeapBase ),
+    .TSMapBase        (TSMapBase),
+    .TSMapSize        (TSMapSize),
+    .MemCapFmt        (1'b0),
+    .CheriPPLBC       (1'b1),
+    .CheriSBND2       (1'b0),
+    .CheriTBRE        (CheriTBRE),
+    .CheriStkZ        (CheriStkZ),
+    .MMRegDinW        (MMRegDinW),
+    .MMRegDoutW       (MMRegDoutW),
+    .CheriCapIT8      (CheriCapIT8)
+  ) u_cheriot_top (
+    .clk_i,
+    .rst_ni,
+
+    .test_en_i,
+    .scan_rst_ni,
+    .ram_cfg_i,
+
+    .cheri_pmode_i,
+    .cheri_tsafe_en_i,
+    .hart_id_i,
+    .boot_addr_i,
+
+    .instr_req_o,
+    .instr_gnt_i,
+    .instr_rvalid_i,
+    .instr_addr_o,
+    .instr_rdata_i,
+    .instr_rdata_intg_i,
+    .instr_err_i,
+
+    .data_req_o,
+    .data_is_cap_o,
+    .data_gnt_i,
+    .data_rvalid_i,
+    .data_we_o,
+    .data_be_o,
+    .data_addr_o,
+    .data_wdata_o,
+    .data_wdata_intg_o,
+    .data_rdata_i,
+    .data_rdata_intg_i,
+    .data_err_i,
+
+    .tsmap_cs_o,
+    .tsmap_addr_o,
+    .tsmap_rdata_i,
+    .mmreg_corein_i,
+    .mmreg_coreout_o,
+    .cheri_fatal_err_o,
+
+    .irq_software_i,
+    .irq_timer_i,
+    .irq_external_i,
+    .irq_fast_i,
+    .irq_nm_i,
+
+    .scramble_key_valid_i,
+    .scramble_key_i,
+    .scramble_nonce_i,
+    .scramble_req_o,
+
+    .debug_req_i,
+    .crash_dump_o,
+    .double_fault_seen_o,
+
+`ifdef RVFI
+    .rvfi_valid,
+    .rvfi_order,
+    .rvfi_insn,
+    .rvfi_trap,
+    .rvfi_halt,
+    .rvfi_intr,
+    .rvfi_mode,
+    .rvfi_ixl,
+    .rvfi_rs1_addr,
+    .rvfi_rs2_addr,
+    .rvfi_rs3_addr,
+    .rvfi_rs1_rdata,
+    .rvfi_rs1_rcap,
+    .rvfi_rs2_rdata,
+    .rvfi_rs2_rcap,
+    .rvfi_rs3_rdata,
+    .rvfi_rd_addr,
+    .rvfi_rd_wdata,
+    .rvfi_rd_wcap,
+    .rvfi_pc_rdata,
+    .rvfi_pc_wdata,
+    .rvfi_mem_addr,
+    .rvfi_mem_rmask,
+    .rvfi_mem_wmask,
+    .rvfi_mem_rdata,
+    .rvfi_mem_wdata,
+    .rvfi_mem_rcap,
+    .rvfi_mem_wcap,
+    .rvfi_mem_is_cap,
+    .rvfi_ext_mip,
+    .rvfi_ext_nmi,
+    .rvfi_ext_debug_req,
+    .rvfi_ext_mcycle,
+`endif
+    .fetch_enable_i,
+    .core_sleep_o,
+    .alert_major_bus_o(),
+    .alert_major_internal_o(),
+    .alert_minor_o()
+  );
+
+// cheriot_tracer relies on the signals from the RISC-V Formal Interface
+// synthesis translate_off
+`ifndef RVFI
+   $fatal("Fatal error: RVFI needs to be defined globally.");
+`endif
+
+`ifdef RVFI
+  cheriot_tracer #(
+    .DataWidth        (DataWidth),
+    .CheriCapIT8      (CheriCapIT8)
+  ) u_cheriot_tracer (
+    .clk_i,
+    .rst_ni,
+
+    .cheri_pmode_i,
+    .cheri_tsafe_en_i,
+    .hart_id_i,
+
+    .rvfi_valid,
+    .rvfi_order,
+    .rvfi_insn,
+    .rvfi_trap,
+    .rvfi_halt,
+    .rvfi_intr,
+    .rvfi_mode,
+    .rvfi_ixl,
+    .rvfi_rs1_addr,
+    .rvfi_rs2_addr,
+    .rvfi_rs3_addr,
+    .rvfi_rs1_rdata,
+    .rvfi_rs2_rdata,
+    .rvfi_rs3_rdata,
+    .rvfi_rs1_rcap,
+    .rvfi_rs2_rcap,
+    .rvfi_rd_wcap,
+    .rvfi_rd_addr,
+    .rvfi_rd_wdata,
+    .rvfi_pc_rdata,
+    .rvfi_pc_wdata,
+    .rvfi_mem_addr,
+    .rvfi_mem_rmask,
+    .rvfi_mem_wmask,
+    .rvfi_mem_rdata,
+    .rvfi_mem_wdata,
+    .rvfi_mem_rcap,
+    .rvfi_mem_wcap,
+    .rvfi_mem_is_cap
+  );
+`endif
+
+// synthesis translate_on
+
+endmodule