Add RVV to tracing

- Plumb the output from rvvCore's writeback into the RetirementBuffer.
- If rvv is enabled, RetirementBuffer is expanded to handle the full
  VLEN.
- Pass along the vector data to rvviTrace, as well.

Change-Id: I418a9963e336ef9b916fc268371cc8339d0a88a8
diff --git a/hdl/chisel/src/kelvin/Interfaces.scala b/hdl/chisel/src/kelvin/Interfaces.scala
index f6c9a09..9b6e1ba 100644
--- a/hdl/chisel/src/kelvin/Interfaces.scala
+++ b/hdl/chisel/src/kelvin/Interfaces.scala
@@ -176,7 +176,7 @@
     val pc = UInt(32.W)
     val inst = UInt(32.W)
     val idx = UInt(p.retirementBufferIdxWidth.W)
-    val data = UInt(32.W)
+    val data = if (p.enableRvv) UInt(p.rvvVlen.W) else UInt(32.W)
   }))
 }
 
@@ -237,6 +237,11 @@
   val data  = Input(UInt(32.W))
 }
 
+class VectorWriteDataIO(p: Parameters) extends Bundle {
+  val addr  = Input(UInt(5.W))
+  val data  = Input(UInt(p.lsuDataBits.W))
+}
+
 class FabricIO(p: Parameters) extends Bundle {
     val readDataAddr = Output(Valid(UInt(p.axi2AddrBits.W)))
     val readData = Input(Valid(UInt(p.axi2DataBits.W)))
diff --git a/hdl/chisel/src/kelvin/Parameters.scala b/hdl/chisel/src/kelvin/Parameters.scala
index 10f3b56..cb03f20 100644
--- a/hdl/chisel/src/kelvin/Parameters.scala
+++ b/hdl/chisel/src/kelvin/Parameters.scala
@@ -105,11 +105,14 @@
   val floatPulpDivsqrt = 0
 
   // Retirement buffer
+  val floatRegfileBaseAddr = 32
+  val rvvRegfileBaseAddr = 64
+  val rvvRegCount = 32
   val retirementBufferSize = 8
   def retirementBufferIdxWidth: Int = {
     val scalarRegCount = 32
     val floatRegCount = (if (enableFloat) { 32 } else { 0 })
-    log2Ceil(scalarRegCount + floatRegCount + 1)
+    log2Ceil(scalarRegCount + floatRegCount + rvvRegCount + 1)
   }
 
   // L0ICache Fetch unit.
diff --git a/hdl/chisel/src/kelvin/RetirementBuffer.scala b/hdl/chisel/src/kelvin/RetirementBuffer.scala
index 11cd32a..851e7f7 100644
--- a/hdl/chisel/src/kelvin/RetirementBuffer.scala
+++ b/hdl/chisel/src/kelvin/RetirementBuffer.scala
@@ -25,12 +25,16 @@
     val writeDataScalar = Input(Vec(p.instructionLanes + 2, Valid(new RegfileWriteDataIO)))
     val writeAddrFloat = Option.when(p.enableFloat)(Input(new RegfileWriteAddrIO))
     val writeDataFloat = Option.when(p.enableFloat)(Input(Vec(2, Valid(new RegfileWriteDataIO))))
+    val writeAddrVector = Option.when(p.enableRvv)(Input(Vec(p.instructionLanes, new RegfileWriteAddrIO)))
+    val writeDataVector = Option.when(p.enableRvv)(Input(Vec(p.instructionLanes, Valid(new VectorWriteDataIO(p)))))
     val nSpace = Output(UInt(32.W))
     val debug = Output(new RetirementBufferDebugIO(p))
   })
+  if (p.enableRvv) {
+    dontTouch(io.writeAddrVector.get)
+  }
 
   val idxWidth = p.retirementBufferIdxWidth
-  val floatRegisterBase = 32.U
   val noWriteRegIdx = ~0.U(idxWidth.W)
   class Instruction extends Bundle {
     val addr = UInt(32.W) // Memory address
@@ -58,12 +62,16 @@
     val scalarValid = io.writeAddrScalar(i).valid
     val scalarAddr = io.writeAddrScalar(i).addr
 
+    val vectorValid = io.writeAddrVector.map(x => x(i).valid).getOrElse(false.B)
+    val vectorAddr = io.writeAddrVector.map(x => x(i).addr).getOrElse(0.U)
+
     val instr = Wire(new Instruction)
     instr.addr := io.inst(i).bits.addr
     instr.inst := io.inst(i).bits.inst
     instr.idx := MuxCase(noWriteRegIdx, Seq(
-      floatValid -> (floatAddr +& floatRegisterBase),
+      floatValid -> (floatAddr +& p.floatRegfileBaseAddr.U),
       (scalarValid && scalarAddr =/= 0.U) -> scalarAddr,
+      (vectorValid && vectorAddr =/= 0.U) -> (vectorAddr +& p.rvvRegfileBaseAddr.U),
     ))
     instr
   }
@@ -84,10 +92,11 @@
   // Maintain a re-order buffer of instruction completion result.
   // The order and alignment of these buffers should correspond to the
   // output of `instBuffer`.
-  val resultBuffer = RegInit(VecInit(Seq.fill(bufferSize)(MakeInvalid(UInt(32.W)))))
+  val dataWidth = if (p.enableRvv) p.lsuDataBits else 32
+  val resultBuffer = RegInit(VecInit(Seq.fill(bufferSize)(MakeInvalid(UInt(dataWidth.W)))))
   // Compute update based on register writeback.
   // Note: The shift when committing instructions will be handled in a later block.
-  val resultUpdate = Wire(Vec(bufferSize, Valid(UInt(32.W))))
+  val resultUpdate = Wire(Vec(bufferSize, Valid(UInt(dataWidth.W))))
 
   for (i <- 0 until bufferSize) {
     val bufferEntry = instBuffer.io.dataOut(i)
@@ -95,7 +104,10 @@
     val scalarWriteIdxMap = io.writeDataScalar.map(
         x => x.valid && (x.bits.addr === bufferEntry.idx))
     val floatWriteIdxMap = io.writeDataFloat.map(y => y.map(
-        x => x.valid && ((x.bits.addr +& floatRegisterBase) ===
+        x => x.valid && ((x.bits.addr +& p.floatRegfileBaseAddr.U) ===
+            bufferEntry.idx))).getOrElse(Seq(false.B))
+    val vectorWriteIdxMap = io.writeDataVector.map(y => y.map(
+        x => x.valid && ((x.bits.addr +& p.rvvRegfileBaseAddr.U) ===
             bufferEntry.idx))).getOrElse(Seq(false.B))
     // Check if this entry is an operation that doesn't require a register write (e.g., a store).
     val nonWritingInstr = bufferEntry.idx === noWriteRegIdx
@@ -103,19 +115,24 @@
     val validBufferEntry = (i.U < instBuffer.io.nEnqueued) && (!resultBuffer(i).valid)
 
     // If the entry is active and its data dependency is met (or it has no dependency)...
-    val updated = (validBufferEntry && (scalarWriteIdxMap.reduce(_|_) || floatWriteIdxMap.reduce(_|_) || nonWritingInstr))
+    val updated = (validBufferEntry && (scalarWriteIdxMap.reduce(_|_) || floatWriteIdxMap.reduce(_|_) || vectorWriteIdxMap.reduce(_|_) || nonWritingInstr))
     // Find the index of the first write port that provides the needed data.
     val scalarWriteIdx = PriorityEncoder(scalarWriteIdxMap)
     val floatWriteIdx = PriorityEncoder(floatWriteIdxMap)
+    val vectorWriteIdx = PriorityEncoder(vectorWriteIdxMap)
     // Select the actual data from the winning write port.
     val writeDataScalar = io.writeDataScalar(scalarWriteIdx).bits.data
     val writeDataFloat = io.writeDataFloat.map(x => x(floatWriteIdx).bits.data).getOrElse(0.U)
+    val writeDataVector = io.writeDataVector.map(x => x(vectorWriteIdx).bits.data).getOrElse(0.U)
     // If updated, mark this buffer entry as complete for the next cycle.
     resultUpdate(i).valid := Mux(updated, true.B, resultBuffer(i).valid) // true.B
     // Select the correct write-back data to store, if updated (FP has priority).
+    val sdata = if (p.enableRvv) Cat(0.U((p.lsuDataBits - 32).W), writeDataScalar) else writeDataScalar
+    val fdata = if (p.enableRvv) Cat(0.U((p.lsuDataBits - 32).W), writeDataFloat) else writeDataFloat
     resultUpdate(i).bits := Mux(updated, MuxCase(0.U, Seq(
-      floatWriteIdxMap.reduce(_|_) -> writeDataFloat,
-      scalarWriteIdxMap.reduce(_|_) -> writeDataScalar,
+      floatWriteIdxMap.reduce(_|_) -> fdata,
+      vectorWriteIdxMap.reduce(_|_) -> writeDataVector,
+      scalarWriteIdxMap.reduce(_|_) -> sdata,
     )), resultBuffer(i).bits)
   }
 
diff --git a/hdl/chisel/src/kelvin/RvviTrace.scala b/hdl/chisel/src/kelvin/RvviTrace.scala
index fb72ee3..44b8d69 100644
--- a/hdl/chisel/src/kelvin/RvviTrace.scala
+++ b/hdl/chisel/src/kelvin/RvviTrace.scala
@@ -197,15 +197,13 @@
             x_wdata(i)(j) := MuxOR(x_wb_valid, wdata)
             x_wb(i)(j) := x_wb_valid
 
-            val f_wb_valid = valid && (wb_idx === j.U + (32.U))
+            val f_wb_valid = valid && (wb_idx === j.U + p.floatRegfileBaseAddr.U)
             f_wdata(i)(j) := MuxOR(f_wb_valid, wdata)
             f_wb(i)(j) := f_wb_valid
 
-            ///////////////////////////////////
-            // TODO(atv): This is just generally not tracked.
-            ///////////////////////////////////
-            v_wdata(i)(j) := 0.U.asTypeOf(rvviTraceBlackBox.io.v_wdata_i(i)(j))
-            v_wb(i)(j) := false.B
+            val v_wb_valid = valid && (wb_idx === j.U + p.rvvRegfileBaseAddr.U)
+            v_wdata(i)(j) := MuxOR(v_wb_valid, wdata)
+            v_wb(i)(j) := v_wb_valid
         }
 
         for (j <- 0 until 4096) {
diff --git a/hdl/chisel/src/kelvin/rvv/RvvCore.scala b/hdl/chisel/src/kelvin/rvv/RvvCore.scala
index 24ded3a..f9529df 100644
--- a/hdl/chisel/src/kelvin/rvv/RvvCore.scala
+++ b/hdl/chisel/src/kelvin/rvv/RvvCore.scala
@@ -40,6 +40,7 @@
             |    input [31:0] inst_GENI_bits_pc,
             |    input [1:0] inst_GENI_bits_opcode,
             |    input [24:0] inst_GENI_bits_bits,
+            |    input [31:0] inst_GENI_bits_vd,
             |""".stripMargin.replaceAll("GENI", i.toString)
     }
 
@@ -117,12 +118,29 @@
         |    output logic [3:0] queue_capacity,
         |""".stripMargin.replaceAll("VSTART_LEN", (log2Ceil(vlen) - 1).toString)
 
+    // Add rd_rob2rt_o interface outputs
+    for (i <- 0 until instructionLanes) {
+        moduleInterface += """
+            |    output rd_rob2rt_o_GENI_w_valid,
+            |    output [4:0] rd_rob2rt_o_GENI_w_index,
+            |    output [127:0] rd_rob2rt_o_GENI_w_data,
+            |    output rd_rob2rt_o_GENI_w_type,
+            |    output [15:0] rd_rob2rt_o_GENI_vd_type,
+            |    output rd_rob2rt_o_GENI_trap_flag,
+            |    output rd_rob2rt_o_GENI_vector_csr_vl,
+            |    output rd_rob2rt_o_GENI_vector_csr_vstart,
+            |    output rd_rob2rt_o_GENI_vector_csr_ma,
+            |    output rd_rob2rt_o_GENI_vector_csr_ta,
+            |    output rd_rob2rt_o_GENI_vector_csr_xrm,
+            |    output rd_rob2rt_o_GENI_vector_csr_sew,
+            |    output rd_rob2rt_o_GENI_vector_csr_lmul,
+            |    output [15:0] rd_rob2rt_o_GENI_vxsaturate,""".stripMargin.replaceAll("GENI", i.toString)
+    }
 
     // Remove last comma/linebreak
-    moduleInterface = moduleInterface.dropRight(2)
+    moduleInterface = moduleInterface.dropRight(1)
     moduleInterface += "\n);\n"
 
-    // Inst valid
     var coreInstantiation = "  logic [GENN-1:0] inst_valid;\n".replaceAll(
             "GENN", instructionLanes.toString)
     for (i <- 0 until instructionLanes) {
@@ -222,6 +240,8 @@
     coreInstantiation += """  RVVConfigState config_state;
         |""".stripMargin
 
+    coreInstantiation += "  ROB2RT_t [3:0] rd_rob2rt_o;\n"
+
     coreInstantiation += """  RvvCore#(.N (GENN)) core(
         |      .clk(clk),
         |      .rstn(rstn),
@@ -261,11 +281,28 @@
         |      .config_state_valid(configStateValid),
         |      .config_state(config_state),
         |      .rvv_idle(rvv_idle),
-        |      .queue_capacity(queue_capacity)
+        |      .queue_capacity(queue_capacity),
+        |      .rd_rob2rt_o(rd_rob2rt_o)
         |""".stripMargin.replaceAll("GENN", instructionLanes.toString)
     coreInstantiation += "  );\n"
 
-    // Connect temp outputs
+    for (i <- 0 until instructionLanes) {
+      coreInstantiation += """  assign rd_rob2rt_o_GENI_w_valid = rd_rob2rt_o[GENI].w_valid;
+      |  assign rd_rob2rt_o_GENI_w_index = rd_rob2rt_o[GENI].w_index;
+      |  assign rd_rob2rt_o_GENI_w_data = rd_rob2rt_o[GENI].w_data;
+      |  assign rd_rob2rt_o_GENI_w_type = rd_rob2rt_o[GENI].w_type;
+      |  assign rd_rob2rt_o_GENI_vd_type = rd_rob2rt_o[GENI].vd_type;
+      |  assign rd_rob2rt_o_GENI_trap_flag = rd_rob2rt_o[GENI].trap_flag;
+      |  assign rd_rob2rt_o_GENI_vector_csr_vl = rd_rob2rt_o[GENI].vector_csr.vl;
+      |  assign rd_rob2rt_o_GENI_vector_csr_vstart = rd_rob2rt_o[GENI].vector_csr.vstart;
+      |  assign rd_rob2rt_o_GENI_vector_csr_ma = rd_rob2rt_o[GENI].vector_csr.ma;
+      |  assign rd_rob2rt_o_GENI_vector_csr_ta = rd_rob2rt_o[GENI].vector_csr.ta;
+      |  assign rd_rob2rt_o_GENI_vector_csr_xrm = rd_rob2rt_o[GENI].vector_csr.xrm;
+      |  assign rd_rob2rt_o_GENI_vector_csr_sew = rd_rob2rt_o[GENI].vector_csr.sew;
+      |  assign rd_rob2rt_o_GENI_vector_csr_lmul = rd_rob2rt_o[GENI].vector_csr.lmul;
+      |  assign rd_rob2rt_o_GENI_vxsaturate = rd_rob2rt_o[GENI].vxsaturate;
+      |""".stripMargin.replaceAll("GENI", i.toString)
+    }
     for (i <- 0 until instructionLanes) {
       coreInstantiation += "  assign inst_GENI_ready = inst_ready[GENI];\n".replaceAll("GENI", i.toString)
     }
@@ -307,6 +344,8 @@
 
     val async_rd = Decoupled(new RegfileWriteDataIO)
 
+    val rd_rob2rt_o = Vec(4, new Rob2Rt(p))
+
     val vcsr_valid = Output(Bool())
     val vcsr_vstart = Output(UInt(7.W))
     val vcsr_xrm = Output(UInt(2.W))
@@ -329,6 +368,7 @@
     val rvv_idle = Output(Bool())
     val queue_capacity = Output(UInt(4.W))
   })
+  dontTouch(io.rd_rob2rt_o)
 
   // Resources must be sorted topologically by dependency DAG
   addResource("hdl/verilog/rvv/inc/rvv_backend_config.svh")
@@ -407,6 +447,7 @@
   rvvCoreWrapper.io.rs <> io.rs
   rvvCoreWrapper.io.rd <> io.rd
   rvvCoreWrapper.io.async_rd <> io.async_rd
+  rvvCoreWrapper.io.rd_rob2rt_o <> io.rd_rob2rt_o
 
   rvvCoreWrapper.io.vstart := Mux(
       io.csr.vstart_write.valid, io.csr.vstart_write.bits, vstart)
diff --git a/hdl/chisel/src/kelvin/rvv/RvvDecode.scala b/hdl/chisel/src/kelvin/rvv/RvvDecode.scala
index 0ac2602..092cb1a 100644
--- a/hdl/chisel/src/kelvin/rvv/RvvDecode.scala
+++ b/hdl/chisel/src/kelvin/rvv/RvvDecode.scala
@@ -36,6 +36,7 @@
   val pc = UInt(32.W)
   val opcode = RvvCompressedOpcode()
   val bits = UInt(25.W)
+  val vd = UInt(5.W)
 
   def funct6(): UInt = {
     bits(24, 19)
@@ -75,6 +76,13 @@
     // TODO(derekjchow): Add all cases that write scalar rd.
   }
 
+  def writesVectorRegister(): Bool = {
+    // A vector instruction writes to a vector register if it's an ALU operation
+    // or a load operation. Store operations do not write to a vector register.
+    // vset* instructions write to a scalar register (rd), not a vector register.
+    opcode === RvvCompressedOpcode.RVVLOAD || (opcode === RvvCompressedOpcode.RVVALU && !isVset())
+  }
+
   override def toPrintable: Printable = {
     cf"[opcode=$opcode, bits=$bits%b]"
   }
@@ -110,6 +118,7 @@
       _.bits.opcode -> new_opcode.bits,
       _.bits.pc -> pc,
       _.bits.bits -> bits,
+      _.bits.vd -> bits(4, 0),
     )
   }
 }
diff --git a/hdl/chisel/src/kelvin/rvv/RvvInterface.scala b/hdl/chisel/src/kelvin/rvv/RvvInterface.scala
index 1f6e8e5..0b12ae5 100644
--- a/hdl/chisel/src/kelvin/rvv/RvvInterface.scala
+++ b/hdl/chisel/src/kelvin/rvv/RvvInterface.scala
@@ -69,6 +69,21 @@
 
     val rvv_idle = Output(Bool())
     val queue_capacity = Output(UInt(4.W))
+
+    // ROB to RT stage writes.
+    val rd_rob2rt_o = Vec(4, new Rob2Rt(p))
+}
+
+
+class Rob2Rt(p: Parameters) extends Bundle {
+  val w_valid = Bool()
+  val w_index = UInt(5.W)
+  val w_data = UInt(p.rvvVlen.W)
+  val w_type = Bool()  // 0 for VRF, 1 for XRF
+  val vd_type = UInt(p.rvvVlenb.W)
+  val trap_flag = Bool()
+  val vector_csr = new RvvConfigState(p)
+  val vxsaturate = UInt(p.rvvVlenb.W)
 }
 
 class RvvCsrIO(p: Parameters) extends Bundle {
@@ -78,4 +93,4 @@
   val vstart_write = Input(Valid(UInt(log2Ceil(p.rvvVlen).W)))
   val vxrm_write = Input(Valid(UInt(2.W)))
   val vxsat_write = Input(Valid(Bool()))
-}
\ No newline at end of file
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Decode.scala b/hdl/chisel/src/kelvin/scalar/Decode.scala
index c5f5f77..c2f1435 100644
--- a/hdl/chisel/src/kelvin/scalar/Decode.scala
+++ b/hdl/chisel/src/kelvin/scalar/Decode.scala
@@ -279,6 +279,7 @@
     val rdMark  = Vec(p.instructionLanes, Flipped(new RegfileWriteAddrIO))
     val busRead = Vec(p.instructionLanes, Flipped(new RegfileBusAddrIO))
     val rdMark_flt = Option.when(p.enableFloat)(Flipped(new RegfileWriteAddrIO))
+    val rvvRdMark = Option.when(p.enableRvv)(Vec(p.instructionLanes, Flipped(new RegfileWriteAddrIO)))
 
     // ALU interface.
     val alu = Vec(p.instructionLanes, Valid(new AluCmd))
@@ -753,6 +754,13 @@
       io.rdMark_flt.get.addr := rdAddr(i)
     }
 
+    // Set RVV vector registers to write
+    if (p.enableRvv) {
+      val rvvRdMark_valid = io.rvv.get(i).fire && d.rvv.get.bits.writesVectorRegister()
+      io.rvvRdMark.get(i).valid := rvvRdMark_valid
+      io.rvvRdMark.get(i).addr := d.rvv.get.bits.vd
+    }
+
     // Register file bus address port.
     // Pointer chasing bypass if immediate is zero.
     // Load/Store immediate selection keys off bit5, and RET off bit6.
diff --git a/hdl/chisel/src/kelvin/scalar/SCore.scala b/hdl/chisel/src/kelvin/scalar/SCore.scala
index 86c6299..83f4c54 100644
--- a/hdl/chisel/src/kelvin/scalar/SCore.scala
+++ b/hdl/chisel/src/kelvin/scalar/SCore.scala
@@ -73,6 +73,14 @@
       retirement_buffer.get.io.writeDataScalar(i) := regfile.io.writeData(i)
     })
     dispatch.io.retirement_buffer_nSpace.get := retirement_buffer.get.io.nSpace
+    if (p.enableRvv) {
+      retirement_buffer.get.io.writeAddrVector.get := dispatch.io.rvvRdMark.get
+      (0 until p.instructionLanes).foreach(i => {
+        retirement_buffer.get.io.writeDataVector.get(i).valid := io.rvvcore.get.rd_rob2rt_o(i).w_valid
+        retirement_buffer.get.io.writeDataVector.get(i).bits.addr := io.rvvcore.get.rd_rob2rt_o(i).w_index
+        retirement_buffer.get.io.writeDataVector.get(i).bits.data := io.rvvcore.get.rd_rob2rt_o(i).w_data
+      })
+    }
   }
 
   if (p.useDebugModule) {
diff --git a/hdl/verilog/rvv/design/RvvCore.sv b/hdl/verilog/rvv/design/RvvCore.sv
index 883dcc7..f7f1642 100644
--- a/hdl/verilog/rvv/design/RvvCore.sv
+++ b/hdl/verilog/rvv/design/RvvCore.sv
@@ -77,7 +77,10 @@
 
   // Idle
   output logic rvv_idle,
-  output logic [$clog2(2*N + 1)-1:0] queue_capacity
+  output logic [$clog2(2*N + 1)-1:0] queue_capacity,
+
+  // Writeback from reorder buffer
+  output ROB2RT_t [`NUM_RT_UOP-1:0] rd_rob2rt_o
 );
   logic [N-1:0] frontend_cmd_valid;
   RVVCmd [N-1:0] frontend_cmd_data;
@@ -185,6 +188,9 @@
     trap_valid_rvs2rvv = 0;
   end
 
+  ROB2RT_t [`NUM_RT_UOP-1:0] rd_rob2rt;
+  assign rd_rob2rt_o = rd_rob2rt;
+
   logic   [`ISSUE_LANE-1:0] insts_ready_cq2rvs;
   rvv_backend backend(
       .clk(clk),
@@ -213,7 +219,8 @@
       .vcsr_valid(vcsr_valid),
       .vector_csr(vector_csr),
       .vcsr_ready(vcsr_ready),
-      .rvv_idle(rvv_idle)
+      .rvv_idle(rvv_idle),
+      .rd_rob2rt_o(rd_rob2rt)
   );
 
 endmodule
diff --git a/hdl/verilog/rvv/design/rvv_backend.sv b/hdl/verilog/rvv/design/rvv_backend.sv
index 090412b..a4598f2 100755
--- a/hdl/verilog/rvv/design/rvv_backend.sv
+++ b/hdl/verilog/rvv/design/rvv_backend.sv
@@ -38,7 +38,9 @@
     vector_csr,
     vcsr_ready,
 
-    rvv_idle
+    rvv_idle,
+
+    rd_rob2rt_o
 );
 // global signal
     input   logic                                     clk;
@@ -81,6 +83,7 @@
 
 // rvv_backend is not active.(IDLE)
     output  logic                                     rvv_idle;
+    output ROB2RT_t [`NUM_RT_UOP-1:0]                 rd_rob2rt_o;
 
 `ifdef TB_BRINGUP
   // inst queue
@@ -993,6 +996,7 @@
 
   // rvv_backend IDLE
   assign rvv_idle = fifo_empty_cq2de&uq_empty&rob_empty;
+  assign rd_rob2rt_o = rd_rob2rt;
 
 `endif // TB_BRINGUP
 
diff --git a/tests/systemc/instruction_trace.cc b/tests/systemc/instruction_trace.cc
index 4521e0f..2dffa3b 100644
--- a/tests/systemc/instruction_trace.cc
+++ b/tests/systemc/instruction_trace.cc
@@ -79,7 +79,11 @@
     for (auto& in : retirement_buffer_) {
       if (in.completed) continue;
       if (valid && (addr == in.reg)) {
-        in.data = data;
+        in.data.resize(4);
+        in.data[0] = (data >> 24) & 0xff;
+        in.data[1] = (data >> 16) & 0xff;
+        in.data[2] = (data >> 8) & 0xff;
+        in.data[3] = data & 0xff;
         in.completed = true;
         break;
       }
@@ -104,7 +108,9 @@
   }
 }
 
-void InstructionTrace::TraceInstructionRaw(uint32_t pc, uint32_t inst, uint32_t reg, uint32_t data) {
+void InstructionTrace::TraceInstructionRaw(uint32_t pc, uint32_t inst,
+                                           uint32_t reg,
+                                           const std::vector<uint8_t>& data) {
   Instruction in(pc, inst, reg);
   in.data = data;
   committed_insts_.push_back(in);
@@ -112,6 +118,10 @@
 
 void InstructionTrace::PrintTrace() const {
   for (auto& inst : committed_insts_) {
-    printf("0x%08x,0x%08x,0x%02x,0x%08x\n", inst.pc, inst.inst, inst.reg, inst.data);
+    printf("0x%08x,0x%08x,0x%02x,0x", inst.pc, inst.inst, inst.reg);
+    for (auto d : inst.data) {
+      printf("%02x", d);
+    }
+    printf("\n");
   }
 }
diff --git a/tests/systemc/instruction_trace.h b/tests/systemc/instruction_trace.h
index 5912299..8699d8a 100644
--- a/tests/systemc/instruction_trace.h
+++ b/tests/systemc/instruction_trace.h
@@ -33,7 +33,8 @@
     const std::vector<uint32_t>& writeDataAddrs,
     const std::vector<uint32_t>& writeDataDatas,
     const std::vector<int>& executeRegBases);
-  void TraceInstructionRaw(uint32_t pc, uint32_t inst, uint32_t reg, uint32_t data);
+  void TraceInstructionRaw(uint32_t pc, uint32_t inst, uint32_t reg,
+                           const std::vector<uint8_t>& data);
   void PrintTrace() const;
 
   static const int kScalarBaseReg = 0;
@@ -49,13 +50,12 @@
       pc(pc),
       inst(inst),
       reg(reg),
-      data(0),
       completed(false) {}
 
     uint32_t pc;
     uint32_t inst;
     uint32_t reg;
-    uint32_t data;
+    std::vector<uint8_t> data;
     bool completed;
   };
   std::vector<Instruction> committed_insts_;
diff --git a/tests/verilator_sim/kelvin/core_mini_axi_tb.cc b/tests/verilator_sim/kelvin/core_mini_axi_tb.cc
index 90862c3..4ab9b61 100644
--- a/tests/verilator_sim/kelvin/core_mini_axi_tb.cc
+++ b/tests/verilator_sim/kelvin/core_mini_axi_tb.cc
@@ -397,13 +397,22 @@
 void CoreMiniAxi_tb::TraceInstructions() {
 #if (KP_useRetirementBuffer == true)
 #define TRACE_INSTRUCTION(x) do { \
-  uint32_t pc, inst, idx, data; \
+  uint32_t pc, inst, idx; \
   pc = debug_io_.rb_inst_##x##_bits_pc.read().get_word(0); \
   inst = debug_io_.rb_inst_##x##_bits_inst.read().get_word(0); \
   idx = debug_io_.rb_inst_##x##_bits_idx.read().get_word(0); \
-  data = debug_io_.rb_inst_##x##_bits_data.read().get_word(0); \
   if (debug_io_.rb_inst_##x##_valid.read()) { \
-    tracer_.TraceInstructionRaw(pc, inst, idx, data); \
+    auto data = debug_io_.rb_inst_##x##_bits_data.read(); \
+    std::vector<uint8_t> data_vec(data.length() / 8); \
+    int num_words = data.length() / 32; \
+    for (int i = 0; i < num_words; ++i) { \
+      uint32_t word = data.get_word((num_words - 1) - i); \
+      data_vec[i*4+0] = (word >> 24) & 0xff; \
+      data_vec[i*4+1] = (word >> 16) & 0xff; \
+      data_vec[i*4+2] = (word >> 8) & 0xff; \
+      data_vec[i*4+3] = word & 0xff; \
+    } \
+    tracer_.TraceInstructionRaw(pc, inst, idx, data_vec); \
   } \
 } while (0);
 REPEAT(TRACE_INSTRUCTION, KP_retirementBufferSize);
diff --git a/tests/verilator_sim/kelvin/core_mini_axi_tb.h b/tests/verilator_sim/kelvin/core_mini_axi_tb.h
index 40ebfbf..70d1a79 100644
--- a/tests/verilator_sim/kelvin/core_mini_axi_tb.h
+++ b/tests/verilator_sim/kelvin/core_mini_axi_tb.h
@@ -130,14 +130,20 @@
     sc_signal<sc_bv<32>> float_writeData_1_bits_data;
 #endif
 #if (KP_useRetirementBuffer == true)
+#if (KP_enableRvv == true)
+#define RB_DEBUG_IO_DATA_WIDTH KP_rvvVlen
+#else
+#define RB_DEBUG_IO_DATA_WIDTH 32
+#endif
 #define RB_DEBUG_IO(x) \
   sc_signal<bool> rb_inst_##x##_valid; \
   sc_signal<sc_bv<32>> rb_inst_##x##_bits_pc; \
   sc_signal<sc_bv<32>> rb_inst_##x##_bits_inst; \
   sc_signal<sc_bv<KP_retirementBufferIdxWidth>> rb_inst_##x##_bits_idx; \
-  sc_signal<sc_bv<32>> rb_inst_##x##_bits_data;
+  sc_signal<sc_bv<RB_DEBUG_IO_DATA_WIDTH>> rb_inst_##x##_bits_data;
   REPEAT(RB_DEBUG_IO, KP_retirementBufferSize);
 #undef RB_DEBUG_IO
+#undef RB_DEBUG_IO_DATA_WIDTH
 #endif
   };