Add minstret CSR

- Track proxy signals for instructions retiring, and use these to
  populate the value of the minstret register.

Change-Id: Idfe9046d17520463c27c5214e0e8d045eba8ea13
diff --git a/hdl/chisel/src/kelvin/scalar/Csr.scala b/hdl/chisel/src/kelvin/scalar/Csr.scala
index 346c9dc..9536f11 100644
--- a/hdl/chisel/src/kelvin/scalar/Csr.scala
+++ b/hdl/chisel/src/kelvin/scalar/Csr.scala
@@ -44,6 +44,14 @@
   val out = new CsrOutIO(p)
 }
 
+class CsrCounters(p: Parameters) extends Bundle {
+  val rfwriteCount = UInt(3.W)
+  val storeCount = UInt(2.W)
+  val branchCount = UInt(1.W)
+  val vrfwriteCount = UInt(3.W)
+  val vstoreCount = UInt(2.W)
+}
+
 class CsrBruIO(p: Parameters) extends Bundle {
   val in = new Bundle {
     val mode   = Valid(Bool())
@@ -88,6 +96,8 @@
     // Vector core.
     val vcore = Input(new Bundle { val undef = Bool() })
 
+    val counters = Input(new CsrCounters(p))
+
     // Pipeline Control.
     val halted = Output(Bool())
     val fault  = Output(Bool())
@@ -132,6 +142,7 @@
   val mhartid   = RegInit(0.U(32.W))
 
   val mcycle    = RegInit(0.U(64.W))
+  val minstret  = RegInit(0.U(64.W))
 
   // 32-bit MXLEN, I,M,X extensions
   val misa      = RegInit(0x40801100.U(32.W))
@@ -170,7 +181,9 @@
   val mspEn       = index === 0x7E1.U
   // M-mode performance CSRs.
   val mcycleEn    = index === 0xB00.U
+  val minstretEn  = index === 0xB02.U
   val mcyclehEn   = index === 0xB80.U
+  val minstrethEn = index === 0xB82.U
   // M-mode information CSRs.
   val mvendoridEn = index === 0xF11.U
   val marchidEn   = index === 0xF12.U
@@ -231,6 +244,8 @@
               MuxOR(mspEn,        msp) |
               MuxOR(mcycleEn,     mcycle(31,0)) |
               MuxOR(mcyclehEn,    mcycle(63,32)) |
+              MuxOR(minstretEn,   minstret(31,0)) |
+              MuxOR(minstrethEn,  minstret(63,32)) |
               MuxOR(mvendoridEn,  mvendorid) |
               MuxOR(marchidEn,    marchid) |
               MuxOR(mimpidEn,     mimpid) |
@@ -274,6 +289,17 @@
   val mcycle_t = Cat(mcycle_th, mcycle_tl)
   mcycle := Mux(valid, mcycle_t, mcycle) + 1.U
 
+
+  val minstret_th = Mux(minstrethEn, wdata, minstret(63,32))
+  val minstret_tl = Mux(minstretEn, wdata, minstret(31,0))
+  val minstret_t = Cat(minstret_th, minstret_tl)
+  minstret := Mux(valid, minstret_t, minstret) +
+    io.counters.rfwriteCount +
+    io.counters.storeCount +
+    io.counters.branchCount +
+    io.counters.vrfwriteCount +
+    io.counters.vstoreCount
+
   when (io.bru.in.mode.valid) {
     mode := io.bru.in.mode.bits
   }
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala
index 92b8628..65a1751 100644
--- a/hdl/chisel/src/kelvin/scalar/Lsu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -107,6 +107,8 @@
 
     // Vector switch.
     val vldst = Output(Bool())
+
+    val storeCount = Output(UInt(2.W))
   })
 
   val lsu = new LsuOp()
@@ -222,6 +224,11 @@
   assert(!(io.ubus.valid && io.dbus.addr(31)))
   assert(!(io.ubus.valid && io.dbus.adrx(31)))
 
+  io.storeCount := PopCount(Cat(
+    io.dbus.valid && io.dbus.write,
+    io.ubus.valid && io.ubus.write
+  ))
+
   io.flush.valid  := ctrl.io.out.valid && (ctrl.io.out.bits.fencei || ctrl.io.out.bits.flushat || ctrl.io.out.bits.flushall)
   io.flush.all    := ctrl.io.out.bits.fencei || ctrl.io.out.bits.flushall
   io.flush.clean  := true.B
diff --git a/hdl/chisel/src/kelvin/scalar/Regfile.scala b/hdl/chisel/src/kelvin/scalar/Regfile.scala
index 4179602..6629505 100644
--- a/hdl/chisel/src/kelvin/scalar/Regfile.scala
+++ b/hdl/chisel/src/kelvin/scalar/Regfile.scala
@@ -95,6 +95,8 @@
       val regd = Output(UInt(32.W))
       val comb = Output(UInt(32.W))
     }
+
+    val rfwriteCount = Output(UInt(6.W))
   })
 
 
@@ -177,6 +179,17 @@
     }
   }
 
+  // We care if someone tried to write x0 (e.g. nop is encoded this way), but want
+  // it separate for above mentioned optimization.
+  val x0 =
+    (0 until 4).map(x =>
+      io.writeData(x).valid &&
+      io.writeData(x).addr === 0.U &&
+      !io.writeMask(x).valid) ++
+    (4 until 6).map(x => io.writeData(x).valid && io.writeData(x).addr === 0.U)
+
+  io.rfwriteCount := PopCount(writeValid) - writeValid(0) + PopCount(x0)
+
   // ***************************************************************************
   // Read ports with write forwarding.
   // ***************************************************************************
diff --git a/hdl/chisel/src/kelvin/scalar/SCore.scala b/hdl/chisel/src/kelvin/scalar/SCore.scala
index 0926de8..8dbdabd 100644
--- a/hdl/chisel/src/kelvin/scalar/SCore.scala
+++ b/hdl/chisel/src/kelvin/scalar/SCore.scala
@@ -172,6 +172,13 @@
 
   io.iflush.valid := iflush
 
+  // Instruction counters
+  csr.io.counters.rfwriteCount := regfile.io.rfwriteCount
+  csr.io.counters.storeCount := lsu.io.storeCount
+  csr.io.counters.branchCount := bru(0).io.taken.valid
+  csr.io.counters.vrfwriteCount := io.vcore.vrfwriteCount
+  csr.io.counters.vstoreCount := io.vcore.vstoreCount
+
   // ---------------------------------------------------------------------------
   // Control Status Unit
   csr.io.csr <> io.csr
diff --git a/hdl/chisel/src/kelvin/vector/VCore.scala b/hdl/chisel/src/kelvin/vector/VCore.scala
index 029900a..58bbab6 100644
--- a/hdl/chisel/src/kelvin/vector/VCore.scala
+++ b/hdl/chisel/src/kelvin/vector/VCore.scala
@@ -39,6 +39,9 @@
 
   // Faults.
   val undef = Output(Bool())
+
+  val vrfwriteCount = Output(UInt(3.W))
+  val vstoreCount = Output(UInt(2.W))
 }
 
 class VCore(p: Parameters) extends Module {
@@ -72,6 +75,9 @@
   val vst    = VSt(p)
   val vrf    = VRegfile(p)
 
+  io.score.vrfwriteCount := vrf.io.vrfwriteCount
+  io.score.vstoreCount := vst.io.vstoreCount + vldst.io.vstoreCount
+
   vinst.io.in <> io.score.vinst
   vinst.io.rs <> io.score.rs
   vinst.io.rd <> io.score.rd
diff --git a/hdl/chisel/src/kelvin/vector/VLdSt.scala b/hdl/chisel/src/kelvin/vector/VLdSt.scala
index df6fadc..d2d9853 100644
--- a/hdl/chisel/src/kelvin/vector/VLdSt.scala
+++ b/hdl/chisel/src/kelvin/vector/VLdSt.scala
@@ -41,8 +41,11 @@
     // Bus.
     val dbus = new DBusIO(p)
     val last = Output(Bool())
+
+    val vstoreCount = Output(UInt(1.W))
   })
 
+
   // A usable amount of outstanding transactions.
   val cmdqDepth = 8
 
@@ -253,6 +256,7 @@
   ctrl.io.in.bits.write := q.io.out.bits.IsStore()
   ctrl.io.in.bits.widx  := q.io.out.bits.vd.addr
   assert(!(ctrl.io.in.valid && !ctrl.io.in.ready))
+  io.vstoreCount := ctrl.io.in.valid && ctrl.io.in.ready;
 
   data.io.in.valid := rdataEn
   data.io.in.bits.wdata := Swizzle(false, 8, rdataAshf, io.read.data)
diff --git a/hdl/chisel/src/kelvin/vector/VRegfile.scala b/hdl/chisel/src/kelvin/vector/VRegfile.scala
index e9f74a5..ac67ff0 100644
--- a/hdl/chisel/src/kelvin/vector/VRegfile.scala
+++ b/hdl/chisel/src/kelvin/vector/VRegfile.scala
@@ -125,6 +125,7 @@
     val conv = Flipped(new VRegfileConvIO(p))
     val transpose = Flipped(new VRegfileTransposeIO(p))
     val vrfsb = Flipped(new VRegfileScoreboardIO)
+    val vrfwriteCount = Output(UInt(3.W))
   })
 
   val segcnt = p.vectorBits / 32
@@ -176,6 +177,8 @@
     }
   }
 
+  io.vrfwriteCount := writevalid(0)
+
   // ---------------------------------------------------------------------------
   // Write ports.
   for (i <- 0 until writePorts) {
diff --git a/hdl/chisel/src/kelvin/vector/VSt.scala b/hdl/chisel/src/kelvin/vector/VSt.scala
index 9dcd9de..f730fec 100644
--- a/hdl/chisel/src/kelvin/vector/VSt.scala
+++ b/hdl/chisel/src/kelvin/vector/VSt.scala
@@ -42,6 +42,8 @@
 
     // Status.
     val nempty = Output(Bool())
+
+    val vstoreCount = Output(UInt(1.W))
   })
 
   // A usable depth of outstanding commands.
@@ -299,6 +301,8 @@
   assert(io.axi.addr.valid === io.axi.data.valid)
   assert(io.axi.addr.ready === io.axi.data.ready)
 
+  io.vstoreCount := ctrl.io.out.valid
+
   // ---------------------------------------------------------------------------
   // Active.
   io.active := q.io.active
diff --git a/tests/verilator_sim/kelvin/vldst_tb.cc b/tests/verilator_sim/kelvin/vldst_tb.cc
index 83e1edf..9539c6a 100644
--- a/tests/verilator_sim/kelvin/vldst_tb.cc
+++ b/tests/verilator_sim/kelvin/vldst_tb.cc
@@ -186,6 +186,7 @@
   sc_in<sc_bv<kVector / 8> > io_dbus_wmask;
   sc_out<sc_bv<kVector> > io_dbus_rdata;
   sc_in<bool> io_last;
+  sc_in<bool> io_vstoreCount;
 
   using Sysc_tb::Sysc_tb;
 
@@ -722,6 +723,7 @@
   sc_signal<sc_bv<kVector / 8> > io_dbus_wmask;
   sc_signal<sc_bv<kVector> > io_dbus_rdata;
   sc_signal<bool> io_last;
+  sc_signal<bool> io_vstoreCount;
 
   VLdSt_tb tb("VLdSt_tb", loops, true /* random */);
   VVLdSt ldst(name);
@@ -891,6 +893,7 @@
   BIND2(tb, ldst, io_dbus_wmask);
   BIND2(tb, ldst, io_dbus_rdata);
   BIND2(tb, ldst, io_last);
+  BIND2(tb, ldst, io_vstoreCount);
 
   if (trace) {
     tb.trace(ldst);
diff --git a/tests/verilator_sim/kelvin/vregfile_tb.cc b/tests/verilator_sim/kelvin/vregfile_tb.cc
index cf45955..25218d5 100644
--- a/tests/verilator_sim/kelvin/vregfile_tb.cc
+++ b/tests/verilator_sim/kelvin/vregfile_tb.cc
@@ -97,6 +97,7 @@
   sc_out<bool> io_vrfsb_set_valid;
   sc_out<sc_bv<128> > io_vrfsb_set_bits;
   sc_in<sc_bv<128> > io_vrfsb_data;
+  sc_in<sc_bv<3> > io_vrfwriteCount;
 
   using Sysc_tb::Sysc_tb;
 
@@ -594,6 +595,7 @@
   sc_signal<sc_bv<128> > io_vrfsb_set_bits;
   sc_signal<sc_bv<128> > io_vrfsb_data;
   sc_signal<bool> io_vrfsb_set_valid;
+  sc_signal<sc_bv<3> > io_vrfwriteCount;
 
   VRegfile_tb tb("VRegfile_tb", loops, true /*random*/);
   VVRegfile vrf(name);
@@ -684,6 +686,7 @@
   BIND2(tb, vrf, io_vrfsb_set_valid);
   BIND2(tb, vrf, io_vrfsb_set_bits);
   BIND2(tb, vrf, io_vrfsb_data);
+  BIND2(tb, vrf, io_vrfwriteCount);
 
   tb.start();
 }
diff --git a/tests/verilator_sim/kelvin/vst_tb.cc b/tests/verilator_sim/kelvin/vst_tb.cc
index f425a10..79567e7 100644
--- a/tests/verilator_sim/kelvin/vst_tb.cc
+++ b/tests/verilator_sim/kelvin/vst_tb.cc
@@ -92,6 +92,7 @@
   sc_in<bool> io_axi_resp_ready;
   sc_out<bool> io_axi_resp_valid;
   sc_in<bool> io_nempty;
+  sc_in<bool> io_vstoreCount;
   sc_out<sc_bv<7> > io_in_bits_0_bits_op;
   sc_out<sc_bv<3> > io_in_bits_0_bits_f2;
   sc_out<sc_bv<3> > io_in_bits_0_bits_sz;
@@ -572,6 +573,7 @@
   sc_signal<bool> io_axi_resp_ready;
   sc_signal<bool> io_axi_resp_valid;
   sc_signal<bool> io_nempty;
+  sc_signal<bool> io_vstoreCount;
   sc_signal<sc_bv<7> > io_in_bits_0_bits_op;
   sc_signal<sc_bv<3> > io_in_bits_0_bits_f2;
   sc_signal<sc_bv<3> > io_in_bits_0_bits_sz;
@@ -741,6 +743,7 @@
   BIND2(tb, st, io_axi_resp_ready);
   BIND2(tb, st, io_axi_resp_valid);
   BIND2(tb, st, io_nempty);
+  BIND2(tb, st, io_vstoreCount);
   BIND2(tb, st, io_in_bits_0_bits_op);
   BIND2(tb, st, io_in_bits_0_bits_f2);
   BIND2(tb, st, io_in_bits_0_bits_sz);