Respect queue counts in Lsu and Rvv.

Use the queue counts from Lsu and Rvv instead of
ready/valid to determine if an instruction can be issued.

Change-Id: I2f3915f702ab465c641a0930340e56c326905129
diff --git a/hdl/chisel/src/kelvin/rvv/RvvCore.scala b/hdl/chisel/src/kelvin/rvv/RvvCore.scala
index 43503b2..f039c91 100644
--- a/hdl/chisel/src/kelvin/rvv/RvvCore.scala
+++ b/hdl/chisel/src/kelvin/rvv/RvvCore.scala
@@ -113,6 +113,7 @@
         |    output [2:0] configSew,
         |    output [2:0] configLmul,
         |    output logic rvv_idle,
+        |    output logic [3:0] queue_capacity,
         |""".stripMargin.replaceAll("VSTART_LEN", (log2Ceil(vlen) - 1).toString)
 
 
@@ -257,7 +258,8 @@
         |      .vcsr_ready(vcsr_ready),
         |      .config_state_valid(configStateValid),
         |      .config_state(config_state),
-        |      .rvv_idle(rvv_idle)
+        |      .rvv_idle(rvv_idle),
+        |      .queue_capacity(queue_capacity)
         |""".stripMargin.replaceAll("GENN", instructionLanes.toString)
     coreInstantiation += "  );\n"
 
@@ -322,6 +324,7 @@
     val configSew = Output(UInt(3.W))
     val configLmul = Output(UInt(3.W))
     val rvv_idle = Output(Bool())
+    val queue_capacity = Output(UInt(4.W))
   })
 
   // Resources must be sorted topologically by dependency DAG
@@ -420,6 +423,7 @@
   io.configState.bits.sew     := rvvCoreWrapper.io.configSew
   io.configState.bits.lmul    := rvvCoreWrapper.io.configLmul
   io.rvv_idle                 := rvvCoreWrapper.io.rvv_idle
+  io.queue_capacity           := rvvCoreWrapper.io.queue_capacity
 
   val vstart_wdata = MuxCase(vstart, Seq(
       rvvCoreWrapper.io.vcsr_valid -> rvvCoreWrapper.io.vcsr_vstart,
@@ -436,3 +440,4 @@
   io.csr.vstart := vstart
   io.csr.vxrm := vxrm
 }
+
diff --git a/hdl/chisel/src/kelvin/rvv/RvvInterface.scala b/hdl/chisel/src/kelvin/rvv/RvvInterface.scala
index 6c68488..babad57 100644
--- a/hdl/chisel/src/kelvin/rvv/RvvInterface.scala
+++ b/hdl/chisel/src/kelvin/rvv/RvvInterface.scala
@@ -68,6 +68,7 @@
     val csr = new RvvCsrIO(p)
 
     val rvv_idle = Output(Bool())
+    val queue_capacity = Output(UInt(4.W))
 }
 
 class RvvCsrIO(p: Parameters) extends Bundle {
diff --git a/hdl/chisel/src/kelvin/scalar/Decode.scala b/hdl/chisel/src/kelvin/scalar/Decode.scala
index 5e98079..6e05ab4 100644
--- a/hdl/chisel/src/kelvin/scalar/Decode.scala
+++ b/hdl/chisel/src/kelvin/scalar/Decode.scala
@@ -192,7 +192,14 @@
   def isFloatStore(): Bool = {
     float.map(f => f.valid && f.bits.opcode === FloatOpcode.STOREFP).getOrElse(false.B)
   }
-  def isLsu(): Bool = { isScalarLoad() || isScalarStore() || vld || vst || flushat || flushall || isFloatLoad() || isFloatStore() }
+  def isLsu(): Bool = {
+      isScalarLoad() || isScalarStore() || vld || vst || flushat || flushall ||
+      isFloatLoad() || isFloatStore() || (if (p.enableRvv) {
+        rvv.get.valid && rvv.get.bits.isLoadStore()
+      } else {
+        false.B
+      })
+  }
   def isMul(): Bool = { mul || mulh || mulhsu || mulhu }
   def isDvu(): Bool = { div || divu || rem || remu }
   def isVector(): Bool = { vld || vst || viop || getvl || getmaxvl }
@@ -284,6 +291,7 @@
 
     // LSU interface.
     val lsu = Vec(p.instructionLanes, Decoupled(new LsuCmd(p)))
+    val lsuQueueCapacity = Input(UInt(3.W))
 
     // Multiplier interface.
     val mlu = Vec(p.instructionLanes, Decoupled(new MluCmd))
@@ -296,6 +304,7 @@
         Vec(p.instructionLanes, Decoupled(new RvvCompressedInstruction)))
     val rvvState = Option.when(p.enableRvv)(Input(Valid(new RvvConfigState(p))))
     val rvvIdle = Option.when(p.enableRvv)(Input(Bool()))
+    val rvvQueueCapacity = Option.when(p.enableRvv)(Input(UInt(4.W)))
 
     // Vector interface, to maintain interface compatibility with old dispatch
     // unit.
@@ -409,9 +418,9 @@
   val fence = decodedInsts.map(x => x.isFency() && (io.mactive || io.lsuActive))
 
   // ---------------------------------------------------------------------------
-  // Rvv interlock rules
+  // Rvv config interlock rules
   // RVV Load store unit requires valid config state on dispatch.
-  val rvvInterlock = if (p.enableRvv) {
+  val rvvConfigInterlock = if (p.enableRvv) {
     val configChange = decodedInsts.map(
         x => x.rvv.get.valid && x.rvv.get.bits.isVset())
     val configInvalid = configChange.scan(!io.rvvState.get.valid)(_ || _)
@@ -426,19 +435,29 @@
   }
 
   // ---------------------------------------------------------------------------
-  // Load/Store interlock
-  // Only dispatch one RVV load/store per cycle.
-  // TODO(derekjchow): Relax this when LsuV2 can accept multiple ops
-  val rvvLsuInterlock = if (p.enableRvv) {
-    val isRvvLsu = decodedInsts.map(
-        x => x.rvv.get.valid && x.rvv.get.bits.isLoadStore())
-    val rvvSet = isRvvLsu.scan(false.B)(_ || _)
-    rvvSet.map(!_).take(p.instructionLanes)
+  // Rvv Interlock
+  val rvvInterlock = if (p.enableRvv) {
+    val isRvv = decodedInsts.map(x => x.rvv.get.valid)
+    val isRvvCount = isRvv.scan(0.U(4.W))(_+_)
+    (0 until p.instructionLanes).map(
+        i => isRvvCount(i) < io.rvvQueueCapacity.get)
   } else {
     Seq.fill(p.instructionLanes)(true.B)
   }
 
   // ---------------------------------------------------------------------------
+  // LSU Interlock
+  val isLsu = decodedInsts.map(x => x.isLsu())
+  val isLsuCount = isLsu.scan(0.U(4.W))(_+_)
+  val lsuInterlock = if (p.useLsuV2) {
+      (0 until p.instructionLanes).map(
+          i => isLsuCount(i) < io.lsuQueueCapacity)
+  } else {
+    // For LSU V1, backpressure from ready/valid handshake to interlock LSU.
+    Seq.fill(p.instructionLanes)(true.B)
+  }
+
+  // ---------------------------------------------------------------------------
   // Undef
   // Ensure undef op is only handled in the first slot
   val undefInterlock = (0 until p.instructionLanes).map(i =>
@@ -478,8 +497,10 @@
       !floatWriteAfterWrite(i) && // Avoid WAW hazards
       !branchInterlock(i) && // Only branch/alu can be dispatched after a branch
       !fence(i) &&           // Don't dispatch if fence interlocked
-      rvvInterlock(i) &&     // Rvv interlock rules
-      rvvLsuInterlock(i) &&  // Dispatch only one Rvv LsuOp
+      rvvConfigInterlock(i) &&     // Rvv interlock rules
+      // rvvLsuInterlock(i) &&  // Dispatch only one Rvv LsuOp
+      lsuInterlock(i) && // Ensure lsu instructions can be dispatched into queue
+      rvvInterlock(i) && // Ensure rvv instructions can be dispatched into queue
       !undefInterlock(i) &&     // Ensure undef is only dispatched from first slot
       io.retirement_buffer_nSpace.map(x => i.U < x).getOrElse(true.B) && // Retirement buffer needs space for our slot
       singleStepInterlock(i)
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala
index e3e390f..8cbe82c 100644
--- a/hdl/chisel/src/kelvin/scalar/Lsu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -57,6 +57,7 @@
     val rvvState = Option.when(p.enableRvv)(Input(Valid(new RvvConfigState(p))))
 
     val storeCount = Output(UInt(2.W))
+    val queueCapacity = Output(UInt(3.W))
     val active = Output(Bool())
   })
 }
@@ -943,6 +944,8 @@
   val io_rd_flt_pipe = Pipe(io_rd_flt_pre_pipe, p.lsuDelayPipelineLen)
   io.rd_flt := io_rd_flt_pipe
 
+  io.queueCapacity := 0.U
+
   assert(!ctrl.io.out.valid || PopCount(Cat(ctrl.io.out.bits.fldst, ctrl.io.out.bits.sldst, ctrl.io.out.bits.vldst)) <= 1.U)
   assert(!data.io.out.valid || PopCount(Cat(data.io.out.bits.fldst, data.io.out.bits.sldst)) <= 1.U)
 }
@@ -953,6 +956,7 @@
   io.storeCount := 0.U
 
   val opQueue = Module(new Queue(new LsuUOp(p), 4))
+  io.queueCapacity := opQueue.entries.U - opQueue.io.count
 
   // Flush state
   // DispatchV2 will only flush on first slot, when LSU is inactive.
diff --git a/hdl/chisel/src/kelvin/scalar/SCore.scala b/hdl/chisel/src/kelvin/scalar/SCore.scala
index 2a7d1ad..23a2048 100644
--- a/hdl/chisel/src/kelvin/scalar/SCore.scala
+++ b/hdl/chisel/src/kelvin/scalar/SCore.scala
@@ -120,6 +120,7 @@
   dispatch.io.halted := csr.io.halted || csr.io.wfi || csr.io.dm.map(_.debug_mode).getOrElse(false.B)
   dispatch.io.mactive := io.vcore.map(_.mactive).getOrElse(false.B)
   dispatch.io.lsuActive := lsu.io.active
+  dispatch.io.lsuQueueCapacity := lsu.io.queueCapacity
   dispatch.io.scoreboard.comb := regfile.io.scoreboard.comb
   dispatch.io.scoreboard.regd := regfile.io.scoreboard.regd
   dispatch.io.branchTaken := branchTaken
@@ -422,6 +423,7 @@
     dispatch.io.rvv.get <> io.rvvcore.get.inst
     dispatch.io.rvvState.get := io.rvvcore.get.configState
     dispatch.io.rvvIdle.get := io.rvvcore.get.rvv_idle
+    dispatch.io.rvvQueueCapacity.get := io.rvvcore.get.queue_capacity
 
     // Register inputs
     io.rvvcore.get.rs := regfile.io.readData
diff --git a/hdl/verilog/rvv/design/RvvCore.sv b/hdl/verilog/rvv/design/RvvCore.sv
index d1b9c8b..86416b5 100644
--- a/hdl/verilog/rvv/design/RvvCore.sv
+++ b/hdl/verilog/rvv/design/RvvCore.sv
@@ -76,11 +76,11 @@
   output RVVConfigState config_state,
 
   // Idle
-  output logic rvv_idle
+  output logic rvv_idle,
+  output logic [$clog2(2*N + 1)-1:0] queue_capacity
 );
   logic [N-1:0] frontend_cmd_valid;
   RVVCmd [N-1:0] frontend_cmd_data;
-  logic [$clog2(2*N + 1)-1:0] queue_capacity;
   RvvFrontEnd#(.N(N)) frontend(
       .clk(clk),
       .rstn(rstn),
@@ -97,19 +97,21 @@
       .reg_write_data_o(reg_write_data),
       .cmd_valid_o(frontend_cmd_valid),
       .cmd_data_o(frontend_cmd_data),
-      .queue_capacity_i(queue_capacity),
+      .queue_capacity_i(queue_capacity_internal),
+      .queue_capacity_o(queue_capacity),
       .config_state_valid(config_state_valid),
       .config_state(config_state)
   );
 
   // Backpressure from backend fifo
   logic   [$clog2(`CQ_DEPTH):0] remaining_count_cq2rvs;
+  logic [$clog2(2*N + 1)-1:0] queue_capacity_internal;
   // Back-pressure frontend
   always_comb begin
     if (remaining_count_cq2rvs > 2*N) begin
-      queue_capacity = 2*N;
+      queue_capacity_internal = 2*N;
     end else begin
-      queue_capacity = remaining_count_cq2rvs;
+      queue_capacity_internal = remaining_count_cq2rvs;
     end
   end
 
diff --git a/hdl/verilog/rvv/design/RvvFrontEnd.sv b/hdl/verilog/rvv/design/RvvFrontEnd.sv
index c37894b..5ef24c5 100644
--- a/hdl/verilog/rvv/design/RvvFrontEnd.sv
+++ b/hdl/verilog/rvv/design/RvvFrontEnd.sv
@@ -49,6 +49,7 @@
   output logic [N-1:0] cmd_valid_o,
   output RVVCmd [N-1:0] cmd_data_o,
   input logic [CAPACITYBITS-1:0] queue_capacity_i,  // Number of elements that can be enqueued
+  output logic [CAPACITYBITS-1:0] queue_capacity_o,
 
   // Config state
   output config_state_valid,
@@ -62,8 +63,9 @@
   RVVConfigState config_state_q;
 
   // Instructions to assemble into commands
-  logic [N-1:0] valid_inst_q;
-  RVVInstruction inst_q [N-1:0];
+  logic [N-1:0] valid_inst_q;     // If the instruction in this slot is valid
+  count_t valid_inst_count_q;     // The sum of valid_inst_q
+  RVVInstruction inst_q [N-1:0];  // The instruction in the slot
 
   // Backpressure
   count_t valid_in_psum [N:0];
@@ -85,22 +87,33 @@
   assign config_state_valid = config_state_reduction;
   assign config_state = config_state_q;
 
+  logic [CAPACITYBITS-1:0] queue_capacity;
+  assign queue_capacity_o = queue_capacity;
+  always_comb begin
+    queue_capacity = queue_capacity_i - valid_inst_count_q;
+  end
+
   logic inst_accepted [N-1:0];
+  count_t valid_inst_count_d;
   always_comb begin
     for (int i = 0; i < N; i++) begin
-      inst_accepted[i] = (valid_in_psum[i] < queue_capacity_i) && inst_valid_i[i];
+      inst_accepted[i] = (valid_in_psum[i] < queue_capacity) && inst_valid_i[i];
       inst_ready_o[i] = inst_accepted[i];
     end
+    valid_inst_count_d = (valid_in_psum[N] < queue_capacity) ?
+        valid_in_psum[N] : queue_capacity;
   end
 
   always_ff @(posedge clk or negedge rstn) begin
     if (!rstn) begin
       for (int i = 0; i < N; i++) begin
         valid_inst_q[i] <= 0;
+        valid_inst_count_q <= 0;
       end;
     end else begin
       for (int i = 0; i < N; i++) begin
         valid_inst_q[i] <= inst_accepted[i];
+        valid_inst_count_q <= valid_inst_count_d;
         inst_q[i] <= inst_data_i[i];
       end
     end