Allow LSU to accept multiple instructions in one cycle.

Change-Id: I6ff3d6e795182bcd921c40e24bce6af4aeda4e77
diff --git a/hdl/chisel/src/common/Aligner.scala b/hdl/chisel/src/common/Aligner.scala
index 79feb4e..936ce63 100644
--- a/hdl/chisel/src/common/Aligner.scala
+++ b/hdl/chisel/src/common/Aligner.scala
@@ -81,4 +81,13 @@
     })
     addResource("hdl/verilog/rvv/design/Aligner.sv")
     setInline(s"$desiredName.sv", GenerateAlignerSource(t, n))
+}
+
+object Aligner {
+    def apply[T <: Data](in: Seq[ValidIO[T]]): Vec[ValidIO[T]] = {
+        val t = chiselTypeOf(in(0).bits)
+        val aligner = Module(new Aligner(t, in.length))
+        aligner.io.in := in.map(v => v.map(_.asUInt))
+        VecInit(aligner.io.out.map(v => v.map(_.asTypeOf(t))))
+    }
 }
\ No newline at end of file
diff --git a/hdl/chisel/src/kelvin/BUILD b/hdl/chisel/src/kelvin/BUILD
index c410861..e8ad752 100644
--- a/hdl/chisel/src/kelvin/BUILD
+++ b/hdl/chisel/src/kelvin/BUILD
@@ -326,6 +326,8 @@
         ":retirement_buffer",
         ":rvvi_trace",
         "//hdl/chisel/src/common",
+        "//hdl/chisel/src/common:aligner",
+        "//hdl/chisel/src/common:circular_buffer_multi",
         "//hdl/chisel/src/common:fp",
         "//hdl/chisel/src/common:instruction_buffer",
         "//hdl/chisel/src/common:scatter_gather",
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala
index 8cbe82c..baa0958 100644
--- a/hdl/chisel/src/kelvin/scalar/Lsu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -955,8 +955,9 @@
   io.vldst := 0.U
   io.storeCount := 0.U
 
-  val opQueue = Module(new Queue(new LsuUOp(p), 4))
-  io.queueCapacity := opQueue.entries.U - opQueue.io.count
+  val opQueue = Module(new CircularBufferMulti(new LsuUOp(p), p.instructionLanes, 4))
+  opQueue.io.flush := false.B
+  io.queueCapacity := opQueue.io.nSpace
 
   // Flush state
   // DispatchV2 will only flush on first slot, when LSU is inactive.
@@ -977,26 +978,25 @@
   ))
 
   // Accept one instruction per cycle.
-  // TODO(derekjchow): Accept multiple when primitives are ready.
-  val canAccept = opQueue.io.enq.ready
-  val queueSpace = Mux(canAccept, 1.U, 0.U)
+  val queueSpace = opQueue.io.nSpace
   val validSum = io.req.map(_.valid).scan(
       0.U(log2Ceil(p.instructionLanes + 1).W))(_+_)
-
   for (i <- 0 until p.instructionLanes) {
     io.req(i).ready := (validSum(i) < queueSpace) && !flushCmd.valid
   }
 
   val ops = (0 until p.instructionLanes).map(i =>
-      LsuUOp(p, i, io.req(i).bits, io.busPort, io.busPort_flt, io.rvvState))
-  val enq = MuxCase(
-      MakeInvalid(new LsuUOp(p)),
-      (0 until p.instructionLanes).map(i =>
-          ((io.req(i).fire && !io.req(i).bits.op.isOneOf(LsuOp.FENCEI, LsuOp.FLUSHAT, LsuOp.FLUSHALL)) -> MakeValid(true.B, ops(i)))))
-  opQueue.io.enq.valid := enq.valid
-  opQueue.io.enq.bits := enq.bits
+    MakeValid(
+        io.req(i).fire && !LsuOp.isFlush(io.req(i).bits.op),
+        LsuUOp(p, i, io.req(i).bits, io.busPort, io.busPort_flt, io.rvvState))
+  )
+  val alignedOps = Aligner(ops)
 
-  val nextSlot = LsuSlot.fromLsuUOp(opQueue.io.deq.bits, p, 16)
+  opQueue.io.enqValid := PopCount(alignedOps.map(_.valid))
+  opQueue.io.enqData := alignedOps.map(_.bits)
+  assert(opQueue.io.enqValid <= opQueue.io.nSpace)
+
+  val nextSlot = LsuSlot.fromLsuUOp(opQueue.io.dataOut(0), p, 16)
 
   // Tracks if a read has been fired last cycle.
   val readFired = RegInit(MakeInvalid(new LsuRead(32 - nextSlot.elemBits)))
@@ -1142,7 +1142,7 @@
   val writebackUpdatedSlot = slot.writebackUpdate(writebackFired)
 
   // TODO(derekjchow): Improve timing?
-  opQueue.io.deq.ready := slot.slotIdle()
+  opQueue.io.deqReady := Mux(slot.slotIdle() && (opQueue.io.nEnqueued > 0.U), 1.U, 0.U)
 
   // ==========================================================================
   // State transition
@@ -1152,7 +1152,7 @@
     // Move to inactive if error.
     io.fault.valid -> LsuSlot.inactive(p, 16),
     // When inactive, dequeue if possible
-    (slot.slotIdle() && opQueue.io.deq.valid) -> nextSlot,
+    (slot.slotIdle() && (opQueue.io.nEnqueued > 0.U)) -> nextSlot,
     // Vector update.
     slot.pendingVector -> vectorUpdatedSlot,
     // Active transaction update.
@@ -1163,6 +1163,6 @@
 
   slot := slotNext
 
-  io.active := !slot.slotIdle() || (opQueue.io.count =/= 0.U)
+  io.active := !slot.slotIdle() || (opQueue.io.nEnqueued =/= 0.U)
 }