Merge "Disable assert in FifoX."
diff --git a/hdl/chisel/src/kelvin/scalar/Decode.scala b/hdl/chisel/src/kelvin/scalar/Decode.scala
index a6d131f..fc0018d 100644
--- a/hdl/chisel/src/kelvin/scalar/Decode.scala
+++ b/hdl/chisel/src/kelvin/scalar/Decode.scala
@@ -200,7 +200,7 @@
     val lsu = Decoupled(new LsuCmd)
 
     // Multiplier interface.
-    val mlu = Valid(new MluCmd)
+    val mlu = Decoupled(new MluCmd)
 
     // Divide interface.
     val dvu = Decoupled(new DvuCmd)
diff --git a/hdl/chisel/src/kelvin/scalar/Mlu.scala b/hdl/chisel/src/kelvin/scalar/Mlu.scala
index 4459933..e221851 100644
--- a/hdl/chisel/src/kelvin/scalar/Mlu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Mlu.scala
@@ -43,86 +43,94 @@
   val op = MluOp()
 }
 
+class MluStage1(p: Parameters) extends Bundle {
+  val rd = UInt(5.W)
+  val op = MluOp()
+  val sel = UInt(p.instructionLanes.W)
+}
+
+class MluStage2(p: Parameters) extends Bundle {
+  val rd = UInt(5.W)
+  val mul = UInt(32.W)
+  val round = UInt(1.W)
+}
+
 class Mlu(p: Parameters) extends Module {
   val io = IO(new Bundle {
     // Decode cycle.
-    val req = Flipped(Vec(p.instructionLanes, Valid(new MluCmd)))
+    val req = Vec(p.instructionLanes, Flipped(Decoupled(new MluCmd)))
 
     // Execute cycle.
     val rs1 = Vec(p.instructionLanes, Flipped(new RegfileReadDataIO))
     val rs2 = Vec(p.instructionLanes, Flipped(new RegfileReadDataIO))
-    val rd  = Valid(Flipped(new RegfileWriteDataIO))
+    val rd  = Decoupled(Flipped(new RegfileWriteDataIO))
   })
 
-  val op = Reg(MluOp())
-  val valid1 = RegInit(false.B)
-  val valid2 = RegInit(false.B)
-  val addr1 = Reg(UInt(5.W))
-  val addr2 = Reg(UInt(5.W))
-  val sel = Reg(UInt(p.instructionLanes.W))
+  // Stage 1 select and decode instruction
+  val arb = Module(new Arbiter(new MluCmd, p.instructionLanes))
+  arb.io.in <> io.req
 
-  val valids = io.req.map(_.valid)
-  assert(valids.length == p.instructionLanes)
-  valid1 := io.req.map(_.valid).reduce(_||_)
-  valid2 := valid1
+  val stage1 = Wire(Decoupled(new MluStage1(p)))
+  stage1.valid := arb.io.out.valid
+  stage1.bits.rd := arb.io.out.bits.addr
+  stage1.bits.op := arb.io.out.bits.op
+  stage1.bits.sel := UIntToOH(arb.io.chosen)
+  arb.io.out.ready := stage1.ready
+  val stage2Input = Queue(stage1, 1, true)
 
-  when (valids.reduce(_||_)) {
-    val idx = PriorityEncoder(valids)
-    op := io.req(idx).bits.op
-    addr1 := io.req(idx).bits.addr
-    sel := (1.U << idx)
-  }
+  // Stage 2 do multiplication
+  val valid2in = stage2Input.valid
+  val op2in = stage2Input.bits.op
+  val addr2in = stage2Input.bits.rd
+  val sel2in = stage2Input.bits.sel
 
-  val rs1 = (0 until p.instructionLanes).map(x => MuxOR(valid1 & sel(x), io.rs1(x).data)).reduce(_ | _)
-  val rs2 = (0 until p.instructionLanes).map(x => MuxOR(valid1 & sel(x), io.rs2(x).data)).reduce(_ | _)
 
+  val rs1 = (0 until p.instructionLanes).map(x => MuxOR(valid2in & sel2in(x), io.rs1(x).data)).reduce(_ | _)
+  val rs2 = (0 until p.instructionLanes).map(x => MuxOR(valid2in & sel2in(x), io.rs2(x).data)).reduce(_ | _)
+
+  val rs2signed = op2in.isOneOf(MluOp.MULH, MluOp.MULHR, MluOp.DMULH, MluOp.DMULHR)
+  val rs1signed = op2in.isOneOf(MluOp.MULHSU, MluOp.MULHSUR) || rs2signed
+  val rs1s = Cat(rs1signed && rs1(31), rs1).asSInt
+  val rs2s = Cat(rs2signed && rs2(31), rs2).asSInt
+  val prod = rs1s * rs2s
+  assert(prod.getWidth == 66)
+
+  val round = prod(30) && op2in.isOneOf(MluOp.DMULHR) ||
+              prod(31) && (op2in.isOneOf(MluOp.MULHR, MluOp.MULHSUR, MluOp.MULHUR))
+
+  val maxneg = 2.U(2.W)
+  val halfneg = 1.U(2.W)
+  val sat = rs1(29,0) === 0.U && rs2(29,0) === 0.U &&
+            (rs1(31,30) === maxneg && rs2(31,30) === maxneg ||
+              rs1(31,30) === maxneg && rs2(31,30) === halfneg ||
+              rs2(31,30) === maxneg && rs1(31,30) === halfneg)
+
+  val mul = MuxCase(0.U(32.W), Seq(
+    (op2in === MluOp.MUL) -> prod(31, 0),
+    op2in.isOneOf(MluOp.MULH, MluOp.MULHSU, MluOp.MULHU, MluOp.MULHR, MluOp.MULHSUR, MluOp.MULHUR) -> prod(63,32),
+    op2in.isOneOf(MluOp.DMULH, MluOp.DMULHR) -> Mux(sat, Mux(prod(65), 0x7fffffff.U(32.W), Cat(1.U(1.W), 0.U(31.W))), prod(62,31))
+  ))
+
+  val stage2 = Wire(Decoupled(new MluStage2(p)))
+  stage2.valid := valid2in
+  stage2.bits.rd := addr2in
+  stage2.bits.mul := mul
+  stage2.bits.round := round
+  stage2Input.ready := stage2.ready
+  val stage3 = Queue(stage2, 1, true)
+
+  // Stage 3 output result
   // Multiplier has a registered output.
-  val mul2 = Reg(UInt(32.W))
-  val round2 = Reg(UInt(1.W))
+  stage3.ready := io.rd.ready
 
-  when (valid1) {
-    val rs2signed = op.isOneOf(MluOp.MULH, MluOp.MULHR, MluOp.DMULH, MluOp.DMULHR)
-    val rs1signed = op.isOneOf(MluOp.MULHSU, MluOp.MULHSUR) || rs2signed
-    val rs1s = Cat(rs1signed && rs1(31), rs1).asSInt
-    val rs2s = Cat(rs2signed && rs2(31), rs2).asSInt
-    val prod = rs1s.asSInt * rs2s.asSInt
-    assert(prod.getWidth == 66)
-
-    addr2 := addr1
-    round2 := prod(30) && op.isOneOf(MluOp.DMULHR) ||
-              prod(31) && (op.isOneOf(MluOp.MULHR, MluOp.MULHSUR, MluOp.MULHUR))
-
-    when (op === MluOp.MUL) {
-      mul2 := prod(31,0)
-    } .elsewhen (op.isOneOf(MluOp.MULH, MluOp.MULHSU, MluOp.MULHU, MluOp.MULHR, MluOp.MULHSUR, MluOp.MULHUR)) {
-      mul2 := prod(63,32)
-    } .elsewhen (op.isOneOf(MluOp.DMULH, MluOp.DMULHR)) {
-      val maxneg = 2.U(2.W)
-      val halfneg = 1.U(2.W)
-      val sat = rs1(29,0) === 0.U && rs2(29,0) === 0.U &&
-                (rs1(31,30) === maxneg && rs2(31,30) === maxneg ||
-                 rs1(31,30) === maxneg && rs2(31,30) === halfneg ||
-                 rs2(31,30) === maxneg && rs1(31,30) === halfneg)
-      when (sat) {
-        when (prod(65)) {
-          mul2 := 0x7fffffff.U(32.W)
-        } .otherwise {
-          mul2 := Cat(1.U(1.W), 0.U(31.W))
-        }
-      } .otherwise {
-        mul2 := prod(62,31)
-      }
-    }
-  }
-
-  io.rd.valid := valid2
-  io.rd.bits.addr  := addr2
-  io.rd.bits.data  := mul2 + round2
+  io.rd.valid     := stage3.valid
+  io.rd.bits.addr := stage3.bits.rd
+  io.rd.bits.data := stage3.bits.mul + stage3.bits.round
 
   // Assertions.
   for (i <- 0 until p.instructionLanes) {
-    assert(!(valid1 && sel(i) && !io.rs1(i).valid))
-    assert(!(valid1 && sel(i) && !io.rs2(i).valid))
+    assert(!(valid2in && sel2in(i) && !io.rs1(i).valid))
+    assert(!(valid2in && sel2in(i) && !io.rs2(i).valid))
   }
 }
 
diff --git a/hdl/chisel/src/kelvin/scalar/MluTest.scala b/hdl/chisel/src/kelvin/scalar/MluTest.scala
index cab5940..f1374d0 100644
--- a/hdl/chisel/src/kelvin/scalar/MluTest.scala
+++ b/hdl/chisel/src/kelvin/scalar/MluTest.scala
@@ -47,6 +47,7 @@
 
         dut.clock.step()
         dut.io.req(0).valid.poke(false.B)
+        dut.io.rd.ready.poke(true.B)
 
         dut.clock.step()
         assertResult(1) { dut.io.rd.valid.peekInt() }
@@ -55,6 +56,7 @@
 
         dut.clock.step()
         assertResult(0) { dut.io.rd.valid.peekInt() }
+        dut.io.rd.ready.poke(false.B)
     }
   }
 }
diff --git a/hdl/chisel/src/kelvin/scalar/SCore.scala b/hdl/chisel/src/kelvin/scalar/SCore.scala
index 1df6db2..bc1600c 100644
--- a/hdl/chisel/src/kelvin/scalar/SCore.scala
+++ b/hdl/chisel/src/kelvin/scalar/SCore.scala
@@ -191,7 +191,7 @@
   // ---------------------------------------------------------------------------
   // Multiplier Unit
   for (i <- 0 until p.instructionLanes) {
-    mlu.io.req(i) := decode(i).io.mlu
+    mlu.io.req(i) <> decode(i).io.mlu
     mlu.io.rs1(i) := regfile.io.readData(2 * i)
     mlu.io.rs2(i) := regfile.io.readData((2 * i) + 1)
   }
@@ -256,10 +256,13 @@
   }
 
   val mluDvuOffset = p.instructionLanes
-  regfile.io.writeData(mluDvuOffset).valid := mlu.io.rd.valid || dvu.io.rd.valid
-  regfile.io.writeData(mluDvuOffset).bits.addr := Mux(mlu.io.rd.valid, mlu.io.rd.bits.addr, dvu.io.rd.bits.addr)
-  regfile.io.writeData(mluDvuOffset).bits.data := Mux(mlu.io.rd.valid, mlu.io.rd.bits.data, dvu.io.rd.bits.data)
-  assert(!(mlu.io.rd.valid && (dvu.io.rd.valid && dvu.io.rd.ready)))  // TODO: stall dvu on mlu write
+  val arb = Module(new Arbiter(new RegfileWriteDataIO, 2))
+  arb.io.in(0) <> mlu.io.rd
+  arb.io.in(1) <> dvu.io.rd
+  arb.io.out.ready := true.B
+  regfile.io.writeData(mluDvuOffset).valid := arb.io.out.valid
+  regfile.io.writeData(mluDvuOffset).bits.addr := arb.io.out.bits.addr
+  regfile.io.writeData(mluDvuOffset).bits.data := arb.io.out.bits.data
 
   val lsuOffset = p.instructionLanes + 1
   regfile.io.writeData(lsuOffset).valid := lsu.io.rd.valid