Disable mlu decoding on taken branches

- Use this approach instead of masking, to account for branch prediction
  and serialization

Fix: 295921340
Change-Id: Ia703e5a79f7768cc5ab9b24edf3197b6f848d9fe
diff --git a/hdl/chisel/src/kelvin/scalar/Decode.scala b/hdl/chisel/src/kelvin/scalar/Decode.scala
index aab824f..646b22c 100644
--- a/hdl/chisel/src/kelvin/scalar/Decode.scala
+++ b/hdl/chisel/src/kelvin/scalar/Decode.scala
@@ -148,7 +148,7 @@
                     io.scoreboard.comb(rs2Addr) && (isStore || vldst))
 
   // Interlock mul, only one lane accepted.
-  val mulEn = !isMul || !io.serializeIn.mul
+  val mulEn = (!isMul || !io.serializeIn.mul) && !io.serializeIn.brcond
 
 
   // Vector extension interlock.
@@ -350,7 +350,6 @@
                                              io.inst.inst(11,7) === 0.U)
 
   // SB,SH,SW   0100011
-  // FSW        0100111 //TODO(hoangm)
   val storeSelect = io.inst.inst(6,3) === 4.U && io.inst.inst(1,0) === 3.U
   io.busRead.immen := !d.io.flushat
   io.busRead.immed := Cat(d.io.imm12(31,5),
diff --git a/hdl/chisel/src/kelvin/scalar/Regfile.scala b/hdl/chisel/src/kelvin/scalar/Regfile.scala
index cf17187..2b554b0 100644
--- a/hdl/chisel/src/kelvin/scalar/Regfile.scala
+++ b/hdl/chisel/src/kelvin/scalar/Regfile.scala
@@ -72,7 +72,7 @@
     // Execute cycle.
     val readData = Vec(8, new RegfileReadDataIO)
     val writeData = Vec(6, new RegfileWriteDataIO)
-    val writeMask = Vec(5, new Bundle {val valid = Input(Bool())})
+    val writeMask = Vec(4, new Bundle {val valid = Input(Bool())})
     val scoreboard = new Bundle {
       val regd = Output(UInt(32.W))
       val comb = Output(UInt(32.W))
@@ -141,8 +141,7 @@
 
   for (i <- 1 until 32) {
     val valid = Cat(io.writeData(5).valid && io.writeData(5).addr === i.U,
-                    io.writeData(4).valid && io.writeData(4).addr === i.U &&
-                      !io.writeMask(4).valid,
+                    io.writeData(4).valid && io.writeData(4).addr === i.U,
                     io.writeData(3).valid && io.writeData(3).addr === i.U &&
                       !io.writeMask(3).valid,
                     io.writeData(2).valid && io.writeData(2).addr === i.U &&
diff --git a/hdl/chisel/src/kelvin/scalar/SCore.scala b/hdl/chisel/src/kelvin/scalar/SCore.scala
index 90b953c..dcc5248 100644
--- a/hdl/chisel/src/kelvin/scalar/SCore.scala
+++ b/hdl/chisel/src/kelvin/scalar/SCore.scala
@@ -197,10 +197,6 @@
   mlu.io.rs2(2) := regfile.io.readData(5)
   mlu.io.rs2(3) := regfile.io.readData(7)
 
-  // On taken branches, multicycle MLU execute must be masked
-  val mluInvalidate = RegInit(false.B)
-  mluInvalidate := branchTaken
-
   // ---------------------------------------------------------------------------
   // Divide Unit
   dvu.io.req <> decode(0).io.dvu
@@ -265,7 +261,6 @@
                                      bru(1).io.taken.valid
   regfile.io.writeMask(3).valid := regfile.io.writeMask(2).valid ||
                                      bru(2).io.taken.valid
-  regfile.io.writeMask(4).valid := mluInvalidate
 
   // ---------------------------------------------------------------------------
   // Vector Extension
diff --git a/tests/verilator_sim/BUILD b/tests/verilator_sim/BUILD
index a974e32..cd33115 100644
--- a/tests/verilator_sim/BUILD
+++ b/tests/verilator_sim/BUILD
@@ -68,6 +68,7 @@
 
 cc_test(
     name = "l1dcache_tb",
+    size = "large",
     srcs = [
         "kelvin/l1dcache_tb.cc",
     ],