Remove explicit valid from RegfileWriteDataIO

- Wrap with Valid() at instantiation sites, instead, when needed.

Change-Id: I5bbf0bcbbeacd7c163a1bb042085a95e5a1fa235
diff --git a/hdl/chisel/src/kelvin/scalar/Alu.scala b/hdl/chisel/src/kelvin/scalar/Alu.scala
index ed63314..6b85de4 100644
--- a/hdl/chisel/src/kelvin/scalar/Alu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Alu.scala
@@ -58,7 +58,7 @@
     // Execute cycle.
     val rs1 = Flipped(new RegfileReadDataIO)
     val rs2 = Flipped(new RegfileReadDataIO)
-    val rd  = Flipped(new RegfileWriteDataIO)
+    val rd  = Valid(Flipped(new RegfileWriteDataIO))
   })
 
   val valid = RegInit(false.B)
@@ -80,9 +80,9 @@
   val shamt = rs2(4,0)
 
   io.rd.valid := valid
-  io.rd.addr  := addr
+  io.rd.bits.addr  := addr
 
-  io.rd.data  := MuxLookup(op, 0.U)(Seq(
+  io.rd.bits.data  := MuxLookup(op, 0.U)(Seq(
       AluOp.ADD  -> (rs1 + rs2),
       AluOp.SUB  -> (rs1 - rs2),
       AluOp.SLT  -> (rs1.asSInt < rs2.asSInt),
diff --git a/hdl/chisel/src/kelvin/scalar/Bru.scala b/hdl/chisel/src/kelvin/scalar/Bru.scala
index 9a3d624..15f2cbe 100644
--- a/hdl/chisel/src/kelvin/scalar/Bru.scala
+++ b/hdl/chisel/src/kelvin/scalar/Bru.scala
@@ -90,7 +90,7 @@
     val csr = new CsrBruIO(p)
     val rs1 = Input(new RegfileReadDataIO)
     val rs2 = Input(new RegfileReadDataIO)
-    val rd  = Flipped(new RegfileWriteDataIO)
+    val rd  = Valid(Flipped(new RegfileWriteDataIO))
     val taken = new BranchTakenIO(p)
     val target = Flipped(new RegfileBranchTargetIO)
     val interlock = Output(Bool())
@@ -175,8 +175,8 @@
   io.taken.value := stateReg.bits.target
 
   io.rd.valid := stateReg.valid && stateReg.bits.linkValid
-  io.rd.addr := stateReg.bits.linkAddr
-  io.rd.data := stateReg.bits.linkData
+  io.rd.bits.addr := stateReg.bits.linkAddr
+  io.rd.bits.data := stateReg.bits.linkData
 
   // Undefined Fault.
   val undefFault = stateReg.valid && (op === BruOp.UNDEF)
diff --git a/hdl/chisel/src/kelvin/scalar/Csr.scala b/hdl/chisel/src/kelvin/scalar/Csr.scala
index 02e5e67..7c0a34a 100644
--- a/hdl/chisel/src/kelvin/scalar/Csr.scala
+++ b/hdl/chisel/src/kelvin/scalar/Csr.scala
@@ -97,7 +97,7 @@
 
     // Execute cycle.
     val rs1 = Flipped(new RegfileReadDataIO)
-    val rd  = Flipped(new RegfileWriteDataIO)
+    val rd  = Valid(Flipped(new RegfileWriteDataIO))
     val bru = Flipped(new CsrBruIO(p))
 
     // Vector core.
@@ -348,8 +348,8 @@
 
   // Write port.
   io.rd.valid := req.valid
-  io.rd.addr  := req.bits.addr
-  io.rd.data  := rdata
+  io.rd.bits.addr  := req.bits.addr
+  io.rd.bits.data  := rdata
 
   // Assertions.
   assert(!(req.valid && !io.rs1.valid))
diff --git a/hdl/chisel/src/kelvin/scalar/Dvu.scala b/hdl/chisel/src/kelvin/scalar/Dvu.scala
index 312589c..2eabb10 100644
--- a/hdl/chisel/src/kelvin/scalar/Dvu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Dvu.scala
@@ -45,12 +45,7 @@
     // Execute cycle.
     val rs1 = Flipped(new RegfileReadDataIO)
     val rs2 = Flipped(new RegfileReadDataIO)
-    val rd  = new Bundle {  // RegfileWriteDataIO
-      val valid = Output(Bool())
-      val ready = Input(Bool())
-      val addr  = Output(UInt(5.W))
-      val data  = Output(UInt(32.W))
-    }
+    val rd  = Decoupled(new RegfileWriteDataIO)
   })
 
   // This implemention differs to common::idiv by supporting early termination,
@@ -146,8 +141,8 @@
   val rem = Mux(signed2r, ~remain + 1.U, remain)
 
   io.rd.valid := count(5)
-  io.rd.addr := addr2
-  io.rd.data := Mux(divide2, div, rem)
+  io.rd.bits.addr := addr2
+  io.rd.bits.data := Mux(divide2, div, rem)
 }
 
 object EmitDvu extends App {
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala
index 0da8e36..8d88202 100644
--- a/hdl/chisel/src/kelvin/scalar/Lsu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -93,7 +93,7 @@
     val busPort = Flipped(new RegfileBusPortIO(p))
 
     // Execute cycle(s).
-    val rd = Flipped(new RegfileWriteDataIO)
+    val rd = Valid(Flipped(new RegfileWriteDataIO))
 
     // Cached interface.
     val dbus = new DBusIO(p)
@@ -297,8 +297,8 @@
 
   // pass-through
   io.rd.valid := rvalid && data.io.out.bits.iload
-  io.rd.addr  := data.io.out.bits.index
-  io.rd.data  := rdata
+  io.rd.bits.addr  := data.io.out.bits.index
+  io.rd.bits.data  := rdata
 
   assert(!ctrl.io.out.valid || PopCount(Cat(ctrl.io.out.bits.sldst, ctrl.io.out.bits.vldst, ctrl.io.out.bits.suncd)) <= 1.U)
   assert(!data.io.out.valid || PopCount(Cat(data.io.out.bits.sldst, data.io.out.bits.suncd)) <= 1.U)
diff --git a/hdl/chisel/src/kelvin/scalar/Mlu.scala b/hdl/chisel/src/kelvin/scalar/Mlu.scala
index 335036f..4459933 100644
--- a/hdl/chisel/src/kelvin/scalar/Mlu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Mlu.scala
@@ -51,7 +51,7 @@
     // Execute cycle.
     val rs1 = Vec(p.instructionLanes, Flipped(new RegfileReadDataIO))
     val rs2 = Vec(p.instructionLanes, Flipped(new RegfileReadDataIO))
-    val rd  = Flipped(new RegfileWriteDataIO)
+    val rd  = Valid(Flipped(new RegfileWriteDataIO))
   })
 
   val op = Reg(MluOp())
@@ -116,8 +116,8 @@
   }
 
   io.rd.valid := valid2
-  io.rd.addr  := addr2
-  io.rd.data  := mul2 + round2
+  io.rd.bits.addr  := addr2
+  io.rd.bits.data  := mul2 + round2
 
   // Assertions.
   for (i <- 0 until p.instructionLanes) {
diff --git a/hdl/chisel/src/kelvin/scalar/MluTest.scala b/hdl/chisel/src/kelvin/scalar/MluTest.scala
index 80f11c4..cab5940 100644
--- a/hdl/chisel/src/kelvin/scalar/MluTest.scala
+++ b/hdl/chisel/src/kelvin/scalar/MluTest.scala
@@ -50,8 +50,8 @@
 
         dut.clock.step()
         assertResult(1) { dut.io.rd.valid.peekInt() }
-        assertResult(13) { dut.io.rd.addr.peekInt() }
-        assertResult(4) { dut.io.rd.data.peekInt() }
+        assertResult(13) { dut.io.rd.bits.addr.peekInt() }
+        assertResult(4) { dut.io.rd.bits.data.peekInt() }
 
         dut.clock.step()
         assertResult(0) { dut.io.rd.valid.peekInt() }
diff --git a/hdl/chisel/src/kelvin/scalar/Regfile.scala b/hdl/chisel/src/kelvin/scalar/Regfile.scala
index 7397bb7..63e688f 100644
--- a/hdl/chisel/src/kelvin/scalar/Regfile.scala
+++ b/hdl/chisel/src/kelvin/scalar/Regfile.scala
@@ -50,7 +50,6 @@
 }
 
 class RegfileWriteDataIO extends Bundle {
-  val valid = Input(Bool())
   val addr  = Input(UInt(5.W))
   val data  = Input(UInt(32.W))
 }
@@ -89,7 +88,10 @@
 
     // Execute cycle.
     val readData = Vec(p.instructionLanes * 2, new RegfileReadDataIO)
-    val writeData = Vec(p.instructionLanes + 2, new RegfileWriteDataIO)
+    val writeData = Vec(p.instructionLanes + 2, new Bundle {
+      val valid = Input(Bool())
+      val bits = new RegfileWriteDataIO
+    })
     val writeMask = Vec(p.instructionLanes, new Bundle {val valid = Input(Bool())})
     val scoreboard = new Bundle {
       val regd = Output(UInt(32.W))
@@ -115,7 +117,7 @@
       .map(x => MuxOR(x.valid, UIntToOH(x.addr, 32))).reduce(_|_)
 
   val scoreboard_clr0 = io.writeData
-      .map(x => MuxOR(x.valid, UIntToOH(x.addr, 32))).reduce(_|_)
+      .map(x => MuxOR(x.valid, UIntToOH(x.bits.addr, 32))).reduce(_|_)
 
   val scoreboard_clr = Cat(scoreboard_clr0(31,1), 0.U(1.W))
 
@@ -150,12 +152,12 @@
 
   for (i <- 1 until 32) {
     val valid = Cat(
-      Array(io.writeData(p.instructionLanes + 1).valid && io.writeData(p.instructionLanes + 1).addr === i.U,
-            io.writeData(p.instructionLanes).valid && io.writeData(p.instructionLanes).addr === i.U) ++
-            (0 until p.instructionLanes).reverse.map(x => io.writeData(x).valid && io.writeData(x).addr === i.U && !io.writeMask(x).valid)
+      Array(io.writeData(p.instructionLanes + 1).valid && io.writeData(p.instructionLanes + 1).bits.addr === i.U,
+            io.writeData(p.instructionLanes).valid && io.writeData(p.instructionLanes).bits.addr === i.U) ++
+            (0 until p.instructionLanes).reverse.map(x => io.writeData(x).valid && io.writeData(x).bits.addr === i.U && !io.writeMask(x).valid)
     )
 
-    val data  = (0 until p.instructionLanes + 2).map(x => MuxOR(valid(x), io.writeData(x).data)).reduce(_|_)
+    val data  = (0 until p.instructionLanes + 2).map(x => MuxOR(valid(x), io.writeData(x).bits.data)).reduce(_|_)
 
     writeValid(i) := valid =/= 0.U
     writeData(i)  := data
@@ -174,9 +176,9 @@
   val x0 =
     (0 until p.instructionLanes).map(x =>
       io.writeData(x).valid &&
-      io.writeData(x).addr === 0.U &&
+      io.writeData(x).bits.addr === 0.U &&
       !io.writeMask(x).valid) ++
-    (p.instructionLanes until p.instructionLanes + 2).map(x => io.writeData(x).valid && io.writeData(x).addr === 0.U)
+    (p.instructionLanes until p.instructionLanes + 2).map(x => io.writeData(x).valid && io.writeData(x).bits.addr === 0.U)
 
   io.rfwriteCount := PopCount(writeValid) - writeValid(0) + PopCount(x0)
 
@@ -247,8 +249,8 @@
       // Delay the failure a cycle for debugging purposes.
       val write_fail = RegInit(false.B)
       write_fail := io.writeData(i).valid && io.writeData(j).valid &&
-                    io.writeData(i).addr === io.writeData(j).addr &&
-                    io.writeData(i).addr =/= 0.U
+                    io.writeData(i).bits.addr === io.writeData(j).bits.addr &&
+                    io.writeData(i).bits.addr =/= 0.U
       assert(!write_fail)
     }
   }
diff --git a/hdl/chisel/src/kelvin/scalar/SCore.scala b/hdl/chisel/src/kelvin/scalar/SCore.scala
index 03c8dd7..1df6db2 100644
--- a/hdl/chisel/src/kelvin/scalar/SCore.scala
+++ b/hdl/chisel/src/kelvin/scalar/SCore.scala
@@ -219,8 +219,8 @@
     regfile.io.busAddr(i) := decode(i).io.busRead
 
     val csr0Valid = if (i == 0) csr.io.rd.valid else false.B
-    val csr0Addr  = if (i == 0) csr.io.rd.addr else 0.U
-    val csr0Data  = if (i == 0) csr.io.rd.data else 0.U
+    val csr0Addr  = if (i == 0) csr.io.rd.bits.addr else 0.U
+    val csr0Data  = if (i == 0) csr.io.rd.bits.data else 0.U
 
 
     regfile.io.writeData(i).valid := csr0Valid ||
@@ -229,20 +229,20 @@
                                         io.vcore.get.rd(i).valid
                                       } else { false.B })
 
-    regfile.io.writeData(i).addr :=
+    regfile.io.writeData(i).bits.addr :=
         MuxOR(csr0Valid, csr0Addr) |
-        MuxOR(alu(i).io.rd.valid, alu(i).io.rd.addr) |
-        MuxOR(bru(i).io.rd.valid, bru(i).io.rd.addr) |
+        MuxOR(alu(i).io.rd.valid, alu(i).io.rd.bits.addr) |
+        MuxOR(bru(i).io.rd.valid, bru(i).io.rd.bits.addr) |
         (if (p.enableVector) {
-           MuxOR(io.vcore.get.rd(i).valid, io.vcore.get.rd(i).addr)
+           MuxOR(io.vcore.get.rd(i).valid, io.vcore.get.rd(i).bits.addr)
          } else { false.B })
 
-    regfile.io.writeData(i).data :=
+    regfile.io.writeData(i).bits.data :=
         MuxOR(csr0Valid, csr0Data) |
-        MuxOR(alu(i).io.rd.valid, alu(i).io.rd.data) |
-        MuxOR(bru(i).io.rd.valid, bru(i).io.rd.data) |
+        MuxOR(alu(i).io.rd.valid, alu(i).io.rd.bits.data) |
+        MuxOR(bru(i).io.rd.valid, bru(i).io.rd.bits.data) |
         (if (p.enableVector) {
-           MuxOR(io.vcore.get.rd(i).valid, io.vcore.get.rd(i).data)
+           MuxOR(io.vcore.get.rd(i).valid, io.vcore.get.rd(i).bits.data)
          } else { false.B })
 
     if (p.enableVector) {
@@ -257,14 +257,14 @@
 
   val mluDvuOffset = p.instructionLanes
   regfile.io.writeData(mluDvuOffset).valid := mlu.io.rd.valid || dvu.io.rd.valid
-  regfile.io.writeData(mluDvuOffset).addr := Mux(mlu.io.rd.valid, mlu.io.rd.addr, dvu.io.rd.addr)
-  regfile.io.writeData(mluDvuOffset).data := Mux(mlu.io.rd.valid, mlu.io.rd.data, dvu.io.rd.data)
+  regfile.io.writeData(mluDvuOffset).bits.addr := Mux(mlu.io.rd.valid, mlu.io.rd.bits.addr, dvu.io.rd.bits.addr)
+  regfile.io.writeData(mluDvuOffset).bits.data := Mux(mlu.io.rd.valid, mlu.io.rd.bits.data, dvu.io.rd.bits.data)
   assert(!(mlu.io.rd.valid && (dvu.io.rd.valid && dvu.io.rd.ready)))  // TODO: stall dvu on mlu write
 
   val lsuOffset = p.instructionLanes + 1
   regfile.io.writeData(lsuOffset).valid := lsu.io.rd.valid
-  regfile.io.writeData(lsuOffset).addr  := lsu.io.rd.addr
-  regfile.io.writeData(lsuOffset).data  := lsu.io.rd.data
+  regfile.io.writeData(lsuOffset).bits.addr  := lsu.io.rd.bits.addr
+  regfile.io.writeData(lsuOffset).bits.data  := lsu.io.rd.bits.data
 
   val writeMask = bru.map(_.io.taken.valid).scan(false.B)(_||_)
   for (i <- 0 until p.instructionLanes) {
diff --git a/hdl/chisel/src/kelvin/vector/VCore.scala b/hdl/chisel/src/kelvin/vector/VCore.scala
index b1252e7..2dc343a 100644
--- a/hdl/chisel/src/kelvin/vector/VCore.scala
+++ b/hdl/chisel/src/kelvin/vector/VCore.scala
@@ -34,7 +34,7 @@
 
   // Execute cycle.
   val rs = Vec(p.instructionLanes * 2, Flipped(new RegfileReadDataIO))
-  val rd = Vec(p.instructionLanes, Flipped(new RegfileWriteDataIO))
+  val rd = Vec(p.instructionLanes, Valid(Flipped(new RegfileWriteDataIO)))
 
   // Status.
   val mactive = Output(Bool())
diff --git a/hdl/chisel/src/kelvin/vector/VInst.scala b/hdl/chisel/src/kelvin/vector/VInst.scala
index a273bd9..ebd5976 100644
--- a/hdl/chisel/src/kelvin/vector/VInst.scala
+++ b/hdl/chisel/src/kelvin/vector/VInst.scala
@@ -68,7 +68,7 @@
 
     // Execute cycle.
     val rs = Vec(p.instructionLanes * 2, Flipped(new RegfileReadDataIO))
-    val rd = Vec(p.instructionLanes, Flipped(new RegfileWriteDataIO))
+    val rd = Vec(p.instructionLanes, Valid(Flipped(new RegfileWriteDataIO)))
 
     // Vector interface.
     val out = new VectorInstructionIO(p)
@@ -236,9 +236,9 @@
 
   for (i <- 0 until p.instructionLanes) {
     io.rd(i).valid := getvl(i) || getmaxvl(i) || vld_u(i) || vst_u(i) || vst_q(i)
-    io.rd(i).addr := rdAddr(i)
+    io.rd(i).bits.addr := rdAddr(i)
 
-    io.rd(i).data :=
+    io.rd(i).bits.data :=
         MuxOR(getvl(i), getvlValue(i)) |
         MuxOR(getmaxvl(i), getmaxvlValue(i)) |
         MuxOR(vld_u(i) || vst_u(i) || vst_q(i), lsuAdder(i))