Parameterize Kelvin over instructionLanes
- Many things in Kelvin were hard-coded to operate on 4 instruction lanes -- refactor those to be flexible based on the instructionLanes value in Parameters
Change-Id: I1957d87b6f355d815380a88c28d210c1c8eec737
diff --git a/hdl/chisel/src/common/BUILD b/hdl/chisel/src/common/BUILD
index 5bab0ba..2805f14 100644
--- a/hdl/chisel/src/common/BUILD
+++ b/hdl/chisel/src/common/BUILD
@@ -18,9 +18,9 @@
chisel_library(
name = "common",
srcs = [
- "Fifo4e.scala",
- "Fifo4.scala",
- "Fifo4x4.scala",
+ "FifoXe.scala",
+ "FifoX.scala",
+ "FifoIxO.scala",
"Fifo.scala",
"IDiv.scala",
"Library.scala",
diff --git a/hdl/chisel/src/common/Fifo4.scala b/hdl/chisel/src/common/Fifo4.scala
deleted file mode 100644
index a01963f..0000000
--- a/hdl/chisel/src/common/Fifo4.scala
+++ /dev/null
@@ -1,192 +0,0 @@
-// Copyright 2023 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package common
-
-import chisel3._
-import chisel3.util._
-import _root_.circt.stage.ChiselStage
-
-object Fifo4 {
- def apply[T <: Data](t: T, n: Int) = {
- Module(new Fifo4(t, n))
- }
-}
-
-// 4way decode, used for Fifo4 style input controls.
-object Fifo4Valid {
- def apply(in: UInt): (UInt, UInt, UInt, UInt) = {
- assert(in.getWidth == 4)
-
- val in0 = Cat(in(3,0) === 8.U, // 8
- in(2,0) === 4.U, // 4, 12
- in(1,0) === 2.U, // 2, 6, 10, 14
- in(0)) // 1, 3, 5, 7, 9, 11, 13, 15
-
- val in1 = Cat(in(3,0) === 12.U ||
- in(3,0) === 10.U ||
- in(3,0) === 9.U, // 9, 10, 12
- in(2,0) === 6.U ||
- in(2,0) === 5.U, // 5, 6, 13, 14
- in(1,0) === 3.U, // 3, 7, 11, 15
- false.B)
-
- val in2 = Cat(in(3,0) === 14.U ||
- in(3,0) === 13.U ||
- in(3,0) === 11.U, // 11, 13, 14
- in(2,0) === 15.U ||
- in(2,0) === 7.U, // 7, 15
- false.B, false.B)
-
- val in3 = Cat(in(3,0) === 15.U, // 15
- false.B, false.B, false.B)
-
- (in0.asUInt, in1.asUInt, in2.asUInt, in3.asUInt)
- }
-}
-
-class Fifo4[T <: Data](t: T, n: Int) extends Module {
- val io = IO(new Bundle {
- val in = Flipped(Decoupled(Vec(4, Valid(t))))
- val out = Decoupled(t)
- val count = Output(UInt(log2Ceil(n+1).W))
- })
-
- val m = n - 1 // n = Mem(n-1) + Slice
-
- def Increment(a: UInt, b: UInt): UInt = {
- val c = a +& b
- val d = Mux(c < m.U, c, c - m.U)(a.getWidth - 1, 0)
- d
- }
-
- val mem = Mem(m, t)
- val mslice = Slice(t, false, true)
-
- val in0pos = RegInit(0.U(log2Ceil(m).W))
- val in1pos = RegInit(1.U(log2Ceil(m).W))
- val in2pos = RegInit(2.U(log2Ceil(m).W))
- val in3pos = RegInit(3.U(log2Ceil(m).W))
- val outpos = RegInit(0.U(log2Ceil(m).W))
- val mcount = RegInit(0.U(log2Ceil(n+1).W))
-
- io.count := mcount + io.out.valid
-
- val ivalid = io.in.valid && io.in.ready
- val ovalid = mslice.io.in.valid && mslice.io.in.ready
-
- val iactive = Cat(io.in.bits(3).valid, io.in.bits(2).valid,
- io.in.bits(1).valid, io.in.bits(0).valid).asUInt
-
- val icount = PopCount(iactive)
-
- // ---------------------------------------------------------------------------
- // Fifo Control.
- when (ivalid) {
- in0pos := Increment(in0pos, icount)
- in1pos := Increment(in1pos, icount)
- in2pos := Increment(in2pos, icount)
- in3pos := Increment(in3pos, icount)
- }
-
- when (ovalid) {
- outpos := Increment(outpos, 1.U)
- }
-
- val inc = MuxOR(ivalid, icount)
- val dec = mslice.io.in.valid && mslice.io.in.ready
-
- when (ivalid || ovalid) {
- mcount := mcount + inc - dec
- }
-
- // ---------------------------------------------------------------------------
- // Fifo Input.
- val (in0valid, in1valid, in2valid, in3valid) = Fifo4Valid(iactive)
-
- for (i <- 0 until m) {
- val valid = Cat(in0pos === i.U && in0valid(3) ||
- in1pos === i.U && in1valid(3) ||
- in2pos === i.U && in2valid(3) ||
- in3pos === i.U && in3valid(3),
- in0pos === i.U && in0valid(2) ||
- in1pos === i.U && in1valid(2) ||
- in2pos === i.U && in2valid(2),
- in0pos === i.U && in0valid(1) ||
- in1pos === i.U && in1valid(1),
- in0pos === i.U && in0valid(0))
-
- when (ivalid) {
- when (valid(0)) {
- mem(i) := io.in.bits(0).bits
- } .elsewhen (valid(1)) {
- mem(i) := io.in.bits(1).bits
- } .elsewhen (valid(2)) {
- mem(i) := io.in.bits(2).bits
- } .elsewhen (valid(3)) {
- mem(i) := io.in.bits(3).bits
- }
- }
- }
-
- mslice.io.in.valid := false.B
- mslice.io.in.bits := io.in.bits(0).bits // defaults
-
- when (mcount > 0.U) {
- when (io.out.ready) {
- mslice.io.in.valid := true.B
- }
- } .otherwise {
- when (ivalid && iactive =/= 0.U) {
- mslice.io.in.valid := true.B
- }
- }
-
- when (mcount > 0.U) {
- mslice.io.in.bits := mem(outpos)
- } .elsewhen (ivalid) {
- when (iactive(0)) {
- mslice.io.in.bits := io.in.bits(0).bits
- } .elsewhen (iactive(1)) {
- mslice.io.in.bits := io.in.bits(1).bits
- } .elsewhen (iactive(2)) {
- mslice.io.in.bits := io.in.bits(2).bits
- } .elsewhen (iactive(3)) {
- mslice.io.in.bits := io.in.bits(3).bits
- }
- }
-
- // ---------------------------------------------------------------------------
- // Valid Entries.
- val active = RegInit(0.U(m.W))
-
- val activeSet = MuxOR(ivalid,
- ((icount >= 1.U) << in0pos) | ((icount >= 2.U) << in1pos) |
- ((icount >= 3.U) << in2pos) | ((icount >= 4.U) << in3pos))
-
- val activeClr = MuxOR(mslice.io.in.valid && mslice.io.in.ready, 1.U << outpos)
-
- active := (active | activeSet) & ~activeClr
-
- // ---------------------------------------------------------------------------
- // Interface.
- io.in.ready := mcount <= (m.U - icount)
- io.out <> mslice.io.out
-
- assert(mcount <= m.U)
-}
-
-object EmitFifo4 extends App {
- ChiselStage.emitSystemVerilogFile(new Fifo4(UInt(8.W), 11), args)
-}
diff --git a/hdl/chisel/src/common/Fifo4e.scala b/hdl/chisel/src/common/Fifo4e.scala
deleted file mode 100644
index 392e7ee..0000000
--- a/hdl/chisel/src/common/Fifo4e.scala
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright 2023 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package common
-
-import chisel3._
-import chisel3.util._
-import _root_.circt.stage.ChiselStage
-
-// Fifo4 with entry output and no output registration stage.
-
-object Fifo4e {
- def apply[T <: Data](t: T, n: Int) = {
- Module(new Fifo4e(t, n))
- }
-}
-
-class Fifo4e[T <: Data](t: T, n: Int) extends Module {
- val io = IO(new Bundle {
- val in = Flipped(Decoupled(Vec(4, Valid(t))))
- val out = Decoupled(t)
- val count = Output(UInt(log2Ceil(n+1).W))
- val entry = Output(Vec(n, Valid(t)))
- val nempty = Output(Bool())
- })
-
- def Increment(a: UInt, b: UInt): UInt = {
- val c = a +& b
- val d = Mux(c < n.U, c, c - n.U)(a.getWidth - 1, 0)
- d
- }
-
- val mem = Mem(n, t)
-
- val in0pos = RegInit(0.U(log2Ceil(n).W))
- val in1pos = RegInit(1.U(log2Ceil(n).W))
- val in2pos = RegInit(2.U(log2Ceil(n).W))
- val in3pos = RegInit(3.U(log2Ceil(n).W))
- val outpos = RegInit(0.U(log2Ceil(n).W))
- val mcount = RegInit(0.U(log2Ceil(n+1).W))
- val nempty = RegInit(false.B)
-
- io.count := mcount
- io.nempty := nempty
-
- val ivalid = io.in.valid && io.in.ready
- val ovalid = io.out.valid && io.out.ready
-
- val iactive = Cat(io.in.bits(3).valid, io.in.bits(2).valid,
- io.in.bits(1).valid, io.in.bits(0).valid).asUInt
-
- val icount = PopCount(iactive)
-
- // ---------------------------------------------------------------------------
- // Fifo Control.
- when (ivalid) {
- in0pos := Increment(in0pos, icount)
- in1pos := Increment(in1pos, icount)
- in2pos := Increment(in2pos, icount)
- in3pos := Increment(in3pos, icount)
- }
-
- when (ovalid) {
- outpos := Increment(outpos, 1.U)
- }
-
- val inc = MuxOR(ivalid, icount)
- val dec = ovalid
-
- when (ivalid || ovalid) {
- val nxtcount = mcount + inc - dec
- mcount := nxtcount
- nempty := nxtcount =/= 0.U
- }
-
- // ---------------------------------------------------------------------------
- // Fifo Input.
- val (in0valid, in1valid, in2valid, in3valid) = Fifo4Valid(iactive)
-
- for (i <- 0 until n) {
- val valid = Cat(in0pos === i.U && in0valid(3) ||
- in1pos === i.U && in1valid(3) ||
- in2pos === i.U && in2valid(3) ||
- in3pos === i.U && in3valid(3),
- in0pos === i.U && in0valid(2) ||
- in1pos === i.U && in1valid(2) ||
- in2pos === i.U && in2valid(2),
- in0pos === i.U && in0valid(1) ||
- in1pos === i.U && in1valid(1),
- in0pos === i.U && in0valid(0))
-
- when (ivalid) {
- when (valid(0)) {
- mem(i) := io.in.bits(0).bits
- } .elsewhen (valid(1)) {
- mem(i) := io.in.bits(1).bits
- } .elsewhen (valid(2)) {
- mem(i) := io.in.bits(2).bits
- } .elsewhen (valid(3)) {
- mem(i) := io.in.bits(3).bits
- }
- }
- }
-
- // ---------------------------------------------------------------------------
- // Valid Entries.
- val active = RegInit(0.U(n.W))
-
- val activeSet = MuxOR(ivalid,
- ((icount >= 1.U) << in0pos) | ((icount >= 2.U) << in1pos) |
- ((icount >= 3.U) << in2pos) | ((icount >= 4.U) << in3pos))
-
- val activeClr = MuxOR(io.out.valid && io.out.ready, 1.U << outpos)
-
- when (io.in.valid && io.in.ready || io.out.valid && io.out.ready) {
- active := (active | activeSet) & ~activeClr
- }
-
- // ---------------------------------------------------------------------------
- // Interface.
- io.in.ready := mcount <= (n.U - icount)
-
- io.out.valid := mcount =/= 0.U
- io.out.bits := mem(outpos)
-
- assert(mcount <= n.U)
-
- for (i <- 0 until n) {
- io.entry(i).valid := active(i)
- io.entry(i).bits := mem(i)
- }
-}
-
-object EmitFifo4e extends App {
- ChiselStage.emitSystemVerilogFile(new Fifo4e(UInt(8.W), 10), args)
-}
diff --git a/hdl/chisel/src/common/Fifo4x4.scala b/hdl/chisel/src/common/Fifo4x4.scala
deleted file mode 100644
index 064af4b..0000000
--- a/hdl/chisel/src/common/Fifo4x4.scala
+++ /dev/null
@@ -1,198 +0,0 @@
-// Copyright 2023 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package common
-
-import chisel3._
-import chisel3.util._
-import _root_.circt.stage.ChiselStage
-
-object Fifo4x4 {
- def apply[T <: Data](t: T, n: Int) = {
- Module(new Fifo4x4(t, n))
- }
-}
-
-// Input accepted with a common handshake and per lane select.
-// Outputs are transacted independently, and ordered {[0], [0,1], [0,1,2], [0,1,2,3]}.
-// Outputs are not registered, assumes passes directly into shallow combinatorial.
-class Fifo4x4[T <: Data](t: T, n: Int) extends Module {
- val io = IO(new Bundle {
- val in = Flipped(Decoupled(Vec(4, Valid(t))))
- val out = Vec(4, Decoupled(t))
- val count = Output(UInt(log2Ceil(n+1).W))
- val nempty = Output(Bool())
- })
-
- val m = n
-
- val mb = log2Ceil(m)
- val n1b = log2Ceil(n + 1)
-
- def Increment(a: UInt, b: UInt): UInt = {
- val c = a +& b
- val d = Mux(c < m.U, c, c - m.U)(a.getWidth - 1, 0)
- d
- }
-
- val mem = Reg(Vec(n, t))
-
- val inpos = Reg(Vec(4, UInt(mb.W))) // reset below
- val outpos = Reg(Vec(4, UInt(mb.W))) // reset below
-
- val mcount = RegInit(0.U(n1b.W))
- val nempty = RegInit(false.B)
- val inready = RegInit(false.B)
- val outvalid = RegInit(0.U(4.W))
-
- val ivalid = io.in.valid && io.in.ready
-
- val iactive = Cat(io.in.bits(3).valid, io.in.bits(2).valid,
- io.in.bits(1).valid, io.in.bits(0).valid).asUInt
-
- val icount = (io.in.bits(0).valid +& io.in.bits(1).valid +&
- io.in.bits(2).valid +& io.in.bits(3).valid)(2,0)
-
- val oactiveBits = Cat(io.out(3).valid && io.out(3).ready,
- io.out(2).valid && io.out(2).ready,
- io.out(1).valid && io.out(1).ready,
- io.out(0).valid && io.out(0).ready)
-
- val ovalid = oactiveBits =/= 0.U
-
- val ocount = (oactiveBits(0) +& oactiveBits(1) +&
- oactiveBits(2) +& oactiveBits(3))(2,0)
-
- assert(!(oactiveBits(1) === 1.U && oactiveBits(0,0) =/= 1.U))
- assert(!(oactiveBits(2) === 1.U && oactiveBits(1,0) =/= 3.U))
- assert(!(oactiveBits(3) === 1.U && oactiveBits(2,0) =/= 7.U))
-
- val ovalidBits = Cat(io.out(3).valid, io.out(2).valid,
- io.out(1).valid, io.out(0).valid)
-
- assert(!(ovalidBits(1) === 1.U && ovalidBits(0,0) =/= 1.U))
- assert(!(ovalidBits(2) === 1.U && ovalidBits(1,0) =/= 3.U))
- assert(!(ovalidBits(3) === 1.U && ovalidBits(2,0) =/= 7.U))
-
- val oreadyBits = Cat(io.out(3).ready, io.out(2).ready,
- io.out(1).ready, io.out(0).ready)
-
- assert(!(oreadyBits(1) === 1.U && oreadyBits(0,0) =/= 1.U))
- assert(!(oreadyBits(2) === 1.U && oreadyBits(1,0) =/= 3.U))
- assert(!(oreadyBits(3) === 1.U && oreadyBits(2,0) =/= 7.U))
-
- // ---------------------------------------------------------------------------
- // Fifo Control.
- when (reset.asBool) {
- for (i <- 0 until 4) {
- inpos(i) := i.U
- }
- } .elsewhen (ivalid) {
- for (i <- 0 until 4) {
- inpos(i) := Increment(inpos(i), icount)
- }
- }
-
- when (reset.asBool) {
- for (i <- 0 until 4) {
- outpos(i) := i.U
- }
- } .elsewhen (ovalid) {
- for (i <- 0 until 4) {
- outpos(i) := Increment(outpos(i), ocount)
- }
- }
-
- val inc = MuxOR(ivalid, icount)
- val dec = MuxOR(ovalid, ocount)
-
- when (ivalid || ovalid) {
- val nxtmcount = mcount + inc - dec
- inready := nxtmcount <= (m.U - 4.U)
- mcount := nxtmcount
- nempty := nxtmcount =/= 0.U
- outvalid := Cat(nxtmcount >= 4.U,
- nxtmcount >= 3.U,
- nxtmcount >= 2.U,
- nxtmcount >= 1.U)
- } .otherwise {
- inready := mcount <= (m.U - 4.U)
- outvalid := Cat(mcount >= 4.U,
- mcount >= 3.U,
- mcount >= 2.U,
- mcount >= 1.U)
- }
-
- // ---------------------------------------------------------------------------
- // Fifo Input.
- val (in0valid, in1valid, in2valid, in3valid) = Fifo4Valid(iactive)
-
- for (i <- 0 until m) {
- val valid = Cat(inpos(0) === i.U && in0valid(3) ||
- inpos(1) === i.U && in1valid(3) ||
- inpos(2) === i.U && in2valid(3) ||
- inpos(3) === i.U && in3valid(3),
-
- inpos(0) === i.U && in0valid(2) ||
- inpos(1) === i.U && in1valid(2) ||
- inpos(2) === i.U && in2valid(2),
-
- inpos(0) === i.U && in0valid(1) ||
- inpos(1) === i.U && in1valid(1),
-
- inpos(0) === i.U && in0valid(0))
-
- if (true) {
- val data = MuxOR(valid(0), io.in.bits(0).bits.asUInt) |
- MuxOR(valid(1), io.in.bits(1).bits.asUInt) |
- MuxOR(valid(2), io.in.bits(2).bits.asUInt) |
- MuxOR(valid(3), io.in.bits(3).bits.asUInt)
-
- when (ivalid && valid =/= 0.U) {
- mem(i) := data.asTypeOf(t)
- }
- } else {
- when (ivalid) {
- when (valid(0)) {
- mem(i) := io.in.bits(0).bits
- } .elsewhen (valid(1)) {
- mem(i) := io.in.bits(1).bits
- } .elsewhen (valid(2)) {
- mem(i) := io.in.bits(2).bits
- } .elsewhen (valid(3)) {
- mem(i) := io.in.bits(3).bits
- }
- }
- }
- }
-
- // ---------------------------------------------------------------------------
- // Interface.
- io.in.ready := inready
-
- for (i <- 0 until 4) {
- io.out(i).valid := outvalid(i)
- io.out(i).bits := mem(outpos(i)) // TODO: VecAt()
- }
-
- io.count := mcount
-
- io.nempty := nempty
-
- assert(io.count <= m.U)
-}
-
-object EmitFifo4x4 extends App {
- ChiselStage.emitSystemVerilogFile(new Fifo4x4(UInt(32.W), 24), args)
-}
diff --git a/hdl/chisel/src/common/FifoIxO.scala b/hdl/chisel/src/common/FifoIxO.scala
new file mode 100644
index 0000000..5be7dd6
--- /dev/null
+++ b/hdl/chisel/src/common/FifoIxO.scala
@@ -0,0 +1,172 @@
+// Copyright 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package common
+
+import chisel3._
+import chisel3.util._
+import _root_.circt.stage.ChiselStage
+
+object FifoIxO {
+ def apply[T <: Data](t: T, i: Int, o: Int, n: Int) = {
+ Module(new FifoIxO(t, i, o, n))
+ }
+}
+
+// Input accepted with a common handshake and per lane select.
+// Outputs are transacted independently, and ordered {[0], [0,1], [0,1,2], [0,1,2,3]}.
+// Outputs are not registered, assumes passes directly into shallow combinatorial.
+class FifoIxO[T <: Data](t: T, i: Int, o: Int, n: Int /* depth */) extends Module {
+ val io = IO(new Bundle {
+ val in = Flipped(Decoupled(Vec(i, Valid(t))))
+ val out = Vec(o, Decoupled(t))
+ val count = Output(UInt(log2Ceil(n+1).W))
+ val nempty = Output(Bool())
+ })
+
+ val m = n
+
+ val mb = log2Ceil(m)
+ val n1b = log2Ceil(n + 1)
+
+ def Increment(a: UInt, b: UInt): UInt = {
+ val c = a +& b
+ val d = Mux(c < m.U, c, c - m.U)(a.getWidth - 1, 0)
+ d
+ }
+
+ val mem = Reg(Vec(n, t))
+
+ val inpos = Reg(Vec(i, UInt(mb.W))) // reset below
+ val outpos = Reg(Vec(o, UInt(mb.W))) // reset below
+
+ val mcount = RegInit(0.U(n1b.W))
+ val nempty = RegInit(false.B)
+ val inready = RegInit(false.B)
+ val outvalid = RegInit(0.U(o.W))
+
+ val ivalid = io.in.valid && io.in.ready
+
+ val iactive = Cat((0 until i).reverse.map(x => io.in.bits(x).valid)).asUInt
+
+ val icount = (io.in.bits.map(x => x.valid.asUInt).reduce(_ +& _))(log2Ceil(i),0)
+
+ val oactiveBits = Cat((0 until o).reverse.map(x => io.out(x).valid && io.out(x).ready))
+
+ val ovalid = oactiveBits =/= 0.U
+
+ val ocount = (0 until o).map(x => oactiveBits(x).asUInt).reduce(_ +& _)(log2Ceil(o),0)
+
+ for (n <- 1 until o) {
+ assert(!(oactiveBits(n) === 1.U && oactiveBits(n - 1,0) =/= ((1 << n) - 1).U))
+ }
+
+ val ovalidBits = Cat((0 until o).reverse.map(x => io.out(x).valid))
+
+ for (n <- 1 until o) {
+ assert(!(ovalidBits(n) === 1.U && ovalidBits(n - 1, 0) =/= ((1 << n) - 1).U))
+ }
+
+ val oreadyBits = Cat((0 until o).reverse.map(x => io.out(x).ready))
+
+ for (n <- 1 until o) {
+ assert(!(oreadyBits(n) === 1.U && oreadyBits(n - 1, 0) =/= ((1 << n) - 1).U))
+ }
+
+ // ---------------------------------------------------------------------------
+ // Fifo Control.
+ when (reset.asBool) {
+ for (i <- 0 until i) {
+ inpos(i) := i.U
+ }
+ } .elsewhen (ivalid) {
+ for (i <- 0 until i) {
+ inpos(i) := Increment(inpos(i), icount)
+ }
+ }
+
+ when (reset.asBool) {
+ for (i <- 0 until o) {
+ outpos(i) := i.U
+ }
+ } .elsewhen (ovalid) {
+ for (i <- 0 until o) {
+ outpos(i) := Increment(outpos(i), ocount)
+ }
+ }
+
+ val inc = MuxOR(ivalid, icount)
+ val dec = MuxOR(ovalid, ocount)
+
+ when (ivalid || ovalid) {
+ val nxtmcount = mcount + inc - dec
+ inready := nxtmcount <= (m.U - i.U)
+ mcount := nxtmcount
+ nempty := nxtmcount =/= 0.U
+ outvalid := Cat((0 until o).reverse.map(x => nxtmcount >= (x + 1).U))
+ } .otherwise {
+ inready := mcount <= (m.U - i.U)
+ outvalid := Cat((0 until o).reverse.map(x => mcount >= (x + 1).U))
+ }
+
+ // ---------------------------------------------------------------------------
+ // Fifo Input.
+ val inxvalid = FifoXValid(iactive)
+
+ for (q <- 0 until m) {
+ val valid = Cat(
+ (0 until i).reverse.map(x =>
+ if (x == 0) { inpos(0) === q.U && inxvalid(0)(0) } else {
+ (0 to x).map(y =>
+ inpos(y) === q.U && inxvalid(y)(x)
+ ).reduce(_ || _)
+ }
+ )
+ )
+
+ if (true) {
+ val data = (0 until i).map(x => MuxOR(valid(x), io.in.bits(x).bits.asUInt)).reduce(_ | _)
+
+ when (ivalid && valid =/= 0.U) {
+ mem(q) := data.asTypeOf(t)
+ }
+ } else {
+ when (ivalid) {
+ when(PopCount(valid) >= 1.U) {
+ val idx = PriorityEncoder(valid)
+ mem(q) := io.in.bits(idx).bits
+ }
+ }
+ }
+ }
+
+ // ---------------------------------------------------------------------------
+ // Interface.
+ io.in.ready := inready
+
+ for (i <- 0 until o) {
+ io.out(i).valid := outvalid(i)
+ io.out(i).bits := mem(outpos(i)) // TODO: VecAt()
+ }
+
+ io.count := mcount
+
+ io.nempty := nempty
+
+ assert(io.count <= m.U)
+}
+
+object EmitFifoIxO extends App {
+ ChiselStage.emitSystemVerilogFile(new FifoIxO(UInt(32.W), 4, 4, 24), args)
+}
diff --git a/hdl/chisel/src/common/FifoX.scala b/hdl/chisel/src/common/FifoX.scala
new file mode 100644
index 0000000..ee3f041
--- /dev/null
+++ b/hdl/chisel/src/common/FifoX.scala
@@ -0,0 +1,163 @@
+// Copyright 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package common
+
+import chisel3._
+import chisel3.util._
+import _root_.circt.stage.ChiselStage
+
+object FifoX {
+ def apply[T <: Data](t: T, x: Int, n: Int) = {
+ Module(new FifoX(t, x, n))
+ }
+}
+
+// Xway decode, used for FifoX style input controls.
+object FifoXValid {
+ def apply(in: UInt): Seq[UInt] = {
+ val inx = new Array[UInt](in.getWidth)
+
+ for (i <- 0 until in.getWidth) {
+ inx(i) = Cat(
+ (0 until in.getWidth).reverse.map(x =>
+ if (x < i) { false.B } else {
+ (PopCount(in(x,0)) === (i + 1).U) && in(x)
+ }
+ )
+ )
+ }
+ inx
+ }
+}
+
+class FifoX[T <: Data](t: T, x: Int, n: Int) extends Module {
+ val io = IO(new Bundle {
+ val in = Flipped(Decoupled(Vec(x, Valid(t))))
+ val out = Decoupled(t)
+ val count = Output(UInt(log2Ceil(n+1).W))
+ })
+
+ val m = n - 1 // n = Mem(n-1) + Slice
+
+ def Increment(a: UInt, b: UInt): UInt = {
+ val c = a +& b
+ val d = Mux(c < m.U, c, c - m.U)(a.getWidth - 1, 0)
+ d
+ }
+
+ val mem = Mem(m, t)
+ val mslice = Slice(t, false, true)
+
+ val inxpos = RegInit(VecInit((0 until x).map(x => x.U(log2Ceil(m).W))))
+ val outpos = RegInit(0.U(log2Ceil(m).W))
+ val mcount = RegInit(0.U(log2Ceil(n+1).W))
+
+ io.count := mcount + io.out.valid
+
+ val ivalid = io.in.valid && io.in.ready
+ val ovalid = mslice.io.in.valid && mslice.io.in.ready
+
+ val iactive = Cat((0 until x).reverse.map(x => io.in.bits(x).valid))
+
+ val icount = PopCount(iactive)
+
+ // ---------------------------------------------------------------------------
+ // Fifo Control.
+ when (ivalid) {
+ for (i <- 0 until x) {
+ inxpos(i) := Increment(inxpos(i), icount)
+ }
+ }
+
+ when (ovalid) {
+ outpos := Increment(outpos, 1.U)
+ }
+
+ val inc = MuxOR(ivalid, icount)
+ val dec = mslice.io.in.valid && mslice.io.in.ready
+
+ when (ivalid || ovalid) {
+ mcount := mcount + inc - dec
+ }
+
+ // ---------------------------------------------------------------------------
+ // Fifo Input.
+ val inxvalid = FifoXValid(iactive)
+
+ for (i <- 0 until m) {
+ val valid = Cat(
+ (0 until x).reverse.map(q =>
+ if (q == 0) { inxpos(0) === i.U && inxvalid(0)(0) } else {
+ (0 to q).map(y =>
+ inxpos(y) === i.U && inxvalid(y)(q)
+ ).reduce(_ || _)
+ }
+ )
+ )
+
+ when (ivalid) {
+ when (PopCount(valid) >= 1.U) {
+ val idx = PriorityEncoder(valid)
+ mem(i) := io.in.bits(idx).bits
+ }
+ }
+ }
+
+ mslice.io.in.valid := false.B
+ mslice.io.in.bits := io.in.bits(0).bits // defaults
+
+ when (mcount > 0.U) {
+ when (io.out.ready) {
+ mslice.io.in.valid := true.B
+ }
+ } .otherwise {
+ when (ivalid && iactive =/= 0.U) {
+ mslice.io.in.valid := true.B
+ }
+ }
+
+ when (mcount > 0.U) {
+ mslice.io.in.bits := mem(outpos)
+ } .elsewhen (ivalid) {
+ assert(PopCount(iactive) >= 1.U)
+ when (iactive =/= 0.U) {
+ val idx = PriorityEncoder(iactive)
+ mslice.io.in.bits := io.in.bits(idx).bits
+ }
+ }
+
+ // ---------------------------------------------------------------------------
+ // Valid Entries.
+ val active = RegInit(0.U(m.W))
+
+ val activeSet = MuxOR(ivalid,
+ (0 until x).map(i => (icount >= (i + 1).U) << inxpos(i)).reduce(_ | _)
+ )
+
+ val activeClr = MuxOR(mslice.io.in.valid && mslice.io.in.ready, 1.U << outpos)
+
+ active := (active | activeSet) & ~activeClr
+
+ // ---------------------------------------------------------------------------
+ // Interface.
+ io.in.ready := mcount <= (m.U - icount)
+ io.out <> mslice.io.out
+
+ assert(mcount <= m.U)
+}
+
+object EmitFifoX extends App {
+ ChiselStage.emitSystemVerilogFile(new FifoX(UInt(8.W), 4, 11), args)
+}
diff --git a/hdl/chisel/src/common/FifoXe.scala b/hdl/chisel/src/common/FifoXe.scala
new file mode 100644
index 0000000..587be62
--- /dev/null
+++ b/hdl/chisel/src/common/FifoXe.scala
@@ -0,0 +1,136 @@
+// Copyright 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package common
+
+import chisel3._
+import chisel3.util._
+import _root_.circt.stage.ChiselStage
+
+// FifoX with entry output and no output registration stage.
+
+object FifoXe {
+ def apply[T <: Data](t: T, x: Int, n: Int) = {
+ Module(new FifoXe(t, x, n))
+ }
+}
+
+class FifoXe[T <: Data](t: T, x:Int, n: Int) extends Module {
+ val io = IO(new Bundle {
+ val in = Flipped(Decoupled(Vec(x, Valid(t))))
+ val out = Decoupled(t)
+ val count = Output(UInt(log2Ceil(n+1).W))
+ val entry = Output(Vec(n, Valid(t)))
+ val nempty = Output(Bool())
+ })
+
+ def Increment(a: UInt, b: UInt): UInt = {
+ val c = a +& b
+ val d = Mux(c < n.U, c, c - n.U)(a.getWidth - 1, 0)
+ d
+ }
+
+ val mem = Mem(n, t)
+
+ val inxpos = RegInit(VecInit((0 until x).map(x => x.U((log2Ceil(n) + 1).W))))
+ val outpos = RegInit(0.U(log2Ceil(n).W))
+ val mcount = RegInit(0.U(log2Ceil(n+1).W))
+ val nempty = RegInit(false.B)
+
+ io.count := mcount
+ io.nempty := nempty
+
+ val ivalid = io.in.valid && io.in.ready
+ val ovalid = io.out.valid && io.out.ready
+
+ val iactive = Cat((0 until x).reverse.map(x => io.in.bits(x).valid))
+
+ val icount = PopCount(iactive)
+
+ // ---------------------------------------------------------------------------
+ // Fifo Control.
+ when (ivalid) {
+ for (i <- 0 until x) {
+ inxpos(i) := Increment(inxpos(i), icount)
+ }
+ }
+
+ when (ovalid) {
+ outpos := Increment(outpos, 1.U)
+ }
+
+ val inc = MuxOR(ivalid, icount)
+ val dec = ovalid
+
+ when (ivalid || ovalid) {
+ val nxtcount = mcount + inc - dec
+ mcount := nxtcount
+ nempty := nxtcount =/= 0.U
+ }
+
+ // ---------------------------------------------------------------------------
+ // Fifo Input.
+ val inxvalid = FifoXValid(iactive)
+
+ for (i <- 0 until n) {
+ val valid = Cat(
+ (0 until x).reverse.map(q =>
+ if (q == 0) { inxpos(0) === i.U && inxvalid(0)(0) } else {
+ (0 to q).map(y =>
+ inxpos(y) === i.U && inxvalid(y)(q)
+ ).reduce(_ || _)
+ }
+ )
+ )
+
+ when (ivalid) {
+ when (PopCount(valid) >= 1.U) {
+ val idx = PriorityEncoder(valid)
+ mem(i) := io.in.bits(idx).bits
+ }
+ }
+ }
+
+ // ---------------------------------------------------------------------------
+ // Valid Entries.
+ val active = RegInit(0.U(n.W))
+
+ val activeSet = MuxOR(ivalid,
+ (0 until x).map(i => (icount >= (i + 1).U) << inxpos(i)).reduce(_ | _)
+ )
+
+ val activeClr = MuxOR(io.out.valid && io.out.ready, 1.U << outpos)
+
+ when (io.in.valid && io.in.ready || io.out.valid && io.out.ready) {
+ active := (active | activeSet) & ~activeClr
+ }
+
+ // ---------------------------------------------------------------------------
+ // Interface.
+ io.in.ready := mcount <= (n.U - icount)
+
+ io.out.valid := mcount =/= 0.U
+ io.out.bits := mem(outpos)
+
+ assert(mcount <= n.U)
+
+ for (i <- 0 until n) {
+ io.entry(i).valid := active(i)
+ io.entry(i).bits := mem(i)
+ }
+}
+
+object EmitFifoXe extends App {
+ ChiselStage.emitSystemVerilogFile(new FifoXe(UInt(8.W), 4, 10), args)
+}
diff --git a/hdl/chisel/src/kelvin/Parameters.scala b/hdl/chisel/src/kelvin/Parameters.scala
index 5b4b14e..35b4929 100644
--- a/hdl/chisel/src/kelvin/Parameters.scala
+++ b/hdl/chisel/src/kelvin/Parameters.scala
@@ -56,6 +56,12 @@
val vectorCountBits = log2Ceil(vectorBits / 8) + 1 + 2 // +2 stripmine
+ val vectorAluCount = 2
+ val vectorReadPorts = (vectorAluCount * 3) + 1
+ val vectorWritePorts = 6
+ val vectorWhintPorts = 4
+ val vectorScalarPorts = 2
+
// Vector queue.
val vectorFifoDepth = 16
diff --git a/hdl/chisel/src/kelvin/scalar/Debug.scala b/hdl/chisel/src/kelvin/scalar/Debug.scala
index 4181680..d2123c6 100644
--- a/hdl/chisel/src/kelvin/scalar/Debug.scala
+++ b/hdl/chisel/src/kelvin/scalar/Debug.scala
@@ -21,13 +21,7 @@
// Debug signals for HDL development.
class DebugIO(p: Parameters) extends Bundle {
val en = Output(UInt(4.W))
- val addr0 = Output(UInt(32.W))
- val addr1 = Output(UInt(32.W))
- val addr2 = Output(UInt(32.W))
- val addr3 = Output(UInt(32.W))
- val inst0 = Output(UInt(32.W))
- val inst1 = Output(UInt(32.W))
- val inst2 = Output(UInt(32.W))
- val inst3 = Output(UInt(32.W))
+ val addr = Vec(p.instructionLanes, UInt(32.W))
+ val inst = Vec(p.instructionLanes, UInt(32.W))
val cycles = Output(UInt(32.W))
}
diff --git a/hdl/chisel/src/kelvin/scalar/Fetch.scala b/hdl/chisel/src/kelvin/scalar/Fetch.scala
index cb21c9d..d6de873 100644
--- a/hdl/chisel/src/kelvin/scalar/Fetch.scala
+++ b/hdl/chisel/src/kelvin/scalar/Fetch.scala
@@ -56,7 +56,7 @@
val csr = new CsrInIO(p)
val ibus = new IBusIO(p)
val inst = new FetchIO(p)
- val branch = Flipped(Vec(4, new BranchTakenIO(p)))
+ val branch = Flipped(Vec(p.instructionLanes, new BranchTakenIO(p)))
val linkPort = Flipped(new RegfileLinkPortIO)
val iflush = Flipped(new IFlushIO(p))
})
@@ -103,9 +103,9 @@
val l0data = Reg(Vec(indices, UInt(p.fetchDataBits.W)))
// Instruction outputs.
- val instValid = RegInit(VecInit(Seq.fill(4)(false.B)))
- val instAddr = Reg(Vec(4, UInt(p.instructionBits.W)))
- val instBits = Reg(Vec(4, UInt(p.instructionBits.W)))
+ val instValid = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
+ val instAddr = Reg(Vec(p.instructionLanes, UInt(p.instructionBits.W)))
+ val instBits = Reg(Vec(p.instructionLanes, UInt(p.instructionBits.W)))
val instAligned0 = Cat(instAddr(0)(31, indexLsb), 0.U(indexLsb.W))
val instAligned1 = instAligned0 + Cat(1.U, 0.U(indexLsb.W))
@@ -135,71 +135,48 @@
(jal, target)
}
- val (preBranchTaken0, preBranchTarget0) =
- Predecode(instAddr(0), instBits(0))
- val (preBranchTaken1, preBranchTarget1) =
- Predecode(instAddr(1), instBits(1))
- val (preBranchTaken2, preBranchTarget2) =
- Predecode(instAddr(2), instBits(2))
- val (preBranchTaken3, preBranchTarget3) =
- Predecode(instAddr(3), instBits(3))
+ val preBranch = (0 until p.instructionLanes).map(x => Predecode(instAddr(x), instBits(x)))
+ val preBranchTakens = preBranch.map { case (taken, target) => taken }
+ val preBranchTargets = preBranch.map { case (taken, target) => target }
- val preBranchTaken = io.inst.lanes(0).valid && preBranchTaken0 ||
- io.inst.lanes(1).valid && preBranchTaken1 ||
- io.inst.lanes(2).valid && preBranchTaken2 ||
- io.inst.lanes(3).valid && preBranchTaken3
+ val preBranchTaken = (0 until p.instructionLanes).map(i =>
+ io.inst.lanes(i).valid && preBranchTakens(i)).reduce(_ || _)
- val preBranchTarget = Mux(preBranchTaken0, preBranchTarget0,
- Mux(preBranchTaken1, preBranchTarget1,
- Mux(preBranchTaken2, preBranchTarget2,
- preBranchTarget3)))
+ val preBranchTarget = MuxCase(
+ preBranchTargets(p.instructionLanes - 1),
+ (0 until p.instructionLanes - 1).map(i => preBranchTakens(i) -> preBranchTargets(i))
+ )
val preBranchTag = preBranchTarget(tagMsb, tagLsb)
val preBranchIndex = preBranchTarget(indexMsb, indexLsb)
- val branchTag0 = io.branch(0).value(tagMsb, tagLsb)
- val branchTag1 = io.branch(1).value(tagMsb, tagLsb)
- val branchTag2 = io.branch(2).value(tagMsb, tagLsb)
- val branchTag3 = io.branch(3).value(tagMsb, tagLsb)
- val branchIndex0 = io.branch(0).value(indexMsb, indexLsb)
- val branchIndex1 = io.branch(1).value(indexMsb, indexLsb)
- val branchIndex2 = io.branch(2).value(indexMsb, indexLsb)
- val branchIndex3 = io.branch(3).value(indexMsb, indexLsb)
+ val branchTags = io.branch.map(x => x.value(tagMsb, tagLsb))
+ val branchIndices = io.branch.map(x => x.value(indexMsb, indexLsb))
- val l0validB0 = l0valid(branchIndex0)
- val l0validB1 = l0valid(branchIndex1)
- val l0validB2 = l0valid(branchIndex2)
- val l0validB3 = l0valid(branchIndex3)
+ val l0valids = (0 until p.instructionLanes).map(x => l0valid(branchIndices(x)))
val l0validP = l0valid(preBranchIndex)
- val l0tagB0 = VecAt(l0tag, branchIndex0)
- val l0tagB1 = VecAt(l0tag, branchIndex1)
- val l0tagB2 = VecAt(l0tag, branchIndex2)
- val l0tagB3 = VecAt(l0tag, branchIndex3)
+ val l0tags = (0 until p.instructionLanes).map(x => VecAt(l0tag, branchIndices(x)))
val l0tagP = VecAt(l0tag, preBranchIndex)
- val reqB0 = io.branch(0).valid && !l0req(branchIndex0) &&
- (branchTag0 =/= l0tagB0 || !l0validB0)
- val reqB1 = io.branch(1).valid && !l0req(branchIndex1) &&
- (branchTag1 =/= l0tagB1 || !l0validB1) &&
- !io.branch(0).valid
- val reqB2 = io.branch(2).valid && !l0req(branchIndex2) &&
- (branchTag2 =/= l0tagB2 || !l0validB2) &&
- !io.branch(0).valid && !io.branch(1).valid
- val reqB3 = io.branch(3).valid && !l0req(branchIndex3) &&
- (branchTag3 =/= l0tagB3 || !l0validB3) &&
- !io.branch(0).valid && !io.branch(1).valid && !io.branch(2).valid
+ val reqBValid = (0 until p.instructionLanes).map(x =>
+ io.branch(x).valid && !l0req(branchIndices(x)) &&
+ (branchTags(x) =/= l0tags(x) || !l0valids(x)))
+ val prevValid = io.branch.map(_.valid).scan(false.B)(_||_)
+ val reqs = (0 until p.instructionLanes).map(x => reqBValid(x) && !prevValid(x))
+
val reqP = preBranchTaken && !l0req(preBranchIndex) && (preBranchTag =/= l0tagP || !l0validP)
val req0 = !match0 && !l0req(instIndex0)
val req1 = !match1 && !l0req(instIndex1)
- aslice.io.in.valid := (reqB0 || reqB1 || reqB2 || reqB3 || reqP || req0 || req1) && !io.iflush.valid
- aslice.io.in.bits := Mux(reqB0, Cat(io.branch(0).value(31,indexLsb), 0.U(indexLsb.W)),
- Mux(reqB1, Cat(io.branch(1).value(31,indexLsb), 0.U(indexLsb.W)),
- Mux(reqB2, Cat(io.branch(2).value(31,indexLsb), 0.U(indexLsb.W)),
- Mux(reqB3, Cat(io.branch(3).value(31,indexLsb), 0.U(indexLsb.W)),
- Mux(reqP, Cat(preBranchTarget(31,indexLsb), 0.U(indexLsb.W)),
- Mux(req0, instAligned0, instAligned1))))))
+ aslice.io.in.valid := (reqs ++ Seq(reqP, req0, req1)).reduce(_ || _) && !io.iflush.valid
+ aslice.io.in.bits := MuxCase(instAligned1,
+ (0 until p.instructionLanes).map(x => reqs(x) -> Cat(io.branch(x).value(31,indexLsb), 0.U(indexLsb.W))) ++
+ Array(
+ reqP -> Cat(preBranchTarget(31,indexLsb), 0.U(indexLsb.W)),
+ req0 -> instAligned0,
+ )
+ )
when (readAddrEn) {
readAddr := io.ibus.addr
@@ -253,25 +230,27 @@
// creates excessive timing pressure. We know that the match is either on
// the old line or the next line, so can late mux on lookups of prior.
// Widen the arithmetic paths and select from results.
- val fetchEn = Wire(Vec(4, Bool()))
+ val fetchEn = Wire(Vec(p.instructionLanes, Bool()))
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
fetchEn(i) := io.inst.lanes(i).valid && io.inst.lanes(i).ready
}
- val fsel = Cat(fetchEn(3),
- fetchEn(2) && !fetchEn(3),
- fetchEn(1) && !fetchEn(2) && !fetchEn(3),
- fetchEn(0) && !fetchEn(1) && !fetchEn(2) && !fetchEn(3),
- !fetchEn(0) && !fetchEn(1) && !fetchEn(2) && !fetchEn(3))
+ val fsela = Cat((0 until p.instructionLanes).reverse.map(x =>
+ (x until p.instructionLanes).map(y =>
+ (if (y == x) { fetchEn(y) } else { !fetchEn(y) })
+ ).reduce(_ && _)
+ ))
+ val fselb = (0 until p.instructionLanes).map(x => !fetchEn(x)).reduce(_ && _)
+ val fsel = Cat(fsela, fselb)
- val nxtInstAddrOffset = instAddr.map(x => x) ++ instAddr.map(x => x + 16.U)
- val nxtInstAddr = (0 until 4).map(i =>
- (0 until 5).map(
+ val nxtInstAddrOffset = instAddr.map(x => x) ++ instAddr.map(x => x + (p.instructionLanes * 4).U)
+ val nxtInstAddr = (0 until p.instructionLanes).map(i =>
+ (0 until (p.instructionLanes + 1)).map(
j => MuxOR(fsel(j), nxtInstAddrOffset(j + i))).reduce(_|_))
val nxtInstIndex0 = nxtInstAddr(0)(indexMsb, indexLsb)
- val nxtInstIndex1 = nxtInstAddr(3)(indexMsb, indexLsb)
+ val nxtInstIndex1 = nxtInstAddr(p.instructionLanes - 1)(indexMsb, indexLsb)
val readFwd0 =
readDataEn && readAddr(31,indexLsb) === instAligned0(31,indexLsb)
@@ -286,7 +265,7 @@
val nxtMatch1 =
Mux(instIndex0(0) === nxtInstIndex1(0), nxtMatch0Fwd, nxtMatch1Fwd)
- val nxtInstValid = Wire(Vec(4, Bool()))
+ val nxtInstValid = Wire(Vec(p.instructionLanes, Bool()))
val nxtInstBits0 = Mux(readFwd0, readData, VecAt(l0data, instIndex0))
val nxtInstBits1 = Mux(readFwd1, readData, VecAt(l0data, instIndex1))
@@ -301,23 +280,18 @@
def BranchMatchDe(valid: Bool, value: UInt):
(Bool, UInt, Vec[UInt], Vec[UInt]) = {
- val addr = VecInit(value,
- value + 4.U,
- value + 8.U,
- value + 12.U)
+ val addr = VecInit((0 until p.instructionLanes).map(x => value + (x * 4).U))
val match0 = l0valid(addr(0)(indexMsb,indexLsb)) &&
addr(0)(tagMsb,tagLsb) === VecAt(l0tag, addr(0)(indexMsb,indexLsb))
- val match1 = l0valid(addr(3)(indexMsb,indexLsb)) &&
- addr(3)(tagMsb,tagLsb) === VecAt(l0tag, addr(3)(indexMsb,indexLsb))
+ val match1 = l0valid(addr(p.instructionLanes - 1)(indexMsb,indexLsb)) &&
+ addr(p.instructionLanes - 1)(tagMsb,tagLsb) === VecAt(l0tag, addr(p.instructionLanes - 1)(indexMsb,indexLsb))
- val vvalid = VecInit(Mux(addr(0)(4,2) <= 7.U, match0, match1),
- Mux(addr(0)(4,2) <= 6.U, match0, match1),
- Mux(addr(0)(4,2) <= 5.U, match0, match1),
- Mux(addr(0)(4,2) <= 4.U, match0, match1))
+ val vvalid = VecInit((0 until p.instructionLanes).reverse.map(x =>
+ Mux(addr(0)(2 + log2Ceil(p.instructionLanes),2) <= (4+x).U, match0, match1)))
val muxbits0 = VecAt(l0data, addr(0)(indexMsb,indexLsb))
- val muxbits1 = VecAt(l0data, addr(3)(indexMsb,indexLsb))
+ val muxbits1 = VecAt(l0data, addr(p.instructionLanes - 1)(indexMsb,indexLsb))
val muxbits = Wire(Vec(16, UInt(p.instructionBits.W)))
for (i <- 0 until 8) {
@@ -326,8 +300,8 @@
muxbits(i + 8) := muxbits1(31 + offset, offset)
}
- val bits = Wire(Vec(4, UInt(p.instructionBits.W)))
- for (i <- 0 until 4) {
+ val bits = Wire(Vec(p.instructionLanes, UInt(p.instructionBits.W)))
+ for (i <- 0 until p.instructionLanes) {
val idx = Cat(addr(0)(5) =/= addr(i)(5), addr(i)(4,2))
bits(i) := VecAt(muxbits, idx)
}
@@ -337,38 +311,26 @@
def BranchMatchEx(branch: Vec[BranchTakenIO]):
(Bool, UInt, Vec[UInt], Vec[UInt]) = {
- val valid = branch(0).valid || branch(1).valid ||
- branch(2).valid || branch(3).valid
+ val valid = branch.map(x => x.valid).reduce(_ || _)
- val addr = VecInit(Mux(branch(0).valid, branch(0).value,
- Mux(branch(1).valid, branch(1).value,
- Mux(branch(2).valid, branch(2).value,
- branch(3).value))),
- Mux(branch(0).valid, branch(0).value + 4.U,
- Mux(branch(1).valid, branch(1).value + 4.U,
- Mux(branch(2).valid, branch(2).value + 4.U,
- branch(3).value + 4.U))),
- Mux(branch(0).valid, branch(0).value + 8.U,
- Mux(branch(1).valid, branch(1).value + 8.U,
- Mux(branch(2).valid, branch(2).value + 8.U,
- branch(3).value + 8.U))),
- Mux(branch(0).valid, branch(0).value + 12.U,
- Mux(branch(1).valid, branch(1).value + 12.U,
- Mux(branch(2).valid, branch(2).value + 12.U,
- branch(3).value + 12.U))))
+
+ val addr = VecInit((0 until branch.length).map(x =>
+ MuxCase(branch(branch.length - 1).value + (x * 4).U, (
+ (0 until branch.length - 1).map(y =>
+ branch(y).valid -> (branch(y).value + (x * 4).U)
+ )
+ ))))
val match0 = l0valid(addr(0)(indexMsb,indexLsb)) &&
addr(0)(tagMsb,tagLsb) === VecAt(l0tag, addr(0)(indexMsb,indexLsb))
- val match1 = l0valid(addr(3)(indexMsb,indexLsb)) &&
- addr(3)(tagMsb,tagLsb) === VecAt(l0tag, addr(3)(indexMsb,indexLsb))
+ val match1 = l0valid(addr(branch.length - 1)(indexMsb,indexLsb)) &&
+ addr(branch.length - 1)(tagMsb,tagLsb) === VecAt(l0tag, addr(branch.length - 1)(indexMsb,indexLsb))
- val vvalid = VecInit(Mux(addr(0)(4,2) <= 7.U, match0, match1),
- Mux(addr(0)(4,2) <= 6.U, match0, match1),
- Mux(addr(0)(4,2) <= 5.U, match0, match1),
- Mux(addr(0)(4,2) <= 4.U, match0, match1))
+ val vvalid = VecInit((0 until branch.length).reverse.map(x =>
+ Mux(addr(0)(2 + log2Ceil(branch.length),2) <= (4 + x).U, match0, match1)))
val muxbits0 = VecAt(l0data, addr(0)(indexMsb,indexLsb))
- val muxbits1 = VecAt(l0data, addr(3)(indexMsb,indexLsb))
+ val muxbits1 = VecAt(l0data, addr(branch.length - 1)(indexMsb,indexLsb))
val muxbits = Wire(Vec(16, UInt(p.instructionBits.W)))
for (i <- 0 until 8) {
@@ -377,8 +339,8 @@
muxbits(i + 8) := muxbits1(31 + offset, offset)
}
- val bits = Wire(Vec(4, UInt(p.instructionBits.W)))
- for (i <- 0 until 4) {
+ val bits = Wire(Vec(branch.length, UInt(p.instructionBits.W)))
+ for (i <- 0 until branch.length) {
val idx = Cat(addr(0)(5) =/= addr(i)(5), addr(i)(4,2))
bits(i) := VecAt(muxbits, idx)
}
@@ -399,21 +361,17 @@
(jal || ret || bxx, target)
}
- val (brchTakenDe0, brchTargetDe0) = PredecodeDe(instAddr(0), instBits(0))
- val (brchTakenDe1, brchTargetDe1) = PredecodeDe(instAddr(1), instBits(1))
- val (brchTakenDe2, brchTargetDe2) = PredecodeDe(instAddr(2), instBits(2))
- val (brchTakenDe3, brchTargetDe3) = PredecodeDe(instAddr(3), instBits(3))
+ val brchDe = (0 until p.instructionLanes).map(x => PredecodeDe(instAddr(x), instBits(x)))
+ val brchTakensDe = brchDe.map { case (taken, target) => taken }
+ val brchTargetsDe = brchDe.map { case (taken, target) => target }
- val brchTakenDeOr =
- io.inst.lanes(0).valid && io.inst.lanes(0).ready && brchTakenDe0 ||
- io.inst.lanes(1).valid && io.inst.lanes(1).ready && brchTakenDe1 ||
- io.inst.lanes(2).valid && io.inst.lanes(2).ready && brchTakenDe2 ||
- io.inst.lanes(3).valid && io.inst.lanes(3).ready && brchTakenDe3
+ val brchTakenDeOr = (0 until p.instructionLanes).map(x =>
+ io.inst.lanes(x).ready && io.inst.lanes(x).valid && brchTakensDe(x)
+ ).reduce(_ || _)
- val brchTargetDe = Mux(brchTakenDe0, brchTargetDe0,
- Mux(brchTakenDe1, brchTargetDe1,
- Mux(brchTakenDe2, brchTargetDe2,
- brchTargetDe3)))
+ val brchTargetDe = MuxCase(brchTargetsDe(p.instructionLanes - 1),
+ (0 until p.instructionLanes - 1).map(x => brchTakensDe(x) -> brchTargetsDe(x))
+ )
val (brchTakenDe, brchValidDe, brchAddrDe, brchBitsDe) =
BranchMatchDe(brchTakenDeOr, brchTargetDe)
@@ -421,21 +379,27 @@
val (brchTakenEx, brchValidEx, brchAddrEx, brchBitsEx) =
BranchMatchEx(io.branch)
+
val brchValidDeMask =
- Cat(!brchTakenDe0 && !brchTakenDe1 && !brchTakenDe2,
- !brchTakenDe0 && !brchTakenDe1,
- !brchTakenDe0,
- true.B)
+ Cat((0 until p.instructionLanes).reverse.map(x =>
+ if (x == 0) { true.B } else {
+ (0 until x).map(y =>
+ !brchTakensDe(y)
+ ).reduce(_ && _)
+ }
+ ))
- val brchFwd = Cat(
- brchTakenDe3 && !brchTakenDe0 && !brchTakenDe1 && !brchTakenDe2,
- brchTakenDe2 && !brchTakenDe0 && !brchTakenDe1,
- brchTakenDe1 && !brchTakenDe0,
- brchTakenDe0)
+ val brchFwd =
+ Cat((0 until p.instructionLanes).reverse.map(x =>
+ brchTakensDe(x) && (if (x == 0) { true.B } else { (0 until x).map(y => !brchTakensDe(y)).reduce(_ && _) })
+ ))
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
// 1, 11, 111, ...
- nxtInstValid(i) := Mux(nxtInstAddr(0)(4,2) <= (7 - i).U, nxtMatch0, nxtMatch1)
+ nxtInstValid(i) := Mux(
+ nxtInstAddr(0)(4,2) <= (7 - i).U,
+ nxtMatch0,
+ nxtMatch1)
val nxtInstValidUInt = nxtInstValid.asUInt
instValid(i) := Mux(brchTakenEx, brchValidEx(i,0) === ~0.U((i+1).W),
@@ -457,14 +421,11 @@
// This pattern of separate when() blocks requires resets after the data.
when (reset.asBool) {
val addr = Cat(io.csr.value(0)(31,2), 0.U(2.W))
- instAddr(0) := addr
- instAddr(1) := addr + 4.U
- instAddr(2) := addr + 8.U
- instAddr(3) := addr + 12.U
+ instAddr := (0 until p.instructionLanes).map(i => addr + (4 * i).U)
}
// Outputs
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
io.inst.lanes(i).valid := instValid(i) & brchValidDeMask(i)
io.inst.lanes(i).addr := instAddr(i)
io.inst.lanes(i).inst := instBits(i)
@@ -472,23 +433,19 @@
}
// Assertions.
- assert(instAddr(0) + 4.U === instAddr(1))
- assert(instAddr(0) + 8.U === instAddr(2))
- assert(instAddr(0) + 12.U === instAddr(3))
+ for (i <- 1 until p.instructionLanes) {
+ assert(instAddr(0) + (4 * i).U === instAddr(i))
+ }
- assert(fsel.getWidth == 5)
+ assert(fsel.getWidth == (p.instructionLanes + 1))
assert(PopCount(fsel) <= 1.U)
val instValidUInt = instValid.asUInt
- assert(!(!instValidUInt(0) && (instValidUInt(3,1) =/= 0.U)))
- assert(!(!instValidUInt(1) && (instValidUInt(3,2) =/= 0.U)))
- assert(!(!instValidUInt(2) && (instValidUInt(3,3) =/= 0.U)))
-
- val instLanesReady = Cat(io.inst.lanes(3).ready, io.inst.lanes(2).ready,
- io.inst.lanes(1).ready, io.inst.lanes(0).ready)
- assert(!(!instLanesReady(0) && (instLanesReady(3,1) =/= 0.U)))
- assert(!(!instLanesReady(1) && (instLanesReady(3,2) =/= 0.U)))
- assert(!(!instLanesReady(2) && (instLanesReady(3,3) =/= 0.U)))
+ val instLanesReady = Cat((0 until p.instructionLanes).reverse.map(x => io.inst.lanes(x).ready))
+ for (i <- 0 until p.instructionLanes - 1) {
+ assert(!(!instValidUInt(i) && (instValidUInt(p.instructionLanes - 1, i + 1) =/= 0.U)))
+ assert(!(!instLanesReady(i) && (instLanesReady(p.instructionLanes - 1, i + 1) =/= 0.U)))
+ }
}
object EmitFetch extends App {
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala
index 60aa158..b13364d 100644
--- a/hdl/chisel/src/kelvin/scalar/Lsu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -92,8 +92,8 @@
class Lsu(p: Parameters) extends Module {
val io = IO(new Bundle {
// Decode cycle.
- val req = Vec(4, new LsuIO(p))
- val busPort = Flipped(new RegfileBusPortIO)
+ val req = Vec(p.instructionLanes, new LsuIO(p))
+ val busPort = Flipped(new RegfileBusPortIO(p))
// Execute cycle(s).
val rd = Flipped(new RegfileWriteDataIO)
@@ -115,16 +115,18 @@
// AXI Queues.
val n = 8
- val ctrl = Fifo4(new LsuCtrl(p), n)
+ val ctrl = FifoX(new LsuCtrl(p), p.instructionLanes, n)
val data = Slice(new LsuReadData(p), true, true)
// Match and mask.
- val ctrlready = Cat(ctrl.io.count <= (n - 4).U,
- ctrl.io.count <= (n - 3).U,
- ctrl.io.count <= (n - 2).U,
- ctrl.io.count <= (n - 1).U)
+ val ctrlready = (1 to p.instructionLanes).reverse.map(x => ctrl.io.count <= (n - x).U)
+ // val ctrlready = Cat(
+ // (1 to p.instructionLanes).reverse.map(
+ // x => ctrl.io.count <= (n - x).U
+ // )
+ // )
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
io.req(i).ready := ctrlready(i) && data.io.in.ready
}
@@ -137,7 +139,7 @@
ctrl.io.in.valid := io.req.map(_.valid).reduce(_||_)
val uncacheable = p.m.filter(x => !x.cacheable)
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
val uncached = io.busPort.addr(i)(31) ||
(if (uncacheable.length > 0) uncacheable.map(x => (io.busPort.addr(i) >= x.memStart.U) && (io.busPort.addr(i) < (x.memStart + x.memSize).U)).reduce(_||_) else false.B)
diff --git a/hdl/chisel/src/kelvin/scalar/Mlu.scala b/hdl/chisel/src/kelvin/scalar/Mlu.scala
index 349104d..b7ad953 100644
--- a/hdl/chisel/src/kelvin/scalar/Mlu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Mlu.scala
@@ -47,11 +47,11 @@
class Mlu(p: Parameters) extends Module {
val io = IO(new Bundle {
// Decode cycle.
- val req = Vec(4, new MluIO(p))
+ val req = Vec(p.instructionLanes, new MluIO(p))
// Execute cycle.
- val rs1 = Vec(4, Flipped(new RegfileReadDataIO))
- val rs2 = Vec(4, Flipped(new RegfileReadDataIO))
+ val rs1 = Vec(p.instructionLanes, Flipped(new RegfileReadDataIO))
+ val rs2 = Vec(p.instructionLanes, Flipped(new RegfileReadDataIO))
val rd = Flipped(new RegfileWriteDataIO)
})
@@ -62,41 +62,25 @@
val valid2 = RegInit(false.B)
val addr1 = Reg(UInt(5.W))
val addr2 = Reg(UInt(5.W))
- val sel = Reg(UInt(4.W))
+ val sel = Reg(UInt(p.instructionLanes.W))
+ val valids = io.req.map(_.valid)
+ assert(valids.length == p.instructionLanes)
valid1 := io.req.map(_.valid).reduce(_||_)
valid2 := valid1
- when (io.req(0).valid) {
- op := io.req(0).op
- addr1 := io.req(0).addr
- sel := 1.U
- } .elsewhen (io.req(1).valid) {
- op := io.req(1).op
- addr1 := io.req(1).addr
- sel := 2.U
- } .elsewhen (io.req(2).valid) {
- op := io.req(2).op
- addr1 := io.req(2).addr
- sel := 4.U
- } .elsewhen (io.req(3).valid) {
- op := io.req(3).op
- addr1 := io.req(3).addr
- sel := 8.U
+ when (valids.reduce(_||_)) {
+ val idx = PriorityEncoder(valids)
+ op := io.req(idx).op
+ addr1 := io.req(idx).addr
+ sel := (1.U << idx)
} .otherwise {
op := 0.U
sel := 0.U
}
- val rs1 = MuxOR(valid1 & sel(0), io.rs1(0).data) |
- MuxOR(valid1 & sel(1), io.rs1(1).data) |
- MuxOR(valid1 & sel(2), io.rs1(2).data) |
- MuxOR(valid1 & sel(3), io.rs1(3).data)
-
- val rs2 = MuxOR(valid1 & sel(0), io.rs2(0).data) |
- MuxOR(valid1 & sel(1), io.rs2(1).data) |
- MuxOR(valid1 & sel(2), io.rs2(2).data) |
- MuxOR(valid1 & sel(3), io.rs2(3).data)
+ val rs1 = (0 until p.instructionLanes).map(x => MuxOR(valid1 & sel(x), io.rs1(x).data)).reduce(_ | _)
+ val rs2 = (0 until p.instructionLanes).map(x => MuxOR(valid1 & sel(x), io.rs2(x).data)).reduce(_ | _)
// Multiplier has a registered output.
val mul2 = Reg(UInt(32.W))
@@ -142,7 +126,7 @@
io.rd.data := mul2 + round2
// Assertions.
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
assert(!(valid1 && sel(i) && !io.rs1(i).valid))
assert(!(valid1 && sel(i) && !io.rs2(i).valid))
}
diff --git a/hdl/chisel/src/kelvin/scalar/Regfile.scala b/hdl/chisel/src/kelvin/scalar/Regfile.scala
index 6dfdd00..7397bb7 100644
--- a/hdl/chisel/src/kelvin/scalar/Regfile.scala
+++ b/hdl/chisel/src/kelvin/scalar/Regfile.scala
@@ -62,9 +62,9 @@
val immed = Input(UInt(32.W))
}
-class RegfileBusPortIO extends Bundle {
- val addr = Output(Vec(4, UInt(32.W)))
- val data = Output(Vec(4, UInt(32.W)))
+class RegfileBusPortIO(p: Parameters) extends Bundle {
+ val addr = Output(Vec(p.instructionLanes, UInt(32.W)))
+ val data = Output(Vec(p.instructionLanes, UInt(32.W)))
}
class RegfileLinkPortIO extends Bundle {
@@ -79,18 +79,18 @@
class Regfile(p: Parameters) extends Module {
val io = IO(new Bundle {
// Decode cycle.
- val readAddr = Vec(8, new RegfileReadAddrIO)
- val readSet = Vec(8, new RegfileReadSetIO)
- val writeAddr = Vec(4, new RegfileWriteAddrIO)
- val busAddr = Vec(4, new RegfileBusAddrIO)
- val target = Vec(4, new RegfileBranchTargetIO)
+ val readAddr = Vec(p.instructionLanes * 2, new RegfileReadAddrIO)
+ val readSet = Vec(p.instructionLanes * 2, new RegfileReadSetIO)
+ val writeAddr = Vec(p.instructionLanes, new RegfileWriteAddrIO)
+ val busAddr = Vec(p.instructionLanes, new RegfileBusAddrIO)
+ val target = Vec(p.instructionLanes, new RegfileBranchTargetIO)
val linkPort = new RegfileLinkPortIO
- val busPort = new RegfileBusPortIO
+ val busPort = new RegfileBusPortIO(p)
// Execute cycle.
- val readData = Vec(8, new RegfileReadDataIO)
- val writeData = Vec(6, new RegfileWriteDataIO)
- val writeMask = Vec(4, new Bundle {val valid = Input(Bool())})
+ val readData = Vec(p.instructionLanes * 2, new RegfileReadDataIO)
+ val writeData = Vec(p.instructionLanes + 2, new RegfileWriteDataIO)
+ val writeMask = Vec(p.instructionLanes, new Bundle {val valid = Input(Bool())})
val scoreboard = new Bundle {
val regd = Output(UInt(32.W))
val comb = Output(UInt(32.W))
@@ -130,11 +130,11 @@
// ***************************************************************************
// The read port response.
// ***************************************************************************
- val readDataReady = RegInit(VecInit(Seq.fill(8){false.B}))
- val readDataBits = Reg(Vec(8, UInt(32.W)))
- val nxtReadDataBits = Wire(Vec(8, UInt(32.W)))
+ val readDataReady = RegInit(VecInit(Seq.fill(p.instructionLanes * 2){false.B}))
+ val readDataBits = Reg(Vec(p.instructionLanes * 2, UInt(32.W)))
+ val nxtReadDataBits = Wire(Vec(p.instructionLanes * 2, UInt(32.W)))
- for (i <- 0 until 8) {
+ for (i <- 0 until (p.instructionLanes * 2)) {
io.readData(i).valid := readDataReady(i)
io.readData(i).data := readDataBits(i)
}
@@ -149,18 +149,13 @@
writeData(0) := 0.U // regfile(0) is optimized away
for (i <- 1 until 32) {
- val valid = Cat(io.writeData(5).valid && io.writeData(5).addr === i.U,
- io.writeData(4).valid && io.writeData(4).addr === i.U,
- io.writeData(3).valid && io.writeData(3).addr === i.U &&
- !io.writeMask(3).valid,
- io.writeData(2).valid && io.writeData(2).addr === i.U &&
- !io.writeMask(2).valid,
- io.writeData(1).valid && io.writeData(1).addr === i.U &&
- !io.writeMask(1).valid,
- io.writeData(0).valid && io.writeData(0).addr === i.U &&
- !io.writeMask(0).valid)
+ val valid = Cat(
+ Array(io.writeData(p.instructionLanes + 1).valid && io.writeData(p.instructionLanes + 1).addr === i.U,
+ io.writeData(p.instructionLanes).valid && io.writeData(p.instructionLanes).addr === i.U) ++
+ (0 until p.instructionLanes).reverse.map(x => io.writeData(x).valid && io.writeData(x).addr === i.U && !io.writeMask(x).valid)
+ )
- val data = (0 until 6).map(x => MuxOR(valid(x), io.writeData(x).data)).reduce(_|_)
+ val data = (0 until p.instructionLanes + 2).map(x => MuxOR(valid(x), io.writeData(x).data)).reduce(_|_)
writeValid(i) := valid =/= 0.U
writeData(i) := data
@@ -177,21 +172,21 @@
// We care if someone tried to write x0 (e.g. nop is encoded this way), but want
// it separate for above mentioned optimization.
val x0 =
- (0 until 4).map(x =>
+ (0 until p.instructionLanes).map(x =>
io.writeData(x).valid &&
io.writeData(x).addr === 0.U &&
!io.writeMask(x).valid) ++
- (4 until 6).map(x => io.writeData(x).valid && io.writeData(x).addr === 0.U)
+ (p.instructionLanes until p.instructionLanes + 2).map(x => io.writeData(x).valid && io.writeData(x).addr === 0.U)
io.rfwriteCount := PopCount(writeValid) - writeValid(0) + PopCount(x0)
// ***************************************************************************
// Read ports with write forwarding.
// ***************************************************************************
- val rdata = Wire(Vec(8, UInt(32.W)))
- val wdata = Wire(Vec(8, UInt(32.W)))
- val rwdata = Wire(Vec(8, UInt(32.W)))
- for (i <- 0 until 8) {
+ val rdata = Wire(Vec((p.instructionLanes * 2), UInt(32.W)))
+ val wdata = Wire(Vec((p.instructionLanes * 2), UInt(32.W)))
+ val rwdata = Wire(Vec((p.instructionLanes * 2), UInt(32.W)))
+ for (i <- 0 until (p.instructionLanes * 2)) {
val idx = io.readAddr(i).addr
val write = VecAt(writeValid, idx)
rdata(i) := VecAt(regfile, idx)
@@ -199,7 +194,7 @@
rwdata(i) := Mux(write, wdata(i), rdata(i))
}
- for (i <- 0 until 8) {
+ for (i <- 0 until (p.instructionLanes * 2)) {
val setValid = io.readSet(i).valid
val setValue = io.readSet(i).value
@@ -215,23 +210,22 @@
}
// Bus port priority encoded address.
- val busAddr = Wire(Vec(4, UInt(32.W)))
- val busValid = Cat(io.busAddr(3).valid, io.busAddr(2).valid,
- io.busAddr(1).valid, io.busAddr(0).valid)
+ val busAddr = Wire(Vec(p.instructionLanes, UInt(32.W)))
+ val busValid = Cat((0 until p.instructionLanes).reverse.map(x => io.busAddr(x).valid))
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
busAddr(i) := Mux(io.busAddr(i).bypass, rwdata(2 * i),
Mux(io.busAddr(i).immen, rdata(2 * i) + io.busAddr(i).immed,
rdata(2 * i)))
}
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
io.busPort.addr(i) := busAddr(i)
io.busPort.data(i) := nxtReadDataBits(2 * i + 1)
}
// Branch target address combinatorial.
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
io.target(i).data := busAddr(i)
}
@@ -244,12 +238,12 @@
// ***************************************************************************
// Assertions.
// ***************************************************************************
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
assert(busAddr(i).getWidth == p.lsuAddrBits)
}
- for (i <- 0 until 6) {
- for (j <- (i+1) until 6) {
+ for (i <- 0 until p.instructionLanes + 2) {
+ for (j <- (i + 1) until p.instructionLanes + 2) {
// Delay the failure a cycle for debugging purposes.
val write_fail = RegInit(false.B)
write_fail := io.writeData(i).valid && io.writeData(j).valid &&
diff --git a/hdl/chisel/src/kelvin/scalar/SCore.scala b/hdl/chisel/src/kelvin/scalar/SCore.scala
index 3f0f678..d9e2c32 100644
--- a/hdl/chisel/src/kelvin/scalar/SCore.scala
+++ b/hdl/chisel/src/kelvin/scalar/SCore.scala
@@ -50,9 +50,9 @@
// The functional units that make up the core.
val regfile = Regfile(p)
val fetch = Fetch(p)
- val decode = Seq(Decode(p, 0), Decode(p, 1), Decode(p, 2), Decode(p, 3))
- val alu = Seq.fill(4)(Alu(p))
- val bru = Seq.fill(4)(Bru(p))
+ val decode = (0 until p.instructionLanes).map(x => Seq(Decode(p, x))).reduce(_ ++ _)
+ val alu = Seq.fill(p.instructionLanes)(Alu(p))
+ val bru = Seq.fill(p.instructionLanes)(Bru(p))
val csr = Csr(p)
val lsu = Lsu(p)
val mlu = Mlu(p)
@@ -77,15 +77,15 @@
io.dflush.clean := lsu.io.flush.clean
lsu.io.flush.ready := io.dflush.ready
- assert(!bru(1).io.iflush)
- assert(!bru(2).io.iflush)
- assert(!bru(3).io.iflush)
+ for (i <- 1 until p.instructionLanes) {
+ assert(!bru(i).io.iflush)
+ }
// ---------------------------------------------------------------------------
// Fetch
fetch.io.csr := io.csr.in
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
fetch.io.branch(i) := bru(i).io.taken
}
@@ -97,7 +97,7 @@
// Decode
val mask = VecInit(decode.map(_.io.inst.ready).scan(true.B)(_ && _))
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
decode(i).io.inst.valid := fetch.io.inst.lanes(i).valid && mask(i)
fetch.io.inst.lanes(i).ready := decode(i).io.inst.ready && mask(i)
decode(i).io.inst.addr := fetch.io.inst.lanes(i).addr
@@ -110,31 +110,31 @@
// Interlock based on regfile write port dependencies.
decode(0).io.interlock := bru(0).io.interlock
- decode(1).io.interlock := decode(0).io.interlock
- decode(2).io.interlock := decode(1).io.interlock
- decode(3).io.interlock := decode(2).io.interlock
+ for (i <- 1 until p.instructionLanes) {
+ decode(i).io.interlock := decode(i - 1).io.interlock
+ }
// Serialize opcodes with only one pipeline.
decode(0).io.serializeIn.defaults()
- decode(1).io.serializeIn := decode(0).io.serializeOut
- decode(2).io.serializeIn := decode(1).io.serializeOut
- decode(3).io.serializeIn := decode(2).io.serializeOut
+ for (i <- 1 until p.instructionLanes) {
+ decode(i).io.serializeIn := decode(i - 1).io.serializeOut
+ }
// In decode update multi-issue scoreboard state.
val scoreboard_spec = decode.map(_.io.scoreboard.spec).scan(0.U)(_|_)
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
decode(i).io.scoreboard.comb := regfile.io.scoreboard.comb | scoreboard_spec(i)
decode(i).io.scoreboard.regd := regfile.io.scoreboard.regd | scoreboard_spec(i)
}
decode(0).io.mactive := io.vcore.mactive
- decode(1).io.mactive := false.B
- decode(2).io.mactive := false.B
- decode(3).io.mactive := false.B
+ for (i <- 1 until p.instructionLanes) {
+ decode(i).io.mactive := false.B
+ }
// ---------------------------------------------------------------------------
// ALU
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
alu(i).io.req := decode(i).io.alu
alu(i).io.rs1 := regfile.io.readData(2 * i + 0)
alu(i).io.rs2 := regfile.io.readData(2 * i + 1)
@@ -142,7 +142,7 @@
// ---------------------------------------------------------------------------
// Branch Unit
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
bru(i).io.req := decode(i).io.bru
bru(i).io.rs1 := regfile.io.readData(2 * i + 0)
bru(i).io.rs2 := regfile.io.readData(2 * i + 1)
@@ -150,9 +150,9 @@
}
bru(0).io.csr <> csr.io.bru
- bru(1).io.csr.defaults()
- bru(2).io.csr.defaults()
- bru(3).io.csr.defaults()
+ for (i <- 1 until p.instructionLanes) {
+ bru(i).io.csr.defaults()
+ }
io.iflush.valid := iflush
@@ -181,7 +181,7 @@
// Load/Store Unit
lsu.io.busPort := regfile.io.busPort
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
lsu.io.req(i).valid := decode(i).io.lsu.valid
lsu.io.req(i).store := decode(i).io.lsu.store
lsu.io.req(i).addr := decode(i).io.lsu.addr
@@ -191,7 +191,7 @@
// ---------------------------------------------------------------------------
// Multiplier Unit
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
mlu.io.req(i) := decode(i).io.mlu
mlu.io.rs1(i) := regfile.io.readData(2 * i)
mlu.io.rs2(i) := regfile.io.readData((2 * i) + 1)
@@ -205,13 +205,13 @@
dvu.io.rd.ready := !mlu.io.rd.valid
// TODO: make port conditional on pipeline index.
- for (i <- 1 until 4) {
+ for (i <- 1 until p.instructionLanes) {
decode(i).io.dvu.ready := false.B
}
// ---------------------------------------------------------------------------
// Register File
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
regfile.io.readAddr(2 * i + 0) := decode(i).io.rs1Read
regfile.io.readAddr(2 * i + 1) := decode(i).io.rs2Read
regfile.io.readSet(2 * i + 0) := decode(i).io.rs1Set
@@ -245,27 +245,29 @@
io.vcore.rd(i).valid) <= 1.U)
}
- regfile.io.writeData(4).valid := mlu.io.rd.valid || dvu.io.rd.valid
- regfile.io.writeData(4).addr := Mux(mlu.io.rd.valid, mlu.io.rd.addr, dvu.io.rd.addr)
- regfile.io.writeData(4).data := Mux(mlu.io.rd.valid, mlu.io.rd.data, dvu.io.rd.data)
+ val mluDvuOffset = p.instructionLanes
+ regfile.io.writeData(mluDvuOffset).valid := mlu.io.rd.valid || dvu.io.rd.valid
+ regfile.io.writeData(mluDvuOffset).addr := Mux(mlu.io.rd.valid, mlu.io.rd.addr, dvu.io.rd.addr)
+ regfile.io.writeData(mluDvuOffset).data := Mux(mlu.io.rd.valid, mlu.io.rd.data, dvu.io.rd.data)
assert(!(mlu.io.rd.valid && (dvu.io.rd.valid && dvu.io.rd.ready))) // TODO: stall dvu on mlu write
- regfile.io.writeData(5).valid := lsu.io.rd.valid
- regfile.io.writeData(5).addr := lsu.io.rd.addr
- regfile.io.writeData(5).data := lsu.io.rd.data
+ val lsuOffset = p.instructionLanes + 1
+ regfile.io.writeData(lsuOffset).valid := lsu.io.rd.valid
+ regfile.io.writeData(lsuOffset).addr := lsu.io.rd.addr
+ regfile.io.writeData(lsuOffset).data := lsu.io.rd.data
val writeMask = bru.map(_.io.taken.valid).scan(false.B)(_||_)
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
regfile.io.writeMask(i).valid := writeMask(i)
}
// ---------------------------------------------------------------------------
// Vector Extension
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
io.vcore.vinst(i) <> decode(i).io.vinst
}
- for (i <- 0 until 8) {
+ for (i <- 0 until p.instructionLanes * 2) {
io.vcore.rs(i) := regfile.io.readData(i)
}
@@ -301,36 +303,23 @@
cycles := cycles + 1.U
io.debug.cycles := cycles
- val debugEn = RegInit(0.U(4.W))
- val debugAddr = Reg(Vec(4, UInt(32.W)))
- val debugInst = Reg(Vec(4, UInt(32.W)))
+ val debugEn = RegInit(0.U(p.instructionLanes.W))
+ val debugAddr = Reg(Vec(p.instructionLanes, UInt(32.W)))
+ val debugInst = Reg(Vec(p.instructionLanes, UInt(32.W)))
- val debugBrch =
- Cat(bru(0).io.taken.valid || bru(1).io.taken.valid || bru(2).io.taken.valid,
- bru(0).io.taken.valid || bru(1).io.taken.valid,
- bru(0).io.taken.valid,
- false.B)
+ val debugBrch = Cat(bru.map(_.io.taken.valid).scanRight(false.B)(_ || _))
- debugEn := Cat(fetch.io.inst.lanes(3).valid && fetch.io.inst.lanes(3).ready && !branchTaken,
- fetch.io.inst.lanes(2).valid && fetch.io.inst.lanes(2).ready && !branchTaken,
- fetch.io.inst.lanes(1).valid && fetch.io.inst.lanes(1).ready && !branchTaken,
- fetch.io.inst.lanes(0).valid && fetch.io.inst.lanes(0).ready && !branchTaken)
+ debugEn := Cat(fetch.io.inst.lanes.map(x => x.valid && x.ready && !branchTaken))
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
debugAddr(i) := fetch.io.inst.lanes(i).addr
debugInst(i) := fetch.io.inst.lanes(i).inst
}
io.debug.en := debugEn & ~debugBrch
- io.debug.addr0 := debugAddr(0)
- io.debug.addr1 := debugAddr(1)
- io.debug.addr2 := debugAddr(2)
- io.debug.addr3 := debugAddr(3)
- io.debug.inst0 := debugInst(0)
- io.debug.inst1 := debugInst(1)
- io.debug.inst2 := debugInst(2)
- io.debug.inst3 := debugInst(3)
+ io.debug.addr <> debugAddr
+ io.debug.inst <> debugInst
}
object EmitSCore extends App {
diff --git a/hdl/chisel/src/kelvin/vector/VAlu.scala b/hdl/chisel/src/kelvin/vector/VAlu.scala
index 03eae95..03f6f36 100644
--- a/hdl/chisel/src/kelvin/vector/VAlu.scala
+++ b/hdl/chisel/src/kelvin/vector/VAlu.scala
@@ -30,15 +30,15 @@
class VAlu(p: Parameters) extends Module {
val io = IO(new Bundle {
// Instructions.
- val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+ val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VDecodeBits))))
val active = Output(UInt(64.W))
// VRegfile.
val vrfsb = Input(UInt(128.W))
- val read = Vec(7, new VRegfileReadIO(p))
- val write = Vec(4, new VRegfileWriteIO(p))
- val whint = Vec(4, new VRegfileWhintIO(p))
- val scalar = Vec(2, new VRegfileScalarIO(p))
+ val read = Vec(p.vectorReadPorts, new VRegfileReadIO(p))
+ val write = Vec(p.vectorWritePorts - 2, new VRegfileWriteIO(p))
+ val whint = Vec(p.vectorWhintPorts, new VRegfileWhintIO(p))
+ val scalar = Vec(p.vectorScalarPorts, new VRegfileScalarIO(p))
// Testbench signals.
val read_0_ready = Output(Bool())
@@ -56,26 +56,26 @@
// ---------------------------------------------------------------------------
// Tie-offs.
- for (i <- 0 until 7) {
+ for (i <- 0 until io.read.length) {
io.read(i).valid := false.B
io.read(i).addr := 0.U
io.read(i).tag := 0.U
}
- for (i <- 0 until 4) {
+ for (i <- 0 until io.write.length) {
io.write(i).valid := false.B
io.write(i).addr := 0.U
io.write(i).data := 0.U
}
- for (i <- 0 until 4) {
+ for (i <- 0 until io.whint.length) {
io.whint(i).valid := false.B
io.whint(i).addr := 0.U
}
// ---------------------------------------------------------------------------
// Opcode checks.
- for (i <- 0 until 4) {
+ for (i <- 0 until io.in.bits.length) {
when (io.in.valid && io.in.ready) {
when (io.in.bits(i).valid) {
val op = io.in.bits(i).bits.op
@@ -254,8 +254,8 @@
active
}
- val q0 = VCmdq(cmdqDepth, new VAluCmdq, Fin0, Fout, Factive)
- val q1 = VCmdq(cmdqDepth, new VAluCmdq, Fin1, Fout, Factive)
+ val q0 = VCmdq(p, cmdqDepth, new VAluCmdq, Fin0, Fout, Factive)
+ val q1 = VCmdq(p, cmdqDepth, new VAluCmdq, Fin1, Fout, Factive)
q0.io.in.valid := io.in.valid && q1.io.in.ready
q1.io.in.valid := io.in.valid && q0.io.in.ready
@@ -278,20 +278,19 @@
// ---------------------------------------------------------------------------
// ALU Selection interleaving.
val alureg = RegInit(false.B)
- val alusel = Wire(Vec(5, Bool()))
+ val alusel = Wire(Vec(p.instructionLanes + 1, Bool()))
// Toggle if previous was valid and was not a synchronized dual command.
alusel(0) := alureg
- alusel(1) := Mux(io.in.bits(0).valid && !io.in.bits(0).bits.cmdsync, !alusel(0), alusel(0))
- alusel(2) := Mux(io.in.bits(1).valid && !io.in.bits(1).bits.cmdsync, !alusel(1), alusel(1))
- alusel(3) := Mux(io.in.bits(2).valid && !io.in.bits(2).bits.cmdsync, !alusel(2), alusel(2))
- alusel(4) := Mux(io.in.bits(3).valid && !io.in.bits(3).bits.cmdsync, !alusel(3), alusel(3))
-
- when (io.in.valid && io.in.ready) {
- alureg := alusel(4)
+ for (i <- 0 until p.instructionLanes) {
+ alusel(i + 1) := Mux(io.in.bits(i).valid && !io.in.bits(i).bits.cmdsync, !alusel(i), alusel(i))
}
- for (i <- 0 until 4) {
+ when (io.in.valid && io.in.ready) {
+ alureg := alusel(alusel.length - 1)
+ }
+
+ for (i <- 0 until p.instructionLanes) {
q0.io.in.bits(i).valid := io.in.bits(i).valid && (alusel(i) === 0.U || io.in.bits(i).bits.cmdsync)
q1.io.in.bits(i).valid := io.in.bits(i).valid && (alusel(i) === 1.U || io.in.bits(i).bits.cmdsync)
}
diff --git a/hdl/chisel/src/kelvin/vector/VCmdq.scala b/hdl/chisel/src/kelvin/vector/VCmdq.scala
index 20e29b3..261ba63 100644
--- a/hdl/chisel/src/kelvin/vector/VCmdq.scala
+++ b/hdl/chisel/src/kelvin/vector/VCmdq.scala
@@ -27,14 +27,14 @@
// <factive> returns the activation status for decode dependencies.
object VCmdq {
- def apply[T <: Data](n: Int, t: T, fin: (VDecodeBits) => T, fout: (T, Bool, UInt, Bool) => (T, Bool), factive: (T, Bool, UInt) => UInt) = {
- Module(new VCmdq(n, t, fin, fout, factive))
+ def apply[T <: Data](p: Parameters, n: Int, t: T, fin: (VDecodeBits) => T, fout: (T, Bool, UInt, Bool) => (T, Bool), factive: (T, Bool, UInt) => UInt) = {
+ Module(new VCmdq(p, n, t, fin, fout, factive))
}
}
-class VCmdq[T <: Data](n: Int, t: T, fin: (VDecodeBits) => T, fout: (T, Bool, UInt, Bool) => (T, Bool), factive: (T, Bool, UInt) => UInt) extends Module {
+class VCmdq[T <: Data](p: Parameters, n: Int, t: T, fin: (VDecodeBits) => T, fout: (T, Bool, UInt, Bool) => (T, Bool), factive: (T, Bool, UInt) => UInt) extends Module {
val io = IO(new Bundle {
- val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+ val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VDecodeBits))))
val out = Decoupled(t)
val active = Output(UInt(64.W))
val nempty = Output(Bool())
@@ -45,7 +45,7 @@
val m = Output(Bool()) // stripmine
}
- val f = Fifo4e(new VCmdqWrapper, n)
+ val f = FifoXe(new VCmdqWrapper, p.instructionLanes, n)
val active = RegInit(0.U(64.W))
@@ -65,7 +65,7 @@
f.io.in.valid := io.in.valid
io.in.ready := f.io.in.ready
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
f.io.in.bits(i).valid := io.in.bits(i).valid
f.io.in.bits(i).bits.tin := fin(io.in.bits(i).bits)
f.io.in.bits(i).bits.m := io.in.bits(i).bits.m
@@ -118,14 +118,10 @@
when (io.in.valid && io.in.ready || io.out.valid && io.out.ready) {
val fvalid = MuxOR(f.io.in.valid && f.io.in.ready,
- Cat(f.io.in.bits(3).valid, f.io.in.bits(2).valid,
- f.io.in.bits(1).valid, f.io.in.bits(0).valid))
+ Cat((0 until p.instructionLanes).reverse.map(x => f.io.in.bits(x).valid)))
- active :=
- MuxOR(fvalid(0), factive(f.io.in.bits(0).bits.tin, f.io.in.bits(0).bits.m, step0)) |
- MuxOR(fvalid(1), factive(f.io.in.bits(1).bits.tin, f.io.in.bits(1).bits.m, step0)) |
- MuxOR(fvalid(2), factive(f.io.in.bits(2).bits.tin, f.io.in.bits(2).bits.m, step0)) |
- MuxOR(fvalid(3), factive(f.io.in.bits(3).bits.tin, f.io.in.bits(3).bits.m, step0)) |
+ active := (0 until p.instructionLanes).map(x =>
+ MuxOR(fvalid(x), factive(f.io.in.bits(x).bits.tin, f.io.in.bits(x).bits.m, step0))).reduce(_|_) |
ValueActive()
}
@@ -180,5 +176,6 @@
active
}
- ChiselStage.emitSystemVerilogFile(new VCmdq(8, new VCmdqTestBundle, VCmdqTestFin, VCmdqTestFout, VCmdqTestFactive), args)
+ val p = kelvin.Parameters()
+ ChiselStage.emitSystemVerilogFile(new VCmdq(p, 8, new VCmdqTestBundle, VCmdqTestFin, VCmdqTestFout, VCmdqTestFactive), args)
}
diff --git a/hdl/chisel/src/kelvin/vector/VConvCtrl.scala b/hdl/chisel/src/kelvin/vector/VConvCtrl.scala
index 1e017a4..ebea853 100644
--- a/hdl/chisel/src/kelvin/vector/VConvCtrl.scala
+++ b/hdl/chisel/src/kelvin/vector/VConvCtrl.scala
@@ -30,7 +30,7 @@
class VConvCtrl(p: Parameters) extends Module {
val io = IO(new Bundle {
// Instructions.
- val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+ val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VDecodeBits))))
val active = Output(UInt(64.W))
// RegisterFile.
@@ -160,7 +160,7 @@
active
}
- val q = VCmdq(cmdqDepth, new VConvCtrlCmdq, Fin, Fout, Factive)
+ val q = VCmdq(p, cmdqDepth, new VConvCtrlCmdq, Fin, Fout, Factive)
q.io.in <> io.in
diff --git a/hdl/chisel/src/kelvin/vector/VCore.scala b/hdl/chisel/src/kelvin/vector/VCore.scala
index 58bbab6..919cb27 100644
--- a/hdl/chisel/src/kelvin/vector/VCore.scala
+++ b/hdl/chisel/src/kelvin/vector/VCore.scala
@@ -28,11 +28,11 @@
class VCoreIO(p: Parameters) extends Bundle {
// Decode cycle.
- val vinst = Vec(4, new VInstIO)
+ val vinst = Vec(p.instructionLanes, new VInstIO)
// Execute cycle.
- val rs = Vec(8, Flipped(new RegfileReadDataIO))
- val rd = Vec(4, Flipped(new RegfileWriteDataIO))
+ val rs = Vec(p.instructionLanes * 2, Flipped(new RegfileReadDataIO))
+ val rd = Vec(p.instructionLanes, Flipped(new RegfileWriteDataIO))
// Status.
val mactive = Output(Bool())
@@ -97,7 +97,7 @@
vinst.io.out.stall := vdec.io.stall // decode backpressure
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
vdec.io.in.bits(i) := vinst.io.out.lane(i)
}
@@ -105,24 +105,24 @@
// ---------------------------------------------------------------------------
// VRegfile.
- for (i <- 0 until 7) {
+ for (i <- 0 until vrf.readPorts) {
vrf.io.read(i).valid := false.B
vrf.io.read(i).addr := 0.U
vrf.io.read(i).tag := 0.U
}
- for (i <- 0 until 6) {
+ for (i <- 0 until vrf.writePorts) {
vrf.io.write(i).valid := false.B
vrf.io.write(i).addr := 0.U
vrf.io.write(i).data := 0.U
}
- for (i <- 0 until 4) {
+ for (i <- 0 until vrf.whintPorts) {
vrf.io.whint(i).valid := false.B
vrf.io.whint(i).addr := 0.U
}
- for (i <- 0 until 2) {
+ for (i <- 0 until vrf.scalarPorts) {
vrf.io.scalar(i).valid := false.B
vrf.io.scalar(i).data := 0.U
}
@@ -133,43 +133,38 @@
// ---------------------------------------------------------------------------
// VALU.
- val aluvalid = Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).alu,
- vdec.io.out(2).valid && vdec.io.cmdq(2).alu,
- vdec.io.out(1).valid && vdec.io.cmdq(1).alu,
- vdec.io.out(0).valid && vdec.io.cmdq(0).alu)
+ val aluvalid = (0 until p.instructionLanes).map(x => vdec.io.out(x).valid && vdec.io.cmdq(x).alu)
+ val aluready = (0 until p.instructionLanes).map(x => valu.io.in.ready && vdec.io.cmdq(x).alu)
- val aluready = Cat(valu.io.in.ready && vdec.io.cmdq(3).alu,
- valu.io.in.ready && vdec.io.cmdq(2).alu,
- valu.io.in.ready && vdec.io.cmdq(1).alu,
- valu.io.in.ready && vdec.io.cmdq(0).alu)
+ valu.io.in.valid := aluvalid.reduce(_ || _)
- valu.io.in.valid := aluvalid =/= 0.U
-
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
valu.io.in.bits(i).valid := aluvalid(i)
valu.io.in.bits(i).bits := vdec.io.out(i).bits
}
- for (i <- 0 until 7) {
+ for (i <- 0 until vrf.readPorts) {
vrf.io.read(i).valid := valu.io.read(i).valid
vrf.io.read(i).addr := valu.io.read(i).addr
vrf.io.read(i).tag := valu.io.read(i).tag
}
- for (i <- 0 until 7) {
+ for (i <- 0 until vrf.readPorts) {
valu.io.read(i).data := vrf.io.read(i).data
}
- for (i <- 0 until 4) {
+ for (i <- 0 until vrf.writePorts - 2) {
vrf.io.write(i).valid := valu.io.write(i).valid
vrf.io.write(i).addr := valu.io.write(i).addr
vrf.io.write(i).data := valu.io.write(i).data
+ }
+ for (i <- 0 until vrf.whintPorts) {
vrf.io.whint(i).valid := valu.io.whint(i).valid
vrf.io.whint(i).addr := valu.io.whint(i).addr
}
- for (i <- 0 until 2) {
+ for (i <- 0 until vrf.scalarPorts) {
vrf.io.scalar(i).valid := valu.io.scalar(i).valid
vrf.io.scalar(i).data := valu.io.scalar(i).data
}
@@ -178,19 +173,12 @@
// ---------------------------------------------------------------------------
// VCONV.
- val convvalid = Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).conv,
- vdec.io.out(2).valid && vdec.io.cmdq(2).conv,
- vdec.io.out(1).valid && vdec.io.cmdq(1).conv,
- vdec.io.out(0).valid && vdec.io.cmdq(0).conv)
+ val convvalid = (0 until p.instructionLanes).map(x => vdec.io.out(x).valid && vdec.io.cmdq(x).conv)
+ val convready = (0 until p.instructionLanes).map(x => vconv.io.in.ready && vdec.io.cmdq(x).conv)
- val convready = Cat(vconv.io.in.ready && vdec.io.cmdq(3).conv,
- vconv.io.in.ready && vdec.io.cmdq(2).conv,
- vconv.io.in.ready && vdec.io.cmdq(1).conv,
- vconv.io.in.ready && vdec.io.cmdq(0).conv)
+ vconv.io.in.valid := convvalid.reduce(_ || _)
- vconv.io.in.valid := convvalid =/= 0.U
-
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
vconv.io.in.bits(i).valid := convvalid(i)
vconv.io.in.bits(i).bits := vdec.io.out(i).bits
}
@@ -201,25 +189,18 @@
// ---------------------------------------------------------------------------
// VLdSt.
- val ldstvalid = Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).ldst,
- vdec.io.out(2).valid && vdec.io.cmdq(2).ldst,
- vdec.io.out(1).valid && vdec.io.cmdq(1).ldst,
- vdec.io.out(0).valid && vdec.io.cmdq(0).ldst)
+ val ldstvalid = (0 until p.instructionLanes).map(x => vdec.io.out(x).valid && vdec.io.cmdq(x).ldst)
+ val ldstready = (0 until p.instructionLanes).map(x => vldst.io.in.ready && vdec.io.cmdq(x).ldst)
- val ldstready = Cat(vldst.io.in.ready && vdec.io.cmdq(3).ldst,
- vldst.io.in.ready && vdec.io.cmdq(2).ldst,
- vldst.io.in.ready && vdec.io.cmdq(1).ldst,
- vldst.io.in.ready && vdec.io.cmdq(0).ldst)
+ vldst.io.in.valid := ldstvalid.reduce(_ || _)
- vldst.io.in.valid := ldstvalid =/= 0.U
-
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
vldst.io.in.bits(i).valid := ldstvalid(i)
vldst.io.in.bits(i).bits := vdec.io.out(i).bits
}
vldst.io.read.ready := !vst.io.read.valid
- vldst.io.read.data := vrf.io.read(6).data
+ vldst.io.read.data := vrf.io.read(vrf.readPorts - 1).data
vldst.io.vrfsb := vrf.io.vrfsb.data
@@ -228,22 +209,12 @@
// ---------------------------------------------------------------------------
// VLd.
- val ldvalid = Wire(UInt(4.W))
- val ldready = Wire(UInt(4.W))
+ val ldvalid = (0 until p.instructionLanes).map(x => vdec.io.cmdq(x).ld && vdec.io.out(x).valid)
+ val ldready = (0 until p.instructionLanes).map(x => vdec.io.cmdq(x).ld && vld.io.in.ready)
- ldvalid := Cat(vdec.io.cmdq(3).ld && vdec.io.out(3).valid,
- vdec.io.cmdq(2).ld && vdec.io.out(2).valid,
- vdec.io.cmdq(1).ld && vdec.io.out(1).valid,
- vdec.io.cmdq(0).ld && vdec.io.out(0).valid)
+ vld.io.in.valid := ldvalid.reduce(_ || _)
- ldready := Cat(vdec.io.cmdq(3).ld && vld.io.in.ready,
- vdec.io.cmdq(2).ld && vld.io.in.ready,
- vdec.io.cmdq(1).ld && vld.io.in.ready,
- vdec.io.cmdq(0).ld && vld.io.in.ready)
-
- vld.io.in.valid := ldvalid =/= 0.U
-
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
vld.io.in.bits(i).valid := ldvalid(i)
vld.io.in.bits(i).bits := vdec.io.out(i).bits
}
@@ -252,22 +223,12 @@
// ---------------------------------------------------------------------------
// VSt.
- val stvalid = Wire(UInt(4.W))
- val stready = Wire(UInt(4.W))
+ val stvalid = (0 until p.instructionLanes).map(x => vdec.io.out(x).valid && vdec.io.cmdq(x).st)
+ val stready = (0 until p.instructionLanes).map(x => vst.io.in.ready && vdec.io.cmdq(x).st)
- stvalid := Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).st,
- vdec.io.out(2).valid && vdec.io.cmdq(2).st,
- vdec.io.out(1).valid && vdec.io.cmdq(1).st,
- vdec.io.out(0).valid && vdec.io.cmdq(0).st)
+ vst.io.in.valid := stvalid.reduce(_ || _)
- stready := Cat(vst.io.in.ready && vdec.io.cmdq(3).st,
- vst.io.in.ready && vdec.io.cmdq(2).st,
- vst.io.in.ready && vdec.io.cmdq(1).st,
- vst.io.in.ready && vdec.io.cmdq(0).st)
-
- vst.io.in.valid := stvalid =/= 0.U
-
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
vst.io.in.bits(i).valid := stvalid(i)
vst.io.in.bits(i).bits := vdec.io.out(i).bits
}
@@ -277,29 +238,29 @@
vst.io.vrfsb := vrf.io.vrfsb.data
vst.io.read.ready := true.B
- vst.io.read.data := vrf.io.read(6).data
+ vst.io.read.data := vrf.io.read(vrf.readPorts - 1).data
// ---------------------------------------------------------------------------
// Load write.
- vrf.io.write(4).valid := vldst.io.write.valid
- vrf.io.write(4).addr := vldst.io.write.addr
- vrf.io.write(4).data := vldst.io.write.data
+ vrf.io.write(vrf.readPorts - 3).valid := vldst.io.write.valid
+ vrf.io.write(vrf.readPorts - 3).addr := vldst.io.write.addr
+ vrf.io.write(vrf.readPorts - 3).data := vldst.io.write.data
- vrf.io.write(5).valid := vld.io.write.valid
- vrf.io.write(5).addr := vld.io.write.addr
- vrf.io.write(5).data := vld.io.write.data
+ vrf.io.write(vrf.readPorts - 2).valid := vld.io.write.valid
+ vrf.io.write(vrf.readPorts - 2).addr := vld.io.write.addr
+ vrf.io.write(vrf.readPorts - 2).data := vld.io.write.data
// ---------------------------------------------------------------------------
// Store read.
- vrf.io.read(6).valid := vst.io.read.valid || vldst.io.read.valid
- vrf.io.read(6).addr := Mux(vst.io.read.valid, vst.io.read.addr,
+ vrf.io.read(vrf.readPorts - 1).valid := vst.io.read.valid || vldst.io.read.valid
+ vrf.io.read(vrf.readPorts - 1).addr := Mux(vst.io.read.valid, vst.io.read.addr,
vldst.io.read.addr)
- vrf.io.read(6).tag := Mux(vst.io.read.valid, vst.io.read.tag,
+ vrf.io.read(vrf.readPorts - 1).tag := Mux(vst.io.read.valid, vst.io.read.tag,
vldst.io.read.tag)
// ---------------------------------------------------------------------------
// VDecode.
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
vdec.io.out(i).ready := aluready(i) || convready(i) || ldstready(i) ||
ldready(i) || stready(i)
}
diff --git a/hdl/chisel/src/kelvin/vector/VDecode.scala b/hdl/chisel/src/kelvin/vector/VDecode.scala
index fa48723..44d6afc 100644
--- a/hdl/chisel/src/kelvin/vector/VDecode.scala
+++ b/hdl/chisel/src/kelvin/vector/VDecode.scala
@@ -18,7 +18,7 @@
import chisel3._
import chisel3.util._
-import common.Fifo4x4
+import common.FifoIxO
import _root_.circt.stage.ChiselStage
object VDecode {
@@ -29,10 +29,10 @@
class VDecode(p: Parameters) extends Module {
val io = IO(new Bundle {
- val in = Flipped(Decoupled(Vec(4, Valid(new VectorInstructionLane))))
- val out = Vec(4, Decoupled(new VDecodeBits))
- val cmdq = Vec(4, Output(new VDecodeCmdq))
- val actv = Vec(4, Output(new VDecodeActive)) // used in testbench
+ val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VectorInstructionLane))))
+ val out = Vec(p.instructionLanes, Decoupled(new VDecodeBits))
+ val cmdq = Vec(p.instructionLanes, Output(new VDecodeCmdq))
+ val actv = Vec(p.instructionLanes, Output(new VDecodeActive)) // used in testbench
val stall = Output(Bool())
val active = Input(UInt(64.W))
val vrfsb = new VRegfileScoreboardIO
@@ -45,27 +45,24 @@
val enc = new VEncodeOp()
- val f = Fifo4x4(new VectorInstructionLane, depth)
+ val f = FifoIxO(new VectorInstructionLane, p.instructionLanes, p.instructionLanes, depth)
- val d = Seq(Module(new VDecodeInstruction(p)),
- Module(new VDecodeInstruction(p)),
- Module(new VDecodeInstruction(p)),
- Module(new VDecodeInstruction(p)))
+ val d = Seq.fill(p.instructionLanes)(Module(new VDecodeInstruction(p)))
- val e = Wire(Vec(4, new VDecodeBits))
+ val e = Wire(Vec(p.instructionLanes, new VDecodeBits))
- val valid = RegInit(VecInit(Seq.fill(4)(false.B)))
- val data = Reg(Vec(4, new VDecodeBits))
- val cmdq = Reg(Vec(4, new VDecodeCmdq))
- val actv = Wire(Vec(4, new VDecodeActive))
- val actv2 = Reg(Vec(4, new VDecodeActive2))
- val dataNxt = Wire(Vec(4, new VDecodeBits))
- val cmdqNxt = Wire(Vec(4, new VDecodeCmdq))
- val actvNxt = Wire(Vec(4, new VDecodeActive2))
+ val valid = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
+ val data = Reg(Vec(p.instructionLanes, new VDecodeBits))
+ val cmdq = Reg(Vec(p.instructionLanes, new VDecodeCmdq))
+ val actv = Wire(Vec(p.instructionLanes, new VDecodeActive))
+ val actv2 = Reg(Vec(p.instructionLanes, new VDecodeActive2))
+ val dataNxt = Wire(Vec(p.instructionLanes, new VDecodeBits))
+ val cmdqNxt = Wire(Vec(p.instructionLanes, new VDecodeCmdq))
+ val actvNxt = Wire(Vec(p.instructionLanes, new VDecodeActive2))
// ---------------------------------------------------------------------------
// Decode.
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
d(i).io.in := f.io.out(i).bits
}
@@ -75,24 +72,11 @@
// write the read usage is occurring on.
val tagReg = RegInit(0.U(64.W))
- val tag0 = tagReg
- val tag1 = tag0 ^ d(0).io.actv.wactive
- val tag2 = tag1 ^ d(1).io.actv.wactive
- val tag3 = tag2 ^ d(2).io.actv.wactive
- val tag4 = tag3 ^ d(3).io.actv.wactive
-
- val tags = Seq(tag0, tag1, tag2, tag3, tag4)
+ val tags = (0 until p.instructionLanes).map(x => d(x).io.actv.wactive).scan(tagReg)(_ ^ _)
+ assert(tags.length == p.instructionLanes + 1)
// f.io.out is ordered, so can use a priority tree.
- when(f.io.out(3).valid && f.io.out(3).ready) {
- tagReg := tag4
- } .elsewhen(f.io.out(2).valid && f.io.out(2).ready) {
- tagReg := tag3
- } .elsewhen(f.io.out(1).valid && f.io.out(1).ready) {
- tagReg := tag2
- } .elsewhen(f.io.out(0).valid && f.io.out(0).ready) {
- tagReg := tag1
- }
+ tagReg := MuxCase(tags(0), (0 until p.instructionLanes).reverse.map(x => (f.io.out(x).valid && f.io.out(x).ready) -> tags(x + 1)))
def TagAddr(tag: UInt, v: VAddrTag): VAddrTag = {
assert(tag.getWidth == 64)
@@ -111,7 +95,7 @@
r
}
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
e(i) := d(i).io.out
e(i).vs := TagAddr(tags(i), d(i).io.out.vs)
e(i).vt := TagAddr(tags(i), d(i).io.out.vt)
@@ -123,34 +107,27 @@
// ---------------------------------------------------------------------------
// Undef. (io.in.ready ignored to signal as early as possible)
- io.undef := io.in.valid && (d(0).io.undef || d(1).io.undef || d(2).io.undef || d(3).io.undef)
+ io.undef := io.in.valid && d.map(x => x.io.undef).reduce(_ || _)
// ---------------------------------------------------------------------------
// Fifo.
f.io.in <> io.in
- val icount = MuxOR(io.in.valid, PopCount(Cat(io.in.bits(0).valid, io.in.bits(1).valid, io.in.bits(2).valid, io.in.bits(3).valid)))
- assert(icount.getWidth == 3)
+ val icount = MuxOR(io.in.valid,
+ PopCount(io.in.bits.map(_.valid))
+ )
- val ocount = PopCount(Cat(valid(0) && !(io.out(0).valid && io.out(0).ready),
- valid(1) && !(io.out(1).valid && io.out(1).ready),
- valid(2) && !(io.out(2).valid && io.out(2).ready),
- valid(3) && !(io.out(3).valid && io.out(3).ready)))
- assert(ocount.getWidth == 3)
+ val ocount = PopCount((0 until p.instructionLanes).map(x => valid(x) && !(io.out(x).valid && io.out(x).ready)))
- for (i <- 0 until 4) {
- f.io.out(i).ready := (i.U + ocount) < 4.U
+ for (i <- 0 until p.instructionLanes) {
+ f.io.out(i).ready := (i.U + ocount) < p.instructionLanes.U
}
// ---------------------------------------------------------------------------
// Valid.
- val fcount = PopCount(Cat(f.io.out(0).valid && f.io.out(0).ready,
- f.io.out(1).valid && f.io.out(1).ready,
- f.io.out(2).valid && f.io.out(2).ready,
- f.io.out(3).valid && f.io.out(3).ready))
- assert(fcount.getWidth == 3)
+ val fcount = PopCount(f.io.out.map(x => x.valid && x.ready))
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
valid(i) := (ocount + fcount) > i.U
}
@@ -159,41 +136,30 @@
io.stall := (f.io.count + icount) > (depth - guard).U
// ---------------------------------------------------------------------------
- // Dependencies.
- val depends = Wire(Vec(4, Bool()))
-
// Writes must not proceed past any outstanding reads or writes,
// or past any dispatching writes.
- val wactive0 = io.vrfsb.data(63, 0) | io.vrfsb.data(127, 64) | io.active
- val wactive1 = actv(0).ractive | actv(0).wactive | wactive0
- val wactive2 = actv(1).ractive | actv(1).wactive | wactive1
- val wactive3 = actv(2).ractive | actv(2).wactive | wactive2
- val wactive = VecInit(wactive0, wactive1, wactive2, wactive3)
+ val wactive = VecInit((0 until p.instructionLanes).map(x => actv(x).ractive | actv(x).wactive).scan(io.vrfsb.data(63,0) | io.vrfsb.data(127,64) | io.active)(_ | _))
// Reads must not proceed past any dispatching writes.
- val ractive0 = 0.U(64.W)
- val ractive1 = actv(0).wactive | ractive0
- val ractive2 = actv(1).wactive | ractive1
- val ractive3 = actv(2).wactive | ractive2
- val ractive = VecInit(ractive0, ractive1, ractive2, ractive3)
+ val ractive = VecInit((0 until p.instructionLanes).map(x => actv(x).wactive).scan(0.U(64.W))(_ | _))
- for (i <- 0 until 4) {
- depends(i) := (wactive(i) & actv(i).wactive) =/= 0.U ||
- (ractive(i) & actv(i).ractive) =/= 0.U
- }
+ // Dependencies.
+ val depends = VecInit((0 until p.instructionLanes).map(i =>
+ (wactive(i) & actv(i).wactive) =/= 0.U ||
+ (ractive(i) & actv(i).ractive) =/= 0.U
+ ))
// ---------------------------------------------------------------------------
// Data.
- val fvalid = VecInit(f.io.out(0).valid, f.io.out(1).valid,
- f.io.out(2).valid, f.io.out(3).valid).asUInt
- assert(!(fvalid(1) && fvalid(0,0) =/= 1.U))
- assert(!(fvalid(2) && fvalid(1,0) =/= 3.U))
- assert(!(fvalid(3) && fvalid(2,0) =/= 7.U))
+ val fvalid = VecInit(f.io.out.map(_.valid)).asUInt
+ for (i <- 0 until p.instructionLanes) {
+ assert(!(fvalid(i) && PopCount(fvalid(i,0)) =/= (i + 1).U))
+ }
// Register is updated when fifo has state or contents are active.
val dataEn = fvalid(0) || valid.asUInt =/= 0.U
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
when (dataEn) {
data(i) := dataNxt(i)
cmdq(i) := cmdqNxt(i)
@@ -201,14 +167,14 @@
}
}
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
actv(i).ractive := actv2(i).ractive
actv(i).wactive := actv2(i).wactive(63, 0) | actv2(i).wactive(127, 64)
}
// Tag the decode wactive.
- val dactv = Wire(Vec(4, new VDecodeActive2))
- for (i <- 0 until 4) {
+ val dactv = Wire(Vec(p.instructionLanes, new VDecodeActive2))
+ for (i <- 0 until p.instructionLanes) {
val w0 = d(i).io.actv.wactive & ~tags(i + 1)
val w1 = d(i).io.actv.wactive & tags(i + 1)
dactv(i).ractive := d(i).io.actv.ractive
@@ -216,155 +182,51 @@
}
// Data multiplexor of current values and fifo+decode output.
- val dataMux = VecInit(data(0), data(1), data(2), data(3),
- e(0), e(1), e(2), e(3))
+ val dataMux = VecInit(data ++ e)
+ val cmdqMux = VecInit(cmdq ++ d.map(x => x.io.cmdq))
+ val actvMux = VecInit(actv2 ++ dactv)
- val cmdqMux = VecInit(cmdq(0), cmdq(1), cmdq(2), cmdq(3),
- d(0).io.cmdq, d(1).io.cmdq, d(2).io.cmdq, d(3).io.cmdq)
-
- val actvMux = VecInit(actv2(0), actv2(1), actv2(2), actv2(3),
- dactv(0), dactv(1), dactv(2), dactv(3))
-
+ def GenerateMarked(start: Int, count: Int): Seq[UInt] = {
+ (0 until count).map(x => Wire(UInt((start + x).W)))
+ }
// Mark the multiplexor entries that need to be kept.
- val marked0 = Wire(UInt(5.W))
- val marked1 = Wire(UInt(6.W))
- val marked2 = Wire(UInt(7.W))
+ val marked = GenerateMarked((p.instructionLanes + 1), p.instructionLanes - 1)
+ val output = Cat((0 until p.instructionLanes).reverse.map(x => io.out(x).valid && io.out(x).ready))
+ val validNotOutput = (0 until (p.instructionLanes * 2) - 1).map(x =>
+ if (x < valid.length) { valid(x) && !output(x) } else { true.B })
+ val prevMarked = (0 until p.instructionLanes).map(x =>
+ if (x == 0) { None } else { Some(marked(x - 1)) }
+ )
- assert((marked1 & marked0) === marked0)
- assert((marked2 & marked0) === marked0)
- assert((marked2 & marked1) === marked1)
-
- val output = Cat(io.out(3).valid && io.out(3).ready,
- io.out(2).valid && io.out(2).ready,
- io.out(1).valid && io.out(1).ready,
- io.out(0).valid && io.out(0).ready)
-
- when (valid(0) && !output(0)) {
- dataNxt(0) := dataMux(0)
- cmdqNxt(0) := cmdqMux(0)
- actvNxt(0) := actvMux(0)
- marked0 := 0x01.U
- } .elsewhen (valid(1) && !output(1)) {
- dataNxt(0) := dataMux(1)
- cmdqNxt(0) := cmdqMux(1)
- actvNxt(0) := actvMux(1)
- marked0 := 0x03.U
- } .elsewhen (valid(2) && !output(2)) {
- dataNxt(0) := dataMux(2)
- cmdqNxt(0) := cmdqMux(2)
- actvNxt(0) := actvMux(2)
- marked0 := 0x07.U
- } .elsewhen (valid(3) && !output(3)) {
- dataNxt(0) := dataMux(3)
- cmdqNxt(0) := cmdqMux(3)
- actvNxt(0) := actvMux(3)
- marked0 := 0x0f.U
- } .otherwise {
- dataNxt(0) := dataMux(4)
- cmdqNxt(0) := cmdqMux(4)
- actvNxt(0) := actvMux(4)
- marked0 := 0x1f.U
- }
-
- when (!marked0(1) && valid(1) && !output(1)) {
- dataNxt(1) := dataMux(1)
- cmdqNxt(1) := cmdqMux(1)
- actvNxt(1) := actvMux(1)
- marked1 := 0x03.U
- } .elsewhen (!marked0(2) && valid(2) && !output(2)) {
- dataNxt(1) := dataMux(2)
- cmdqNxt(1) := cmdqMux(2)
- actvNxt(1) := actvMux(2)
- marked1 := 0x07.U
- } .elsewhen (!marked0(3) && valid(3) && !output(3)) {
- dataNxt(1) := dataMux(3)
- cmdqNxt(1) := cmdqMux(3)
- actvNxt(1) := actvMux(3)
- marked1 := 0x0f.U
- } .elsewhen (!marked0(4)) {
- dataNxt(1) := dataMux(4)
- cmdqNxt(1) := cmdqMux(4)
- actvNxt(1) := actvMux(4)
- marked1 := 0x1f.U
- } .otherwise {
- dataNxt(1) := dataMux(5)
- cmdqNxt(1) := cmdqMux(5)
- actvNxt(1) := actvMux(5)
- marked1 := 0x3f.U
- }
-
- when (!marked1(2) && valid(2) && !output(2)) {
- dataNxt(2) := dataMux(2)
- cmdqNxt(2) := cmdqMux(2)
- actvNxt(2) := actvMux(2)
- marked2 := 0x07.U
- } .elsewhen (!marked1(3) && valid(3) && !output(3)) {
- dataNxt(2) := dataMux(3)
- cmdqNxt(2) := cmdqMux(3)
- actvNxt(2) := actvMux(3)
- marked2 := 0x0f.U
- } .elsewhen (!marked1(4)) {
- dataNxt(2) := dataMux(4)
- cmdqNxt(2) := cmdqMux(4)
- actvNxt(2) := actvMux(4)
- marked2 := 0x1f.U
- } .elsewhen (!marked1(5)) {
- dataNxt(2) := dataMux(5)
- cmdqNxt(2) := cmdqMux(5)
- actvNxt(2) := actvMux(5)
- marked2 := 0x3f.U
- } .otherwise {
- dataNxt(2) := dataMux(6)
- cmdqNxt(2) := cmdqMux(6)
- actvNxt(2) := actvMux(6)
- marked2 := 0x7f.U
- }
-
- when (!marked2(3) && valid(3) && !output(3)) {
- dataNxt(3) := dataMux(3)
- cmdqNxt(3) := cmdqMux(3)
- actvNxt(3) := actvMux(3)
- } .elsewhen (!marked2(4)) {
- dataNxt(3) := dataMux(4)
- cmdqNxt(3) := cmdqMux(4)
- actvNxt(3) := actvMux(4)
- } .elsewhen (!marked2(5)) {
- dataNxt(3) := dataMux(5)
- cmdqNxt(3) := cmdqMux(5)
- actvNxt(3) := actvMux(5)
- } .elsewhen (!marked2(6)) {
- dataNxt(3) := dataMux(6)
- cmdqNxt(3) := cmdqMux(6)
- actvNxt(3) := actvMux(6)
- } .otherwise {
- dataNxt(3) := dataMux(7)
- cmdqNxt(3) := cmdqMux(7)
- actvNxt(3) := actvMux(7)
+ for (i <- 0 until p.instructionLanes) {
+ val idx = MuxCase((i + p.instructionLanes).U, (i until p.instructionLanes + i).map(x =>
+ (!prevMarked(i).getOrElse(false.B)(x) && validNotOutput(x)) -> (x).U
+ ))
+ dataNxt(i) := dataMux(idx)
+ cmdqNxt(i) := cmdqMux(idx)
+ actvNxt(i) := actvMux(idx)
+ if (i < marked.length) {
+ val width = marked(i).getWidth
+ marked(i) := ~0.U(width.W) >> ((width - 1).U - idx)
+ }
}
// ---------------------------------------------------------------------------
// Scoreboard.
- io.vrfsb.set.valid := output(0) || output(1) || output(2) || output(3)
+ // io.vrfsb.set.valid := output(0) || output(1) || output(2) || output(3)
+ io.vrfsb.set.valid := output =/= 0.U
- io.vrfsb.set.bits := (MuxOR(output(0), actv2(0).wactive) |
- MuxOR(output(1), actv2(1).wactive) |
- MuxOR(output(2), actv2(2).wactive) |
- MuxOR(output(3), actv2(3).wactive))
+ io.vrfsb.set.bits := (0 until p.instructionLanes).map(x => MuxOR(output(x), actv2(x).wactive)).reduce(_ | _)
assert((io.vrfsb.set.bits(63, 0) & io.vrfsb.set.bits(127, 64)) === 0.U)
assert(((io.vrfsb.data(63, 0) | io.vrfsb.data(127, 64)) & (io.vrfsb.set.bits(63, 0) | io.vrfsb.set.bits(127, 64))) === 0.U)
// ---------------------------------------------------------------------------
// Outputs.
- val outvalid = Wire(Vec(4, Bool()))
- val cmdsync = Wire(Vec(4, Bool()))
+ val outvalid = VecInit((0 until p.instructionLanes).map(i => valid(i) && !depends(i)))
+ val cmdsync = VecInit((0 until p.instructionLanes).map(i => data(i).cmdsync))
- for (i <- 0 until 4) {
- outvalid(i) := valid(i) && !depends(i)
- cmdsync(i) := data(i).cmdsync
- }
-
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
// Synchronize commands at cmdsync instance or if found in history.
// Note: {vdwinit, vdwconv, vdmulh}, vdmulh must not issue before vdwconv.
val synchronize = cmdsync.asUInt(i,0) =/= 0.U
diff --git a/hdl/chisel/src/kelvin/vector/VInst.scala b/hdl/chisel/src/kelvin/vector/VInst.scala
index 8a1b42e..8757cea 100644
--- a/hdl/chisel/src/kelvin/vector/VInst.scala
+++ b/hdl/chisel/src/kelvin/vector/VInst.scala
@@ -44,11 +44,11 @@
val op = Input(UInt(new VInstOp().Entries.W))
}
-class VectorInstructionIO extends Bundle {
+class VectorInstructionIO(p: Parameters) extends Bundle {
val valid = Output(Bool())
val ready = Input(Bool())
val stall = Input(Bool())
- val lane = Vec(4, Valid(new VectorInstructionLane))
+ val lane = Vec(p.instructionLanes, Valid(new VectorInstructionLane))
}
class VectorInstructionLane extends Bundle {
@@ -68,14 +68,14 @@
class VInst(p: Parameters) extends Module {
val io = IO(new Bundle {
// Decode cycle.
- val in = Vec(4, new VInstIO)
+ val in = Vec(p.instructionLanes, new VInstIO)
// Execute cycle.
- val rs = Vec(8, Flipped(new RegfileReadDataIO))
- val rd = Vec(4, Flipped(new RegfileWriteDataIO))
+ val rs = Vec(p.instructionLanes * 2, Flipped(new RegfileReadDataIO))
+ val rd = Vec(p.instructionLanes, Flipped(new RegfileWriteDataIO))
// Vector interface.
- val out = new VectorInstructionIO
+ val out = new VectorInstructionIO(p)
// Status.
val nempty = Output(Bool())
@@ -91,41 +91,34 @@
val maxvlwm = (p.vectorBits * 4 / 32).U(p.vectorCountBits.W)
assert(maxvlw >= 4.U)
- val slice = Slice(Vec(4, new Bundle {
+ val slice = Slice(Vec(p.instructionLanes, new Bundle {
val vld = Output(Bool())
val vst = Output(Bool())
val lane = Valid(new VectorInstructionLane)
}), true)
- val reqvalid = VecInit(io.in(0).valid && io.in(0).ready,
- io.in(1).valid && io.in(1).ready,
- io.in(2).valid && io.in(2).ready,
- io.in(3).valid && io.in(3).ready)
-
- val reqaddr = VecInit(io.in(0).inst(19,15),
- io.in(1).inst(19,15),
- io.in(2).inst(19,15),
- io.in(3).inst(19,15))
+ val reqvalid = VecInit(io.in.map(x => x.valid && x.ready))
+ val reqaddr = VecInit(io.in.map(x => x.inst(19,15)))
// ---------------------------------------------------------------------------
// Response to Decode.
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
io.in(i).ready := !io.out.stall
}
// ---------------------------------------------------------------------------
// Controls.
- val vld_o = RegInit(VecInit(Seq.fill(4)(false.B)))
- val vld_u = RegInit(VecInit(Seq.fill(4)(false.B)))
- val vst_o = RegInit(VecInit(Seq.fill(4)(false.B)))
- val vst_u = RegInit(VecInit(Seq.fill(4)(false.B)))
- val vst_q = RegInit(VecInit(Seq.fill(4)(false.B)))
- val getvl = RegInit(VecInit(Seq.fill(4)(false.B)))
- val getmaxvl = RegInit(VecInit(Seq.fill(4)(false.B)))
+ val vld_o = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
+ val vld_u = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
+ val vst_o = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
+ val vst_u = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
+ val vst_q = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
+ val getvl = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
+ val getmaxvl = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
- val rdAddr = Reg(Vec(4, UInt(5.W)))
+ val rdAddr = Reg(Vec(p.instructionLanes, UInt(5.W)))
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
when (reqvalid(i)) {
rdAddr(i) := io.in(i).addr
}
@@ -134,13 +127,13 @@
// ---------------------------------------------------------------------------
// Vector Interface.
val vvalid = RegInit(false.B)
- val vinstValid = RegInit(VecInit(Seq.fill(4)(false.B)))
- val vinstInst = Reg(Vec(4, UInt(32.W)))
- val nxtVinstValid = Wire(Vec(4, Bool()))
+ val vinstValid = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
+ val vinstInst = Reg(Vec(p.instructionLanes, UInt(32.W)))
+ val nxtVinstValid = Wire(Vec(p.instructionLanes, Bool()))
vvalid := nxtVinstValid.asUInt =/= 0.U
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
nxtVinstValid(i) := reqvalid(i) && (io.in(i).op(vinst.VLD) ||
io.in(i).op(vinst.VST) ||
io.in(i).op(vinst.VIOP))
@@ -148,7 +141,7 @@
vinstInst(i) := io.in(i).inst
}
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
val p = io.in(i).inst(28) // func2
val q = io.in(i).inst(30) // func2
vld_o(i) := reqvalid(i) && io.in(i).op(vinst.VLD) && !p
@@ -162,11 +155,11 @@
// ---------------------------------------------------------------------------
// Register write port.
- val lsuAdder = Wire(Vec(4, UInt(32.W)))
- val getvlValue = Wire(Vec(4, UInt(p.vectorCountBits.W))) // bytes
- val getmaxvlValue = Wire(Vec(4, UInt(p.vectorCountBits.W))) // bytes
+ val lsuAdder = Wire(Vec(p.instructionLanes, UInt(32.W)))
+ val getvlValue = Wire(Vec(p.instructionLanes, UInt(p.vectorCountBits.W))) // bytes
+ val getmaxvlValue = Wire(Vec(p.instructionLanes, UInt(p.vectorCountBits.W))) // bytes
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
val rs1 = io.rs(2 * i + 0).data
val rs2 = io.rs(2 * i + 1).data
val m = vinstInst(i)(5)
@@ -220,7 +213,7 @@
lsuAdder(i) := rs1 + offset
}
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
val len = Wire(UInt(p.vectorCountBits.W)) // bytes
val rs1 = io.rs(2 * i + 0).data
val rs2 = io.rs(2 * i + 1).data
@@ -247,7 +240,7 @@
getmaxvlValue(i) := maxvl
}
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
io.rd(i).valid := getvl(i) || getmaxvl(i) || vld_u(i) || vst_u(i) || vst_q(i)
io.rd(i).addr := rdAddr(i)
@@ -267,7 +260,7 @@
// Resolve back-pressure with stall to io.in in decode.
assert(!(slice.io.in.valid && !slice.io.in.ready))
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
slice.io.in.bits(i).vld := vld_o(i) || vld_u(i)
slice.io.in.bits(i).vst := vst_o(i) || vst_u(i) || vst_q(i)
slice.io.in.bits(i).lane.valid := vinstValid(i)
@@ -276,7 +269,7 @@
slice.io.in.bits(i).lane.bits.data := io.rs(2 * i + 1).data
}
- for (i <- 0 until 4) {
+ for (i <- 0 until p.instructionLanes) {
io.out.lane(i) := slice.io.out.bits(i).lane
}
@@ -290,8 +283,7 @@
val nempty = RegInit(false.B)
// Simple implementation, will overlap downstream units redundantly.
- nempty := io.in(0).valid || io.in(1).valid || io.in(2).valid ||
- io.in(3).valid || vvalid || io.out.valid
+ nempty := io.in.map(x => x.valid).reduce(_ || _) || vvalid || io.out.valid
io.nempty := nempty
}
diff --git a/hdl/chisel/src/kelvin/vector/VLd.scala b/hdl/chisel/src/kelvin/vector/VLd.scala
index 88b4d8d..bfbda33 100644
--- a/hdl/chisel/src/kelvin/vector/VLd.scala
+++ b/hdl/chisel/src/kelvin/vector/VLd.scala
@@ -30,7 +30,7 @@
class VLd(p: Parameters) extends Module {
val io = IO(new Bundle {
// Instructions.
- val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+ val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VDecodeBits))))
// VRegfile.
val write = new VRegfileWriteIO(p)
@@ -131,7 +131,7 @@
0.U
}
- val q = VCmdq(cmdqDepth, new VLdCmdq, Fin, Fout, Factive)
+ val q = VCmdq(p, cmdqDepth, new VLdCmdq, Fin, Fout, Factive)
q.io.in <> io.in
diff --git a/hdl/chisel/src/kelvin/vector/VLdSt.scala b/hdl/chisel/src/kelvin/vector/VLdSt.scala
index d2d9853..1aa3ee2 100644
--- a/hdl/chisel/src/kelvin/vector/VLdSt.scala
+++ b/hdl/chisel/src/kelvin/vector/VLdSt.scala
@@ -30,7 +30,7 @@
class VLdSt(p: Parameters) extends Module {
val io = IO(new Bundle {
// Instructions.
- val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+ val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VDecodeBits))))
val active = Output(UInt(64.W))
// VRegfile.
@@ -180,7 +180,7 @@
active
}
- val q = VCmdq(cmdqDepth, new VLdStCmdq, Fin, Fout, Factive)
+ val q = VCmdq(p, cmdqDepth, new VLdStCmdq, Fin, Fout, Factive)
q.io.in <> io.in
diff --git a/hdl/chisel/src/kelvin/vector/VRegfile.scala b/hdl/chisel/src/kelvin/vector/VRegfile.scala
index ac67ff0..fa75b05 100644
--- a/hdl/chisel/src/kelvin/vector/VRegfile.scala
+++ b/hdl/chisel/src/kelvin/vector/VRegfile.scala
@@ -113,13 +113,14 @@
}
class VRegfile(p: Parameters) extends Module {
- val readPorts = 7
- val writePorts = 6
- val whintPorts = 4
+ val readPorts = p.vectorReadPorts
+ val scalarPorts = p.vectorScalarPorts
+ val writePorts = p.vectorWritePorts
+ val whintPorts = p.vectorWhintPorts
val io = IO(new Bundle {
val read = Vec(readPorts, Flipped(new VRegfileReadIO(p)))
- val scalar = Vec(readPorts / 3, Flipped(new VRegfileScalarIO(p)))
+ val scalar = Vec(scalarPorts, Flipped(new VRegfileScalarIO(p)))
val write = Vec(writePorts, Flipped(new VRegfileWrintIO(p)))
val whint = Vec(whintPorts, Flipped(new VRegfileWhintIO(p)))
val conv = Flipped(new VRegfileConvIO(p))
diff --git a/hdl/chisel/src/kelvin/vector/VRegfileSegment.scala b/hdl/chisel/src/kelvin/vector/VRegfileSegment.scala
index 90a4935..38451d7 100644
--- a/hdl/chisel/src/kelvin/vector/VRegfileSegment.scala
+++ b/hdl/chisel/src/kelvin/vector/VRegfileSegment.scala
@@ -21,8 +21,8 @@
import _root_.circt.stage.ChiselStage
class VRegfileSegment(p: Parameters) extends Module {
- val readPorts = 7
- val writePorts = 6
+ val readPorts = p.vectorReadPorts
+ val writePorts = p.vectorWritePorts
val tcnt = 16.min(p.vectorBits / 32)
val io = IO(new Bundle {
diff --git a/hdl/chisel/src/kelvin/vector/VSt.scala b/hdl/chisel/src/kelvin/vector/VSt.scala
index f730fec..638f709 100644
--- a/hdl/chisel/src/kelvin/vector/VSt.scala
+++ b/hdl/chisel/src/kelvin/vector/VSt.scala
@@ -30,7 +30,7 @@
class VSt(p: Parameters) extends Module {
val io = IO(new Bundle {
// Instructions.
- val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+ val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VDecodeBits))))
val active = Output(UInt(64.W))
// VRegfile.
@@ -182,7 +182,7 @@
val strb = UInt((p.lsuDataBits / 8).W)
}
- val q = VCmdq(cmdqDepth, new VStCmdq, Fin, Fout, Factive)
+ val q = VCmdq(p, cmdqDepth, new VStCmdq, Fin, Fout, Factive)
val ctrl = Slice(new Ctrl, false, true)
val data = Slice(new Data, false, true, true)
diff --git a/tests/verilator_sim/kelvin/core_tb.cc b/tests/verilator_sim/kelvin/core_tb.cc
index 969e361..73396ab 100644
--- a/tests/verilator_sim/kelvin/core_tb.cc
+++ b/tests/verilator_sim/kelvin/core_tb.cc
@@ -195,14 +195,14 @@
core.io_slog_addr(io_slog_addr);
core.io_slog_data(io_slog_data);
core.io_debug_en(io_debug_en);
- core.io_debug_addr0(io_debug_addr0);
- core.io_debug_addr1(io_debug_addr1);
- core.io_debug_addr2(io_debug_addr2);
- core.io_debug_addr3(io_debug_addr3);
- core.io_debug_inst0(io_debug_inst0);
- core.io_debug_inst1(io_debug_inst1);
- core.io_debug_inst2(io_debug_inst2);
- core.io_debug_inst3(io_debug_inst3);
+ core.io_debug_addr_0(io_debug_addr0);
+ core.io_debug_addr_1(io_debug_addr1);
+ core.io_debug_addr_2(io_debug_addr2);
+ core.io_debug_addr_3(io_debug_addr3);
+ core.io_debug_inst_0(io_debug_inst0);
+ core.io_debug_inst_1(io_debug_inst1);
+ core.io_debug_inst_2(io_debug_inst2);
+ core.io_debug_inst_3(io_debug_inst3);
core.io_debug_cycles(io_debug_cycles);
mif.clock(tb.clock);