Parameterize Kelvin over instructionLanes - Many things in Kelvin were hard-coded to operate on 4 instruction lanes -- refactor those to be flexible based on the instructionLanes value in Parameters Change-Id: I1957d87b6f355d815380a88c28d210c1c8eec737
diff --git a/hdl/chisel/src/common/BUILD b/hdl/chisel/src/common/BUILD index 5bab0ba..2805f14 100644 --- a/hdl/chisel/src/common/BUILD +++ b/hdl/chisel/src/common/BUILD
@@ -18,9 +18,9 @@ chisel_library( name = "common", srcs = [ - "Fifo4e.scala", - "Fifo4.scala", - "Fifo4x4.scala", + "FifoXe.scala", + "FifoX.scala", + "FifoIxO.scala", "Fifo.scala", "IDiv.scala", "Library.scala",
diff --git a/hdl/chisel/src/common/Fifo4.scala b/hdl/chisel/src/common/Fifo4.scala deleted file mode 100644 index a01963f..0000000 --- a/hdl/chisel/src/common/Fifo4.scala +++ /dev/null
@@ -1,192 +0,0 @@ -// Copyright 2023 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package common - -import chisel3._ -import chisel3.util._ -import _root_.circt.stage.ChiselStage - -object Fifo4 { - def apply[T <: Data](t: T, n: Int) = { - Module(new Fifo4(t, n)) - } -} - -// 4way decode, used for Fifo4 style input controls. -object Fifo4Valid { - def apply(in: UInt): (UInt, UInt, UInt, UInt) = { - assert(in.getWidth == 4) - - val in0 = Cat(in(3,0) === 8.U, // 8 - in(2,0) === 4.U, // 4, 12 - in(1,0) === 2.U, // 2, 6, 10, 14 - in(0)) // 1, 3, 5, 7, 9, 11, 13, 15 - - val in1 = Cat(in(3,0) === 12.U || - in(3,0) === 10.U || - in(3,0) === 9.U, // 9, 10, 12 - in(2,0) === 6.U || - in(2,0) === 5.U, // 5, 6, 13, 14 - in(1,0) === 3.U, // 3, 7, 11, 15 - false.B) - - val in2 = Cat(in(3,0) === 14.U || - in(3,0) === 13.U || - in(3,0) === 11.U, // 11, 13, 14 - in(2,0) === 15.U || - in(2,0) === 7.U, // 7, 15 - false.B, false.B) - - val in3 = Cat(in(3,0) === 15.U, // 15 - false.B, false.B, false.B) - - (in0.asUInt, in1.asUInt, in2.asUInt, in3.asUInt) - } -} - -class Fifo4[T <: Data](t: T, n: Int) extends Module { - val io = IO(new Bundle { - val in = Flipped(Decoupled(Vec(4, Valid(t)))) - val out = Decoupled(t) - val count = Output(UInt(log2Ceil(n+1).W)) - }) - - val m = n - 1 // n = Mem(n-1) + Slice - - def Increment(a: UInt, b: UInt): UInt = { - val c = a +& b - val d = Mux(c < m.U, c, c - m.U)(a.getWidth - 1, 0) - d - } - - val mem = Mem(m, t) - val mslice = Slice(t, false, true) - - val in0pos = RegInit(0.U(log2Ceil(m).W)) - val in1pos = RegInit(1.U(log2Ceil(m).W)) - val in2pos = RegInit(2.U(log2Ceil(m).W)) - val in3pos = RegInit(3.U(log2Ceil(m).W)) - val outpos = RegInit(0.U(log2Ceil(m).W)) - val mcount = RegInit(0.U(log2Ceil(n+1).W)) - - io.count := mcount + io.out.valid - - val ivalid = io.in.valid && io.in.ready - val ovalid = mslice.io.in.valid && mslice.io.in.ready - - val iactive = Cat(io.in.bits(3).valid, io.in.bits(2).valid, - io.in.bits(1).valid, io.in.bits(0).valid).asUInt - - val icount = PopCount(iactive) - - // --------------------------------------------------------------------------- - // Fifo Control. - when (ivalid) { - in0pos := Increment(in0pos, icount) - in1pos := Increment(in1pos, icount) - in2pos := Increment(in2pos, icount) - in3pos := Increment(in3pos, icount) - } - - when (ovalid) { - outpos := Increment(outpos, 1.U) - } - - val inc = MuxOR(ivalid, icount) - val dec = mslice.io.in.valid && mslice.io.in.ready - - when (ivalid || ovalid) { - mcount := mcount + inc - dec - } - - // --------------------------------------------------------------------------- - // Fifo Input. - val (in0valid, in1valid, in2valid, in3valid) = Fifo4Valid(iactive) - - for (i <- 0 until m) { - val valid = Cat(in0pos === i.U && in0valid(3) || - in1pos === i.U && in1valid(3) || - in2pos === i.U && in2valid(3) || - in3pos === i.U && in3valid(3), - in0pos === i.U && in0valid(2) || - in1pos === i.U && in1valid(2) || - in2pos === i.U && in2valid(2), - in0pos === i.U && in0valid(1) || - in1pos === i.U && in1valid(1), - in0pos === i.U && in0valid(0)) - - when (ivalid) { - when (valid(0)) { - mem(i) := io.in.bits(0).bits - } .elsewhen (valid(1)) { - mem(i) := io.in.bits(1).bits - } .elsewhen (valid(2)) { - mem(i) := io.in.bits(2).bits - } .elsewhen (valid(3)) { - mem(i) := io.in.bits(3).bits - } - } - } - - mslice.io.in.valid := false.B - mslice.io.in.bits := io.in.bits(0).bits // defaults - - when (mcount > 0.U) { - when (io.out.ready) { - mslice.io.in.valid := true.B - } - } .otherwise { - when (ivalid && iactive =/= 0.U) { - mslice.io.in.valid := true.B - } - } - - when (mcount > 0.U) { - mslice.io.in.bits := mem(outpos) - } .elsewhen (ivalid) { - when (iactive(0)) { - mslice.io.in.bits := io.in.bits(0).bits - } .elsewhen (iactive(1)) { - mslice.io.in.bits := io.in.bits(1).bits - } .elsewhen (iactive(2)) { - mslice.io.in.bits := io.in.bits(2).bits - } .elsewhen (iactive(3)) { - mslice.io.in.bits := io.in.bits(3).bits - } - } - - // --------------------------------------------------------------------------- - // Valid Entries. - val active = RegInit(0.U(m.W)) - - val activeSet = MuxOR(ivalid, - ((icount >= 1.U) << in0pos) | ((icount >= 2.U) << in1pos) | - ((icount >= 3.U) << in2pos) | ((icount >= 4.U) << in3pos)) - - val activeClr = MuxOR(mslice.io.in.valid && mslice.io.in.ready, 1.U << outpos) - - active := (active | activeSet) & ~activeClr - - // --------------------------------------------------------------------------- - // Interface. - io.in.ready := mcount <= (m.U - icount) - io.out <> mslice.io.out - - assert(mcount <= m.U) -} - -object EmitFifo4 extends App { - ChiselStage.emitSystemVerilogFile(new Fifo4(UInt(8.W), 11), args) -}
diff --git a/hdl/chisel/src/common/Fifo4e.scala b/hdl/chisel/src/common/Fifo4e.scala deleted file mode 100644 index 392e7ee..0000000 --- a/hdl/chisel/src/common/Fifo4e.scala +++ /dev/null
@@ -1,147 +0,0 @@ -// Copyright 2023 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package common - -import chisel3._ -import chisel3.util._ -import _root_.circt.stage.ChiselStage - -// Fifo4 with entry output and no output registration stage. - -object Fifo4e { - def apply[T <: Data](t: T, n: Int) = { - Module(new Fifo4e(t, n)) - } -} - -class Fifo4e[T <: Data](t: T, n: Int) extends Module { - val io = IO(new Bundle { - val in = Flipped(Decoupled(Vec(4, Valid(t)))) - val out = Decoupled(t) - val count = Output(UInt(log2Ceil(n+1).W)) - val entry = Output(Vec(n, Valid(t))) - val nempty = Output(Bool()) - }) - - def Increment(a: UInt, b: UInt): UInt = { - val c = a +& b - val d = Mux(c < n.U, c, c - n.U)(a.getWidth - 1, 0) - d - } - - val mem = Mem(n, t) - - val in0pos = RegInit(0.U(log2Ceil(n).W)) - val in1pos = RegInit(1.U(log2Ceil(n).W)) - val in2pos = RegInit(2.U(log2Ceil(n).W)) - val in3pos = RegInit(3.U(log2Ceil(n).W)) - val outpos = RegInit(0.U(log2Ceil(n).W)) - val mcount = RegInit(0.U(log2Ceil(n+1).W)) - val nempty = RegInit(false.B) - - io.count := mcount - io.nempty := nempty - - val ivalid = io.in.valid && io.in.ready - val ovalid = io.out.valid && io.out.ready - - val iactive = Cat(io.in.bits(3).valid, io.in.bits(2).valid, - io.in.bits(1).valid, io.in.bits(0).valid).asUInt - - val icount = PopCount(iactive) - - // --------------------------------------------------------------------------- - // Fifo Control. - when (ivalid) { - in0pos := Increment(in0pos, icount) - in1pos := Increment(in1pos, icount) - in2pos := Increment(in2pos, icount) - in3pos := Increment(in3pos, icount) - } - - when (ovalid) { - outpos := Increment(outpos, 1.U) - } - - val inc = MuxOR(ivalid, icount) - val dec = ovalid - - when (ivalid || ovalid) { - val nxtcount = mcount + inc - dec - mcount := nxtcount - nempty := nxtcount =/= 0.U - } - - // --------------------------------------------------------------------------- - // Fifo Input. - val (in0valid, in1valid, in2valid, in3valid) = Fifo4Valid(iactive) - - for (i <- 0 until n) { - val valid = Cat(in0pos === i.U && in0valid(3) || - in1pos === i.U && in1valid(3) || - in2pos === i.U && in2valid(3) || - in3pos === i.U && in3valid(3), - in0pos === i.U && in0valid(2) || - in1pos === i.U && in1valid(2) || - in2pos === i.U && in2valid(2), - in0pos === i.U && in0valid(1) || - in1pos === i.U && in1valid(1), - in0pos === i.U && in0valid(0)) - - when (ivalid) { - when (valid(0)) { - mem(i) := io.in.bits(0).bits - } .elsewhen (valid(1)) { - mem(i) := io.in.bits(1).bits - } .elsewhen (valid(2)) { - mem(i) := io.in.bits(2).bits - } .elsewhen (valid(3)) { - mem(i) := io.in.bits(3).bits - } - } - } - - // --------------------------------------------------------------------------- - // Valid Entries. - val active = RegInit(0.U(n.W)) - - val activeSet = MuxOR(ivalid, - ((icount >= 1.U) << in0pos) | ((icount >= 2.U) << in1pos) | - ((icount >= 3.U) << in2pos) | ((icount >= 4.U) << in3pos)) - - val activeClr = MuxOR(io.out.valid && io.out.ready, 1.U << outpos) - - when (io.in.valid && io.in.ready || io.out.valid && io.out.ready) { - active := (active | activeSet) & ~activeClr - } - - // --------------------------------------------------------------------------- - // Interface. - io.in.ready := mcount <= (n.U - icount) - - io.out.valid := mcount =/= 0.U - io.out.bits := mem(outpos) - - assert(mcount <= n.U) - - for (i <- 0 until n) { - io.entry(i).valid := active(i) - io.entry(i).bits := mem(i) - } -} - -object EmitFifo4e extends App { - ChiselStage.emitSystemVerilogFile(new Fifo4e(UInt(8.W), 10), args) -}
diff --git a/hdl/chisel/src/common/Fifo4x4.scala b/hdl/chisel/src/common/Fifo4x4.scala deleted file mode 100644 index 064af4b..0000000 --- a/hdl/chisel/src/common/Fifo4x4.scala +++ /dev/null
@@ -1,198 +0,0 @@ -// Copyright 2023 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package common - -import chisel3._ -import chisel3.util._ -import _root_.circt.stage.ChiselStage - -object Fifo4x4 { - def apply[T <: Data](t: T, n: Int) = { - Module(new Fifo4x4(t, n)) - } -} - -// Input accepted with a common handshake and per lane select. -// Outputs are transacted independently, and ordered {[0], [0,1], [0,1,2], [0,1,2,3]}. -// Outputs are not registered, assumes passes directly into shallow combinatorial. -class Fifo4x4[T <: Data](t: T, n: Int) extends Module { - val io = IO(new Bundle { - val in = Flipped(Decoupled(Vec(4, Valid(t)))) - val out = Vec(4, Decoupled(t)) - val count = Output(UInt(log2Ceil(n+1).W)) - val nempty = Output(Bool()) - }) - - val m = n - - val mb = log2Ceil(m) - val n1b = log2Ceil(n + 1) - - def Increment(a: UInt, b: UInt): UInt = { - val c = a +& b - val d = Mux(c < m.U, c, c - m.U)(a.getWidth - 1, 0) - d - } - - val mem = Reg(Vec(n, t)) - - val inpos = Reg(Vec(4, UInt(mb.W))) // reset below - val outpos = Reg(Vec(4, UInt(mb.W))) // reset below - - val mcount = RegInit(0.U(n1b.W)) - val nempty = RegInit(false.B) - val inready = RegInit(false.B) - val outvalid = RegInit(0.U(4.W)) - - val ivalid = io.in.valid && io.in.ready - - val iactive = Cat(io.in.bits(3).valid, io.in.bits(2).valid, - io.in.bits(1).valid, io.in.bits(0).valid).asUInt - - val icount = (io.in.bits(0).valid +& io.in.bits(1).valid +& - io.in.bits(2).valid +& io.in.bits(3).valid)(2,0) - - val oactiveBits = Cat(io.out(3).valid && io.out(3).ready, - io.out(2).valid && io.out(2).ready, - io.out(1).valid && io.out(1).ready, - io.out(0).valid && io.out(0).ready) - - val ovalid = oactiveBits =/= 0.U - - val ocount = (oactiveBits(0) +& oactiveBits(1) +& - oactiveBits(2) +& oactiveBits(3))(2,0) - - assert(!(oactiveBits(1) === 1.U && oactiveBits(0,0) =/= 1.U)) - assert(!(oactiveBits(2) === 1.U && oactiveBits(1,0) =/= 3.U)) - assert(!(oactiveBits(3) === 1.U && oactiveBits(2,0) =/= 7.U)) - - val ovalidBits = Cat(io.out(3).valid, io.out(2).valid, - io.out(1).valid, io.out(0).valid) - - assert(!(ovalidBits(1) === 1.U && ovalidBits(0,0) =/= 1.U)) - assert(!(ovalidBits(2) === 1.U && ovalidBits(1,0) =/= 3.U)) - assert(!(ovalidBits(3) === 1.U && ovalidBits(2,0) =/= 7.U)) - - val oreadyBits = Cat(io.out(3).ready, io.out(2).ready, - io.out(1).ready, io.out(0).ready) - - assert(!(oreadyBits(1) === 1.U && oreadyBits(0,0) =/= 1.U)) - assert(!(oreadyBits(2) === 1.U && oreadyBits(1,0) =/= 3.U)) - assert(!(oreadyBits(3) === 1.U && oreadyBits(2,0) =/= 7.U)) - - // --------------------------------------------------------------------------- - // Fifo Control. - when (reset.asBool) { - for (i <- 0 until 4) { - inpos(i) := i.U - } - } .elsewhen (ivalid) { - for (i <- 0 until 4) { - inpos(i) := Increment(inpos(i), icount) - } - } - - when (reset.asBool) { - for (i <- 0 until 4) { - outpos(i) := i.U - } - } .elsewhen (ovalid) { - for (i <- 0 until 4) { - outpos(i) := Increment(outpos(i), ocount) - } - } - - val inc = MuxOR(ivalid, icount) - val dec = MuxOR(ovalid, ocount) - - when (ivalid || ovalid) { - val nxtmcount = mcount + inc - dec - inready := nxtmcount <= (m.U - 4.U) - mcount := nxtmcount - nempty := nxtmcount =/= 0.U - outvalid := Cat(nxtmcount >= 4.U, - nxtmcount >= 3.U, - nxtmcount >= 2.U, - nxtmcount >= 1.U) - } .otherwise { - inready := mcount <= (m.U - 4.U) - outvalid := Cat(mcount >= 4.U, - mcount >= 3.U, - mcount >= 2.U, - mcount >= 1.U) - } - - // --------------------------------------------------------------------------- - // Fifo Input. - val (in0valid, in1valid, in2valid, in3valid) = Fifo4Valid(iactive) - - for (i <- 0 until m) { - val valid = Cat(inpos(0) === i.U && in0valid(3) || - inpos(1) === i.U && in1valid(3) || - inpos(2) === i.U && in2valid(3) || - inpos(3) === i.U && in3valid(3), - - inpos(0) === i.U && in0valid(2) || - inpos(1) === i.U && in1valid(2) || - inpos(2) === i.U && in2valid(2), - - inpos(0) === i.U && in0valid(1) || - inpos(1) === i.U && in1valid(1), - - inpos(0) === i.U && in0valid(0)) - - if (true) { - val data = MuxOR(valid(0), io.in.bits(0).bits.asUInt) | - MuxOR(valid(1), io.in.bits(1).bits.asUInt) | - MuxOR(valid(2), io.in.bits(2).bits.asUInt) | - MuxOR(valid(3), io.in.bits(3).bits.asUInt) - - when (ivalid && valid =/= 0.U) { - mem(i) := data.asTypeOf(t) - } - } else { - when (ivalid) { - when (valid(0)) { - mem(i) := io.in.bits(0).bits - } .elsewhen (valid(1)) { - mem(i) := io.in.bits(1).bits - } .elsewhen (valid(2)) { - mem(i) := io.in.bits(2).bits - } .elsewhen (valid(3)) { - mem(i) := io.in.bits(3).bits - } - } - } - } - - // --------------------------------------------------------------------------- - // Interface. - io.in.ready := inready - - for (i <- 0 until 4) { - io.out(i).valid := outvalid(i) - io.out(i).bits := mem(outpos(i)) // TODO: VecAt() - } - - io.count := mcount - - io.nempty := nempty - - assert(io.count <= m.U) -} - -object EmitFifo4x4 extends App { - ChiselStage.emitSystemVerilogFile(new Fifo4x4(UInt(32.W), 24), args) -}
diff --git a/hdl/chisel/src/common/FifoIxO.scala b/hdl/chisel/src/common/FifoIxO.scala new file mode 100644 index 0000000..5be7dd6 --- /dev/null +++ b/hdl/chisel/src/common/FifoIxO.scala
@@ -0,0 +1,172 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package common + +import chisel3._ +import chisel3.util._ +import _root_.circt.stage.ChiselStage + +object FifoIxO { + def apply[T <: Data](t: T, i: Int, o: Int, n: Int) = { + Module(new FifoIxO(t, i, o, n)) + } +} + +// Input accepted with a common handshake and per lane select. +// Outputs are transacted independently, and ordered {[0], [0,1], [0,1,2], [0,1,2,3]}. +// Outputs are not registered, assumes passes directly into shallow combinatorial. +class FifoIxO[T <: Data](t: T, i: Int, o: Int, n: Int /* depth */) extends Module { + val io = IO(new Bundle { + val in = Flipped(Decoupled(Vec(i, Valid(t)))) + val out = Vec(o, Decoupled(t)) + val count = Output(UInt(log2Ceil(n+1).W)) + val nempty = Output(Bool()) + }) + + val m = n + + val mb = log2Ceil(m) + val n1b = log2Ceil(n + 1) + + def Increment(a: UInt, b: UInt): UInt = { + val c = a +& b + val d = Mux(c < m.U, c, c - m.U)(a.getWidth - 1, 0) + d + } + + val mem = Reg(Vec(n, t)) + + val inpos = Reg(Vec(i, UInt(mb.W))) // reset below + val outpos = Reg(Vec(o, UInt(mb.W))) // reset below + + val mcount = RegInit(0.U(n1b.W)) + val nempty = RegInit(false.B) + val inready = RegInit(false.B) + val outvalid = RegInit(0.U(o.W)) + + val ivalid = io.in.valid && io.in.ready + + val iactive = Cat((0 until i).reverse.map(x => io.in.bits(x).valid)).asUInt + + val icount = (io.in.bits.map(x => x.valid.asUInt).reduce(_ +& _))(log2Ceil(i),0) + + val oactiveBits = Cat((0 until o).reverse.map(x => io.out(x).valid && io.out(x).ready)) + + val ovalid = oactiveBits =/= 0.U + + val ocount = (0 until o).map(x => oactiveBits(x).asUInt).reduce(_ +& _)(log2Ceil(o),0) + + for (n <- 1 until o) { + assert(!(oactiveBits(n) === 1.U && oactiveBits(n - 1,0) =/= ((1 << n) - 1).U)) + } + + val ovalidBits = Cat((0 until o).reverse.map(x => io.out(x).valid)) + + for (n <- 1 until o) { + assert(!(ovalidBits(n) === 1.U && ovalidBits(n - 1, 0) =/= ((1 << n) - 1).U)) + } + + val oreadyBits = Cat((0 until o).reverse.map(x => io.out(x).ready)) + + for (n <- 1 until o) { + assert(!(oreadyBits(n) === 1.U && oreadyBits(n - 1, 0) =/= ((1 << n) - 1).U)) + } + + // --------------------------------------------------------------------------- + // Fifo Control. + when (reset.asBool) { + for (i <- 0 until i) { + inpos(i) := i.U + } + } .elsewhen (ivalid) { + for (i <- 0 until i) { + inpos(i) := Increment(inpos(i), icount) + } + } + + when (reset.asBool) { + for (i <- 0 until o) { + outpos(i) := i.U + } + } .elsewhen (ovalid) { + for (i <- 0 until o) { + outpos(i) := Increment(outpos(i), ocount) + } + } + + val inc = MuxOR(ivalid, icount) + val dec = MuxOR(ovalid, ocount) + + when (ivalid || ovalid) { + val nxtmcount = mcount + inc - dec + inready := nxtmcount <= (m.U - i.U) + mcount := nxtmcount + nempty := nxtmcount =/= 0.U + outvalid := Cat((0 until o).reverse.map(x => nxtmcount >= (x + 1).U)) + } .otherwise { + inready := mcount <= (m.U - i.U) + outvalid := Cat((0 until o).reverse.map(x => mcount >= (x + 1).U)) + } + + // --------------------------------------------------------------------------- + // Fifo Input. + val inxvalid = FifoXValid(iactive) + + for (q <- 0 until m) { + val valid = Cat( + (0 until i).reverse.map(x => + if (x == 0) { inpos(0) === q.U && inxvalid(0)(0) } else { + (0 to x).map(y => + inpos(y) === q.U && inxvalid(y)(x) + ).reduce(_ || _) + } + ) + ) + + if (true) { + val data = (0 until i).map(x => MuxOR(valid(x), io.in.bits(x).bits.asUInt)).reduce(_ | _) + + when (ivalid && valid =/= 0.U) { + mem(q) := data.asTypeOf(t) + } + } else { + when (ivalid) { + when(PopCount(valid) >= 1.U) { + val idx = PriorityEncoder(valid) + mem(q) := io.in.bits(idx).bits + } + } + } + } + + // --------------------------------------------------------------------------- + // Interface. + io.in.ready := inready + + for (i <- 0 until o) { + io.out(i).valid := outvalid(i) + io.out(i).bits := mem(outpos(i)) // TODO: VecAt() + } + + io.count := mcount + + io.nempty := nempty + + assert(io.count <= m.U) +} + +object EmitFifoIxO extends App { + ChiselStage.emitSystemVerilogFile(new FifoIxO(UInt(32.W), 4, 4, 24), args) +}
diff --git a/hdl/chisel/src/common/FifoX.scala b/hdl/chisel/src/common/FifoX.scala new file mode 100644 index 0000000..ee3f041 --- /dev/null +++ b/hdl/chisel/src/common/FifoX.scala
@@ -0,0 +1,163 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package common + +import chisel3._ +import chisel3.util._ +import _root_.circt.stage.ChiselStage + +object FifoX { + def apply[T <: Data](t: T, x: Int, n: Int) = { + Module(new FifoX(t, x, n)) + } +} + +// Xway decode, used for FifoX style input controls. +object FifoXValid { + def apply(in: UInt): Seq[UInt] = { + val inx = new Array[UInt](in.getWidth) + + for (i <- 0 until in.getWidth) { + inx(i) = Cat( + (0 until in.getWidth).reverse.map(x => + if (x < i) { false.B } else { + (PopCount(in(x,0)) === (i + 1).U) && in(x) + } + ) + ) + } + inx + } +} + +class FifoX[T <: Data](t: T, x: Int, n: Int) extends Module { + val io = IO(new Bundle { + val in = Flipped(Decoupled(Vec(x, Valid(t)))) + val out = Decoupled(t) + val count = Output(UInt(log2Ceil(n+1).W)) + }) + + val m = n - 1 // n = Mem(n-1) + Slice + + def Increment(a: UInt, b: UInt): UInt = { + val c = a +& b + val d = Mux(c < m.U, c, c - m.U)(a.getWidth - 1, 0) + d + } + + val mem = Mem(m, t) + val mslice = Slice(t, false, true) + + val inxpos = RegInit(VecInit((0 until x).map(x => x.U(log2Ceil(m).W)))) + val outpos = RegInit(0.U(log2Ceil(m).W)) + val mcount = RegInit(0.U(log2Ceil(n+1).W)) + + io.count := mcount + io.out.valid + + val ivalid = io.in.valid && io.in.ready + val ovalid = mslice.io.in.valid && mslice.io.in.ready + + val iactive = Cat((0 until x).reverse.map(x => io.in.bits(x).valid)) + + val icount = PopCount(iactive) + + // --------------------------------------------------------------------------- + // Fifo Control. + when (ivalid) { + for (i <- 0 until x) { + inxpos(i) := Increment(inxpos(i), icount) + } + } + + when (ovalid) { + outpos := Increment(outpos, 1.U) + } + + val inc = MuxOR(ivalid, icount) + val dec = mslice.io.in.valid && mslice.io.in.ready + + when (ivalid || ovalid) { + mcount := mcount + inc - dec + } + + // --------------------------------------------------------------------------- + // Fifo Input. + val inxvalid = FifoXValid(iactive) + + for (i <- 0 until m) { + val valid = Cat( + (0 until x).reverse.map(q => + if (q == 0) { inxpos(0) === i.U && inxvalid(0)(0) } else { + (0 to q).map(y => + inxpos(y) === i.U && inxvalid(y)(q) + ).reduce(_ || _) + } + ) + ) + + when (ivalid) { + when (PopCount(valid) >= 1.U) { + val idx = PriorityEncoder(valid) + mem(i) := io.in.bits(idx).bits + } + } + } + + mslice.io.in.valid := false.B + mslice.io.in.bits := io.in.bits(0).bits // defaults + + when (mcount > 0.U) { + when (io.out.ready) { + mslice.io.in.valid := true.B + } + } .otherwise { + when (ivalid && iactive =/= 0.U) { + mslice.io.in.valid := true.B + } + } + + when (mcount > 0.U) { + mslice.io.in.bits := mem(outpos) + } .elsewhen (ivalid) { + assert(PopCount(iactive) >= 1.U) + when (iactive =/= 0.U) { + val idx = PriorityEncoder(iactive) + mslice.io.in.bits := io.in.bits(idx).bits + } + } + + // --------------------------------------------------------------------------- + // Valid Entries. + val active = RegInit(0.U(m.W)) + + val activeSet = MuxOR(ivalid, + (0 until x).map(i => (icount >= (i + 1).U) << inxpos(i)).reduce(_ | _) + ) + + val activeClr = MuxOR(mslice.io.in.valid && mslice.io.in.ready, 1.U << outpos) + + active := (active | activeSet) & ~activeClr + + // --------------------------------------------------------------------------- + // Interface. + io.in.ready := mcount <= (m.U - icount) + io.out <> mslice.io.out + + assert(mcount <= m.U) +} + +object EmitFifoX extends App { + ChiselStage.emitSystemVerilogFile(new FifoX(UInt(8.W), 4, 11), args) +}
diff --git a/hdl/chisel/src/common/FifoXe.scala b/hdl/chisel/src/common/FifoXe.scala new file mode 100644 index 0000000..587be62 --- /dev/null +++ b/hdl/chisel/src/common/FifoXe.scala
@@ -0,0 +1,136 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package common + +import chisel3._ +import chisel3.util._ +import _root_.circt.stage.ChiselStage + +// FifoX with entry output and no output registration stage. + +object FifoXe { + def apply[T <: Data](t: T, x: Int, n: Int) = { + Module(new FifoXe(t, x, n)) + } +} + +class FifoXe[T <: Data](t: T, x:Int, n: Int) extends Module { + val io = IO(new Bundle { + val in = Flipped(Decoupled(Vec(x, Valid(t)))) + val out = Decoupled(t) + val count = Output(UInt(log2Ceil(n+1).W)) + val entry = Output(Vec(n, Valid(t))) + val nempty = Output(Bool()) + }) + + def Increment(a: UInt, b: UInt): UInt = { + val c = a +& b + val d = Mux(c < n.U, c, c - n.U)(a.getWidth - 1, 0) + d + } + + val mem = Mem(n, t) + + val inxpos = RegInit(VecInit((0 until x).map(x => x.U((log2Ceil(n) + 1).W)))) + val outpos = RegInit(0.U(log2Ceil(n).W)) + val mcount = RegInit(0.U(log2Ceil(n+1).W)) + val nempty = RegInit(false.B) + + io.count := mcount + io.nempty := nempty + + val ivalid = io.in.valid && io.in.ready + val ovalid = io.out.valid && io.out.ready + + val iactive = Cat((0 until x).reverse.map(x => io.in.bits(x).valid)) + + val icount = PopCount(iactive) + + // --------------------------------------------------------------------------- + // Fifo Control. + when (ivalid) { + for (i <- 0 until x) { + inxpos(i) := Increment(inxpos(i), icount) + } + } + + when (ovalid) { + outpos := Increment(outpos, 1.U) + } + + val inc = MuxOR(ivalid, icount) + val dec = ovalid + + when (ivalid || ovalid) { + val nxtcount = mcount + inc - dec + mcount := nxtcount + nempty := nxtcount =/= 0.U + } + + // --------------------------------------------------------------------------- + // Fifo Input. + val inxvalid = FifoXValid(iactive) + + for (i <- 0 until n) { + val valid = Cat( + (0 until x).reverse.map(q => + if (q == 0) { inxpos(0) === i.U && inxvalid(0)(0) } else { + (0 to q).map(y => + inxpos(y) === i.U && inxvalid(y)(q) + ).reduce(_ || _) + } + ) + ) + + when (ivalid) { + when (PopCount(valid) >= 1.U) { + val idx = PriorityEncoder(valid) + mem(i) := io.in.bits(idx).bits + } + } + } + + // --------------------------------------------------------------------------- + // Valid Entries. + val active = RegInit(0.U(n.W)) + + val activeSet = MuxOR(ivalid, + (0 until x).map(i => (icount >= (i + 1).U) << inxpos(i)).reduce(_ | _) + ) + + val activeClr = MuxOR(io.out.valid && io.out.ready, 1.U << outpos) + + when (io.in.valid && io.in.ready || io.out.valid && io.out.ready) { + active := (active | activeSet) & ~activeClr + } + + // --------------------------------------------------------------------------- + // Interface. + io.in.ready := mcount <= (n.U - icount) + + io.out.valid := mcount =/= 0.U + io.out.bits := mem(outpos) + + assert(mcount <= n.U) + + for (i <- 0 until n) { + io.entry(i).valid := active(i) + io.entry(i).bits := mem(i) + } +} + +object EmitFifoXe extends App { + ChiselStage.emitSystemVerilogFile(new FifoXe(UInt(8.W), 4, 10), args) +}
diff --git a/hdl/chisel/src/kelvin/Parameters.scala b/hdl/chisel/src/kelvin/Parameters.scala index 5b4b14e..35b4929 100644 --- a/hdl/chisel/src/kelvin/Parameters.scala +++ b/hdl/chisel/src/kelvin/Parameters.scala
@@ -56,6 +56,12 @@ val vectorCountBits = log2Ceil(vectorBits / 8) + 1 + 2 // +2 stripmine + val vectorAluCount = 2 + val vectorReadPorts = (vectorAluCount * 3) + 1 + val vectorWritePorts = 6 + val vectorWhintPorts = 4 + val vectorScalarPorts = 2 + // Vector queue. val vectorFifoDepth = 16
diff --git a/hdl/chisel/src/kelvin/scalar/Debug.scala b/hdl/chisel/src/kelvin/scalar/Debug.scala index 4181680..d2123c6 100644 --- a/hdl/chisel/src/kelvin/scalar/Debug.scala +++ b/hdl/chisel/src/kelvin/scalar/Debug.scala
@@ -21,13 +21,7 @@ // Debug signals for HDL development. class DebugIO(p: Parameters) extends Bundle { val en = Output(UInt(4.W)) - val addr0 = Output(UInt(32.W)) - val addr1 = Output(UInt(32.W)) - val addr2 = Output(UInt(32.W)) - val addr3 = Output(UInt(32.W)) - val inst0 = Output(UInt(32.W)) - val inst1 = Output(UInt(32.W)) - val inst2 = Output(UInt(32.W)) - val inst3 = Output(UInt(32.W)) + val addr = Vec(p.instructionLanes, UInt(32.W)) + val inst = Vec(p.instructionLanes, UInt(32.W)) val cycles = Output(UInt(32.W)) }
diff --git a/hdl/chisel/src/kelvin/scalar/Fetch.scala b/hdl/chisel/src/kelvin/scalar/Fetch.scala index cb21c9d..d6de873 100644 --- a/hdl/chisel/src/kelvin/scalar/Fetch.scala +++ b/hdl/chisel/src/kelvin/scalar/Fetch.scala
@@ -56,7 +56,7 @@ val csr = new CsrInIO(p) val ibus = new IBusIO(p) val inst = new FetchIO(p) - val branch = Flipped(Vec(4, new BranchTakenIO(p))) + val branch = Flipped(Vec(p.instructionLanes, new BranchTakenIO(p))) val linkPort = Flipped(new RegfileLinkPortIO) val iflush = Flipped(new IFlushIO(p)) }) @@ -103,9 +103,9 @@ val l0data = Reg(Vec(indices, UInt(p.fetchDataBits.W))) // Instruction outputs. - val instValid = RegInit(VecInit(Seq.fill(4)(false.B))) - val instAddr = Reg(Vec(4, UInt(p.instructionBits.W))) - val instBits = Reg(Vec(4, UInt(p.instructionBits.W))) + val instValid = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B))) + val instAddr = Reg(Vec(p.instructionLanes, UInt(p.instructionBits.W))) + val instBits = Reg(Vec(p.instructionLanes, UInt(p.instructionBits.W))) val instAligned0 = Cat(instAddr(0)(31, indexLsb), 0.U(indexLsb.W)) val instAligned1 = instAligned0 + Cat(1.U, 0.U(indexLsb.W)) @@ -135,71 +135,48 @@ (jal, target) } - val (preBranchTaken0, preBranchTarget0) = - Predecode(instAddr(0), instBits(0)) - val (preBranchTaken1, preBranchTarget1) = - Predecode(instAddr(1), instBits(1)) - val (preBranchTaken2, preBranchTarget2) = - Predecode(instAddr(2), instBits(2)) - val (preBranchTaken3, preBranchTarget3) = - Predecode(instAddr(3), instBits(3)) + val preBranch = (0 until p.instructionLanes).map(x => Predecode(instAddr(x), instBits(x))) + val preBranchTakens = preBranch.map { case (taken, target) => taken } + val preBranchTargets = preBranch.map { case (taken, target) => target } - val preBranchTaken = io.inst.lanes(0).valid && preBranchTaken0 || - io.inst.lanes(1).valid && preBranchTaken1 || - io.inst.lanes(2).valid && preBranchTaken2 || - io.inst.lanes(3).valid && preBranchTaken3 + val preBranchTaken = (0 until p.instructionLanes).map(i => + io.inst.lanes(i).valid && preBranchTakens(i)).reduce(_ || _) - val preBranchTarget = Mux(preBranchTaken0, preBranchTarget0, - Mux(preBranchTaken1, preBranchTarget1, - Mux(preBranchTaken2, preBranchTarget2, - preBranchTarget3))) + val preBranchTarget = MuxCase( + preBranchTargets(p.instructionLanes - 1), + (0 until p.instructionLanes - 1).map(i => preBranchTakens(i) -> preBranchTargets(i)) + ) val preBranchTag = preBranchTarget(tagMsb, tagLsb) val preBranchIndex = preBranchTarget(indexMsb, indexLsb) - val branchTag0 = io.branch(0).value(tagMsb, tagLsb) - val branchTag1 = io.branch(1).value(tagMsb, tagLsb) - val branchTag2 = io.branch(2).value(tagMsb, tagLsb) - val branchTag3 = io.branch(3).value(tagMsb, tagLsb) - val branchIndex0 = io.branch(0).value(indexMsb, indexLsb) - val branchIndex1 = io.branch(1).value(indexMsb, indexLsb) - val branchIndex2 = io.branch(2).value(indexMsb, indexLsb) - val branchIndex3 = io.branch(3).value(indexMsb, indexLsb) + val branchTags = io.branch.map(x => x.value(tagMsb, tagLsb)) + val branchIndices = io.branch.map(x => x.value(indexMsb, indexLsb)) - val l0validB0 = l0valid(branchIndex0) - val l0validB1 = l0valid(branchIndex1) - val l0validB2 = l0valid(branchIndex2) - val l0validB3 = l0valid(branchIndex3) + val l0valids = (0 until p.instructionLanes).map(x => l0valid(branchIndices(x))) val l0validP = l0valid(preBranchIndex) - val l0tagB0 = VecAt(l0tag, branchIndex0) - val l0tagB1 = VecAt(l0tag, branchIndex1) - val l0tagB2 = VecAt(l0tag, branchIndex2) - val l0tagB3 = VecAt(l0tag, branchIndex3) + val l0tags = (0 until p.instructionLanes).map(x => VecAt(l0tag, branchIndices(x))) val l0tagP = VecAt(l0tag, preBranchIndex) - val reqB0 = io.branch(0).valid && !l0req(branchIndex0) && - (branchTag0 =/= l0tagB0 || !l0validB0) - val reqB1 = io.branch(1).valid && !l0req(branchIndex1) && - (branchTag1 =/= l0tagB1 || !l0validB1) && - !io.branch(0).valid - val reqB2 = io.branch(2).valid && !l0req(branchIndex2) && - (branchTag2 =/= l0tagB2 || !l0validB2) && - !io.branch(0).valid && !io.branch(1).valid - val reqB3 = io.branch(3).valid && !l0req(branchIndex3) && - (branchTag3 =/= l0tagB3 || !l0validB3) && - !io.branch(0).valid && !io.branch(1).valid && !io.branch(2).valid + val reqBValid = (0 until p.instructionLanes).map(x => + io.branch(x).valid && !l0req(branchIndices(x)) && + (branchTags(x) =/= l0tags(x) || !l0valids(x))) + val prevValid = io.branch.map(_.valid).scan(false.B)(_||_) + val reqs = (0 until p.instructionLanes).map(x => reqBValid(x) && !prevValid(x)) + val reqP = preBranchTaken && !l0req(preBranchIndex) && (preBranchTag =/= l0tagP || !l0validP) val req0 = !match0 && !l0req(instIndex0) val req1 = !match1 && !l0req(instIndex1) - aslice.io.in.valid := (reqB0 || reqB1 || reqB2 || reqB3 || reqP || req0 || req1) && !io.iflush.valid - aslice.io.in.bits := Mux(reqB0, Cat(io.branch(0).value(31,indexLsb), 0.U(indexLsb.W)), - Mux(reqB1, Cat(io.branch(1).value(31,indexLsb), 0.U(indexLsb.W)), - Mux(reqB2, Cat(io.branch(2).value(31,indexLsb), 0.U(indexLsb.W)), - Mux(reqB3, Cat(io.branch(3).value(31,indexLsb), 0.U(indexLsb.W)), - Mux(reqP, Cat(preBranchTarget(31,indexLsb), 0.U(indexLsb.W)), - Mux(req0, instAligned0, instAligned1)))))) + aslice.io.in.valid := (reqs ++ Seq(reqP, req0, req1)).reduce(_ || _) && !io.iflush.valid + aslice.io.in.bits := MuxCase(instAligned1, + (0 until p.instructionLanes).map(x => reqs(x) -> Cat(io.branch(x).value(31,indexLsb), 0.U(indexLsb.W))) ++ + Array( + reqP -> Cat(preBranchTarget(31,indexLsb), 0.U(indexLsb.W)), + req0 -> instAligned0, + ) + ) when (readAddrEn) { readAddr := io.ibus.addr @@ -253,25 +230,27 @@ // creates excessive timing pressure. We know that the match is either on // the old line or the next line, so can late mux on lookups of prior. // Widen the arithmetic paths and select from results. - val fetchEn = Wire(Vec(4, Bool())) + val fetchEn = Wire(Vec(p.instructionLanes, Bool())) - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { fetchEn(i) := io.inst.lanes(i).valid && io.inst.lanes(i).ready } - val fsel = Cat(fetchEn(3), - fetchEn(2) && !fetchEn(3), - fetchEn(1) && !fetchEn(2) && !fetchEn(3), - fetchEn(0) && !fetchEn(1) && !fetchEn(2) && !fetchEn(3), - !fetchEn(0) && !fetchEn(1) && !fetchEn(2) && !fetchEn(3)) + val fsela = Cat((0 until p.instructionLanes).reverse.map(x => + (x until p.instructionLanes).map(y => + (if (y == x) { fetchEn(y) } else { !fetchEn(y) }) + ).reduce(_ && _) + )) + val fselb = (0 until p.instructionLanes).map(x => !fetchEn(x)).reduce(_ && _) + val fsel = Cat(fsela, fselb) - val nxtInstAddrOffset = instAddr.map(x => x) ++ instAddr.map(x => x + 16.U) - val nxtInstAddr = (0 until 4).map(i => - (0 until 5).map( + val nxtInstAddrOffset = instAddr.map(x => x) ++ instAddr.map(x => x + (p.instructionLanes * 4).U) + val nxtInstAddr = (0 until p.instructionLanes).map(i => + (0 until (p.instructionLanes + 1)).map( j => MuxOR(fsel(j), nxtInstAddrOffset(j + i))).reduce(_|_)) val nxtInstIndex0 = nxtInstAddr(0)(indexMsb, indexLsb) - val nxtInstIndex1 = nxtInstAddr(3)(indexMsb, indexLsb) + val nxtInstIndex1 = nxtInstAddr(p.instructionLanes - 1)(indexMsb, indexLsb) val readFwd0 = readDataEn && readAddr(31,indexLsb) === instAligned0(31,indexLsb) @@ -286,7 +265,7 @@ val nxtMatch1 = Mux(instIndex0(0) === nxtInstIndex1(0), nxtMatch0Fwd, nxtMatch1Fwd) - val nxtInstValid = Wire(Vec(4, Bool())) + val nxtInstValid = Wire(Vec(p.instructionLanes, Bool())) val nxtInstBits0 = Mux(readFwd0, readData, VecAt(l0data, instIndex0)) val nxtInstBits1 = Mux(readFwd1, readData, VecAt(l0data, instIndex1)) @@ -301,23 +280,18 @@ def BranchMatchDe(valid: Bool, value: UInt): (Bool, UInt, Vec[UInt], Vec[UInt]) = { - val addr = VecInit(value, - value + 4.U, - value + 8.U, - value + 12.U) + val addr = VecInit((0 until p.instructionLanes).map(x => value + (x * 4).U)) val match0 = l0valid(addr(0)(indexMsb,indexLsb)) && addr(0)(tagMsb,tagLsb) === VecAt(l0tag, addr(0)(indexMsb,indexLsb)) - val match1 = l0valid(addr(3)(indexMsb,indexLsb)) && - addr(3)(tagMsb,tagLsb) === VecAt(l0tag, addr(3)(indexMsb,indexLsb)) + val match1 = l0valid(addr(p.instructionLanes - 1)(indexMsb,indexLsb)) && + addr(p.instructionLanes - 1)(tagMsb,tagLsb) === VecAt(l0tag, addr(p.instructionLanes - 1)(indexMsb,indexLsb)) - val vvalid = VecInit(Mux(addr(0)(4,2) <= 7.U, match0, match1), - Mux(addr(0)(4,2) <= 6.U, match0, match1), - Mux(addr(0)(4,2) <= 5.U, match0, match1), - Mux(addr(0)(4,2) <= 4.U, match0, match1)) + val vvalid = VecInit((0 until p.instructionLanes).reverse.map(x => + Mux(addr(0)(2 + log2Ceil(p.instructionLanes),2) <= (4+x).U, match0, match1))) val muxbits0 = VecAt(l0data, addr(0)(indexMsb,indexLsb)) - val muxbits1 = VecAt(l0data, addr(3)(indexMsb,indexLsb)) + val muxbits1 = VecAt(l0data, addr(p.instructionLanes - 1)(indexMsb,indexLsb)) val muxbits = Wire(Vec(16, UInt(p.instructionBits.W))) for (i <- 0 until 8) { @@ -326,8 +300,8 @@ muxbits(i + 8) := muxbits1(31 + offset, offset) } - val bits = Wire(Vec(4, UInt(p.instructionBits.W))) - for (i <- 0 until 4) { + val bits = Wire(Vec(p.instructionLanes, UInt(p.instructionBits.W))) + for (i <- 0 until p.instructionLanes) { val idx = Cat(addr(0)(5) =/= addr(i)(5), addr(i)(4,2)) bits(i) := VecAt(muxbits, idx) } @@ -337,38 +311,26 @@ def BranchMatchEx(branch: Vec[BranchTakenIO]): (Bool, UInt, Vec[UInt], Vec[UInt]) = { - val valid = branch(0).valid || branch(1).valid || - branch(2).valid || branch(3).valid + val valid = branch.map(x => x.valid).reduce(_ || _) - val addr = VecInit(Mux(branch(0).valid, branch(0).value, - Mux(branch(1).valid, branch(1).value, - Mux(branch(2).valid, branch(2).value, - branch(3).value))), - Mux(branch(0).valid, branch(0).value + 4.U, - Mux(branch(1).valid, branch(1).value + 4.U, - Mux(branch(2).valid, branch(2).value + 4.U, - branch(3).value + 4.U))), - Mux(branch(0).valid, branch(0).value + 8.U, - Mux(branch(1).valid, branch(1).value + 8.U, - Mux(branch(2).valid, branch(2).value + 8.U, - branch(3).value + 8.U))), - Mux(branch(0).valid, branch(0).value + 12.U, - Mux(branch(1).valid, branch(1).value + 12.U, - Mux(branch(2).valid, branch(2).value + 12.U, - branch(3).value + 12.U)))) + + val addr = VecInit((0 until branch.length).map(x => + MuxCase(branch(branch.length - 1).value + (x * 4).U, ( + (0 until branch.length - 1).map(y => + branch(y).valid -> (branch(y).value + (x * 4).U) + ) + )))) val match0 = l0valid(addr(0)(indexMsb,indexLsb)) && addr(0)(tagMsb,tagLsb) === VecAt(l0tag, addr(0)(indexMsb,indexLsb)) - val match1 = l0valid(addr(3)(indexMsb,indexLsb)) && - addr(3)(tagMsb,tagLsb) === VecAt(l0tag, addr(3)(indexMsb,indexLsb)) + val match1 = l0valid(addr(branch.length - 1)(indexMsb,indexLsb)) && + addr(branch.length - 1)(tagMsb,tagLsb) === VecAt(l0tag, addr(branch.length - 1)(indexMsb,indexLsb)) - val vvalid = VecInit(Mux(addr(0)(4,2) <= 7.U, match0, match1), - Mux(addr(0)(4,2) <= 6.U, match0, match1), - Mux(addr(0)(4,2) <= 5.U, match0, match1), - Mux(addr(0)(4,2) <= 4.U, match0, match1)) + val vvalid = VecInit((0 until branch.length).reverse.map(x => + Mux(addr(0)(2 + log2Ceil(branch.length),2) <= (4 + x).U, match0, match1))) val muxbits0 = VecAt(l0data, addr(0)(indexMsb,indexLsb)) - val muxbits1 = VecAt(l0data, addr(3)(indexMsb,indexLsb)) + val muxbits1 = VecAt(l0data, addr(branch.length - 1)(indexMsb,indexLsb)) val muxbits = Wire(Vec(16, UInt(p.instructionBits.W))) for (i <- 0 until 8) { @@ -377,8 +339,8 @@ muxbits(i + 8) := muxbits1(31 + offset, offset) } - val bits = Wire(Vec(4, UInt(p.instructionBits.W))) - for (i <- 0 until 4) { + val bits = Wire(Vec(branch.length, UInt(p.instructionBits.W))) + for (i <- 0 until branch.length) { val idx = Cat(addr(0)(5) =/= addr(i)(5), addr(i)(4,2)) bits(i) := VecAt(muxbits, idx) } @@ -399,21 +361,17 @@ (jal || ret || bxx, target) } - val (brchTakenDe0, brchTargetDe0) = PredecodeDe(instAddr(0), instBits(0)) - val (brchTakenDe1, brchTargetDe1) = PredecodeDe(instAddr(1), instBits(1)) - val (brchTakenDe2, brchTargetDe2) = PredecodeDe(instAddr(2), instBits(2)) - val (brchTakenDe3, brchTargetDe3) = PredecodeDe(instAddr(3), instBits(3)) + val brchDe = (0 until p.instructionLanes).map(x => PredecodeDe(instAddr(x), instBits(x))) + val brchTakensDe = brchDe.map { case (taken, target) => taken } + val brchTargetsDe = brchDe.map { case (taken, target) => target } - val brchTakenDeOr = - io.inst.lanes(0).valid && io.inst.lanes(0).ready && brchTakenDe0 || - io.inst.lanes(1).valid && io.inst.lanes(1).ready && brchTakenDe1 || - io.inst.lanes(2).valid && io.inst.lanes(2).ready && brchTakenDe2 || - io.inst.lanes(3).valid && io.inst.lanes(3).ready && brchTakenDe3 + val brchTakenDeOr = (0 until p.instructionLanes).map(x => + io.inst.lanes(x).ready && io.inst.lanes(x).valid && brchTakensDe(x) + ).reduce(_ || _) - val brchTargetDe = Mux(brchTakenDe0, brchTargetDe0, - Mux(brchTakenDe1, brchTargetDe1, - Mux(brchTakenDe2, brchTargetDe2, - brchTargetDe3))) + val brchTargetDe = MuxCase(brchTargetsDe(p.instructionLanes - 1), + (0 until p.instructionLanes - 1).map(x => brchTakensDe(x) -> brchTargetsDe(x)) + ) val (brchTakenDe, brchValidDe, brchAddrDe, brchBitsDe) = BranchMatchDe(brchTakenDeOr, brchTargetDe) @@ -421,21 +379,27 @@ val (brchTakenEx, brchValidEx, brchAddrEx, brchBitsEx) = BranchMatchEx(io.branch) + val brchValidDeMask = - Cat(!brchTakenDe0 && !brchTakenDe1 && !brchTakenDe2, - !brchTakenDe0 && !brchTakenDe1, - !brchTakenDe0, - true.B) + Cat((0 until p.instructionLanes).reverse.map(x => + if (x == 0) { true.B } else { + (0 until x).map(y => + !brchTakensDe(y) + ).reduce(_ && _) + } + )) - val brchFwd = Cat( - brchTakenDe3 && !brchTakenDe0 && !brchTakenDe1 && !brchTakenDe2, - brchTakenDe2 && !brchTakenDe0 && !brchTakenDe1, - brchTakenDe1 && !brchTakenDe0, - brchTakenDe0) + val brchFwd = + Cat((0 until p.instructionLanes).reverse.map(x => + brchTakensDe(x) && (if (x == 0) { true.B } else { (0 until x).map(y => !brchTakensDe(y)).reduce(_ && _) }) + )) - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { // 1, 11, 111, ... - nxtInstValid(i) := Mux(nxtInstAddr(0)(4,2) <= (7 - i).U, nxtMatch0, nxtMatch1) + nxtInstValid(i) := Mux( + nxtInstAddr(0)(4,2) <= (7 - i).U, + nxtMatch0, + nxtMatch1) val nxtInstValidUInt = nxtInstValid.asUInt instValid(i) := Mux(brchTakenEx, brchValidEx(i,0) === ~0.U((i+1).W), @@ -457,14 +421,11 @@ // This pattern of separate when() blocks requires resets after the data. when (reset.asBool) { val addr = Cat(io.csr.value(0)(31,2), 0.U(2.W)) - instAddr(0) := addr - instAddr(1) := addr + 4.U - instAddr(2) := addr + 8.U - instAddr(3) := addr + 12.U + instAddr := (0 until p.instructionLanes).map(i => addr + (4 * i).U) } // Outputs - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { io.inst.lanes(i).valid := instValid(i) & brchValidDeMask(i) io.inst.lanes(i).addr := instAddr(i) io.inst.lanes(i).inst := instBits(i) @@ -472,23 +433,19 @@ } // Assertions. - assert(instAddr(0) + 4.U === instAddr(1)) - assert(instAddr(0) + 8.U === instAddr(2)) - assert(instAddr(0) + 12.U === instAddr(3)) + for (i <- 1 until p.instructionLanes) { + assert(instAddr(0) + (4 * i).U === instAddr(i)) + } - assert(fsel.getWidth == 5) + assert(fsel.getWidth == (p.instructionLanes + 1)) assert(PopCount(fsel) <= 1.U) val instValidUInt = instValid.asUInt - assert(!(!instValidUInt(0) && (instValidUInt(3,1) =/= 0.U))) - assert(!(!instValidUInt(1) && (instValidUInt(3,2) =/= 0.U))) - assert(!(!instValidUInt(2) && (instValidUInt(3,3) =/= 0.U))) - - val instLanesReady = Cat(io.inst.lanes(3).ready, io.inst.lanes(2).ready, - io.inst.lanes(1).ready, io.inst.lanes(0).ready) - assert(!(!instLanesReady(0) && (instLanesReady(3,1) =/= 0.U))) - assert(!(!instLanesReady(1) && (instLanesReady(3,2) =/= 0.U))) - assert(!(!instLanesReady(2) && (instLanesReady(3,3) =/= 0.U))) + val instLanesReady = Cat((0 until p.instructionLanes).reverse.map(x => io.inst.lanes(x).ready)) + for (i <- 0 until p.instructionLanes - 1) { + assert(!(!instValidUInt(i) && (instValidUInt(p.instructionLanes - 1, i + 1) =/= 0.U))) + assert(!(!instLanesReady(i) && (instLanesReady(p.instructionLanes - 1, i + 1) =/= 0.U))) + } } object EmitFetch extends App {
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala index 60aa158..b13364d 100644 --- a/hdl/chisel/src/kelvin/scalar/Lsu.scala +++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -92,8 +92,8 @@ class Lsu(p: Parameters) extends Module { val io = IO(new Bundle { // Decode cycle. - val req = Vec(4, new LsuIO(p)) - val busPort = Flipped(new RegfileBusPortIO) + val req = Vec(p.instructionLanes, new LsuIO(p)) + val busPort = Flipped(new RegfileBusPortIO(p)) // Execute cycle(s). val rd = Flipped(new RegfileWriteDataIO) @@ -115,16 +115,18 @@ // AXI Queues. val n = 8 - val ctrl = Fifo4(new LsuCtrl(p), n) + val ctrl = FifoX(new LsuCtrl(p), p.instructionLanes, n) val data = Slice(new LsuReadData(p), true, true) // Match and mask. - val ctrlready = Cat(ctrl.io.count <= (n - 4).U, - ctrl.io.count <= (n - 3).U, - ctrl.io.count <= (n - 2).U, - ctrl.io.count <= (n - 1).U) + val ctrlready = (1 to p.instructionLanes).reverse.map(x => ctrl.io.count <= (n - x).U) + // val ctrlready = Cat( + // (1 to p.instructionLanes).reverse.map( + // x => ctrl.io.count <= (n - x).U + // ) + // ) - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { io.req(i).ready := ctrlready(i) && data.io.in.ready } @@ -137,7 +139,7 @@ ctrl.io.in.valid := io.req.map(_.valid).reduce(_||_) val uncacheable = p.m.filter(x => !x.cacheable) - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { val uncached = io.busPort.addr(i)(31) || (if (uncacheable.length > 0) uncacheable.map(x => (io.busPort.addr(i) >= x.memStart.U) && (io.busPort.addr(i) < (x.memStart + x.memSize).U)).reduce(_||_) else false.B)
diff --git a/hdl/chisel/src/kelvin/scalar/Mlu.scala b/hdl/chisel/src/kelvin/scalar/Mlu.scala index 349104d..b7ad953 100644 --- a/hdl/chisel/src/kelvin/scalar/Mlu.scala +++ b/hdl/chisel/src/kelvin/scalar/Mlu.scala
@@ -47,11 +47,11 @@ class Mlu(p: Parameters) extends Module { val io = IO(new Bundle { // Decode cycle. - val req = Vec(4, new MluIO(p)) + val req = Vec(p.instructionLanes, new MluIO(p)) // Execute cycle. - val rs1 = Vec(4, Flipped(new RegfileReadDataIO)) - val rs2 = Vec(4, Flipped(new RegfileReadDataIO)) + val rs1 = Vec(p.instructionLanes, Flipped(new RegfileReadDataIO)) + val rs2 = Vec(p.instructionLanes, Flipped(new RegfileReadDataIO)) val rd = Flipped(new RegfileWriteDataIO) }) @@ -62,41 +62,25 @@ val valid2 = RegInit(false.B) val addr1 = Reg(UInt(5.W)) val addr2 = Reg(UInt(5.W)) - val sel = Reg(UInt(4.W)) + val sel = Reg(UInt(p.instructionLanes.W)) + val valids = io.req.map(_.valid) + assert(valids.length == p.instructionLanes) valid1 := io.req.map(_.valid).reduce(_||_) valid2 := valid1 - when (io.req(0).valid) { - op := io.req(0).op - addr1 := io.req(0).addr - sel := 1.U - } .elsewhen (io.req(1).valid) { - op := io.req(1).op - addr1 := io.req(1).addr - sel := 2.U - } .elsewhen (io.req(2).valid) { - op := io.req(2).op - addr1 := io.req(2).addr - sel := 4.U - } .elsewhen (io.req(3).valid) { - op := io.req(3).op - addr1 := io.req(3).addr - sel := 8.U + when (valids.reduce(_||_)) { + val idx = PriorityEncoder(valids) + op := io.req(idx).op + addr1 := io.req(idx).addr + sel := (1.U << idx) } .otherwise { op := 0.U sel := 0.U } - val rs1 = MuxOR(valid1 & sel(0), io.rs1(0).data) | - MuxOR(valid1 & sel(1), io.rs1(1).data) | - MuxOR(valid1 & sel(2), io.rs1(2).data) | - MuxOR(valid1 & sel(3), io.rs1(3).data) - - val rs2 = MuxOR(valid1 & sel(0), io.rs2(0).data) | - MuxOR(valid1 & sel(1), io.rs2(1).data) | - MuxOR(valid1 & sel(2), io.rs2(2).data) | - MuxOR(valid1 & sel(3), io.rs2(3).data) + val rs1 = (0 until p.instructionLanes).map(x => MuxOR(valid1 & sel(x), io.rs1(x).data)).reduce(_ | _) + val rs2 = (0 until p.instructionLanes).map(x => MuxOR(valid1 & sel(x), io.rs2(x).data)).reduce(_ | _) // Multiplier has a registered output. val mul2 = Reg(UInt(32.W)) @@ -142,7 +126,7 @@ io.rd.data := mul2 + round2 // Assertions. - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { assert(!(valid1 && sel(i) && !io.rs1(i).valid)) assert(!(valid1 && sel(i) && !io.rs2(i).valid)) }
diff --git a/hdl/chisel/src/kelvin/scalar/Regfile.scala b/hdl/chisel/src/kelvin/scalar/Regfile.scala index 6dfdd00..7397bb7 100644 --- a/hdl/chisel/src/kelvin/scalar/Regfile.scala +++ b/hdl/chisel/src/kelvin/scalar/Regfile.scala
@@ -62,9 +62,9 @@ val immed = Input(UInt(32.W)) } -class RegfileBusPortIO extends Bundle { - val addr = Output(Vec(4, UInt(32.W))) - val data = Output(Vec(4, UInt(32.W))) +class RegfileBusPortIO(p: Parameters) extends Bundle { + val addr = Output(Vec(p.instructionLanes, UInt(32.W))) + val data = Output(Vec(p.instructionLanes, UInt(32.W))) } class RegfileLinkPortIO extends Bundle { @@ -79,18 +79,18 @@ class Regfile(p: Parameters) extends Module { val io = IO(new Bundle { // Decode cycle. - val readAddr = Vec(8, new RegfileReadAddrIO) - val readSet = Vec(8, new RegfileReadSetIO) - val writeAddr = Vec(4, new RegfileWriteAddrIO) - val busAddr = Vec(4, new RegfileBusAddrIO) - val target = Vec(4, new RegfileBranchTargetIO) + val readAddr = Vec(p.instructionLanes * 2, new RegfileReadAddrIO) + val readSet = Vec(p.instructionLanes * 2, new RegfileReadSetIO) + val writeAddr = Vec(p.instructionLanes, new RegfileWriteAddrIO) + val busAddr = Vec(p.instructionLanes, new RegfileBusAddrIO) + val target = Vec(p.instructionLanes, new RegfileBranchTargetIO) val linkPort = new RegfileLinkPortIO - val busPort = new RegfileBusPortIO + val busPort = new RegfileBusPortIO(p) // Execute cycle. - val readData = Vec(8, new RegfileReadDataIO) - val writeData = Vec(6, new RegfileWriteDataIO) - val writeMask = Vec(4, new Bundle {val valid = Input(Bool())}) + val readData = Vec(p.instructionLanes * 2, new RegfileReadDataIO) + val writeData = Vec(p.instructionLanes + 2, new RegfileWriteDataIO) + val writeMask = Vec(p.instructionLanes, new Bundle {val valid = Input(Bool())}) val scoreboard = new Bundle { val regd = Output(UInt(32.W)) val comb = Output(UInt(32.W)) @@ -130,11 +130,11 @@ // *************************************************************************** // The read port response. // *************************************************************************** - val readDataReady = RegInit(VecInit(Seq.fill(8){false.B})) - val readDataBits = Reg(Vec(8, UInt(32.W))) - val nxtReadDataBits = Wire(Vec(8, UInt(32.W))) + val readDataReady = RegInit(VecInit(Seq.fill(p.instructionLanes * 2){false.B})) + val readDataBits = Reg(Vec(p.instructionLanes * 2, UInt(32.W))) + val nxtReadDataBits = Wire(Vec(p.instructionLanes * 2, UInt(32.W))) - for (i <- 0 until 8) { + for (i <- 0 until (p.instructionLanes * 2)) { io.readData(i).valid := readDataReady(i) io.readData(i).data := readDataBits(i) } @@ -149,18 +149,13 @@ writeData(0) := 0.U // regfile(0) is optimized away for (i <- 1 until 32) { - val valid = Cat(io.writeData(5).valid && io.writeData(5).addr === i.U, - io.writeData(4).valid && io.writeData(4).addr === i.U, - io.writeData(3).valid && io.writeData(3).addr === i.U && - !io.writeMask(3).valid, - io.writeData(2).valid && io.writeData(2).addr === i.U && - !io.writeMask(2).valid, - io.writeData(1).valid && io.writeData(1).addr === i.U && - !io.writeMask(1).valid, - io.writeData(0).valid && io.writeData(0).addr === i.U && - !io.writeMask(0).valid) + val valid = Cat( + Array(io.writeData(p.instructionLanes + 1).valid && io.writeData(p.instructionLanes + 1).addr === i.U, + io.writeData(p.instructionLanes).valid && io.writeData(p.instructionLanes).addr === i.U) ++ + (0 until p.instructionLanes).reverse.map(x => io.writeData(x).valid && io.writeData(x).addr === i.U && !io.writeMask(x).valid) + ) - val data = (0 until 6).map(x => MuxOR(valid(x), io.writeData(x).data)).reduce(_|_) + val data = (0 until p.instructionLanes + 2).map(x => MuxOR(valid(x), io.writeData(x).data)).reduce(_|_) writeValid(i) := valid =/= 0.U writeData(i) := data @@ -177,21 +172,21 @@ // We care if someone tried to write x0 (e.g. nop is encoded this way), but want // it separate for above mentioned optimization. val x0 = - (0 until 4).map(x => + (0 until p.instructionLanes).map(x => io.writeData(x).valid && io.writeData(x).addr === 0.U && !io.writeMask(x).valid) ++ - (4 until 6).map(x => io.writeData(x).valid && io.writeData(x).addr === 0.U) + (p.instructionLanes until p.instructionLanes + 2).map(x => io.writeData(x).valid && io.writeData(x).addr === 0.U) io.rfwriteCount := PopCount(writeValid) - writeValid(0) + PopCount(x0) // *************************************************************************** // Read ports with write forwarding. // *************************************************************************** - val rdata = Wire(Vec(8, UInt(32.W))) - val wdata = Wire(Vec(8, UInt(32.W))) - val rwdata = Wire(Vec(8, UInt(32.W))) - for (i <- 0 until 8) { + val rdata = Wire(Vec((p.instructionLanes * 2), UInt(32.W))) + val wdata = Wire(Vec((p.instructionLanes * 2), UInt(32.W))) + val rwdata = Wire(Vec((p.instructionLanes * 2), UInt(32.W))) + for (i <- 0 until (p.instructionLanes * 2)) { val idx = io.readAddr(i).addr val write = VecAt(writeValid, idx) rdata(i) := VecAt(regfile, idx) @@ -199,7 +194,7 @@ rwdata(i) := Mux(write, wdata(i), rdata(i)) } - for (i <- 0 until 8) { + for (i <- 0 until (p.instructionLanes * 2)) { val setValid = io.readSet(i).valid val setValue = io.readSet(i).value @@ -215,23 +210,22 @@ } // Bus port priority encoded address. - val busAddr = Wire(Vec(4, UInt(32.W))) - val busValid = Cat(io.busAddr(3).valid, io.busAddr(2).valid, - io.busAddr(1).valid, io.busAddr(0).valid) + val busAddr = Wire(Vec(p.instructionLanes, UInt(32.W))) + val busValid = Cat((0 until p.instructionLanes).reverse.map(x => io.busAddr(x).valid)) - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { busAddr(i) := Mux(io.busAddr(i).bypass, rwdata(2 * i), Mux(io.busAddr(i).immen, rdata(2 * i) + io.busAddr(i).immed, rdata(2 * i))) } - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { io.busPort.addr(i) := busAddr(i) io.busPort.data(i) := nxtReadDataBits(2 * i + 1) } // Branch target address combinatorial. - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { io.target(i).data := busAddr(i) } @@ -244,12 +238,12 @@ // *************************************************************************** // Assertions. // *************************************************************************** - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { assert(busAddr(i).getWidth == p.lsuAddrBits) } - for (i <- 0 until 6) { - for (j <- (i+1) until 6) { + for (i <- 0 until p.instructionLanes + 2) { + for (j <- (i + 1) until p.instructionLanes + 2) { // Delay the failure a cycle for debugging purposes. val write_fail = RegInit(false.B) write_fail := io.writeData(i).valid && io.writeData(j).valid &&
diff --git a/hdl/chisel/src/kelvin/scalar/SCore.scala b/hdl/chisel/src/kelvin/scalar/SCore.scala index 3f0f678..d9e2c32 100644 --- a/hdl/chisel/src/kelvin/scalar/SCore.scala +++ b/hdl/chisel/src/kelvin/scalar/SCore.scala
@@ -50,9 +50,9 @@ // The functional units that make up the core. val regfile = Regfile(p) val fetch = Fetch(p) - val decode = Seq(Decode(p, 0), Decode(p, 1), Decode(p, 2), Decode(p, 3)) - val alu = Seq.fill(4)(Alu(p)) - val bru = Seq.fill(4)(Bru(p)) + val decode = (0 until p.instructionLanes).map(x => Seq(Decode(p, x))).reduce(_ ++ _) + val alu = Seq.fill(p.instructionLanes)(Alu(p)) + val bru = Seq.fill(p.instructionLanes)(Bru(p)) val csr = Csr(p) val lsu = Lsu(p) val mlu = Mlu(p) @@ -77,15 +77,15 @@ io.dflush.clean := lsu.io.flush.clean lsu.io.flush.ready := io.dflush.ready - assert(!bru(1).io.iflush) - assert(!bru(2).io.iflush) - assert(!bru(3).io.iflush) + for (i <- 1 until p.instructionLanes) { + assert(!bru(i).io.iflush) + } // --------------------------------------------------------------------------- // Fetch fetch.io.csr := io.csr.in - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { fetch.io.branch(i) := bru(i).io.taken } @@ -97,7 +97,7 @@ // Decode val mask = VecInit(decode.map(_.io.inst.ready).scan(true.B)(_ && _)) - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { decode(i).io.inst.valid := fetch.io.inst.lanes(i).valid && mask(i) fetch.io.inst.lanes(i).ready := decode(i).io.inst.ready && mask(i) decode(i).io.inst.addr := fetch.io.inst.lanes(i).addr @@ -110,31 +110,31 @@ // Interlock based on regfile write port dependencies. decode(0).io.interlock := bru(0).io.interlock - decode(1).io.interlock := decode(0).io.interlock - decode(2).io.interlock := decode(1).io.interlock - decode(3).io.interlock := decode(2).io.interlock + for (i <- 1 until p.instructionLanes) { + decode(i).io.interlock := decode(i - 1).io.interlock + } // Serialize opcodes with only one pipeline. decode(0).io.serializeIn.defaults() - decode(1).io.serializeIn := decode(0).io.serializeOut - decode(2).io.serializeIn := decode(1).io.serializeOut - decode(3).io.serializeIn := decode(2).io.serializeOut + for (i <- 1 until p.instructionLanes) { + decode(i).io.serializeIn := decode(i - 1).io.serializeOut + } // In decode update multi-issue scoreboard state. val scoreboard_spec = decode.map(_.io.scoreboard.spec).scan(0.U)(_|_) - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { decode(i).io.scoreboard.comb := regfile.io.scoreboard.comb | scoreboard_spec(i) decode(i).io.scoreboard.regd := regfile.io.scoreboard.regd | scoreboard_spec(i) } decode(0).io.mactive := io.vcore.mactive - decode(1).io.mactive := false.B - decode(2).io.mactive := false.B - decode(3).io.mactive := false.B + for (i <- 1 until p.instructionLanes) { + decode(i).io.mactive := false.B + } // --------------------------------------------------------------------------- // ALU - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { alu(i).io.req := decode(i).io.alu alu(i).io.rs1 := regfile.io.readData(2 * i + 0) alu(i).io.rs2 := regfile.io.readData(2 * i + 1) @@ -142,7 +142,7 @@ // --------------------------------------------------------------------------- // Branch Unit - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { bru(i).io.req := decode(i).io.bru bru(i).io.rs1 := regfile.io.readData(2 * i + 0) bru(i).io.rs2 := regfile.io.readData(2 * i + 1) @@ -150,9 +150,9 @@ } bru(0).io.csr <> csr.io.bru - bru(1).io.csr.defaults() - bru(2).io.csr.defaults() - bru(3).io.csr.defaults() + for (i <- 1 until p.instructionLanes) { + bru(i).io.csr.defaults() + } io.iflush.valid := iflush @@ -181,7 +181,7 @@ // Load/Store Unit lsu.io.busPort := regfile.io.busPort - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { lsu.io.req(i).valid := decode(i).io.lsu.valid lsu.io.req(i).store := decode(i).io.lsu.store lsu.io.req(i).addr := decode(i).io.lsu.addr @@ -191,7 +191,7 @@ // --------------------------------------------------------------------------- // Multiplier Unit - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { mlu.io.req(i) := decode(i).io.mlu mlu.io.rs1(i) := regfile.io.readData(2 * i) mlu.io.rs2(i) := regfile.io.readData((2 * i) + 1) @@ -205,13 +205,13 @@ dvu.io.rd.ready := !mlu.io.rd.valid // TODO: make port conditional on pipeline index. - for (i <- 1 until 4) { + for (i <- 1 until p.instructionLanes) { decode(i).io.dvu.ready := false.B } // --------------------------------------------------------------------------- // Register File - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { regfile.io.readAddr(2 * i + 0) := decode(i).io.rs1Read regfile.io.readAddr(2 * i + 1) := decode(i).io.rs2Read regfile.io.readSet(2 * i + 0) := decode(i).io.rs1Set @@ -245,27 +245,29 @@ io.vcore.rd(i).valid) <= 1.U) } - regfile.io.writeData(4).valid := mlu.io.rd.valid || dvu.io.rd.valid - regfile.io.writeData(4).addr := Mux(mlu.io.rd.valid, mlu.io.rd.addr, dvu.io.rd.addr) - regfile.io.writeData(4).data := Mux(mlu.io.rd.valid, mlu.io.rd.data, dvu.io.rd.data) + val mluDvuOffset = p.instructionLanes + regfile.io.writeData(mluDvuOffset).valid := mlu.io.rd.valid || dvu.io.rd.valid + regfile.io.writeData(mluDvuOffset).addr := Mux(mlu.io.rd.valid, mlu.io.rd.addr, dvu.io.rd.addr) + regfile.io.writeData(mluDvuOffset).data := Mux(mlu.io.rd.valid, mlu.io.rd.data, dvu.io.rd.data) assert(!(mlu.io.rd.valid && (dvu.io.rd.valid && dvu.io.rd.ready))) // TODO: stall dvu on mlu write - regfile.io.writeData(5).valid := lsu.io.rd.valid - regfile.io.writeData(5).addr := lsu.io.rd.addr - regfile.io.writeData(5).data := lsu.io.rd.data + val lsuOffset = p.instructionLanes + 1 + regfile.io.writeData(lsuOffset).valid := lsu.io.rd.valid + regfile.io.writeData(lsuOffset).addr := lsu.io.rd.addr + regfile.io.writeData(lsuOffset).data := lsu.io.rd.data val writeMask = bru.map(_.io.taken.valid).scan(false.B)(_||_) - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { regfile.io.writeMask(i).valid := writeMask(i) } // --------------------------------------------------------------------------- // Vector Extension - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { io.vcore.vinst(i) <> decode(i).io.vinst } - for (i <- 0 until 8) { + for (i <- 0 until p.instructionLanes * 2) { io.vcore.rs(i) := regfile.io.readData(i) } @@ -301,36 +303,23 @@ cycles := cycles + 1.U io.debug.cycles := cycles - val debugEn = RegInit(0.U(4.W)) - val debugAddr = Reg(Vec(4, UInt(32.W))) - val debugInst = Reg(Vec(4, UInt(32.W))) + val debugEn = RegInit(0.U(p.instructionLanes.W)) + val debugAddr = Reg(Vec(p.instructionLanes, UInt(32.W))) + val debugInst = Reg(Vec(p.instructionLanes, UInt(32.W))) - val debugBrch = - Cat(bru(0).io.taken.valid || bru(1).io.taken.valid || bru(2).io.taken.valid, - bru(0).io.taken.valid || bru(1).io.taken.valid, - bru(0).io.taken.valid, - false.B) + val debugBrch = Cat(bru.map(_.io.taken.valid).scanRight(false.B)(_ || _)) - debugEn := Cat(fetch.io.inst.lanes(3).valid && fetch.io.inst.lanes(3).ready && !branchTaken, - fetch.io.inst.lanes(2).valid && fetch.io.inst.lanes(2).ready && !branchTaken, - fetch.io.inst.lanes(1).valid && fetch.io.inst.lanes(1).ready && !branchTaken, - fetch.io.inst.lanes(0).valid && fetch.io.inst.lanes(0).ready && !branchTaken) + debugEn := Cat(fetch.io.inst.lanes.map(x => x.valid && x.ready && !branchTaken)) - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { debugAddr(i) := fetch.io.inst.lanes(i).addr debugInst(i) := fetch.io.inst.lanes(i).inst } io.debug.en := debugEn & ~debugBrch - io.debug.addr0 := debugAddr(0) - io.debug.addr1 := debugAddr(1) - io.debug.addr2 := debugAddr(2) - io.debug.addr3 := debugAddr(3) - io.debug.inst0 := debugInst(0) - io.debug.inst1 := debugInst(1) - io.debug.inst2 := debugInst(2) - io.debug.inst3 := debugInst(3) + io.debug.addr <> debugAddr + io.debug.inst <> debugInst } object EmitSCore extends App {
diff --git a/hdl/chisel/src/kelvin/vector/VAlu.scala b/hdl/chisel/src/kelvin/vector/VAlu.scala index 03eae95..03f6f36 100644 --- a/hdl/chisel/src/kelvin/vector/VAlu.scala +++ b/hdl/chisel/src/kelvin/vector/VAlu.scala
@@ -30,15 +30,15 @@ class VAlu(p: Parameters) extends Module { val io = IO(new Bundle { // Instructions. - val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits)))) + val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VDecodeBits)))) val active = Output(UInt(64.W)) // VRegfile. val vrfsb = Input(UInt(128.W)) - val read = Vec(7, new VRegfileReadIO(p)) - val write = Vec(4, new VRegfileWriteIO(p)) - val whint = Vec(4, new VRegfileWhintIO(p)) - val scalar = Vec(2, new VRegfileScalarIO(p)) + val read = Vec(p.vectorReadPorts, new VRegfileReadIO(p)) + val write = Vec(p.vectorWritePorts - 2, new VRegfileWriteIO(p)) + val whint = Vec(p.vectorWhintPorts, new VRegfileWhintIO(p)) + val scalar = Vec(p.vectorScalarPorts, new VRegfileScalarIO(p)) // Testbench signals. val read_0_ready = Output(Bool()) @@ -56,26 +56,26 @@ // --------------------------------------------------------------------------- // Tie-offs. - for (i <- 0 until 7) { + for (i <- 0 until io.read.length) { io.read(i).valid := false.B io.read(i).addr := 0.U io.read(i).tag := 0.U } - for (i <- 0 until 4) { + for (i <- 0 until io.write.length) { io.write(i).valid := false.B io.write(i).addr := 0.U io.write(i).data := 0.U } - for (i <- 0 until 4) { + for (i <- 0 until io.whint.length) { io.whint(i).valid := false.B io.whint(i).addr := 0.U } // --------------------------------------------------------------------------- // Opcode checks. - for (i <- 0 until 4) { + for (i <- 0 until io.in.bits.length) { when (io.in.valid && io.in.ready) { when (io.in.bits(i).valid) { val op = io.in.bits(i).bits.op @@ -254,8 +254,8 @@ active } - val q0 = VCmdq(cmdqDepth, new VAluCmdq, Fin0, Fout, Factive) - val q1 = VCmdq(cmdqDepth, new VAluCmdq, Fin1, Fout, Factive) + val q0 = VCmdq(p, cmdqDepth, new VAluCmdq, Fin0, Fout, Factive) + val q1 = VCmdq(p, cmdqDepth, new VAluCmdq, Fin1, Fout, Factive) q0.io.in.valid := io.in.valid && q1.io.in.ready q1.io.in.valid := io.in.valid && q0.io.in.ready @@ -278,20 +278,19 @@ // --------------------------------------------------------------------------- // ALU Selection interleaving. val alureg = RegInit(false.B) - val alusel = Wire(Vec(5, Bool())) + val alusel = Wire(Vec(p.instructionLanes + 1, Bool())) // Toggle if previous was valid and was not a synchronized dual command. alusel(0) := alureg - alusel(1) := Mux(io.in.bits(0).valid && !io.in.bits(0).bits.cmdsync, !alusel(0), alusel(0)) - alusel(2) := Mux(io.in.bits(1).valid && !io.in.bits(1).bits.cmdsync, !alusel(1), alusel(1)) - alusel(3) := Mux(io.in.bits(2).valid && !io.in.bits(2).bits.cmdsync, !alusel(2), alusel(2)) - alusel(4) := Mux(io.in.bits(3).valid && !io.in.bits(3).bits.cmdsync, !alusel(3), alusel(3)) - - when (io.in.valid && io.in.ready) { - alureg := alusel(4) + for (i <- 0 until p.instructionLanes) { + alusel(i + 1) := Mux(io.in.bits(i).valid && !io.in.bits(i).bits.cmdsync, !alusel(i), alusel(i)) } - for (i <- 0 until 4) { + when (io.in.valid && io.in.ready) { + alureg := alusel(alusel.length - 1) + } + + for (i <- 0 until p.instructionLanes) { q0.io.in.bits(i).valid := io.in.bits(i).valid && (alusel(i) === 0.U || io.in.bits(i).bits.cmdsync) q1.io.in.bits(i).valid := io.in.bits(i).valid && (alusel(i) === 1.U || io.in.bits(i).bits.cmdsync) }
diff --git a/hdl/chisel/src/kelvin/vector/VCmdq.scala b/hdl/chisel/src/kelvin/vector/VCmdq.scala index 20e29b3..261ba63 100644 --- a/hdl/chisel/src/kelvin/vector/VCmdq.scala +++ b/hdl/chisel/src/kelvin/vector/VCmdq.scala
@@ -27,14 +27,14 @@ // <factive> returns the activation status for decode dependencies. object VCmdq { - def apply[T <: Data](n: Int, t: T, fin: (VDecodeBits) => T, fout: (T, Bool, UInt, Bool) => (T, Bool), factive: (T, Bool, UInt) => UInt) = { - Module(new VCmdq(n, t, fin, fout, factive)) + def apply[T <: Data](p: Parameters, n: Int, t: T, fin: (VDecodeBits) => T, fout: (T, Bool, UInt, Bool) => (T, Bool), factive: (T, Bool, UInt) => UInt) = { + Module(new VCmdq(p, n, t, fin, fout, factive)) } } -class VCmdq[T <: Data](n: Int, t: T, fin: (VDecodeBits) => T, fout: (T, Bool, UInt, Bool) => (T, Bool), factive: (T, Bool, UInt) => UInt) extends Module { +class VCmdq[T <: Data](p: Parameters, n: Int, t: T, fin: (VDecodeBits) => T, fout: (T, Bool, UInt, Bool) => (T, Bool), factive: (T, Bool, UInt) => UInt) extends Module { val io = IO(new Bundle { - val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits)))) + val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VDecodeBits)))) val out = Decoupled(t) val active = Output(UInt(64.W)) val nempty = Output(Bool()) @@ -45,7 +45,7 @@ val m = Output(Bool()) // stripmine } - val f = Fifo4e(new VCmdqWrapper, n) + val f = FifoXe(new VCmdqWrapper, p.instructionLanes, n) val active = RegInit(0.U(64.W)) @@ -65,7 +65,7 @@ f.io.in.valid := io.in.valid io.in.ready := f.io.in.ready - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { f.io.in.bits(i).valid := io.in.bits(i).valid f.io.in.bits(i).bits.tin := fin(io.in.bits(i).bits) f.io.in.bits(i).bits.m := io.in.bits(i).bits.m @@ -118,14 +118,10 @@ when (io.in.valid && io.in.ready || io.out.valid && io.out.ready) { val fvalid = MuxOR(f.io.in.valid && f.io.in.ready, - Cat(f.io.in.bits(3).valid, f.io.in.bits(2).valid, - f.io.in.bits(1).valid, f.io.in.bits(0).valid)) + Cat((0 until p.instructionLanes).reverse.map(x => f.io.in.bits(x).valid))) - active := - MuxOR(fvalid(0), factive(f.io.in.bits(0).bits.tin, f.io.in.bits(0).bits.m, step0)) | - MuxOR(fvalid(1), factive(f.io.in.bits(1).bits.tin, f.io.in.bits(1).bits.m, step0)) | - MuxOR(fvalid(2), factive(f.io.in.bits(2).bits.tin, f.io.in.bits(2).bits.m, step0)) | - MuxOR(fvalid(3), factive(f.io.in.bits(3).bits.tin, f.io.in.bits(3).bits.m, step0)) | + active := (0 until p.instructionLanes).map(x => + MuxOR(fvalid(x), factive(f.io.in.bits(x).bits.tin, f.io.in.bits(x).bits.m, step0))).reduce(_|_) | ValueActive() } @@ -180,5 +176,6 @@ active } - ChiselStage.emitSystemVerilogFile(new VCmdq(8, new VCmdqTestBundle, VCmdqTestFin, VCmdqTestFout, VCmdqTestFactive), args) + val p = kelvin.Parameters() + ChiselStage.emitSystemVerilogFile(new VCmdq(p, 8, new VCmdqTestBundle, VCmdqTestFin, VCmdqTestFout, VCmdqTestFactive), args) }
diff --git a/hdl/chisel/src/kelvin/vector/VConvCtrl.scala b/hdl/chisel/src/kelvin/vector/VConvCtrl.scala index 1e017a4..ebea853 100644 --- a/hdl/chisel/src/kelvin/vector/VConvCtrl.scala +++ b/hdl/chisel/src/kelvin/vector/VConvCtrl.scala
@@ -30,7 +30,7 @@ class VConvCtrl(p: Parameters) extends Module { val io = IO(new Bundle { // Instructions. - val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits)))) + val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VDecodeBits)))) val active = Output(UInt(64.W)) // RegisterFile. @@ -160,7 +160,7 @@ active } - val q = VCmdq(cmdqDepth, new VConvCtrlCmdq, Fin, Fout, Factive) + val q = VCmdq(p, cmdqDepth, new VConvCtrlCmdq, Fin, Fout, Factive) q.io.in <> io.in
diff --git a/hdl/chisel/src/kelvin/vector/VCore.scala b/hdl/chisel/src/kelvin/vector/VCore.scala index 58bbab6..919cb27 100644 --- a/hdl/chisel/src/kelvin/vector/VCore.scala +++ b/hdl/chisel/src/kelvin/vector/VCore.scala
@@ -28,11 +28,11 @@ class VCoreIO(p: Parameters) extends Bundle { // Decode cycle. - val vinst = Vec(4, new VInstIO) + val vinst = Vec(p.instructionLanes, new VInstIO) // Execute cycle. - val rs = Vec(8, Flipped(new RegfileReadDataIO)) - val rd = Vec(4, Flipped(new RegfileWriteDataIO)) + val rs = Vec(p.instructionLanes * 2, Flipped(new RegfileReadDataIO)) + val rd = Vec(p.instructionLanes, Flipped(new RegfileWriteDataIO)) // Status. val mactive = Output(Bool()) @@ -97,7 +97,7 @@ vinst.io.out.stall := vdec.io.stall // decode backpressure - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { vdec.io.in.bits(i) := vinst.io.out.lane(i) } @@ -105,24 +105,24 @@ // --------------------------------------------------------------------------- // VRegfile. - for (i <- 0 until 7) { + for (i <- 0 until vrf.readPorts) { vrf.io.read(i).valid := false.B vrf.io.read(i).addr := 0.U vrf.io.read(i).tag := 0.U } - for (i <- 0 until 6) { + for (i <- 0 until vrf.writePorts) { vrf.io.write(i).valid := false.B vrf.io.write(i).addr := 0.U vrf.io.write(i).data := 0.U } - for (i <- 0 until 4) { + for (i <- 0 until vrf.whintPorts) { vrf.io.whint(i).valid := false.B vrf.io.whint(i).addr := 0.U } - for (i <- 0 until 2) { + for (i <- 0 until vrf.scalarPorts) { vrf.io.scalar(i).valid := false.B vrf.io.scalar(i).data := 0.U } @@ -133,43 +133,38 @@ // --------------------------------------------------------------------------- // VALU. - val aluvalid = Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).alu, - vdec.io.out(2).valid && vdec.io.cmdq(2).alu, - vdec.io.out(1).valid && vdec.io.cmdq(1).alu, - vdec.io.out(0).valid && vdec.io.cmdq(0).alu) + val aluvalid = (0 until p.instructionLanes).map(x => vdec.io.out(x).valid && vdec.io.cmdq(x).alu) + val aluready = (0 until p.instructionLanes).map(x => valu.io.in.ready && vdec.io.cmdq(x).alu) - val aluready = Cat(valu.io.in.ready && vdec.io.cmdq(3).alu, - valu.io.in.ready && vdec.io.cmdq(2).alu, - valu.io.in.ready && vdec.io.cmdq(1).alu, - valu.io.in.ready && vdec.io.cmdq(0).alu) + valu.io.in.valid := aluvalid.reduce(_ || _) - valu.io.in.valid := aluvalid =/= 0.U - - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { valu.io.in.bits(i).valid := aluvalid(i) valu.io.in.bits(i).bits := vdec.io.out(i).bits } - for (i <- 0 until 7) { + for (i <- 0 until vrf.readPorts) { vrf.io.read(i).valid := valu.io.read(i).valid vrf.io.read(i).addr := valu.io.read(i).addr vrf.io.read(i).tag := valu.io.read(i).tag } - for (i <- 0 until 7) { + for (i <- 0 until vrf.readPorts) { valu.io.read(i).data := vrf.io.read(i).data } - for (i <- 0 until 4) { + for (i <- 0 until vrf.writePorts - 2) { vrf.io.write(i).valid := valu.io.write(i).valid vrf.io.write(i).addr := valu.io.write(i).addr vrf.io.write(i).data := valu.io.write(i).data + } + for (i <- 0 until vrf.whintPorts) { vrf.io.whint(i).valid := valu.io.whint(i).valid vrf.io.whint(i).addr := valu.io.whint(i).addr } - for (i <- 0 until 2) { + for (i <- 0 until vrf.scalarPorts) { vrf.io.scalar(i).valid := valu.io.scalar(i).valid vrf.io.scalar(i).data := valu.io.scalar(i).data } @@ -178,19 +173,12 @@ // --------------------------------------------------------------------------- // VCONV. - val convvalid = Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).conv, - vdec.io.out(2).valid && vdec.io.cmdq(2).conv, - vdec.io.out(1).valid && vdec.io.cmdq(1).conv, - vdec.io.out(0).valid && vdec.io.cmdq(0).conv) + val convvalid = (0 until p.instructionLanes).map(x => vdec.io.out(x).valid && vdec.io.cmdq(x).conv) + val convready = (0 until p.instructionLanes).map(x => vconv.io.in.ready && vdec.io.cmdq(x).conv) - val convready = Cat(vconv.io.in.ready && vdec.io.cmdq(3).conv, - vconv.io.in.ready && vdec.io.cmdq(2).conv, - vconv.io.in.ready && vdec.io.cmdq(1).conv, - vconv.io.in.ready && vdec.io.cmdq(0).conv) + vconv.io.in.valid := convvalid.reduce(_ || _) - vconv.io.in.valid := convvalid =/= 0.U - - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { vconv.io.in.bits(i).valid := convvalid(i) vconv.io.in.bits(i).bits := vdec.io.out(i).bits } @@ -201,25 +189,18 @@ // --------------------------------------------------------------------------- // VLdSt. - val ldstvalid = Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).ldst, - vdec.io.out(2).valid && vdec.io.cmdq(2).ldst, - vdec.io.out(1).valid && vdec.io.cmdq(1).ldst, - vdec.io.out(0).valid && vdec.io.cmdq(0).ldst) + val ldstvalid = (0 until p.instructionLanes).map(x => vdec.io.out(x).valid && vdec.io.cmdq(x).ldst) + val ldstready = (0 until p.instructionLanes).map(x => vldst.io.in.ready && vdec.io.cmdq(x).ldst) - val ldstready = Cat(vldst.io.in.ready && vdec.io.cmdq(3).ldst, - vldst.io.in.ready && vdec.io.cmdq(2).ldst, - vldst.io.in.ready && vdec.io.cmdq(1).ldst, - vldst.io.in.ready && vdec.io.cmdq(0).ldst) + vldst.io.in.valid := ldstvalid.reduce(_ || _) - vldst.io.in.valid := ldstvalid =/= 0.U - - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { vldst.io.in.bits(i).valid := ldstvalid(i) vldst.io.in.bits(i).bits := vdec.io.out(i).bits } vldst.io.read.ready := !vst.io.read.valid - vldst.io.read.data := vrf.io.read(6).data + vldst.io.read.data := vrf.io.read(vrf.readPorts - 1).data vldst.io.vrfsb := vrf.io.vrfsb.data @@ -228,22 +209,12 @@ // --------------------------------------------------------------------------- // VLd. - val ldvalid = Wire(UInt(4.W)) - val ldready = Wire(UInt(4.W)) + val ldvalid = (0 until p.instructionLanes).map(x => vdec.io.cmdq(x).ld && vdec.io.out(x).valid) + val ldready = (0 until p.instructionLanes).map(x => vdec.io.cmdq(x).ld && vld.io.in.ready) - ldvalid := Cat(vdec.io.cmdq(3).ld && vdec.io.out(3).valid, - vdec.io.cmdq(2).ld && vdec.io.out(2).valid, - vdec.io.cmdq(1).ld && vdec.io.out(1).valid, - vdec.io.cmdq(0).ld && vdec.io.out(0).valid) + vld.io.in.valid := ldvalid.reduce(_ || _) - ldready := Cat(vdec.io.cmdq(3).ld && vld.io.in.ready, - vdec.io.cmdq(2).ld && vld.io.in.ready, - vdec.io.cmdq(1).ld && vld.io.in.ready, - vdec.io.cmdq(0).ld && vld.io.in.ready) - - vld.io.in.valid := ldvalid =/= 0.U - - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { vld.io.in.bits(i).valid := ldvalid(i) vld.io.in.bits(i).bits := vdec.io.out(i).bits } @@ -252,22 +223,12 @@ // --------------------------------------------------------------------------- // VSt. - val stvalid = Wire(UInt(4.W)) - val stready = Wire(UInt(4.W)) + val stvalid = (0 until p.instructionLanes).map(x => vdec.io.out(x).valid && vdec.io.cmdq(x).st) + val stready = (0 until p.instructionLanes).map(x => vst.io.in.ready && vdec.io.cmdq(x).st) - stvalid := Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).st, - vdec.io.out(2).valid && vdec.io.cmdq(2).st, - vdec.io.out(1).valid && vdec.io.cmdq(1).st, - vdec.io.out(0).valid && vdec.io.cmdq(0).st) + vst.io.in.valid := stvalid.reduce(_ || _) - stready := Cat(vst.io.in.ready && vdec.io.cmdq(3).st, - vst.io.in.ready && vdec.io.cmdq(2).st, - vst.io.in.ready && vdec.io.cmdq(1).st, - vst.io.in.ready && vdec.io.cmdq(0).st) - - vst.io.in.valid := stvalid =/= 0.U - - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { vst.io.in.bits(i).valid := stvalid(i) vst.io.in.bits(i).bits := vdec.io.out(i).bits } @@ -277,29 +238,29 @@ vst.io.vrfsb := vrf.io.vrfsb.data vst.io.read.ready := true.B - vst.io.read.data := vrf.io.read(6).data + vst.io.read.data := vrf.io.read(vrf.readPorts - 1).data // --------------------------------------------------------------------------- // Load write. - vrf.io.write(4).valid := vldst.io.write.valid - vrf.io.write(4).addr := vldst.io.write.addr - vrf.io.write(4).data := vldst.io.write.data + vrf.io.write(vrf.readPorts - 3).valid := vldst.io.write.valid + vrf.io.write(vrf.readPorts - 3).addr := vldst.io.write.addr + vrf.io.write(vrf.readPorts - 3).data := vldst.io.write.data - vrf.io.write(5).valid := vld.io.write.valid - vrf.io.write(5).addr := vld.io.write.addr - vrf.io.write(5).data := vld.io.write.data + vrf.io.write(vrf.readPorts - 2).valid := vld.io.write.valid + vrf.io.write(vrf.readPorts - 2).addr := vld.io.write.addr + vrf.io.write(vrf.readPorts - 2).data := vld.io.write.data // --------------------------------------------------------------------------- // Store read. - vrf.io.read(6).valid := vst.io.read.valid || vldst.io.read.valid - vrf.io.read(6).addr := Mux(vst.io.read.valid, vst.io.read.addr, + vrf.io.read(vrf.readPorts - 1).valid := vst.io.read.valid || vldst.io.read.valid + vrf.io.read(vrf.readPorts - 1).addr := Mux(vst.io.read.valid, vst.io.read.addr, vldst.io.read.addr) - vrf.io.read(6).tag := Mux(vst.io.read.valid, vst.io.read.tag, + vrf.io.read(vrf.readPorts - 1).tag := Mux(vst.io.read.valid, vst.io.read.tag, vldst.io.read.tag) // --------------------------------------------------------------------------- // VDecode. - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { vdec.io.out(i).ready := aluready(i) || convready(i) || ldstready(i) || ldready(i) || stready(i) }
diff --git a/hdl/chisel/src/kelvin/vector/VDecode.scala b/hdl/chisel/src/kelvin/vector/VDecode.scala index fa48723..44d6afc 100644 --- a/hdl/chisel/src/kelvin/vector/VDecode.scala +++ b/hdl/chisel/src/kelvin/vector/VDecode.scala
@@ -18,7 +18,7 @@ import chisel3._ import chisel3.util._ -import common.Fifo4x4 +import common.FifoIxO import _root_.circt.stage.ChiselStage object VDecode { @@ -29,10 +29,10 @@ class VDecode(p: Parameters) extends Module { val io = IO(new Bundle { - val in = Flipped(Decoupled(Vec(4, Valid(new VectorInstructionLane)))) - val out = Vec(4, Decoupled(new VDecodeBits)) - val cmdq = Vec(4, Output(new VDecodeCmdq)) - val actv = Vec(4, Output(new VDecodeActive)) // used in testbench + val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VectorInstructionLane)))) + val out = Vec(p.instructionLanes, Decoupled(new VDecodeBits)) + val cmdq = Vec(p.instructionLanes, Output(new VDecodeCmdq)) + val actv = Vec(p.instructionLanes, Output(new VDecodeActive)) // used in testbench val stall = Output(Bool()) val active = Input(UInt(64.W)) val vrfsb = new VRegfileScoreboardIO @@ -45,27 +45,24 @@ val enc = new VEncodeOp() - val f = Fifo4x4(new VectorInstructionLane, depth) + val f = FifoIxO(new VectorInstructionLane, p.instructionLanes, p.instructionLanes, depth) - val d = Seq(Module(new VDecodeInstruction(p)), - Module(new VDecodeInstruction(p)), - Module(new VDecodeInstruction(p)), - Module(new VDecodeInstruction(p))) + val d = Seq.fill(p.instructionLanes)(Module(new VDecodeInstruction(p))) - val e = Wire(Vec(4, new VDecodeBits)) + val e = Wire(Vec(p.instructionLanes, new VDecodeBits)) - val valid = RegInit(VecInit(Seq.fill(4)(false.B))) - val data = Reg(Vec(4, new VDecodeBits)) - val cmdq = Reg(Vec(4, new VDecodeCmdq)) - val actv = Wire(Vec(4, new VDecodeActive)) - val actv2 = Reg(Vec(4, new VDecodeActive2)) - val dataNxt = Wire(Vec(4, new VDecodeBits)) - val cmdqNxt = Wire(Vec(4, new VDecodeCmdq)) - val actvNxt = Wire(Vec(4, new VDecodeActive2)) + val valid = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B))) + val data = Reg(Vec(p.instructionLanes, new VDecodeBits)) + val cmdq = Reg(Vec(p.instructionLanes, new VDecodeCmdq)) + val actv = Wire(Vec(p.instructionLanes, new VDecodeActive)) + val actv2 = Reg(Vec(p.instructionLanes, new VDecodeActive2)) + val dataNxt = Wire(Vec(p.instructionLanes, new VDecodeBits)) + val cmdqNxt = Wire(Vec(p.instructionLanes, new VDecodeCmdq)) + val actvNxt = Wire(Vec(p.instructionLanes, new VDecodeActive2)) // --------------------------------------------------------------------------- // Decode. - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { d(i).io.in := f.io.out(i).bits } @@ -75,24 +72,11 @@ // write the read usage is occurring on. val tagReg = RegInit(0.U(64.W)) - val tag0 = tagReg - val tag1 = tag0 ^ d(0).io.actv.wactive - val tag2 = tag1 ^ d(1).io.actv.wactive - val tag3 = tag2 ^ d(2).io.actv.wactive - val tag4 = tag3 ^ d(3).io.actv.wactive - - val tags = Seq(tag0, tag1, tag2, tag3, tag4) + val tags = (0 until p.instructionLanes).map(x => d(x).io.actv.wactive).scan(tagReg)(_ ^ _) + assert(tags.length == p.instructionLanes + 1) // f.io.out is ordered, so can use a priority tree. - when(f.io.out(3).valid && f.io.out(3).ready) { - tagReg := tag4 - } .elsewhen(f.io.out(2).valid && f.io.out(2).ready) { - tagReg := tag3 - } .elsewhen(f.io.out(1).valid && f.io.out(1).ready) { - tagReg := tag2 - } .elsewhen(f.io.out(0).valid && f.io.out(0).ready) { - tagReg := tag1 - } + tagReg := MuxCase(tags(0), (0 until p.instructionLanes).reverse.map(x => (f.io.out(x).valid && f.io.out(x).ready) -> tags(x + 1))) def TagAddr(tag: UInt, v: VAddrTag): VAddrTag = { assert(tag.getWidth == 64) @@ -111,7 +95,7 @@ r } - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { e(i) := d(i).io.out e(i).vs := TagAddr(tags(i), d(i).io.out.vs) e(i).vt := TagAddr(tags(i), d(i).io.out.vt) @@ -123,34 +107,27 @@ // --------------------------------------------------------------------------- // Undef. (io.in.ready ignored to signal as early as possible) - io.undef := io.in.valid && (d(0).io.undef || d(1).io.undef || d(2).io.undef || d(3).io.undef) + io.undef := io.in.valid && d.map(x => x.io.undef).reduce(_ || _) // --------------------------------------------------------------------------- // Fifo. f.io.in <> io.in - val icount = MuxOR(io.in.valid, PopCount(Cat(io.in.bits(0).valid, io.in.bits(1).valid, io.in.bits(2).valid, io.in.bits(3).valid))) - assert(icount.getWidth == 3) + val icount = MuxOR(io.in.valid, + PopCount(io.in.bits.map(_.valid)) + ) - val ocount = PopCount(Cat(valid(0) && !(io.out(0).valid && io.out(0).ready), - valid(1) && !(io.out(1).valid && io.out(1).ready), - valid(2) && !(io.out(2).valid && io.out(2).ready), - valid(3) && !(io.out(3).valid && io.out(3).ready))) - assert(ocount.getWidth == 3) + val ocount = PopCount((0 until p.instructionLanes).map(x => valid(x) && !(io.out(x).valid && io.out(x).ready))) - for (i <- 0 until 4) { - f.io.out(i).ready := (i.U + ocount) < 4.U + for (i <- 0 until p.instructionLanes) { + f.io.out(i).ready := (i.U + ocount) < p.instructionLanes.U } // --------------------------------------------------------------------------- // Valid. - val fcount = PopCount(Cat(f.io.out(0).valid && f.io.out(0).ready, - f.io.out(1).valid && f.io.out(1).ready, - f.io.out(2).valid && f.io.out(2).ready, - f.io.out(3).valid && f.io.out(3).ready)) - assert(fcount.getWidth == 3) + val fcount = PopCount(f.io.out.map(x => x.valid && x.ready)) - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { valid(i) := (ocount + fcount) > i.U } @@ -159,41 +136,30 @@ io.stall := (f.io.count + icount) > (depth - guard).U // --------------------------------------------------------------------------- - // Dependencies. - val depends = Wire(Vec(4, Bool())) - // Writes must not proceed past any outstanding reads or writes, // or past any dispatching writes. - val wactive0 = io.vrfsb.data(63, 0) | io.vrfsb.data(127, 64) | io.active - val wactive1 = actv(0).ractive | actv(0).wactive | wactive0 - val wactive2 = actv(1).ractive | actv(1).wactive | wactive1 - val wactive3 = actv(2).ractive | actv(2).wactive | wactive2 - val wactive = VecInit(wactive0, wactive1, wactive2, wactive3) + val wactive = VecInit((0 until p.instructionLanes).map(x => actv(x).ractive | actv(x).wactive).scan(io.vrfsb.data(63,0) | io.vrfsb.data(127,64) | io.active)(_ | _)) // Reads must not proceed past any dispatching writes. - val ractive0 = 0.U(64.W) - val ractive1 = actv(0).wactive | ractive0 - val ractive2 = actv(1).wactive | ractive1 - val ractive3 = actv(2).wactive | ractive2 - val ractive = VecInit(ractive0, ractive1, ractive2, ractive3) + val ractive = VecInit((0 until p.instructionLanes).map(x => actv(x).wactive).scan(0.U(64.W))(_ | _)) - for (i <- 0 until 4) { - depends(i) := (wactive(i) & actv(i).wactive) =/= 0.U || - (ractive(i) & actv(i).ractive) =/= 0.U - } + // Dependencies. + val depends = VecInit((0 until p.instructionLanes).map(i => + (wactive(i) & actv(i).wactive) =/= 0.U || + (ractive(i) & actv(i).ractive) =/= 0.U + )) // --------------------------------------------------------------------------- // Data. - val fvalid = VecInit(f.io.out(0).valid, f.io.out(1).valid, - f.io.out(2).valid, f.io.out(3).valid).asUInt - assert(!(fvalid(1) && fvalid(0,0) =/= 1.U)) - assert(!(fvalid(2) && fvalid(1,0) =/= 3.U)) - assert(!(fvalid(3) && fvalid(2,0) =/= 7.U)) + val fvalid = VecInit(f.io.out.map(_.valid)).asUInt + for (i <- 0 until p.instructionLanes) { + assert(!(fvalid(i) && PopCount(fvalid(i,0)) =/= (i + 1).U)) + } // Register is updated when fifo has state or contents are active. val dataEn = fvalid(0) || valid.asUInt =/= 0.U - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { when (dataEn) { data(i) := dataNxt(i) cmdq(i) := cmdqNxt(i) @@ -201,14 +167,14 @@ } } - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { actv(i).ractive := actv2(i).ractive actv(i).wactive := actv2(i).wactive(63, 0) | actv2(i).wactive(127, 64) } // Tag the decode wactive. - val dactv = Wire(Vec(4, new VDecodeActive2)) - for (i <- 0 until 4) { + val dactv = Wire(Vec(p.instructionLanes, new VDecodeActive2)) + for (i <- 0 until p.instructionLanes) { val w0 = d(i).io.actv.wactive & ~tags(i + 1) val w1 = d(i).io.actv.wactive & tags(i + 1) dactv(i).ractive := d(i).io.actv.ractive @@ -216,155 +182,51 @@ } // Data multiplexor of current values and fifo+decode output. - val dataMux = VecInit(data(0), data(1), data(2), data(3), - e(0), e(1), e(2), e(3)) + val dataMux = VecInit(data ++ e) + val cmdqMux = VecInit(cmdq ++ d.map(x => x.io.cmdq)) + val actvMux = VecInit(actv2 ++ dactv) - val cmdqMux = VecInit(cmdq(0), cmdq(1), cmdq(2), cmdq(3), - d(0).io.cmdq, d(1).io.cmdq, d(2).io.cmdq, d(3).io.cmdq) - - val actvMux = VecInit(actv2(0), actv2(1), actv2(2), actv2(3), - dactv(0), dactv(1), dactv(2), dactv(3)) - + def GenerateMarked(start: Int, count: Int): Seq[UInt] = { + (0 until count).map(x => Wire(UInt((start + x).W))) + } // Mark the multiplexor entries that need to be kept. - val marked0 = Wire(UInt(5.W)) - val marked1 = Wire(UInt(6.W)) - val marked2 = Wire(UInt(7.W)) + val marked = GenerateMarked((p.instructionLanes + 1), p.instructionLanes - 1) + val output = Cat((0 until p.instructionLanes).reverse.map(x => io.out(x).valid && io.out(x).ready)) + val validNotOutput = (0 until (p.instructionLanes * 2) - 1).map(x => + if (x < valid.length) { valid(x) && !output(x) } else { true.B }) + val prevMarked = (0 until p.instructionLanes).map(x => + if (x == 0) { None } else { Some(marked(x - 1)) } + ) - assert((marked1 & marked0) === marked0) - assert((marked2 & marked0) === marked0) - assert((marked2 & marked1) === marked1) - - val output = Cat(io.out(3).valid && io.out(3).ready, - io.out(2).valid && io.out(2).ready, - io.out(1).valid && io.out(1).ready, - io.out(0).valid && io.out(0).ready) - - when (valid(0) && !output(0)) { - dataNxt(0) := dataMux(0) - cmdqNxt(0) := cmdqMux(0) - actvNxt(0) := actvMux(0) - marked0 := 0x01.U - } .elsewhen (valid(1) && !output(1)) { - dataNxt(0) := dataMux(1) - cmdqNxt(0) := cmdqMux(1) - actvNxt(0) := actvMux(1) - marked0 := 0x03.U - } .elsewhen (valid(2) && !output(2)) { - dataNxt(0) := dataMux(2) - cmdqNxt(0) := cmdqMux(2) - actvNxt(0) := actvMux(2) - marked0 := 0x07.U - } .elsewhen (valid(3) && !output(3)) { - dataNxt(0) := dataMux(3) - cmdqNxt(0) := cmdqMux(3) - actvNxt(0) := actvMux(3) - marked0 := 0x0f.U - } .otherwise { - dataNxt(0) := dataMux(4) - cmdqNxt(0) := cmdqMux(4) - actvNxt(0) := actvMux(4) - marked0 := 0x1f.U - } - - when (!marked0(1) && valid(1) && !output(1)) { - dataNxt(1) := dataMux(1) - cmdqNxt(1) := cmdqMux(1) - actvNxt(1) := actvMux(1) - marked1 := 0x03.U - } .elsewhen (!marked0(2) && valid(2) && !output(2)) { - dataNxt(1) := dataMux(2) - cmdqNxt(1) := cmdqMux(2) - actvNxt(1) := actvMux(2) - marked1 := 0x07.U - } .elsewhen (!marked0(3) && valid(3) && !output(3)) { - dataNxt(1) := dataMux(3) - cmdqNxt(1) := cmdqMux(3) - actvNxt(1) := actvMux(3) - marked1 := 0x0f.U - } .elsewhen (!marked0(4)) { - dataNxt(1) := dataMux(4) - cmdqNxt(1) := cmdqMux(4) - actvNxt(1) := actvMux(4) - marked1 := 0x1f.U - } .otherwise { - dataNxt(1) := dataMux(5) - cmdqNxt(1) := cmdqMux(5) - actvNxt(1) := actvMux(5) - marked1 := 0x3f.U - } - - when (!marked1(2) && valid(2) && !output(2)) { - dataNxt(2) := dataMux(2) - cmdqNxt(2) := cmdqMux(2) - actvNxt(2) := actvMux(2) - marked2 := 0x07.U - } .elsewhen (!marked1(3) && valid(3) && !output(3)) { - dataNxt(2) := dataMux(3) - cmdqNxt(2) := cmdqMux(3) - actvNxt(2) := actvMux(3) - marked2 := 0x0f.U - } .elsewhen (!marked1(4)) { - dataNxt(2) := dataMux(4) - cmdqNxt(2) := cmdqMux(4) - actvNxt(2) := actvMux(4) - marked2 := 0x1f.U - } .elsewhen (!marked1(5)) { - dataNxt(2) := dataMux(5) - cmdqNxt(2) := cmdqMux(5) - actvNxt(2) := actvMux(5) - marked2 := 0x3f.U - } .otherwise { - dataNxt(2) := dataMux(6) - cmdqNxt(2) := cmdqMux(6) - actvNxt(2) := actvMux(6) - marked2 := 0x7f.U - } - - when (!marked2(3) && valid(3) && !output(3)) { - dataNxt(3) := dataMux(3) - cmdqNxt(3) := cmdqMux(3) - actvNxt(3) := actvMux(3) - } .elsewhen (!marked2(4)) { - dataNxt(3) := dataMux(4) - cmdqNxt(3) := cmdqMux(4) - actvNxt(3) := actvMux(4) - } .elsewhen (!marked2(5)) { - dataNxt(3) := dataMux(5) - cmdqNxt(3) := cmdqMux(5) - actvNxt(3) := actvMux(5) - } .elsewhen (!marked2(6)) { - dataNxt(3) := dataMux(6) - cmdqNxt(3) := cmdqMux(6) - actvNxt(3) := actvMux(6) - } .otherwise { - dataNxt(3) := dataMux(7) - cmdqNxt(3) := cmdqMux(7) - actvNxt(3) := actvMux(7) + for (i <- 0 until p.instructionLanes) { + val idx = MuxCase((i + p.instructionLanes).U, (i until p.instructionLanes + i).map(x => + (!prevMarked(i).getOrElse(false.B)(x) && validNotOutput(x)) -> (x).U + )) + dataNxt(i) := dataMux(idx) + cmdqNxt(i) := cmdqMux(idx) + actvNxt(i) := actvMux(idx) + if (i < marked.length) { + val width = marked(i).getWidth + marked(i) := ~0.U(width.W) >> ((width - 1).U - idx) + } } // --------------------------------------------------------------------------- // Scoreboard. - io.vrfsb.set.valid := output(0) || output(1) || output(2) || output(3) + // io.vrfsb.set.valid := output(0) || output(1) || output(2) || output(3) + io.vrfsb.set.valid := output =/= 0.U - io.vrfsb.set.bits := (MuxOR(output(0), actv2(0).wactive) | - MuxOR(output(1), actv2(1).wactive) | - MuxOR(output(2), actv2(2).wactive) | - MuxOR(output(3), actv2(3).wactive)) + io.vrfsb.set.bits := (0 until p.instructionLanes).map(x => MuxOR(output(x), actv2(x).wactive)).reduce(_ | _) assert((io.vrfsb.set.bits(63, 0) & io.vrfsb.set.bits(127, 64)) === 0.U) assert(((io.vrfsb.data(63, 0) | io.vrfsb.data(127, 64)) & (io.vrfsb.set.bits(63, 0) | io.vrfsb.set.bits(127, 64))) === 0.U) // --------------------------------------------------------------------------- // Outputs. - val outvalid = Wire(Vec(4, Bool())) - val cmdsync = Wire(Vec(4, Bool())) + val outvalid = VecInit((0 until p.instructionLanes).map(i => valid(i) && !depends(i))) + val cmdsync = VecInit((0 until p.instructionLanes).map(i => data(i).cmdsync)) - for (i <- 0 until 4) { - outvalid(i) := valid(i) && !depends(i) - cmdsync(i) := data(i).cmdsync - } - - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { // Synchronize commands at cmdsync instance or if found in history. // Note: {vdwinit, vdwconv, vdmulh}, vdmulh must not issue before vdwconv. val synchronize = cmdsync.asUInt(i,0) =/= 0.U
diff --git a/hdl/chisel/src/kelvin/vector/VInst.scala b/hdl/chisel/src/kelvin/vector/VInst.scala index 8a1b42e..8757cea 100644 --- a/hdl/chisel/src/kelvin/vector/VInst.scala +++ b/hdl/chisel/src/kelvin/vector/VInst.scala
@@ -44,11 +44,11 @@ val op = Input(UInt(new VInstOp().Entries.W)) } -class VectorInstructionIO extends Bundle { +class VectorInstructionIO(p: Parameters) extends Bundle { val valid = Output(Bool()) val ready = Input(Bool()) val stall = Input(Bool()) - val lane = Vec(4, Valid(new VectorInstructionLane)) + val lane = Vec(p.instructionLanes, Valid(new VectorInstructionLane)) } class VectorInstructionLane extends Bundle { @@ -68,14 +68,14 @@ class VInst(p: Parameters) extends Module { val io = IO(new Bundle { // Decode cycle. - val in = Vec(4, new VInstIO) + val in = Vec(p.instructionLanes, new VInstIO) // Execute cycle. - val rs = Vec(8, Flipped(new RegfileReadDataIO)) - val rd = Vec(4, Flipped(new RegfileWriteDataIO)) + val rs = Vec(p.instructionLanes * 2, Flipped(new RegfileReadDataIO)) + val rd = Vec(p.instructionLanes, Flipped(new RegfileWriteDataIO)) // Vector interface. - val out = new VectorInstructionIO + val out = new VectorInstructionIO(p) // Status. val nempty = Output(Bool()) @@ -91,41 +91,34 @@ val maxvlwm = (p.vectorBits * 4 / 32).U(p.vectorCountBits.W) assert(maxvlw >= 4.U) - val slice = Slice(Vec(4, new Bundle { + val slice = Slice(Vec(p.instructionLanes, new Bundle { val vld = Output(Bool()) val vst = Output(Bool()) val lane = Valid(new VectorInstructionLane) }), true) - val reqvalid = VecInit(io.in(0).valid && io.in(0).ready, - io.in(1).valid && io.in(1).ready, - io.in(2).valid && io.in(2).ready, - io.in(3).valid && io.in(3).ready) - - val reqaddr = VecInit(io.in(0).inst(19,15), - io.in(1).inst(19,15), - io.in(2).inst(19,15), - io.in(3).inst(19,15)) + val reqvalid = VecInit(io.in.map(x => x.valid && x.ready)) + val reqaddr = VecInit(io.in.map(x => x.inst(19,15))) // --------------------------------------------------------------------------- // Response to Decode. - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { io.in(i).ready := !io.out.stall } // --------------------------------------------------------------------------- // Controls. - val vld_o = RegInit(VecInit(Seq.fill(4)(false.B))) - val vld_u = RegInit(VecInit(Seq.fill(4)(false.B))) - val vst_o = RegInit(VecInit(Seq.fill(4)(false.B))) - val vst_u = RegInit(VecInit(Seq.fill(4)(false.B))) - val vst_q = RegInit(VecInit(Seq.fill(4)(false.B))) - val getvl = RegInit(VecInit(Seq.fill(4)(false.B))) - val getmaxvl = RegInit(VecInit(Seq.fill(4)(false.B))) + val vld_o = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B))) + val vld_u = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B))) + val vst_o = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B))) + val vst_u = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B))) + val vst_q = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B))) + val getvl = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B))) + val getmaxvl = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B))) - val rdAddr = Reg(Vec(4, UInt(5.W))) + val rdAddr = Reg(Vec(p.instructionLanes, UInt(5.W))) - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { when (reqvalid(i)) { rdAddr(i) := io.in(i).addr } @@ -134,13 +127,13 @@ // --------------------------------------------------------------------------- // Vector Interface. val vvalid = RegInit(false.B) - val vinstValid = RegInit(VecInit(Seq.fill(4)(false.B))) - val vinstInst = Reg(Vec(4, UInt(32.W))) - val nxtVinstValid = Wire(Vec(4, Bool())) + val vinstValid = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B))) + val vinstInst = Reg(Vec(p.instructionLanes, UInt(32.W))) + val nxtVinstValid = Wire(Vec(p.instructionLanes, Bool())) vvalid := nxtVinstValid.asUInt =/= 0.U - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { nxtVinstValid(i) := reqvalid(i) && (io.in(i).op(vinst.VLD) || io.in(i).op(vinst.VST) || io.in(i).op(vinst.VIOP)) @@ -148,7 +141,7 @@ vinstInst(i) := io.in(i).inst } - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { val p = io.in(i).inst(28) // func2 val q = io.in(i).inst(30) // func2 vld_o(i) := reqvalid(i) && io.in(i).op(vinst.VLD) && !p @@ -162,11 +155,11 @@ // --------------------------------------------------------------------------- // Register write port. - val lsuAdder = Wire(Vec(4, UInt(32.W))) - val getvlValue = Wire(Vec(4, UInt(p.vectorCountBits.W))) // bytes - val getmaxvlValue = Wire(Vec(4, UInt(p.vectorCountBits.W))) // bytes + val lsuAdder = Wire(Vec(p.instructionLanes, UInt(32.W))) + val getvlValue = Wire(Vec(p.instructionLanes, UInt(p.vectorCountBits.W))) // bytes + val getmaxvlValue = Wire(Vec(p.instructionLanes, UInt(p.vectorCountBits.W))) // bytes - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { val rs1 = io.rs(2 * i + 0).data val rs2 = io.rs(2 * i + 1).data val m = vinstInst(i)(5) @@ -220,7 +213,7 @@ lsuAdder(i) := rs1 + offset } - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { val len = Wire(UInt(p.vectorCountBits.W)) // bytes val rs1 = io.rs(2 * i + 0).data val rs2 = io.rs(2 * i + 1).data @@ -247,7 +240,7 @@ getmaxvlValue(i) := maxvl } - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { io.rd(i).valid := getvl(i) || getmaxvl(i) || vld_u(i) || vst_u(i) || vst_q(i) io.rd(i).addr := rdAddr(i) @@ -267,7 +260,7 @@ // Resolve back-pressure with stall to io.in in decode. assert(!(slice.io.in.valid && !slice.io.in.ready)) - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { slice.io.in.bits(i).vld := vld_o(i) || vld_u(i) slice.io.in.bits(i).vst := vst_o(i) || vst_u(i) || vst_q(i) slice.io.in.bits(i).lane.valid := vinstValid(i) @@ -276,7 +269,7 @@ slice.io.in.bits(i).lane.bits.data := io.rs(2 * i + 1).data } - for (i <- 0 until 4) { + for (i <- 0 until p.instructionLanes) { io.out.lane(i) := slice.io.out.bits(i).lane } @@ -290,8 +283,7 @@ val nempty = RegInit(false.B) // Simple implementation, will overlap downstream units redundantly. - nempty := io.in(0).valid || io.in(1).valid || io.in(2).valid || - io.in(3).valid || vvalid || io.out.valid + nempty := io.in.map(x => x.valid).reduce(_ || _) || vvalid || io.out.valid io.nempty := nempty }
diff --git a/hdl/chisel/src/kelvin/vector/VLd.scala b/hdl/chisel/src/kelvin/vector/VLd.scala index 88b4d8d..bfbda33 100644 --- a/hdl/chisel/src/kelvin/vector/VLd.scala +++ b/hdl/chisel/src/kelvin/vector/VLd.scala
@@ -30,7 +30,7 @@ class VLd(p: Parameters) extends Module { val io = IO(new Bundle { // Instructions. - val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits)))) + val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VDecodeBits)))) // VRegfile. val write = new VRegfileWriteIO(p) @@ -131,7 +131,7 @@ 0.U } - val q = VCmdq(cmdqDepth, new VLdCmdq, Fin, Fout, Factive) + val q = VCmdq(p, cmdqDepth, new VLdCmdq, Fin, Fout, Factive) q.io.in <> io.in
diff --git a/hdl/chisel/src/kelvin/vector/VLdSt.scala b/hdl/chisel/src/kelvin/vector/VLdSt.scala index d2d9853..1aa3ee2 100644 --- a/hdl/chisel/src/kelvin/vector/VLdSt.scala +++ b/hdl/chisel/src/kelvin/vector/VLdSt.scala
@@ -30,7 +30,7 @@ class VLdSt(p: Parameters) extends Module { val io = IO(new Bundle { // Instructions. - val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits)))) + val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VDecodeBits)))) val active = Output(UInt(64.W)) // VRegfile. @@ -180,7 +180,7 @@ active } - val q = VCmdq(cmdqDepth, new VLdStCmdq, Fin, Fout, Factive) + val q = VCmdq(p, cmdqDepth, new VLdStCmdq, Fin, Fout, Factive) q.io.in <> io.in
diff --git a/hdl/chisel/src/kelvin/vector/VRegfile.scala b/hdl/chisel/src/kelvin/vector/VRegfile.scala index ac67ff0..fa75b05 100644 --- a/hdl/chisel/src/kelvin/vector/VRegfile.scala +++ b/hdl/chisel/src/kelvin/vector/VRegfile.scala
@@ -113,13 +113,14 @@ } class VRegfile(p: Parameters) extends Module { - val readPorts = 7 - val writePorts = 6 - val whintPorts = 4 + val readPorts = p.vectorReadPorts + val scalarPorts = p.vectorScalarPorts + val writePorts = p.vectorWritePorts + val whintPorts = p.vectorWhintPorts val io = IO(new Bundle { val read = Vec(readPorts, Flipped(new VRegfileReadIO(p))) - val scalar = Vec(readPorts / 3, Flipped(new VRegfileScalarIO(p))) + val scalar = Vec(scalarPorts, Flipped(new VRegfileScalarIO(p))) val write = Vec(writePorts, Flipped(new VRegfileWrintIO(p))) val whint = Vec(whintPorts, Flipped(new VRegfileWhintIO(p))) val conv = Flipped(new VRegfileConvIO(p))
diff --git a/hdl/chisel/src/kelvin/vector/VRegfileSegment.scala b/hdl/chisel/src/kelvin/vector/VRegfileSegment.scala index 90a4935..38451d7 100644 --- a/hdl/chisel/src/kelvin/vector/VRegfileSegment.scala +++ b/hdl/chisel/src/kelvin/vector/VRegfileSegment.scala
@@ -21,8 +21,8 @@ import _root_.circt.stage.ChiselStage class VRegfileSegment(p: Parameters) extends Module { - val readPorts = 7 - val writePorts = 6 + val readPorts = p.vectorReadPorts + val writePorts = p.vectorWritePorts val tcnt = 16.min(p.vectorBits / 32) val io = IO(new Bundle {
diff --git a/hdl/chisel/src/kelvin/vector/VSt.scala b/hdl/chisel/src/kelvin/vector/VSt.scala index f730fec..638f709 100644 --- a/hdl/chisel/src/kelvin/vector/VSt.scala +++ b/hdl/chisel/src/kelvin/vector/VSt.scala
@@ -30,7 +30,7 @@ class VSt(p: Parameters) extends Module { val io = IO(new Bundle { // Instructions. - val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits)))) + val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VDecodeBits)))) val active = Output(UInt(64.W)) // VRegfile. @@ -182,7 +182,7 @@ val strb = UInt((p.lsuDataBits / 8).W) } - val q = VCmdq(cmdqDepth, new VStCmdq, Fin, Fout, Factive) + val q = VCmdq(p, cmdqDepth, new VStCmdq, Fin, Fout, Factive) val ctrl = Slice(new Ctrl, false, true) val data = Slice(new Data, false, true, true)
diff --git a/tests/verilator_sim/kelvin/core_tb.cc b/tests/verilator_sim/kelvin/core_tb.cc index 969e361..73396ab 100644 --- a/tests/verilator_sim/kelvin/core_tb.cc +++ b/tests/verilator_sim/kelvin/core_tb.cc
@@ -195,14 +195,14 @@ core.io_slog_addr(io_slog_addr); core.io_slog_data(io_slog_data); core.io_debug_en(io_debug_en); - core.io_debug_addr0(io_debug_addr0); - core.io_debug_addr1(io_debug_addr1); - core.io_debug_addr2(io_debug_addr2); - core.io_debug_addr3(io_debug_addr3); - core.io_debug_inst0(io_debug_inst0); - core.io_debug_inst1(io_debug_inst1); - core.io_debug_inst2(io_debug_inst2); - core.io_debug_inst3(io_debug_inst3); + core.io_debug_addr_0(io_debug_addr0); + core.io_debug_addr_1(io_debug_addr1); + core.io_debug_addr_2(io_debug_addr2); + core.io_debug_addr_3(io_debug_addr3); + core.io_debug_inst_0(io_debug_inst0); + core.io_debug_inst_1(io_debug_inst1); + core.io_debug_inst_2(io_debug_inst2); + core.io_debug_inst_3(io_debug_inst3); core.io_debug_cycles(io_debug_cycles); mif.clock(tb.clock);