blob: ac67ff0f2b8573cfc3f3f847486be9dffd88de78 [file] [log] [blame]
/*
* Copyright 2023 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package kelvin
import chisel3._
import chisel3.util._
import common._
import _root_.circt.stage.ChiselStage
object VRegfile {
def apply(p: Parameters): VRegfile = {
return Module(new VRegfile(p))
}
}
class VRegfileReadIO(p: Parameters) extends Bundle {
val valid = Output(Bool())
val addr = Output(UInt(6.W))
val tag = Output(UInt(1.W))
val data = Input(UInt(p.vectorBits.W))
}
class VRegfileReadHsIO(p: Parameters) extends Bundle {
val valid = Output(Bool())
val ready = Input(Bool()) // handshake
val stall = Output(Bool()) // Testbench signal.
val addr = Output(UInt(6.W))
val tag = Output(UInt(1.W))
val data = Input(UInt(p.vectorBits.W))
}
class VRegfileScalarIO(p: Parameters) extends Bundle {
val valid = Output(Bool())
val data = Output(UInt(32.W))
}
class VRegfileTransposeIO(p: Parameters) extends Bundle {
val valid = Output(Bool())
val tcnt = 16.min(p.vectorBits / 32)
val addr = Output(UInt(6.W))
val index = Output(UInt(log2Ceil(tcnt).W))
val data = Input(UInt((tcnt * 32).W))
}
class VRegfileWrite(p: Parameters) extends Bundle {
val addr = UInt(6.W)
val data = UInt(p.vectorBits.W)
}
class VRegfileWriteIO(p: Parameters) extends Bundle {
val valid = Output(Bool())
val addr = Output(UInt(6.W))
val data = Output(UInt(p.vectorBits.W))
}
class VRegfileWriteHsIO(p: Parameters) extends Bundle {
val valid = Output(Bool())
val ready = Input(Bool()) // handshake used in arbitration logic
val addr = Output(UInt(6.W))
val data = Output(UInt(p.vectorBits.W))
}
// Write internal.
class VRegfileWrintIO(p: Parameters) extends Bundle {
val valid = Output(Bool())
val addr = Output(UInt(6.W))
val data = Output(UInt(p.vectorBits.W))
}
// Write internal.
class VRegfileWhintIO(p: Parameters) extends Bundle {
val valid = Output(Bool())
val addr = Output(UInt(6.W))
}
class VRegfileConvIO(p: Parameters) extends Bundle {
val valid = Output(Bool()) // registered signal suitable for mux control
val ready = Output(Bool()) // combinatoral from scheduling logic
val op = new Bundle {
val conv = Output(Bool()) // convolution to accum
val init = Output(Bool()) // set accum
val tran = Output(Bool()) // transpose to accum
val wclr = Output(Bool()) // write accum to vreg and clear accum
}
val addr1 = Output(UInt(6.W)) // narrow: transpose
val addr2 = Output(UInt(6.W)) // wide: internal
val mode = Output(UInt(2.W))
val index = Output(UInt(log2Ceil(p.vectorBits / 32).W))
val abias = Output(UInt(9.W))
val bbias = Output(UInt(9.W))
val asign = Output(Bool())
val bsign = Output(Bool())
}
class VRegfileScoreboardIO extends Bundle {
// 64 registers sequenced from even/odd tags.
val set = Valid(UInt(128.W))
val data = Input(UInt(128.W))
}
class VRegfile(p: Parameters) extends Module {
val readPorts = 7
val writePorts = 6
val whintPorts = 4
val io = IO(new Bundle {
val read = Vec(readPorts, Flipped(new VRegfileReadIO(p)))
val scalar = Vec(readPorts / 3, Flipped(new VRegfileScalarIO(p)))
val write = Vec(writePorts, Flipped(new VRegfileWrintIO(p)))
val whint = Vec(whintPorts, Flipped(new VRegfileWhintIO(p)))
val conv = Flipped(new VRegfileConvIO(p))
val transpose = Flipped(new VRegfileTransposeIO(p))
val vrfsb = Flipped(new VRegfileScoreboardIO)
val vrfwriteCount = Output(UInt(3.W))
})
val segcnt = p.vectorBits / 32
val segcntBits = log2Ceil(segcnt)
// ---------------------------------------------------------------------------
// Register file storage.
val vreg = for (i <- 0 until segcnt) yield {
Module(new VRegfileSegment(p))
}
// ---------------------------------------------------------------------------
// Convolution unit.
val vconv = VConvAlu(p)
// ---------------------------------------------------------------------------
// Assert state.
val writeCurr = Wire(UInt(64.W))
val writePrev = RegInit(0.U(64.W))
val writeSet = Wire(Vec(writePorts, UInt(64.W)))
for (i <- 0 until writePorts) {
writeSet(i) := MuxOR(io.write(i).valid, 1.U << io.write(i).addr)
}
writeCurr := VecOR(writeSet)
writePrev := writeCurr
// ---------------------------------------------------------------------------
// Write port interface and registration.
val writevalidBool = Wire(Vec(writePorts, Bool()))
val writevalid = writevalidBool.asUInt
val writebits = Wire(Vec(writePorts, new VRegfileWrite(p)))
val writevalidReg = RegInit(0.U(writePorts.W))
val writebitsReg = Reg(Vec(writePorts, new VRegfileWrite(p)))
for (i <- 0 until writePorts) {
writevalidBool(i) := io.write(i).valid
writebits(i).addr := io.write(i).addr
writebits(i).data := io.write(i).data
}
writevalidReg := writevalid
for (i <- 0 until writePorts) {
when (io.write(i).valid) {
writebitsReg(i).addr := io.write(i).addr
writebitsReg(i).data := io.write(i).data
}
}
io.vrfwriteCount := writevalid(0)
// ---------------------------------------------------------------------------
// Write ports.
for (i <- 0 until writePorts) {
for (j <- 0 until segcnt) {
vreg(j).io.write(i).valid := writevalidReg(i)
vreg(j).io.write(i).addr := writebitsReg(i).addr
vreg(j).io.write(i).data := writebitsReg(i).data(32 * j + 31, 32 * j)
}
}
// ---------------------------------------------------------------------------
// Read ports.
val readData = Reg(Vec(readPorts, UInt(p.vectorBits.W)))
def ReadScalar(i: Int): (Bool, UInt) = {
val valid = Wire(Bool())
val scalar = Wire(UInt(32.W))
if (i == 1 || i == 4) {
valid := io.scalar(i / 3).valid
scalar := io.scalar(i / 3).data
} else {
valid := false.B
scalar := 0.U
}
val lanes = p.vectorBits / 32
val values = Wire(Vec(lanes, UInt(32.W)))
for (i <- 0 until lanes) {
values(i) := scalar
}
val result = values.asUInt
assert(result.getWidth == p.vectorBits)
(valid, result)
}
val rdata = Wire(Vec(readPorts, Vec(segcnt, UInt(32.W))))
for (i <- 0 until readPorts) {
for (j <- 0 until segcnt) {
vreg(j).io.read(i).addr := io.read(i).addr
rdata(i)(j) := vreg(j).io.read(i).data
}
}
for (i <- 0 until readPorts) {
// Forwarding of internal write-staging registers.
val f1validBits = Wire(Vec(writePorts, Bool()))
val f1valid = f1validBits.asUInt
assert(PopCount(f1valid) <= 1.U)
val f2validBits = Wire(Vec(writePorts, Bool()))
val f2valid = f2validBits.asUInt
assert(PopCount(f2valid) <= 1.U)
for (j <- 0 until writePorts) {
f1validBits(j) := writevalid(j) &&
writebits(j).addr === io.read(i).addr
}
for (j <- 0 until writePorts) {
f2validBits(j) := writevalidReg(j) &&
writebitsReg(j).addr === io.read(i).addr
}
val f1dataBits = Wire(Vec(writePorts, UInt(p.vectorBits.W)))
val f1data = VecOR(f1dataBits, writePorts)
for (j <- 0 until writePorts) {
f1dataBits(j) := MuxOR(f1valid(j), writebits(j).data)
}
val f2dataBits = Wire(Vec(writePorts, UInt(p.vectorBits.W)))
val f2data = VecOR(f2dataBits, writePorts)
for (j <- 0 until writePorts) {
f2dataBits(j) := MuxOR(f2valid(j), writebitsReg(j).data)
}
val (scalarValid, scalarData) = ReadScalar(i)
val sel = Cat(scalarValid,
!scalarValid && f1valid =/= 0.U,
!scalarValid && f1valid === 0.U && f2valid =/= 0.U,
!scalarValid && f1valid === 0.U && f2valid === 0.U)
assert(PopCount(sel) <= 1.U)
val data = MuxOR(sel(3), scalarData) |
MuxOR(sel(2), f1data) |
MuxOR(sel(1), f2data) |
MuxOR(sel(0), rdata(i).asUInt)
val rvalid =
if (i == 1 || i == 4) {
assert(!(io.read(i).valid && io.scalar(i / 3).valid))
io.read(i).valid || io.scalar(i / 3).valid
} else {
io.read(i).valid
}
when (rvalid) {
readData(i) := data
}
}
for (i <- 0 until readPorts) {
io.read(i).data := readData(i)
}
// ---------------------------------------------------------------------------
// Conv port.
val convConv = RegInit(false.B)
val convInit = RegInit(false.B)
val convTran = RegInit(false.B)
val convClear = RegInit(false.B)
val convIndex = Reg(UInt(log2Ceil(p.vectorBits / 32).W))
val convAbias = Reg(UInt(9.W))
val convBbias = Reg(UInt(9.W))
val convAsign = Reg(Bool())
val convBsign = Reg(Bool())
val internalData = Reg(UInt(p.vectorBits.W))
// io.conv.valid controls read multiplexors
// io.conv.ready frames data phase readiness
convConv := io.conv.valid && io.conv.ready && io.conv.op.conv
convInit := io.conv.valid && io.conv.ready && io.conv.op.init
convTran := io.conv.valid && io.conv.ready && io.conv.op.tran
convClear := io.conv.valid && io.conv.ready && io.conv.op.wclr
convIndex := io.conv.index
assert(!(io.conv.valid && io.conv.ready) ||
PopCount(Cat(io.conv.op.conv, io.conv.op.wclr, io.conv.op.init, io.conv.op.tran)) === 1.U)
val idata = Wire(Vec(segcnt, UInt(32.W)))
for (i <- 0 until segcnt) {
idata(i) := vreg(i).io.internal.data
}
for (i <- 0 until segcnt) {
vreg(i).io.internal.addr := io.conv.addr2
}
when (io.conv.valid) {
internalData := idata.asUInt
}
when (io.conv.valid) {
convAbias := io.conv.abias
convBbias := io.conv.bbias
convAsign := io.conv.asign
convBsign := io.conv.bsign
}
for (i <- 0 until segcnt) {
vreg(i).io.conv.valid := convClear
for (j <- 0 until segcnt) {
vreg(i).io.conv.data(j) := vconv.io.out(j)(32 * i + 31, 32 * i) // note index are reversed
}
}
// Note: do not assert if read touches any of the conv read/write registers.
// Other scheduling mechanisms are used to not advance the opcode.
val convRead0 = io.conv.valid && io.conv.ready && io.conv.op.conv
val convClear0 = io.conv.valid && io.conv.ready && io.conv.op.wclr
assert(!(convRead0 && io.conv.mode =/= 0.U))
// assert(!(convRead0 && io.conv.addr1(5,4) === 3.U))
// assert(!(convRead0 && io.conv.addr2(5,4) === 3.U))
assert(!(convRead0 && io.conv.addr1(3,0) =/= 0.U))
assert(!(convRead0 && io.conv.addr1(5,2) === io.conv.addr2(5,2) && (p.vectorBits == 128).B))
assert(!(convRead0 && io.conv.addr1(5,3) === io.conv.addr2(5,3) && (p.vectorBits == 256).B))
assert(!(convRead0 && io.conv.addr1(5,4) === io.conv.addr2(5,4) && (p.vectorBits == 512).B))
// Convolution reads must not be under pipelined writes.
assert(!(convRead0 && writeCurr(io.conv.addr2)))
assert(!(convRead0 && writePrev(io.conv.addr2)))
val convmaska = 0xffff.U << 48.U
assert(!(convClear0 && (writeCurr & convmaska) =/= 0.U))
assert(!(convClear0 && (writePrev & convmaska) =/= 0.U))
// // Note: writePrev check not needed since accumulator is a cycle after reads.
// // assert(!(convClear0 && (writePrev & convmaska) =/= 0.U))
for (i <- 0 until writePorts) {
assert(!((convClear0 || convClear) && io.write(i).valid && io.write(i).addr >= 48.U))
}
// ---------------------------------------------------------------------------
// Convolution.
vconv.io.op.conv := convConv
vconv.io.op.init := convInit
vconv.io.op.tran := convTran
vconv.io.op.clear := convClear
vconv.io.index := convIndex
vconv.io.adata := io.transpose.data
vconv.io.bdata := internalData
vconv.io.abias := convAbias
vconv.io.bbias := convBbias
vconv.io.asign := convAsign
vconv.io.bsign := convBsign
// ---------------------------------------------------------------------------
// Transpose port.
val transposeData = Reg(UInt(io.transpose.data.getWidth.W))
val transposeDataMux = Wire(Vec(segcnt, UInt(io.transpose.data.getWidth.W)))
for (i <- 0 until segcnt) {
vreg(i).io.transpose.addr := Mux(io.conv.valid, io.conv.addr1, io.transpose.addr)
transposeDataMux(i) := vreg(i).io.transpose.data
}
when (io.conv.valid || io.transpose.valid) {
val index = Mux(io.conv.valid, io.conv.index, io.transpose.index)
transposeData := VecAt(transposeDataMux, index)
}
io.transpose.data := transposeData
// Transpose reads must not be under pipelined writes.
for (i <- 0 until segcnt) {
assert(!(io.transpose.valid && writeCurr(io.transpose.addr + i.U)))
assert(!(io.transpose.valid && writePrev(io.transpose.addr + i.U)))
}
assert(!(io.transpose.valid && io.conv.valid))
assert(!(io.transpose.valid && convConv))
// ---------------------------------------------------------------------------
// Scoreboard.
def SbClr(valid: Bool = false.B, data: UInt = 0.U(128.W), i: Int = 0): (Bool, UInt) = {
if (i < writePorts) {
val wvalid = io.write(i).valid
val hvalid = if (i < whintPorts) io.whint(i).valid else false.B
val woh = MuxOR(io.write(i).valid, UIntToOH(io.write(i).addr, 64))
val hoh = if (i < whintPorts) MuxOR(io.whint(i).valid, UIntToOH(io.whint(i).addr, 64)) else 0.U
val whoh = woh | hoh
val whdata = Cat(whoh, whoh)
assert(whdata.getWidth == 128)
SbClr(valid || wvalid || hvalid, data | whdata, i + 1)
} else {
val cvalid = convClear // delayed one cycle beyond io.conv.wclr, no forwarding to read ports
val cdataH = Wire(UInt(16.W))
val cdata = MuxOR(cvalid, Cat(cdataH, 0.U(48.W), cdataH, 0.U(48.W)))
assert(cdata.getWidth == 128)
if (p.vectorBits == 128) cdataH := 0x000f.U
if (p.vectorBits == 256) cdataH := 0x00ff.U
if (p.vectorBits == 512) cdataH := 0xffff.U
(valid || cvalid, data | cdata)
}
}
val vrfsb = RegInit(0.U(128.W))
val vrfsbSetEn = io.vrfsb.set.valid
val vrfsbSet = MuxOR(io.vrfsb.set.valid, io.vrfsb.set.bits)
val (vrfsbClrEn, vrfsbClr) = SbClr()
when (vrfsbSetEn || vrfsbClrEn) {
vrfsb := (vrfsb & ~vrfsbClr) | vrfsbSet
}
io.vrfsb.data := vrfsb
}
object EmitVRegfile extends App {
val p = new Parameters
ChiselStage.emitSystemVerilogFile(new VRegfile(p), args)
}