Initial commit of overview.md - Also clean up some chisel Change-Id: I7bb97991aa36f6cf127092a53c2f7ff661d0e9f0
diff --git a/README.md b/README.md index b697d08..79de965 100644 --- a/README.md +++ b/README.md
@@ -2,6 +2,11 @@ Kelvin is a RISC-V32IM core with a custom instruction set. + + +More information on the design can be found in the +[overview](doc/overview.md). + ## Building Kelvin uses [bazel](https://bazel.build/) as it's build system. The Verilated
diff --git a/doc/images/arch.png b/doc/images/arch.png new file mode 100644 index 0000000..0483cf5 --- /dev/null +++ b/doc/images/arch.png Binary files differ
diff --git a/doc/images/mac.png b/doc/images/mac.png new file mode 100644 index 0000000..775b37e --- /dev/null +++ b/doc/images/mac.png Binary files differ
diff --git a/doc/images/simd.png b/doc/images/simd.png new file mode 100644 index 0000000..b3db5dd --- /dev/null +++ b/doc/images/simd.png Binary files differ
diff --git a/doc/overview.md b/doc/overview.md new file mode 100644 index 0000000..b741e3f --- /dev/null +++ b/doc/overview.md
@@ -0,0 +1,110 @@ +# Kelvin + +Kelvin is a RISCV CPU with custom SIMD instructions and microarchitectural +decisions aligned with the dataplane properties of an ML accelerator. Kelvin +starts with domain and matrix capabilities and then adds vector and scalar +capabilities for a fused design. + +## Block Diagram + + + +## Scalar Core + +A simple RISC-V scalar frontend drives the command queues of the ML+SIMD +backend. + +Kelvin utilizes a custom RISC-V frontend (rv32im) that runs the minimal set of +instructions to support an executor run-to-completion model (eg. no OS, no +interrupts), with all control tasks onloaded to the SMC . The C extension +encoding is reclaimed (as per the risc-v specification) to provide the necessary +encoding space for the SIMD registers (6b indices), and to allow flexible type +encodings and instruction compression (stripmining) for the SIMD instruction +set. The scalar core is an in order machine with no speculation. + +The branch policy in the fetch stage is backwards branches are taken and forward +branches are not-taken, incurring a penalty cycle if the execute result does not +match the decision in the fetch unit. + +## Vector Core + + + +We use SIMD and vector interchangeably, referring to a simple and practical SIMD +instruction definition devoid of variable length behaviors. The scalar frontend +is decoupled from the backend by a Fifo structure that buffers vector +instructions, posting only to the relevant command queues when dependencies are +resolved in the vector regfile. + +### MAC + +The central component of the design is a quantized outer product +multiply-accumulate engine. An outer-product engine provides two-dimensional +broadcast structures to maximize the amount of deliverable compute with respect +to memory accesses. On one axis is a parallel broadcast (“wide”, convolution +weights), and the other axis the transpose shifted inputs of a number of batches +(“narrow”, eg. MobileNet XY batching). + + + +The outer-product construction is a vertical arrangement of multiple VDOT +opcodes which utilize 4x 8bit multiplies reduced into 32 bit accumulators. + +### Stripmining + +Strip mining is defined as folding array-based parallelism to fit the available +hardware parallelism. To reduce frontend instruction dispatch pressure becoming +a bottleneck, and to natively support instruction level tiling patterns through +the SIMD registers, the instruction encoding shall explicitly include a +stripmine mechanism that converts a single frontend dispatch event to the +command queue into four serialized issue events into the SIMD units. For +instance a “vadd v0” in Dispatch will produce “vadd v0 : vadd v1 : vadd v2 : +vadd v3” at Issue. These will be processed as four discrete events. + +## Registers + +There are 4 distinct register types. + +Registers | Names | Width +---------------- | ------------- | ----------------------- +Scalar (31) | zero, x1..x31 | 32 bits +Vector (64) | v0..v63 | 256 bits (eg. int32 x8) +Accumulator | acc<8><8> | 8x8x 32 bits +Control & Status | CSRx | Various + +## Cache + +Caches exists as a single layer between the core and the first level of shared +SRAM. The L1 cache and scalar core frontend are an overhead to the rest of the +backend compute pipeline and ideally are as small as possible. + +The L1Icache is 8KB (256b blocks * 256 slots) with 4-way set associativity. + +The L1Dcache sizing is towards the scalar core requirements to perform loop +management and address generation. The L1Dcache is 16KB (SIMD256b) with low set +associativity of 4-way. The L1Dcache is implemented with a dual bank +architecture where each bank is 8KB (similar to L1Icache). This property allows +for a degree of next line prefetch. The L1Dcache also serves as an alignment +buffer for the scalar and SIMD instructions to assist development and to +simplify software support. In an embedded setting, the L1Dcache provides half of +the memory bandwidth to the ML outer-product engine when only a single external +memory port is provided. Line and all entry flushing is supported where the core +stalls until completion to simplify the contract. + +A shared VLdSt unit exists for cached accesses. + +## Uncached + +Note: It is not recommended to use intentional uncached accesses as +`mmap_uncached` has been seen to be buggy. + +Memory may be accessed as uncached through the setting of a high address bit. +This is for simple fine grain control over how load/store units are to access +memory directly or through the L1 cache. We only allow aligned accesses of +native register size (eg. scalar=32b, simd=256b) via uncached accesses direct to +memory. This simplifies the hardware which is required to support a large window +of outstanding read operations, but does impose complications on the software. +The code must assume C `__restrict__` attributes for any memory accessed in this +way. + +Separate VLd and VSt units exist for uncached accesses.
diff --git a/hdl/chisel/src/common/Fifo4.scala b/hdl/chisel/src/common/Fifo4.scala index 0c277a1..952ff33 100644 --- a/hdl/chisel/src/common/Fifo4.scala +++ b/hdl/chisel/src/common/Fifo4.scala
@@ -127,16 +127,6 @@ in1pos === i.U && in1valid(1), in0pos === i.U && in0valid(0)) - // Couldn't get the following to work properly. - // - // val data = MuxOR(valid(0), io.in.bits(0).bits.asUInt) | - // MuxOR(valid(1), io.in.bits(1).bits.asUInt) | - // MuxOR(valid(2), io.in.bits(2).bits.asUInt) | - // MuxOR(valid(3), io.in.bits(3).bits.asUInt) - // - // when (ivalid && valid =/= 0.U) { - // mem(i) := data.asTypeOf(t) - // } when (ivalid) { when (valid(0)) { mem(i) := io.in.bits(0).bits @@ -166,7 +156,6 @@ when (mcount > 0.U) { mslice.io.in.bits := mem(outpos) } .elsewhen (ivalid) { - // As above, couldn't get MuxOR to work. when (iactive(0)) { mslice.io.in.bits := io.in.bits(0).bits } .elsewhen (iactive(1)) {
diff --git a/hdl/chisel/src/common/Fifo4e.scala b/hdl/chisel/src/common/Fifo4e.scala index ac2a484..a298552 100644 --- a/hdl/chisel/src/common/Fifo4e.scala +++ b/hdl/chisel/src/common/Fifo4e.scala
@@ -100,16 +100,6 @@ in1pos === i.U && in1valid(1), in0pos === i.U && in0valid(0)) - // Couldn't get the following to work properly. - // - // val data = MuxOR(valid(0), io.in.bits(0).bits.asUInt) | - // MuxOR(valid(1), io.in.bits(1).bits.asUInt) | - // MuxOR(valid(2), io.in.bits(2).bits.asUInt) | - // MuxOR(valid(3), io.in.bits(3).bits.asUInt) - // - // when (ivalid && valid =/= 0.U) { - // mem(i) := data.asTypeOf(t) - // } when (ivalid) { when (valid(0)) { mem(i) := io.in.bits(0).bits
diff --git a/hdl/chisel/src/kelvin/L1DCache.scala b/hdl/chisel/src/kelvin/L1DCache.scala index 13e61a1..4c0d8d0 100644 --- a/hdl/chisel/src/kelvin/L1DCache.scala +++ b/hdl/chisel/src/kelvin/L1DCache.scala
@@ -262,7 +262,7 @@ // 2^8 * 256 / 8 = 8KiB 4-way Tag[31,12] + Index[11,6] + Data[5,0] val slots = p.l1dslots val slotBits = log2Ceil(slots) - val assoc = 4 // 2, 4, 8, 16, slots + val assoc = 4 val sets = slots / assoc val setLsb = log2Ceil(p.lsuDataBits / 8) val setMsb = log2Ceil(sets) + setLsb - 1 @@ -342,7 +342,6 @@ val valid = RegInit(VecInit(Seq.fill(slots)(false.B))) val dirty = RegInit(VecInit(Seq.fill(slots)(false.B))) val camaddr = Reg(Vec(slots, UInt(32.W))) - // val mem = Mem1RWM(slots, p.lsuDataBits * 9 / 8, 9) val mem = Module(new Sram_1rwm_256x288()) val history = Reg(Vec(slots / assoc, Vec(assoc, UInt(log2Ceil(assoc).W))))
diff --git a/hdl/chisel/src/kelvin/L1ICache.scala b/hdl/chisel/src/kelvin/L1ICache.scala index 55cc19c..135bb6c 100644 --- a/hdl/chisel/src/kelvin/L1ICache.scala +++ b/hdl/chisel/src/kelvin/L1ICache.scala
@@ -26,13 +26,13 @@ class L1ICache(p: Parameters) extends Module { // A relatively simple cache block. Only one transaction may post at a time. - // 2^8 * 256 / 8 = 8KiB 4-way Tag[31,12] + Index[11,6] + Data[5,0] + // 2^8 * 256 / 8 = 8KiB 4-way Tag[31,11] + Index[10,5] + Data[4,0] assert(p.axi0IdBits == 4) assert(p.axi0DataBits == 256) val slots = p.l1islots val slotBits = log2Ceil(slots) - val assoc = 4 // 2, 4, 8, 16, slots + val assoc = 4 val sets = slots / assoc val setLsb = log2Ceil(p.fetchDataBits / 8) val setMsb = log2Ceil(sets) + setLsb - 1 @@ -71,7 +71,6 @@ // CAM state. val valid = RegInit(VecInit(Seq.fill(slots)(false.B))) val camaddr = Reg(Vec(slots, UInt(32.W))) - // val mem = Mem1RW(slots, UInt(p.axi0DataBits.W)) val mem = Module(new Sram_1rw_256x256()) val history = Reg(Vec(slots / assoc, Vec(assoc, UInt(log2Ceil(assoc).W))))
diff --git a/hdl/chisel/src/kelvin/Parameters.scala b/hdl/chisel/src/kelvin/Parameters.scala index b9e3d77..bb8a149 100644 --- a/hdl/chisel/src/kelvin/Parameters.scala +++ b/hdl/chisel/src/kelvin/Parameters.scala
@@ -25,7 +25,7 @@ } // Vector Length (register-file and compute). - // 128 = faster builds, but not production(?). + // 128 = faster builds, but not production. val vectorBits = sys.env.get("KELVIN_SIMD").getOrElse("256").toInt assert(vectorBits == 512 || vectorBits == 256 || vectorBits == 128) @@ -46,9 +46,7 @@ val vectorFifoDepth = 16 // L0ICache Fetch unit. - // val fetchCacheBytes = 2048 val fetchCacheBytes = 1024 - // val fetchCacheBytes = 128 // Scalar Core Fetch bus. val fetchAddrBits = 32 // do not change
diff --git a/hdl/chisel/src/kelvin/scalar/Alu.scala b/hdl/chisel/src/kelvin/scalar/Alu.scala index 445dacc..06818c1 100644 --- a/hdl/chisel/src/kelvin/scalar/Alu.scala +++ b/hdl/chisel/src/kelvin/scalar/Alu.scala
@@ -79,31 +79,10 @@ op := io.req.op } - // val rs1 = MuxOR(valid, io.rs1.data) - // val rs2 = MuxOR(valid, io.rs2.data) val rs1 = io.rs1.data val rs2 = io.rs2.data val shamt = rs2(4,0) - // TODO: should we be masking like this for energy? - // TODO: a single addsub for add/sub/slt/sltu - // val add = MuxOR(op(alu.ADD), rs1) + MuxOR(op(alu.ADD), rs2) - // val sub = MuxOR(op(alu.SUB), rs1) - MuxOR(op(alu.SUB), rs2) - // val sll = MuxOR(op(alu.SLL), rs1) << MuxOR(op(alu.SLL), shamt) - // val srl = MuxOR(op(alu.SRL), rs1) >> MuxOR(op(alu.SRL), shamt) - // val sra = (MuxOR(op(alu.SRA), rs1.asSInt, 0.S) >> MuxOR(op(alu.SRA), shamt)).asUInt - // val slt = MuxOR(op(alu.SLT), rs1.asSInt, 0.S) < MuxOR(op(alu.SLT), rs2.asSInt, 0.S) - // val sltu = MuxOR(op(alu.SLTU), rs1) < MuxOR(op(alu.SLTU), rs2) - // val and = MuxOR(op(alu.AND), rs1) & MuxOR(op(alu.AND), rs2) - // val or = MuxOR(op(alu.OR), rs1) | MuxOR(op(alu.OR), rs2) - // val xor = MuxOR(op(alu.XOR), rs1) ^ MuxOR(op(alu.XOR), rs2) - // val lui = MuxOR(op(alu.LUI), rs2) - // val clz = MuxOR(op(alu.CLZ), CLZ(rs1)) - // val ctz = MuxOR(op(alu.CTZ), CTZ(rs1)) - // val pcnt = MuxOR(op(alu.PCNT), PopCount(rs1)) - - // io.rd.data := add | sub | sll | srl | sra | slt | sltu | and | or | xor | lui - io.rd.valid := valid io.rd.addr := addr io.rd.data := MuxOR(op(alu.ADD), rs1 + rs2) |
diff --git a/hdl/chisel/src/kelvin/scalar/Decode.scala b/hdl/chisel/src/kelvin/scalar/Decode.scala index 48f3622..9c7abf4 100644 --- a/hdl/chisel/src/kelvin/scalar/Decode.scala +++ b/hdl/chisel/src/kelvin/scalar/Decode.scala
@@ -12,6 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. + +// Decode: Contains decode logic to be forwarded to the appropriate functional +// block. A serialization mechanism is introduced to stall a decoded instruction +// from bring presented to the functional block until next cycle if the block has +// already been presented with an instruction from another decoder. + package kelvin import chisel3._
diff --git a/hdl/chisel/src/kelvin/scalar/Fetch.scala b/hdl/chisel/src/kelvin/scalar/Fetch.scala index 722c428..fb951e1 100644 --- a/hdl/chisel/src/kelvin/scalar/Fetch.scala +++ b/hdl/chisel/src/kelvin/scalar/Fetch.scala
@@ -12,6 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. + +// Fetch Unit: 4 way fetcher that directly feeds the 4 decoders. +// The fetcher itself has a partial decoder to identify branches, where backwards +// branches are assumed taken and forward branches assumed not taken. + package kelvin import chisel3._
diff --git a/hdl/chisel/src/kelvin/scalar/Regfile.scala b/hdl/chisel/src/kelvin/scalar/Regfile.scala index c594fcb..a3e9d84 100644 --- a/hdl/chisel/src/kelvin/scalar/Regfile.scala +++ b/hdl/chisel/src/kelvin/scalar/Regfile.scala
@@ -12,6 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Regfile: 32 entry scalar register file with 8 read ports and 6 +// write ports. Houses a global scoreboard that informs of interlock +// deps inside the decoders. + package kelvin import chisel3._ @@ -92,11 +96,8 @@ } }) - // 8R6W - // 8 read ports - // 6 write ports - // The scalar registers, integer (and float todo). + // The scalar registers. val regfile = Reg(Vec(32, UInt(32.W))) // ***************************************************************************
diff --git a/hdl/chisel/src/kelvin/scalar/SCore.scala b/hdl/chisel/src/kelvin/scalar/SCore.scala index 07ef6ad..b380584 100644 --- a/hdl/chisel/src/kelvin/scalar/SCore.scala +++ b/hdl/chisel/src/kelvin/scalar/SCore.scala
@@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. + +// Scalar Core Frontend package kelvin import chisel3._
diff --git a/hdl/chisel/src/kelvin/vector/VCore.scala b/hdl/chisel/src/kelvin/vector/VCore.scala index 7e4cb00..029900a 100644 --- a/hdl/chisel/src/kelvin/vector/VCore.scala +++ b/hdl/chisel/src/kelvin/vector/VCore.scala
@@ -26,12 +26,6 @@ } } -// object VCore { -// def apply(p: Parameters): VCoreEmpty = { -// return Module(new VCoreEmpty(p)) -// } -// } - class VCoreIO(p: Parameters) extends Bundle { // Decode cycle. val vinst = Vec(4, new VInstIO) @@ -310,51 +304,3 @@ vld.io.nempty || vst.io.nempty } -class VCoreEmpty(p: Parameters) extends Module { - val io = IO(new Bundle { - // Score <> VCore - val score = new VCoreIO(p) - - // Data bus interface. - val dbus = new DBusIO(p) - val last = Output(Bool()) - - // AXI interface. - val ld = new AxiMasterReadIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits) - val st = new AxiMasterWriteIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits) - }) - - io.score.undef := io.score.vinst(0).valid || io.score.vinst(1).valid || - io.score.vinst(2).valid || io.score.vinst(3).valid - - io.score.mactive := false.B - - io.dbus.valid := false.B - io.dbus.write := false.B - io.dbus.size := 0.U - io.dbus.addr := 0.U - io.dbus.adrx := 0.U - io.dbus.wdata := 0.U - io.dbus.wmask := 0.U - io.last := false.B - - for (i <- 0 until 4) { - io.score.vinst(i).ready := true.B - io.score.rd(i).valid := false.B - io.score.rd(i).addr := 0.U - io.score.rd(i).data := 0.U - } - - io.ld.addr.valid := false.B - io.ld.addr.bits.addr := 0.U - io.ld.addr.bits.id := 0.U - io.ld.data.ready := false.B - - io.st.addr.valid := false.B - io.st.addr.bits.addr := 0.U - io.st.addr.bits.id := 0.U - io.st.data.valid := false.B - io.st.data.bits.data := 0.U - io.st.data.bits.strb := 0.U - io.st.resp.ready := false.B -}