Kelvin core, with bazel support.
Change-Id: I11ceb466009c1b2e01929327cb946a0f2ab80116
diff --git a/.bazelrc b/.bazelrc
new file mode 100644
index 0000000..5b3d13f
--- /dev/null
+++ b/.bazelrc
@@ -0,0 +1 @@
+build --cxxopt=-std=c++17
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ac51a05
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+bazel-*
diff --git a/WORKSPACE b/WORKSPACE
new file mode 100644
index 0000000..fa5bbf9
--- /dev/null
+++ b/WORKSPACE
@@ -0,0 +1,20 @@
+workspace(name="kelvin_hw")
+
+load("//rules:repos.bzl", "kelvin_repos")
+kelvin_repos()
+
+# Scala setup
+load("@io_bazel_rules_scala//:scala_config.bzl", "scala_config")
+scala_config(scala_version = "2.13.6")
+load("@io_bazel_rules_scala//scala:scala.bzl", "rules_scala_setup", "rules_scala_toolchain_deps_repositories")
+rules_scala_setup()
+rules_scala_toolchain_deps_repositories(fetch_sources = True)
+load("@io_bazel_rules_scala//scala:toolchains.bzl", "scala_register_toolchains")
+scala_register_toolchains()
+load("@rules_proto//proto:repositories.bzl", "rules_proto_dependencies", "rules_proto_toolchains")
+rules_proto_dependencies()
+rules_proto_toolchains()
+
+load("//rules:deps.bzl", "kelvin_deps")
+kelvin_deps()
+
diff --git a/external/0001-Update-version-of-Googletest-for-bazel-compatitibili.patch b/external/0001-Update-version-of-Googletest-for-bazel-compatitibili.patch
new file mode 100644
index 0000000..6524ee1
--- /dev/null
+++ b/external/0001-Update-version-of-Googletest-for-bazel-compatitibili.patch
@@ -0,0 +1,27 @@
+From 17ec1b6631933d745a419835b1f88c4fffa5bc40 Mon Sep 17 00:00:00 2001
+From: Derek Chow <derekjchow@google.com>
+Date: Mon, 24 Jul 2023 13:44:30 -0700
+Subject: [PATCH] Update version of Googletest for bazel compatitibility.
+
+---
+ .../com_google_googletest/com_google_googletest.bzl | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git dependency_support/com_google_googletest/com_google_googletest.bzl dependency_support/com_google_googletest/com_google_googletest.bzl
+index e57c403..6ba524c 100644
+--- dependency_support/com_google_googletest/com_google_googletest.bzl
++++ dependency_support/com_google_googletest/com_google_googletest.bzl
+@@ -21,7 +21,7 @@ def com_google_googletest():
+ maybe(
+ http_archive,
+ name = "com_google_googletest",
+- urls = ["https://github.com/google/googletest/archive/0eea2e9fc63461761dea5f2f517bd6af2ca024fa.zip"], # 2020-04-30
+- strip_prefix = "googletest-0eea2e9fc63461761dea5f2f517bd6af2ca024fa",
+- sha256 = "9463ff914d7c3db02de6bd40a3c412a74e979e3c76eaa89920a49ff8488d6d69",
++ urls = ["https://github.com/google/googletest/archive/refs/tags/v1.13.0.zip"],
++ strip_prefix = "googletest-1.13.0",
++ sha256 = "ffa17fbc5953900994e2deec164bb8949879ea09b411e07f215bfbb1f87f4632",
+ )
+--
+2.41.0.487.g6d72f3e995-goog
+
diff --git a/external/0002-SystemC-support-for-verilator.patch b/external/0002-SystemC-support-for-verilator.patch
new file mode 100644
index 0000000..7e414a7
--- /dev/null
+++ b/external/0002-SystemC-support-for-verilator.patch
@@ -0,0 +1,32 @@
+From 123df7a8075ee82f5e8988c77bc5e17c06078506 Mon Sep 17 00:00:00 2001
+From: Derek Chow <derekjchow@google.com>
+Date: Mon, 24 Jul 2023 17:09:47 -0700
+Subject: [PATCH 2/2] SystemC support for verilator.
+
+---
+ dependency_support/verilator/verilator.BUILD | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git dependency_support/verilator/verilator.BUILD dependency_support/verilator/verilator.BUILD
+index 371a1dd..3fc5aa1 100644
+--- dependency_support/verilator/verilator.BUILD
++++ dependency_support/verilator/verilator.BUILD
+@@ -200,6 +200,7 @@ cc_library(
+ "include/verilated_imp.h",
+ "include/verilated_syms.h",
+ "include/verilated_vcd_c.cpp",
++ "include/verilated_vcd_sc.cpp",
+ ],
+ hdrs = [
+ "include/verilated.h",
+@@ -215,6 +216,7 @@ cc_library(
+ # Needed for verilated_vcd_c.cpp and verilated_fst_c.cpp
+ "include/verilated_trace_imp.h",
+ "include/verilated_vcd_c.h",
++ "include/verilated_vcd_sc.h",
+ "include/verilatedos.h",
+ "include/verilated_types.h",
+ "include/verilated_funcs.h",
+--
+2.41.0.487.g6d72f3e995-goog
+
diff --git a/external/systemc.BUILD b/external/systemc.BUILD
new file mode 100644
index 0000000..2a48cf8
--- /dev/null
+++ b/external/systemc.BUILD
@@ -0,0 +1,22 @@
+load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake")
+
+filegroup(
+ name = "all_srcs",
+ srcs = glob(["**"]),
+)
+
+# TODO(derekjchow): Set isystem for systemc headers.
+cmake(
+ name = "systemc",
+ cache_entries = {
+ "CMAKE_CXX_STANDARD": "17",
+ "BUILD_SHARED_LIBS": "False",
+ },
+ generate_args = [
+ "-G Ninja",
+ ],
+ install = True,
+ lib_source = "@accellera_systemc//:all_srcs",
+ out_static_libs = ["libsystemc.a"],
+ visibility = ["//visibility:public"],
+)
diff --git a/hdl/chisel/.scalafmt.conf b/hdl/chisel/.scalafmt.conf
new file mode 100644
index 0000000..3ccfeff
--- /dev/null
+++ b/hdl/chisel/.scalafmt.conf
@@ -0,0 +1,4 @@
+version = "3.6.1"
+maxColumn = 80
+runner.dialect = scala3
+project.git = true
diff --git a/hdl/chisel/BUILD b/hdl/chisel/BUILD
new file mode 100644
index 0000000..8f5a41f
--- /dev/null
+++ b/hdl/chisel/BUILD
@@ -0,0 +1,123 @@
+load("@io_bazel_rules_scala//scala:scala.bzl", "scala_library", "scala_binary")
+load("@kelvin_hw//rules:chisel.bzl", "chisel_binary",
+ "chisel_library",
+ "chisel_cc_library")
+load("@rules_hdl//verilog:providers.bzl", "verilog_library")
+load("@rules_hdl//verilator:defs.bzl", "verilator_cc_library")
+
+chisel_library(
+ name = "common",
+ srcs = glob(["src/common/*.scala"]),
+)
+
+chisel_library(
+ name = "kelvin",
+ srcs = glob(["src/kelvin/**/*.scala"]),
+ deps = [
+ ":common",
+ ]
+)
+
+chisel_cc_library(
+ name = "core_cc_library",
+ chisel_lib = ":kelvin",
+ emit_class = "kelvin.EmitCore",
+ module_name = "Core",
+)
+
+chisel_cc_library(
+ name = "dbus2axi_cc_library",
+ chisel_lib = ":kelvin",
+ emit_class = "kelvin.EmitDBus2Axi",
+ module_name = "DBus2Axi",
+)
+
+chisel_cc_library(
+ name = "l1dcache_cc_library",
+ chisel_lib = ":kelvin",
+ emit_class = "kelvin.EmitL1DCache",
+ module_name = "L1DCache",
+ verilog_deps = [
+ "//hdl/verilog:sram_1rw_256x288",
+ ],
+)
+
+chisel_cc_library(
+ name = "l1icache_cc_library",
+ chisel_lib = ":kelvin",
+ emit_class = "kelvin.EmitL1ICache",
+ module_name = "L1ICache",
+ verilog_deps = [
+ "//hdl/verilog:sram_1rw_256x256",
+ ],
+)
+
+chisel_cc_library(
+ name = "vcmdq_cc_library",
+ chisel_lib = ":kelvin",
+ emit_class = "kelvin.EmitVCmdq",
+ module_name = "VCmdq",
+)
+
+chisel_cc_library(
+ name = "vconvalu_cc_library",
+ chisel_lib = ":kelvin",
+ emit_class = "kelvin.EmitVConvAlu",
+ module_name = "VConvAlu",
+)
+
+chisel_cc_library(
+ name = "vconvctrl_cc_library",
+ chisel_lib = ":kelvin",
+ emit_class = "kelvin.EmitVConvCtrl",
+ module_name = "VConvCtrl",
+)
+
+chisel_cc_library(
+ name = "vdecode_cc_library",
+ chisel_lib = ":kelvin",
+ emit_class = "kelvin.EmitVDecode",
+ module_name = "VDecode",
+)
+
+chisel_cc_library(
+ name = "vdecodeinstruction_cc_library",
+ chisel_lib = ":kelvin",
+ emit_class = "kelvin.EmitVDecodeInstruction",
+ module_name = "VDecodeInstruction",
+)
+
+chisel_cc_library(
+ name = "vldst_cc_library",
+ chisel_lib = ":kelvin",
+ emit_class = "kelvin.EmitVLdSt",
+ module_name = "VLdSt",
+)
+
+chisel_cc_library(
+ name = "vld_cc_library",
+ chisel_lib = ":kelvin",
+ emit_class = "kelvin.EmitVLd",
+ module_name = "VLd",
+)
+
+chisel_cc_library(
+ name = "vregfile_cc_library",
+ chisel_lib = ":kelvin",
+ emit_class = "kelvin.EmitVRegfile",
+ module_name = "VRegfile",
+)
+
+chisel_cc_library(
+ name = "vregfilesegment_cc_library",
+ chisel_lib = ":kelvin",
+ emit_class = "kelvin.EmitVRegfileSegment",
+ module_name = "VRegfileSegment",
+)
+
+chisel_cc_library(
+ name = "vst_cc_library",
+ chisel_lib = ":kelvin",
+ emit_class = "kelvin.EmitVSt",
+ module_name = "VSt",
+)
\ No newline at end of file
diff --git a/hdl/chisel/src/common/Fifo.scala b/hdl/chisel/src/common/Fifo.scala
new file mode 100644
index 0000000..f1efd98
--- /dev/null
+++ b/hdl/chisel/src/common/Fifo.scala
@@ -0,0 +1,102 @@
+package common
+
+import chisel3._
+import chisel3.util._
+
+object Fifo {
+ def apply[T <: Data](t: T, n: Int, passReady: Boolean = false) = {
+ Module(new Fifo(t, n, passReady))
+ }
+}
+
+class Fifo[T <: Data](t: T, n: Int, passReady: Boolean) extends Module {
+ val io = IO(new Bundle {
+ val in = Flipped(Decoupled(t))
+ val out = Decoupled(t)
+ val count = Output(UInt(log2Ceil(n+1).W))
+ })
+
+ // An (n-1) queue with a registered output stage.
+ val m = n - 1 // n = Mem(n-1) + Out
+
+ val mem = Mem(m, t)
+ val rdata = Reg(t)
+
+ val rvalid = RegInit(false.B)
+ val wready = RegInit(false.B)
+ val raddr = RegInit(0.U(log2Ceil(m).W))
+ val waddr = RegInit(0.U(log2Ceil(m).W))
+ val count = RegInit(0.U(log2Ceil(n+1).W))
+
+ // ---------------------------------------------------------------------------
+ // Memory Addresses.
+ val winc = io.in.valid && io.in.ready
+ val rinc = (!rvalid || io.out.ready) && (winc || count > 1.U)
+
+ when (winc) {
+ waddr := Mux(waddr === (m - 1).U, 0.U, waddr + 1.U)
+ }
+
+ when (rinc) {
+ raddr := Mux(raddr === (m - 1).U, 0.U, raddr + 1.U)
+ }
+
+ val forward = rinc && winc && count <= 1.U
+
+ // ---------------------------------------------------------------------------
+ // FIFO Control.
+ val ien = io.in.valid && io.in.ready
+ val oen = io.out.valid && io.out.ready
+
+ when (ien && !oen) {
+ count := count + 1.U
+ } .elsewhen (!ien && oen) {
+ count := count - 1.U
+ }
+
+ when (ien) {
+ rvalid := true.B
+ } .elsewhen (io.out.ready && count === 1.U) {
+ rvalid := false.B
+ }
+
+ wready := count < (n - 1).U ||
+ count === (n - 1).U && !(ien && !oen) ||
+ (oen && !ien)
+
+ // ---------------------------------------------------------------------------
+ // Memory.
+ when (winc && !forward) {
+ mem(waddr) := io.in.bits
+ }
+
+ when (forward) {
+ rdata := io.in.bits
+ } .elsewhen (rinc) {
+ rdata := mem(raddr)
+ }
+
+ // ---------------------------------------------------------------------------
+ // Interface.
+ io.out.valid := rvalid
+ io.out.bits := rdata
+
+ if (passReady) {
+ io.in.ready := wready || io.out.ready // pass-through
+ } else {
+ io.in.ready := wready
+ }
+
+ io.count := count
+
+ assert(count <= n.U)
+ assert(!(!passReady.B && io.in.ready && count === n.U))
+}
+
+object EmitFifo extends App {
+ (new chisel3.stage.ChiselStage).emitVerilog(new Fifo(UInt(8.W), 11, false), args)
+}
+
+object EmitFifo_1 extends App {
+ (new chisel3.stage.ChiselStage).emitVerilog(new Fifo(UInt(8.W), 11, true), args)
+}
diff --git a/hdl/chisel/src/common/Fifo4.scala b/hdl/chisel/src/common/Fifo4.scala
new file mode 100644
index 0000000..a434364
--- /dev/null
+++ b/hdl/chisel/src/common/Fifo4.scala
@@ -0,0 +1,189 @@
+package common
+
+import chisel3._
+import chisel3.util._
+
+object Fifo4 {
+ def apply[T <: Data](t: T, n: Int) = {
+ Module(new Fifo4(t, n))
+ }
+}
+
+// 4way decode, used for Fifo4 style input controls.
+object Fifo4Valid {
+ def apply(in: UInt): (UInt, UInt, UInt, UInt) = {
+ assert(in.getWidth == 4)
+
+ val in0 = Cat(in(3,0) === 8.U, // 8
+ in(2,0) === 4.U, // 4, 12
+ in(1,0) === 2.U, // 2, 6, 10, 14
+ in(0)) // 1, 3, 5, 7, 9, 11, 13, 15
+
+ val in1 = Cat(in(3,0) === 12.U ||
+ in(3,0) === 10.U ||
+ in(3,0) === 9.U, // 9, 10, 12
+ in(2,0) === 6.U ||
+ in(2,0) === 5.U, // 5, 6, 13, 14
+ in(1,0) === 3.U, // 3, 7, 11, 15
+ false.B)
+
+ val in2 = Cat(in(3,0) === 14.U ||
+ in(3,0) === 13.U ||
+ in(3,0) === 11.U, // 11, 13, 14
+ in(2,0) === 15.U ||
+ in(2,0) === 7.U, // 7, 15
+ false.B, false.B)
+
+ val in3 = Cat(in(3,0) === 15.U, // 15
+ false.B, false.B, false.B)
+
+ (in0.asUInt, in1.asUInt, in2.asUInt, in3.asUInt)
+ }
+}
+
+class Fifo4[T <: Data](t: T, n: Int) extends Module {
+ val io = IO(new Bundle {
+ val in = Flipped(Decoupled(Vec(4, Valid(t))))
+ val out = Decoupled(t)
+ val count = Output(UInt(log2Ceil(n+1).W))
+ })
+
+ val m = n - 1 // n = Mem(n-1) + Slice
+
+ def Increment(a: UInt, b: UInt): UInt = {
+ val c = a +& b
+ val d = Mux(c < m.U, c, c - m.U)(a.getWidth - 1, 0)
+ d
+ }
+
+ val mem = Mem(m, t)
+ val mslice = Slice(t, false, true)
+
+ val in0pos = RegInit(0.U(log2Ceil(m).W))
+ val in1pos = RegInit(1.U(log2Ceil(m).W))
+ val in2pos = RegInit(2.U(log2Ceil(m).W))
+ val in3pos = RegInit(3.U(log2Ceil(m).W))
+ val outpos = RegInit(0.U(log2Ceil(m).W))
+ val mcount = RegInit(0.U(log2Ceil(n+1).W))
+
+ io.count := mcount + io.out.valid
+
+ val ivalid = io.in.valid && io.in.ready
+ val ovalid = mslice.io.in.valid && mslice.io.in.ready
+
+ val iactive = Cat(io.in.bits(3).valid, io.in.bits(2).valid,
+ io.in.bits(1).valid, io.in.bits(0).valid).asUInt
+
+ val icount = io.in.bits(0).valid +& io.in.bits(1).valid +
+ io.in.bits(2).valid +& io.in.bits(3).valid
+
+ // ---------------------------------------------------------------------------
+ // Fifo Control.
+ when (ivalid) {
+ in0pos := Increment(in0pos, icount)
+ in1pos := Increment(in1pos, icount)
+ in2pos := Increment(in2pos, icount)
+ in3pos := Increment(in3pos, icount)
+ }
+
+ when (ovalid) {
+ outpos := Increment(outpos, 1.U)
+ }
+
+ val inc = MuxOR(ivalid, icount)
+ val dec = mslice.io.in.valid && mslice.io.in.ready
+
+ when (ivalid || ovalid) {
+ mcount := mcount + inc - dec
+ }
+
+ // ---------------------------------------------------------------------------
+ // Fifo Input.
+ val (in0valid, in1valid, in2valid, in3valid) = Fifo4Valid(iactive)
+
+ for (i <- 0 until m) {
+ val valid = Cat(in0pos === i.U && in0valid(3) ||
+ in1pos === i.U && in1valid(3) ||
+ in2pos === i.U && in2valid(3) ||
+ in3pos === i.U && in3valid(3),
+ in0pos === i.U && in0valid(2) ||
+ in1pos === i.U && in1valid(2) ||
+ in2pos === i.U && in2valid(2),
+ in0pos === i.U && in0valid(1) ||
+ in1pos === i.U && in1valid(1),
+ in0pos === i.U && in0valid(0))
+
+ // Couldn't get the following to work properly.
+ //
+ // val data = MuxOR(valid(0), io.in.bits(0).bits.asUInt) |
+ // MuxOR(valid(1), io.in.bits(1).bits.asUInt) |
+ // MuxOR(valid(2), io.in.bits(2).bits.asUInt) |
+ // MuxOR(valid(3), io.in.bits(3).bits.asUInt)
+ //
+ // when (ivalid && valid =/= 0.U) {
+ // mem(i) := data.asTypeOf(t)
+ // }
+ when (ivalid) {
+ when (valid(0)) {
+ mem(i) := io.in.bits(0).bits
+ } .elsewhen (valid(1)) {
+ mem(i) := io.in.bits(1).bits
+ } .elsewhen (valid(2)) {
+ mem(i) := io.in.bits(2).bits
+ } .elsewhen (valid(3)) {
+ mem(i) := io.in.bits(3).bits
+ }
+ }
+ }
+
+ mslice.io.in.valid := false.B
+ mslice.io.in.bits := io.in.bits(0).bits // defaults
+
+ when (mcount > 0.U) {
+ when (io.out.ready) {
+ mslice.io.in.valid := true.B
+ }
+ } .otherwise {
+ when (ivalid && iactive =/= 0.U) {
+ mslice.io.in.valid := true.B
+ }
+ }
+
+ when (mcount > 0.U) {
+ mslice.io.in.bits := mem(outpos)
+ } .elsewhen (ivalid) {
+ // As above, couldn't get MuxOR to work.
+ when (iactive(0)) {
+ mslice.io.in.bits := io.in.bits(0).bits
+ } .elsewhen (iactive(1)) {
+ mslice.io.in.bits := io.in.bits(1).bits
+ } .elsewhen (iactive(2)) {
+ mslice.io.in.bits := io.in.bits(2).bits
+ } .elsewhen (iactive(3)) {
+ mslice.io.in.bits := io.in.bits(3).bits
+ }
+ }
+
+ // ---------------------------------------------------------------------------
+ // Valid Entries.
+ val active = RegInit(0.U(m.W))
+
+ val activeSet = MuxOR(ivalid,
+ ((icount >= 1.U) << in0pos) | ((icount >= 2.U) << in1pos) |
+ ((icount >= 3.U) << in2pos) | ((icount >= 4.U) << in3pos))
+
+ val activeClr = MuxOR(mslice.io.in.valid && mslice.io.in.ready, 1.U << outpos)
+
+ active := (active | activeSet) & ~activeClr
+
+ // ---------------------------------------------------------------------------
+ // Interface.
+ io.in.ready := mcount <= (m.U - icount)
+ io.out <> mslice.io.out
+
+ assert(mcount <= m.U)
+}
+
+object EmitFifo4 extends App {
+ (new chisel3.stage.ChiselStage).emitVerilog(new Fifo4(UInt(8.W), 11), args)
+}
diff --git a/hdl/chisel/src/common/Fifo4e.scala b/hdl/chisel/src/common/Fifo4e.scala
new file mode 100644
index 0000000..bca120d
--- /dev/null
+++ b/hdl/chisel/src/common/Fifo4e.scala
@@ -0,0 +1,143 @@
+package common
+
+import chisel3._
+import chisel3.util._
+
+// Fifo4 with entry output and no output registration stage.
+
+object Fifo4e {
+ def apply[T <: Data](t: T, n: Int) = {
+ Module(new Fifo4e(t, n))
+ }
+}
+
+class Fifo4e[T <: Data](t: T, n: Int) extends Module {
+ val io = IO(new Bundle {
+ val in = Flipped(Decoupled(Vec(4, Valid(t))))
+ val out = Decoupled(t)
+ val count = Output(UInt(log2Ceil(n+1).W))
+ val entry = Output(Vec(n, Valid(t)))
+ val nempty = Output(Bool())
+ })
+
+ def Increment(a: UInt, b: UInt): UInt = {
+ val c = a +& b
+ val d = Mux(c < n.U, c, c - n.U)(a.getWidth - 1, 0)
+ d
+ }
+
+ val mem = Mem(n, t)
+
+ val in0pos = RegInit(0.U(log2Ceil(n).W))
+ val in1pos = RegInit(1.U(log2Ceil(n).W))
+ val in2pos = RegInit(2.U(log2Ceil(n).W))
+ val in3pos = RegInit(3.U(log2Ceil(n).W))
+ val outpos = RegInit(0.U(log2Ceil(n).W))
+ val mcount = RegInit(0.U(log2Ceil(n+1).W))
+ val nempty = RegInit(false.B)
+
+ io.count := mcount
+ io.nempty := nempty
+
+ val ivalid = io.in.valid && io.in.ready
+ val ovalid = io.out.valid && io.out.ready
+
+ val iactive = Cat(io.in.bits(3).valid, io.in.bits(2).valid,
+ io.in.bits(1).valid, io.in.bits(0).valid).asUInt
+
+ val icount = io.in.bits(0).valid +& io.in.bits(1).valid +
+ io.in.bits(2).valid +& io.in.bits(3).valid
+
+ // ---------------------------------------------------------------------------
+ // Fifo Control.
+ when (ivalid) {
+ in0pos := Increment(in0pos, icount)
+ in1pos := Increment(in1pos, icount)
+ in2pos := Increment(in2pos, icount)
+ in3pos := Increment(in3pos, icount)
+ }
+
+ when (ovalid) {
+ outpos := Increment(outpos, 1.U)
+ }
+
+ val inc = MuxOR(ivalid, icount)
+ val dec = ovalid
+
+ when (ivalid || ovalid) {
+ val nxtcount = mcount + inc - dec
+ mcount := nxtcount
+ nempty := nxtcount =/= 0.U
+ }
+
+ // ---------------------------------------------------------------------------
+ // Fifo Input.
+ val (in0valid, in1valid, in2valid, in3valid) = Fifo4Valid(iactive)
+
+ for (i <- 0 until n) {
+ val valid = Cat(in0pos === i.U && in0valid(3) ||
+ in1pos === i.U && in1valid(3) ||
+ in2pos === i.U && in2valid(3) ||
+ in3pos === i.U && in3valid(3),
+ in0pos === i.U && in0valid(2) ||
+ in1pos === i.U && in1valid(2) ||
+ in2pos === i.U && in2valid(2),
+ in0pos === i.U && in0valid(1) ||
+ in1pos === i.U && in1valid(1),
+ in0pos === i.U && in0valid(0))
+
+ // Couldn't get the following to work properly.
+ //
+ // val data = MuxOR(valid(0), io.in.bits(0).bits.asUInt) |
+ // MuxOR(valid(1), io.in.bits(1).bits.asUInt) |
+ // MuxOR(valid(2), io.in.bits(2).bits.asUInt) |
+ // MuxOR(valid(3), io.in.bits(3).bits.asUInt)
+ //
+ // when (ivalid && valid =/= 0.U) {
+ // mem(i) := data.asTypeOf(t)
+ // }
+ when (ivalid) {
+ when (valid(0)) {
+ mem(i) := io.in.bits(0).bits
+ } .elsewhen (valid(1)) {
+ mem(i) := io.in.bits(1).bits
+ } .elsewhen (valid(2)) {
+ mem(i) := io.in.bits(2).bits
+ } .elsewhen (valid(3)) {
+ mem(i) := io.in.bits(3).bits
+ }
+ }
+ }
+
+ // ---------------------------------------------------------------------------
+ // Valid Entries.
+ val active = RegInit(0.U(n.W))
+
+ val activeSet = MuxOR(ivalid,
+ ((icount >= 1.U) << in0pos) | ((icount >= 2.U) << in1pos) |
+ ((icount >= 3.U) << in2pos) | ((icount >= 4.U) << in3pos))
+
+ val activeClr = MuxOR(io.out.valid && io.out.ready, 1.U << outpos)
+
+ when (io.in.valid && io.in.ready || io.out.valid && io.out.ready) {
+ active := (active | activeSet) & ~activeClr
+ }
+
+ // ---------------------------------------------------------------------------
+ // Interface.
+ io.in.ready := mcount <= (n.U - icount)
+
+ io.out.valid := mcount =/= 0.U
+ io.out.bits := mem(outpos)
+
+ assert(mcount <= n.U)
+
+ for (i <- 0 until n) {
+ io.entry(i).valid := active(i)
+ io.entry(i).bits := mem(i)
+ }
+}
+
+object EmitFifo4e extends App {
+ (new chisel3.stage.ChiselStage).emitVerilog(new Fifo4e(UInt(8.W), 10), args)
+}
diff --git a/hdl/chisel/src/common/Fifo4x4.scala b/hdl/chisel/src/common/Fifo4x4.scala
new file mode 100644
index 0000000..5d02731
--- /dev/null
+++ b/hdl/chisel/src/common/Fifo4x4.scala
@@ -0,0 +1,183 @@
+package common
+
+import chisel3._
+import chisel3.util._
+
+object Fifo4x4 {
+ def apply[T <: Data](t: T, n: Int) = {
+ Module(new Fifo4x4(t, n))
+ }
+}
+
+// Input accepted with a common handshake and per lane select.
+// Outputs are transacted independently, and ordered {[0], [0,1], [0,1,2], [0,1,2,3]}.
+// Outputs are not registered, assumes passes directly into shallow combinatorial.
+class Fifo4x4[T <: Data](t: T, n: Int) extends Module {
+ val io = IO(new Bundle {
+ val in = Flipped(Decoupled(Vec(4, Valid(t))))
+ val out = Vec(4, Decoupled(t))
+ val count = Output(UInt(log2Ceil(n+1).W))
+ val nempty = Output(Bool())
+ })
+
+ val m = n
+
+ val mb = log2Ceil(m)
+ val n1b = log2Ceil(n + 1)
+
+ def Increment(a: UInt, b: UInt): UInt = {
+ val c = a +& b
+ val d = Mux(c < m.U, c, c - m.U)(a.getWidth - 1, 0)
+ d
+ }
+
+ val mem = Reg(Vec(n, t))
+
+ val inpos = Reg(Vec(4, UInt(mb.W))) // reset below
+ val outpos = Reg(Vec(4, UInt(mb.W))) // reset below
+
+ val mcount = RegInit(0.U(n1b.W))
+ val nempty = RegInit(false.B)
+ val inready = RegInit(false.B)
+ val outvalid = RegInit(0.U(4.W))
+
+ val ivalid = io.in.valid && io.in.ready
+
+ val iactive = Cat(io.in.bits(3).valid, io.in.bits(2).valid,
+ io.in.bits(1).valid, io.in.bits(0).valid).asUInt
+
+ val icount = (io.in.bits(0).valid +& io.in.bits(1).valid +&
+ io.in.bits(2).valid +& io.in.bits(3).valid)(2,0)
+
+ val oactiveBits = Cat(io.out(3).valid && io.out(3).ready,
+ io.out(2).valid && io.out(2).ready,
+ io.out(1).valid && io.out(1).ready,
+ io.out(0).valid && io.out(0).ready)
+
+ val ovalid = oactiveBits =/= 0.U
+
+ val ocount = (oactiveBits(0) +& oactiveBits(1) +&
+ oactiveBits(2) +& oactiveBits(3))(2,0)
+
+ assert(!(oactiveBits(1) === 1.U && oactiveBits(0,0) =/= 1.U))
+ assert(!(oactiveBits(2) === 1.U && oactiveBits(1,0) =/= 3.U))
+ assert(!(oactiveBits(3) === 1.U && oactiveBits(2,0) =/= 7.U))
+
+ val ovalidBits = Cat(io.out(3).valid, io.out(2).valid,
+ io.out(1).valid, io.out(0).valid)
+
+ assert(!(ovalidBits(1) === 1.U && ovalidBits(0,0) =/= 1.U))
+ assert(!(ovalidBits(2) === 1.U && ovalidBits(1,0) =/= 3.U))
+ assert(!(ovalidBits(3) === 1.U && ovalidBits(2,0) =/= 7.U))
+
+ val oreadyBits = Cat(io.out(3).ready, io.out(2).ready,
+ io.out(1).ready, io.out(0).ready)
+
+ assert(!(oreadyBits(1) === 1.U && oreadyBits(0,0) =/= 1.U))
+ assert(!(oreadyBits(2) === 1.U && oreadyBits(1,0) =/= 3.U))
+ assert(!(oreadyBits(3) === 1.U && oreadyBits(2,0) =/= 7.U))
+
+ // ---------------------------------------------------------------------------
+ // Fifo Control.
+ when (reset.asBool) {
+ for (i <- 0 until 4) {
+ inpos(i) := i.U
+ }
+ } .elsewhen (ivalid) {
+ for (i <- 0 until 4) {
+ inpos(i) := Increment(inpos(i), icount)
+ }
+ }
+
+ when (reset.asBool) {
+ for (i <- 0 until 4) {
+ outpos(i) := i.U
+ }
+ } .elsewhen (ovalid) {
+ for (i <- 0 until 4) {
+ outpos(i) := Increment(outpos(i), ocount)
+ }
+ }
+
+ val inc = MuxOR(ivalid, icount)
+ val dec = MuxOR(ovalid, ocount)
+
+ when (ivalid || ovalid) {
+ val nxtmcount = mcount + inc - dec
+ inready := nxtmcount <= (m.U - 4.U)
+ mcount := nxtmcount
+ nempty := nxtmcount =/= 0.U
+ outvalid := Cat(nxtmcount >= 4.U,
+ nxtmcount >= 3.U,
+ nxtmcount >= 2.U,
+ nxtmcount >= 1.U)
+ } .otherwise {
+ inready := mcount <= (m.U - 4.U)
+ outvalid := Cat(mcount >= 4.U,
+ mcount >= 3.U,
+ mcount >= 2.U,
+ mcount >= 1.U)
+ }
+
+ // ---------------------------------------------------------------------------
+ // Fifo Input.
+ val (in0valid, in1valid, in2valid, in3valid) = Fifo4Valid(iactive)
+
+ for (i <- 0 until m) {
+ val valid = Cat(inpos(0) === i.U && in0valid(3) ||
+ inpos(1) === i.U && in1valid(3) ||
+ inpos(2) === i.U && in2valid(3) ||
+ inpos(3) === i.U && in3valid(3),
+
+ inpos(0) === i.U && in0valid(2) ||
+ inpos(1) === i.U && in1valid(2) ||
+ inpos(2) === i.U && in2valid(2),
+
+ inpos(0) === i.U && in0valid(1) ||
+ inpos(1) === i.U && in1valid(1),
+
+ inpos(0) === i.U && in0valid(0))
+
+ if (true) {
+ val data = MuxOR(valid(0), io.in.bits(0).bits.asUInt) |
+ MuxOR(valid(1), io.in.bits(1).bits.asUInt) |
+ MuxOR(valid(2), io.in.bits(2).bits.asUInt) |
+ MuxOR(valid(3), io.in.bits(3).bits.asUInt)
+
+ when (ivalid && valid =/= 0.U) {
+ mem(i) := data.asTypeOf(t)
+ }
+ } else {
+ when (ivalid) {
+ when (valid(0)) {
+ mem(i) := io.in.bits(0).bits
+ } .elsewhen (valid(1)) {
+ mem(i) := io.in.bits(1).bits
+ } .elsewhen (valid(2)) {
+ mem(i) := io.in.bits(2).bits
+ } .elsewhen (valid(3)) {
+ mem(i) := io.in.bits(3).bits
+ }
+ }
+ }
+ }
+
+ // ---------------------------------------------------------------------------
+ // Interface.
+ io.in.ready := inready
+
+ for (i <- 0 until 4) {
+ io.out(i).valid := outvalid(i)
+ io.out(i).bits := mem(outpos(i)) // TODO: VecAt()
+ }
+
+ io.count := mcount
+
+ io.nempty := nempty
+
+ assert(io.count <= m.U)
+}
+
+object EmitFifo4x4 extends App {
+ (new chisel3.stage.ChiselStage).emitVerilog(new Fifo4x4(UInt(32.W), 24), args)
+}
diff --git a/hdl/chisel/src/common/IDiv.scala b/hdl/chisel/src/common/IDiv.scala
new file mode 100644
index 0000000..070eb3e
--- /dev/null
+++ b/hdl/chisel/src/common/IDiv.scala
@@ -0,0 +1,175 @@
+package common
+
+import chisel3._
+import chisel3.util._
+import common._
+
+// An integer divide unit, to be fused with fdiv.
+
+object IDiv {
+ def apply(n: Int): IDiv = {
+ return Module(new IDiv(n))
+ }
+
+ val Stages = 4
+ val Rcnt = 32 / Stages
+}
+
+case class IDivOp() {
+ val DIV = 0
+ val DIVU = 1
+ val REM = 2
+ val REMU = 3
+ val Entries = 4
+}
+
+class IDiv(n: Int) extends Module {
+ val io = IO(new Bundle {
+ val req = Input(UInt(new IDivOp().Entries.W))
+ val ina = Flipped(Decoupled(Vec(n, UInt(32.W))))
+ val inb = Flipped(Decoupled(Vec(n, UInt(32.W))))
+ val out = Decoupled(Vec(n, UInt(32.W)))
+ })
+
+ val dvu = new IDivOp()
+
+ val active = RegInit(false.B)
+ val result = RegInit(false.B)
+ val count = Reg(UInt(6.W))
+
+ val state = Reg(Vec(n, new IDivState()))
+
+ val ivalid = io.ina.valid && io.ina.ready && io.inb.valid && io.inb.ready
+ val ovalid = io.out.valid && io.out.ready
+
+ when (ivalid) {
+ active := true.B
+ } .elsewhen (active && count === IDiv.Rcnt.U) {
+ active := false.B
+ }
+
+ when (ovalid) {
+ result := false.B
+ } .elsewhen (active && count === IDiv.Rcnt.U) {
+ result := true.B
+ }
+
+ when (ivalid) {
+ count := 0.U
+ } .elsewhen (active) {
+ count := count + 1.U
+ }
+
+ for (i <- 0 until n) {
+ val ina = io.ina.bits(i)
+ val inb = io.inb.bits(i)
+ val st = state(i)
+
+ when (ivalid) {
+ val divide = io.req(dvu.DIV) || io.req(dvu.DIVU)
+ val signed = io.req(dvu.DIV) || io.req(dvu.REM)
+ state(i) := IDivComb1(ina, inb, signed, divide)
+ } .elsewhen (active) {
+ state(i) := IDivComb2(state(i), count)
+ }
+ }
+
+ io.ina.ready := io.inb.valid && !active && (!result || io.out.ready)
+ io.inb.ready := io.ina.valid && !active && (!result || io.out.ready)
+
+ io.out.valid := result
+
+ for (i <- 0 until n) {
+ io.out.bits(i) := IDivComb3(state(i))
+ }
+}
+
+class IDivState extends Bundle {
+ val denom = UInt(32.W) // output is placed first
+ val divide = UInt(32.W)
+ val remain = UInt(32.W)
+ val opdiv = Bool()
+ val opneg = Bool()
+}
+
+object IDivComb1 {
+ def apply(ina: UInt, inb: UInt, signed: Bool, divide: Bool): IDivState = {
+ val out = Wire(new IDivState())
+
+ val divByZero = inb === 0.U
+ val divsign = signed && (ina(31) =/= inb(31)) && !divByZero
+ val remsign = signed && ina(31)
+ val inp = Mux(signed && ina(31), ~ina + 1.U, ina)
+
+ out.opdiv := divide
+ out.opneg := Mux(divide, divsign, remsign)
+ out.denom := Mux(signed && inb(31), ~inb + 1.U, inb)
+ out.divide := inp
+ out.remain := 0.U
+
+ out
+ }
+}
+
+object IDivComb2 {
+ def apply(in: IDivState, count: UInt): IDivState = {
+ val out = Wire(new IDivState())
+ out := in
+
+ when (count < IDiv.Rcnt.U) {
+ val (div1, rem1) = Divide(in.divide, in.remain, in.denom)
+ if (IDiv.Stages == 1) {
+ out.divide := div1
+ out.remain := rem1
+ } else if (IDiv.Stages == 2) {
+ val (div2, rem2) = Divide(div1, rem1, in.denom)
+ out.divide := div2
+ out.remain := rem2
+ } else if (IDiv.Stages == 4) {
+ val (div2, rem2) = Divide(div1, rem1, in.denom)
+ val (div3, rem3) = Divide(div2, rem2, in.denom)
+ val (div4, rem4) = Divide(div3, rem3, in.denom)
+ out.divide := div4
+ out.remain := rem4
+ } else {
+ assert(false)
+ }
+ } .otherwise {
+ val div = Mux(in.opneg, ~in.divide + 1.U, in.divide)
+ val rem = Mux(in.opneg, ~in.remain + 1.U, in.remain)
+ out.denom := Mux(in.opdiv, div, rem)
+ }
+
+ out
+ }
+
+ def Divide(prvDivide: UInt, prvRemain: UInt, denom: UInt): (UInt, UInt) = {
+ val shfRemain = Cat(prvRemain(30,0), prvDivide(31))
+ val subtract = shfRemain -& denom
+ assert(subtract.getWidth == 33)
+ val divDivide = Wire(UInt(32.W))
+ val divRemain = Wire(UInt(32.W))
+
+ when (!subtract(32)) {
+ divDivide := Cat(prvDivide(30,0), 1.U(1.W))
+ divRemain := subtract(31,0)
+ } .otherwise {
+ divDivide := Cat(prvDivide(30,0), 0.U(1.W))
+ divRemain := shfRemain
+ }
+
+ (divDivide, divRemain)
+ }
+}
+
+object IDivComb3 {
+ def apply(in: IDivState): UInt = {
+ val result = in.denom
+ assert(result.getWidth == 32)
+ result
+ }
+}
+
+object EmitIDiv extends App {
+ (new chisel3.stage.ChiselStage).emitVerilog(new IDiv(1), args)
+}
diff --git a/hdl/chisel/src/common/Library.scala b/hdl/chisel/src/common/Library.scala
new file mode 100644
index 0000000..9fa4682
--- /dev/null
+++ b/hdl/chisel/src/common/Library.scala
@@ -0,0 +1,14 @@
+package common
+
+import chisel3._
+import chisel3.util._
+
+object MuxOR {
+ def apply(valid: Bool, data: UInt): UInt = {
+ Mux(valid, data, 0.U(data.getWidth))
+ }
+
+ def apply(valid: Bool, data: Bool): Bool = {
+ Mux(valid, data, false.B)
+ }
+}
diff --git a/hdl/chisel/src/common/Slice.scala b/hdl/chisel/src/common/Slice.scala
new file mode 100644
index 0000000..ac93641
--- /dev/null
+++ b/hdl/chisel/src/common/Slice.scala
@@ -0,0 +1,128 @@
+package common
+
+import chisel3._
+import chisel3.util._
+
+object Slice {
+ def apply[T <: Data](t: T, doubleBuffered: Boolean = true,
+ passReady: Boolean = false, passValid: Boolean = false) = {
+ Module(new Slice(t, doubleBuffered, passReady, passValid))
+ }
+}
+
+class Slice[T <: Data](t: T, doubleBuffered: Boolean,
+ passReady: Boolean, passValid: Boolean) extends Module {
+ val io = IO(new Bundle {
+ val in = Flipped(Decoupled(t))
+ val out = Decoupled(t)
+ val count = Output(UInt(2.W))
+ val value = Output(Vec(if (doubleBuffered) 2 else 1, Valid(t)))
+ })
+
+ val size = if (doubleBuffered) 2 else 1
+
+ val ipos = RegInit(0.U(size.W))
+ val opos = RegInit(0.U(size.W))
+ val count = RegInit(0.U(size.W))
+ val mem = Reg(Vec(size, t))
+
+ val empty = ipos === opos
+ val bypass = if (passValid) io.in.valid && io.out.ready && empty else false.B
+ val ivalid = io.in.valid && io.in.ready && !bypass
+ val ovalid = io.out.valid && io.out.ready && !bypass
+
+ when (ivalid) {
+ ipos := ipos + 1.U
+ }
+
+ when (ovalid) {
+ opos := opos + 1.U
+ }
+
+ when (ivalid =/= ovalid) {
+ count := count + ivalid - ovalid
+ }
+
+ if (doubleBuffered) {
+ val full = ipos(0) === opos(0) && ipos(1) =/= opos(1)
+ if (passReady) {
+ io.in.ready := !full || io.out.ready // pass-through
+ } else {
+ io.in.ready := !full
+ }
+
+ when (ovalid && full) {
+ mem(0) := mem(1)
+ }
+
+ when (ivalid && !ovalid && empty ||
+ ivalid && ovalid && !full) {
+ mem(0) := io.in.bits
+ }
+
+ when (ivalid && !ovalid && !empty ||
+ ivalid && ovalid && full) {
+ mem(1) := io.in.bits
+ }
+
+ io.value(0).valid := !empty
+ io.value(1).valid := full
+ io.value(0).bits := mem(0)
+ io.value(1).bits := mem(1)
+ } else {
+ if (passReady) {
+ io.in.ready := empty || io.out.ready // pass-through
+ } else {
+ io.in.ready := empty
+ }
+
+ when (ivalid) {
+ mem(0) := io.in.bits
+ }
+
+ io.value(0).valid := !empty
+ io.value(0).bits := mem(0)
+ }
+
+ if (!passValid) {
+ io.out.valid := !empty
+ io.out.bits := mem(0)
+ } else {
+ io.out.valid := !empty || io.in.valid // pass-through
+ io.out.bits := Mux(!empty, mem(0), io.in.bits) // pass-through
+ }
+
+ io.count := count
+}
+
+object EmitSlice extends App {
+ (new chisel3.stage.ChiselStage).emitVerilog(new Slice(UInt(32.W), false, false, false), args)
+}
+
+object EmitSlice_1 extends App {
+ (new chisel3.stage.ChiselStage).emitVerilog(new Slice(UInt(32.W), false, false, true), args)
+}
+
+object EmitSlice_2 extends App {
+ (new chisel3.stage.ChiselStage).emitVerilog(new Slice(UInt(32.W), false, true, false), args)
+}
+
+object EmitSlice_3 extends App {
+ (new chisel3.stage.ChiselStage).emitVerilog(new Slice(UInt(32.W), false, true, true), args)
+}
+
+object EmitSlice_4 extends App {
+ (new chisel3.stage.ChiselStage).emitVerilog(new Slice(UInt(32.W), true, false, false), args)
+}
+
+object EmitSlice_5 extends App {
+ (new chisel3.stage.ChiselStage).emitVerilog(new Slice(UInt(32.W), true, false, true), args)
+}
+
+object EmitSlice_6 extends App {
+ (new chisel3.stage.ChiselStage).emitVerilog(new Slice(UInt(32.W), true, true, false), args)
+}
+
+object EmitSlice_7 extends App {
+ (new chisel3.stage.ChiselStage).emitVerilog(new Slice(UInt(32.W), true, true, true), args)
+}
diff --git a/hdl/chisel/src/kelvin/Axi.scala b/hdl/chisel/src/kelvin/Axi.scala
new file mode 100644
index 0000000..bd5009a
--- /dev/null
+++ b/hdl/chisel/src/kelvin/Axi.scala
@@ -0,0 +1,169 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+case object AxiResponse {
+ val okay = 0
+ val rsvd = 1
+ val slverr = 2
+ val mmuerr = 3
+}
+
+// case object AxiBurst {
+// val fixed = 0
+// val incr = 1
+// val wrap = 2
+// }
+
+// case object AxiSize {
+// val bytes1 = 0
+// val bytes2 = 1
+// val bytes4 = 2
+// val bytes8 = 3
+// val bytes16 = 4
+// val bytes32 = 5
+// val bytes64 = 6
+// val bytes128 = 7
+// }
+
+class AxiAddress(addrWidthBits: Int, idBits: Int) extends Bundle {
+ val addr = UInt(addrWidthBits.W)
+ val id = UInt(idBits.W)
+ // val burst = UInt(2.W)
+ // val size = UInt(3.W)
+
+ def defaults() = {
+ addr := 0.U
+ id := 0.U
+ // burst := new AxiBurst().fixed
+ // size := new AxiSize().bytes4
+ }
+}
+
+class AxiWriteData(dataWidthBits: Int) extends Bundle {
+ val data = UInt(dataWidthBits.W)
+ val strb = UInt((dataWidthBits/8).W)
+
+ def defaults() = {
+ data := 0.U
+ strb := ((1 << (dataWidthBits/8)) - 1).U
+ }
+}
+
+class AxiWriteResponse(idBits: Int) extends Bundle {
+ val id = UInt(idBits.W)
+ val resp = UInt(2.W)
+
+ def defaults() = {
+ id := 0.U
+ resp := 0.U
+ }
+
+ def defaultsFlipped() = {
+ id := 0.U
+ resp := 0.U
+ }
+}
+
+class AxiReadData(dataWidthBits: Int, idBits: Int) extends Bundle {
+ val resp = UInt(2.W)
+ val id = UInt(idBits.W)
+ val data = UInt(dataWidthBits.W)
+ // val last = Bool()
+
+ def defaultsFlipped() = {
+ resp := 0.U
+ id := 0.U
+ data := 0.U
+ // last := false.B
+ }
+}
+
+class AxiLiteAddress(addrWidthBits: Int) extends Bundle {
+ val addr = UInt(addrWidthBits.W)
+ val prot = UInt(3.W)
+}
+
+class AxiLiteWriteData(dataWidthBits: Int) extends Bundle {
+ val data = UInt(dataWidthBits.W)
+ val strb = UInt((dataWidthBits/8).W)
+}
+
+class AxiLiteReadData(dataWidthBits: Int) extends Bundle {
+ val data = UInt(dataWidthBits.W)
+ val resp = UInt(2.W)
+}
+
+class AxiMasterIO(addrWidthBits: Int, dataWidthBits: Int, idBits: Int)
+ extends Bundle {
+ val write = new AxiMasterWriteIO(addrWidthBits, dataWidthBits, idBits)
+ val read = new AxiMasterReadIO(addrWidthBits, dataWidthBits, idBits)
+
+ def defaults() = {
+ write.defaults()
+ read.defaults()
+ }
+
+ def defaultsFlipped() = {
+ write.defaultsFlipped()
+ read.defaultsFlipped()
+ }
+}
+
+class AxiMasterWriteIO(addrWidthBits: Int, dataWidthBits: Int, idBits: Int)
+ extends Bundle {
+ val addr = Decoupled(new AxiAddress(addrWidthBits, idBits))
+ val data = Decoupled(new AxiWriteData(dataWidthBits))
+ val resp = Flipped(Decoupled(new AxiWriteResponse(idBits)))
+
+ def defaults() = {
+ addr.bits.defaults()
+ data.bits.defaults()
+ addr.valid := false.B
+ data.valid := false.B
+ resp.ready := true.B
+ }
+
+ def defaultsFlipped() = {
+ addr.ready := false.B
+ data.ready := false.B
+ resp.valid := false.B
+ resp.bits.defaultsFlipped()
+ }
+}
+
+class AxiMasterReadIO(addrWidthBits: Int, dataWidthBits: Int, idBits: Int)
+ extends Bundle {
+ val addr = Decoupled(new AxiAddress(addrWidthBits, idBits))
+ val data = Flipped(Decoupled(new AxiReadData(dataWidthBits, idBits)))
+
+ def defaults() = {
+ addr.bits.defaults()
+ addr.valid := false.B
+ data.ready := false.B
+ }
+
+ def defaultsFlipped() = {
+ addr.ready := false.B
+ data.valid := false.B
+ data.bits.defaultsFlipped()
+ }
+}
+
+class AxiLiteMasterIO(val addrWidthBits: Int, val dataWidthBits: Int) extends Bundle {
+ val read = new AxiLiteMasterReadIO(addrWidthBits, dataWidthBits)
+ val write = new AxiLiteMasterWriteIO(addrWidthBits, dataWidthBits)
+}
+
+class AxiLiteMasterWriteIO(val addrWidthBits: Int, val dataWidthBits: Int) extends Bundle {
+ val addr = Decoupled(new AxiLiteAddress(addrWidthBits))
+ val data = Decoupled(new AxiLiteWriteData(dataWidthBits))
+ val resp = Flipped(Decoupled(UInt(2.W)))
+}
+
+class AxiLiteMasterReadIO(addrWidthBits: Int, dataWidthBits: Int)
+ extends Bundle {
+ val addr = Decoupled(new AxiLiteAddress(addrWidthBits))
+ val data = Flipped(Decoupled(new AxiLiteReadData(dataWidthBits)))
+}
diff --git a/hdl/chisel/src/kelvin/ClockGate.scala b/hdl/chisel/src/kelvin/ClockGate.scala
new file mode 100644
index 0000000..87a576e
--- /dev/null
+++ b/hdl/chisel/src/kelvin/ClockGate.scala
@@ -0,0 +1,13 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+class ClockGate extends BlackBox {
+ val io = IO(new Bundle {
+ val clk_i = Input(Clock())
+ val enable = Input(Bool()) // '1' passthrough, '0' disable.
+ val clk_o = Output(Clock())
+ })
+}
diff --git a/hdl/chisel/src/kelvin/Core.scala b/hdl/chisel/src/kelvin/Core.scala
new file mode 100644
index 0000000..16c8ece
--- /dev/null
+++ b/hdl/chisel/src/kelvin/Core.scala
@@ -0,0 +1,76 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Core {
+ def apply(p: Parameters): Core = {
+ return Module(new Core(p))
+ }
+}
+
+class Core(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ val csr = new CsrInOutIO(p)
+ val halted = Output(Bool())
+ val fault = Output(Bool())
+
+ val ibus = new IBusIO(p)
+ val dbus = new DBusIO(p)
+ val axi0 = new AxiMasterIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits)
+ val axi1 = new AxiMasterIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits)
+
+ val iflush = new IFlushIO(p)
+ val dflush = new DFlushIO(p)
+ val slog = new SLogIO(p)
+
+ val debug = new DebugIO(p)
+ })
+
+ val score = SCore(p)
+ val vcore = VCore(p)
+ val dbusmux = DBusMux(p)
+
+ // ---------------------------------------------------------------------------
+ // Scalar Core outputs.
+ io.csr <> score.io.csr
+ io.ibus <> score.io.ibus
+ io.halted := score.io.halted
+ io.fault := score.io.fault
+ io.iflush <> score.io.iflush
+ io.dflush <> score.io.dflush
+ io.slog := score.io.slog
+ io.debug := score.io.debug
+
+ // ---------------------------------------------------------------------------
+ // Vector core.
+ score.io.vcore <> vcore.io.score
+
+ // ---------------------------------------------------------------------------
+ // Local Data Bus Port
+ dbusmux.io.vldst := score.io.vldst
+ dbusmux.io.vlast := vcore.io.last
+
+ dbusmux.io.vcore <> vcore.io.dbus
+ dbusmux.io.score <> score.io.dbus
+
+ io.dbus <> dbusmux.io.dbus
+
+ // ---------------------------------------------------------------------------
+ // Scalar DBus to AXI.
+ val dbus2axi = DBus2Axi(p)
+ dbus2axi.io.dbus <> score.io.ubus
+
+ // ---------------------------------------------------------------------------
+ // AXI ports.
+ io.axi0.read <> vcore.io.ld
+ io.axi0.write <> vcore.io.st
+
+ io.axi1 <> dbus2axi.io.axi
+}
+
+object EmitCore extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new Core(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/DBus2Axi.scala b/hdl/chisel/src/kelvin/DBus2Axi.scala
new file mode 100644
index 0000000..6fc89c0
--- /dev/null
+++ b/hdl/chisel/src/kelvin/DBus2Axi.scala
@@ -0,0 +1,65 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object DBus2Axi {
+ def apply(p: Parameters): DBus2Axi = {
+ return Module(new DBus2Axi(p))
+ }
+}
+
+class DBus2Axi(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ val dbus = Flipped(new DBusIO(p))
+ val axi = new AxiMasterIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits)
+ })
+
+ val linebit = log2Ceil(p.lsuDataBits / 8)
+
+ val sraddrActive = RegInit(false.B)
+ val sdata = Reg(UInt(p.axi2DataBits.W))
+
+ when (io.axi.read.data.valid && io.axi.read.data.ready) {
+ sraddrActive := false.B
+ assert(sraddrActive)
+ assert(!io.axi.read.addr.valid)
+ } .elsewhen (io.axi.read.addr.valid && io.axi.read.addr.ready) {
+ sraddrActive := true.B
+ assert(!sraddrActive)
+ assert(!io.axi.read.data.valid)
+ }
+
+ when (io.axi.read.data.valid && io.axi.read.data.ready) {
+ sdata := io.axi.read.data.bits.data
+ }
+
+ io.dbus.ready := Mux(io.dbus.write,
+ io.axi.write.addr.valid && io.axi.write.addr.ready,
+ io.axi.read.data.valid && sraddrActive)
+ io.dbus.rdata := sdata
+
+ val saddr = Cat(io.dbus.addr(31, linebit), 0.U(linebit.W))
+
+ io.axi.write.addr.valid := io.dbus.valid && io.dbus.write
+ io.axi.write.addr.bits.addr := saddr
+ io.axi.write.addr.bits.id := 0.U
+
+ io.axi.write.data.valid := io.dbus.valid && io.dbus.write
+ io.axi.write.data.bits.strb := io.dbus.wmask
+ io.axi.write.data.bits.data := io.dbus.wdata
+
+ io.axi.write.resp.ready := true.B
+
+ io.axi.read.addr.valid := io.dbus.valid && !io.dbus.write && !sraddrActive
+ io.axi.read.addr.bits.addr := saddr
+ io.axi.read.addr.bits.id := 0.U
+
+ io.axi.read.data.ready := true.B
+}
+
+object EmitDBus2Axi extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new DBus2Axi(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/DBusMux.scala b/hdl/chisel/src/kelvin/DBusMux.scala
new file mode 100644
index 0000000..30f523f
--- /dev/null
+++ b/hdl/chisel/src/kelvin/DBusMux.scala
@@ -0,0 +1,41 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object DBusMux {
+ def apply(p: Parameters): DBusMux = {
+ return Module(new DBusMux(p))
+ }
+}
+
+class DBusMux(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ val vldst = Input(Bool()) // score.lsu
+ val vlast = Input(Bool()) // vcore.vldst
+ val vcore = Flipped(new DBusIO(p))
+ val score = Flipped(new DBusIO(p))
+ val dbus = new DBusIO(p)
+ })
+
+ io.dbus.valid := Mux(io.vldst, io.vcore.valid, io.score.valid)
+ io.dbus.write := Mux(io.vldst, io.vcore.write, io.score.write)
+ io.dbus.addr := Mux(io.vldst, io.vcore.addr, io.score.addr)
+ io.dbus.adrx := Mux(io.vldst, io.vcore.adrx, io.score.adrx)
+ io.dbus.size := Mux(io.vldst, io.vcore.size, io.score.size)
+ io.dbus.wdata := Mux(io.vldst, io.vcore.wdata, io.score.wdata)
+ io.dbus.wmask := Mux(io.vldst, io.vcore.wmask, io.score.wmask)
+
+ io.score.rdata := io.dbus.rdata
+ io.vcore.rdata := io.dbus.rdata
+
+ // Scalar core fifo syncs to vector core vldst, removed on last transaction.
+ io.score.ready := io.dbus.ready && (!io.vldst || io.vcore.valid && io.vlast)
+ io.vcore.ready := io.dbus.ready && io.vldst
+}
+
+object EmitDBusMux extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new DBusMux(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/L1DCache.scala b/hdl/chisel/src/kelvin/L1DCache.scala
new file mode 100644
index 0000000..ac006e7
--- /dev/null
+++ b/hdl/chisel/src/kelvin/L1DCache.scala
@@ -0,0 +1,676 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import chisel3.experimental.ChiselEnum
+import common._
+
+object L1DCache {
+ def apply(p: Parameters): L1DCache = {
+ return Module(new L1DCache(p))
+ }
+}
+
+object L1DCacheBank {
+ def apply(p: Parameters): L1DCacheBank = {
+ return Module(new L1DCacheBank(p))
+ }
+}
+
+class L1DCache(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ val dbus = Flipped(new DBusIO(p))
+ val axi = new AxiMasterIO(p.axi1AddrBits, p.axi1DataBits, p.axi1IdBits)
+ val flush = Flipped(new DFlushIO(p))
+ })
+
+ assert(p.axi1IdBits == 4)
+ assert(p.axi1DataBits == 256)
+
+ val bank0 = Module(new L1DCacheBank(p))
+ val bank1 = Module(new L1DCacheBank(p))
+
+ val linebit = log2Ceil(p.lsuDataBits / 8)
+ val linebytes = 1 << linebit
+
+ // Remove bank select bit from address.
+ def BankInAddress(addr: UInt): UInt = {
+ assert(addr.getWidth == 32)
+ val output = Cat(addr(31, linebit + 1), addr(linebit - 1, 0))
+ assert(output.getWidth == 31)
+ output
+ }
+
+ // Add bank select bit to address.
+ def BankOutAddress(addr: UInt, bank: Int): UInt = {
+ assert(addr.getWidth == 31)
+ val output = Cat(addr(30, linebit), bank.U(1.W), addr(linebit - 1, 0))
+ assert(output.getWidth == 32)
+ output
+ }
+
+ assert(io.dbus.size <= linebytes.U)
+
+ // ---------------------------------------------------------------------------
+ // Data bus multiplexor.
+ val lineend = (io.dbus.addr(linebit - 1, 0) + io.dbus.size) > linebytes.U
+ val dempty = io.dbus.size === 0.U
+ val dsel0 = io.dbus.addr(linebit) === 0.U && !dempty || lineend
+ val dsel1 = io.dbus.addr(linebit) === 1.U && !dempty || lineend
+ val preread = ~io.dbus.addr(11, linebit) =/= 0.U && !io.dbus.write && !dempty // Within 4KB
+ val addrA = Mux(io.dbus.addr(linebit), BankInAddress(io.dbus.adrx), BankInAddress(io.dbus.addr))
+ val addrB = Mux(io.dbus.addr(linebit), BankInAddress(io.dbus.addr), BankInAddress(io.dbus.adrx))
+ val rsel = Reg(Vec(linebytes, Bool()))
+
+ assert(!(io.dbus.valid && io.dbus.adrx =/= (io.dbus.addr + linebytes.U)))
+
+ // Write masks
+ val wmaskSA = ((~0.U(linebytes.W)) << io.dbus.addr(linebit - 1, 0))(linebytes - 1, 0)
+ val wmaskSB = ((~0.U(linebytes.W)) >> (linebytes.U - io.dbus.addr(linebit - 1, 0)))(linebytes - 1, 0)
+ val wmaskA = io.dbus.wmask & wmaskSA
+ val wmaskB = io.dbus.wmask & wmaskSB
+ assert(wmaskSA.getWidth == io.dbus.wmask.getWidth)
+ assert(wmaskSB.getWidth == io.dbus.wmask.getWidth)
+ assert(wmaskA.getWidth == io.dbus.wmask.getWidth)
+ assert(wmaskB.getWidth == io.dbus.wmask.getWidth)
+ assert((wmaskSA | wmaskSB) === ~0.U(linebytes.W))
+ assert((wmaskSA & wmaskSB) === 0.U)
+
+ bank0.io.dbus.valid := io.dbus.valid && (dsel0 || preread)
+ bank0.io.dbus.write := io.dbus.write
+ bank0.io.dbus.wmask := Mux(io.dbus.addr(linebit), wmaskB, wmaskA)
+ bank0.io.dbus.size := io.dbus.size
+ bank0.io.dbus.addr := addrA
+ bank0.io.dbus.adrx := addrB
+ bank0.io.dbus.wdata := io.dbus.wdata
+
+ bank1.io.dbus.valid := io.dbus.valid && (dsel1 || preread)
+ bank1.io.dbus.write := io.dbus.write
+ bank1.io.dbus.wmask := Mux(io.dbus.addr(linebit), wmaskA, wmaskB)
+ bank1.io.dbus.size := io.dbus.size
+ bank1.io.dbus.addr := addrB
+ bank1.io.dbus.adrx := addrA
+ bank1.io.dbus.wdata := io.dbus.wdata
+
+ val dbusready = (bank0.io.dbus.ready || !dsel0) &&
+ (bank1.io.dbus.ready || !dsel1)
+
+ // Read bank selection.
+ when (io.dbus.valid && dbusready && !io.dbus.write) {
+ val addr = io.dbus.addr(linebit, 0)
+ for (i <- 0 until linebytes) {
+ // reverse order to index usage
+ rsel(linebytes - 1 - i) := (addr + i.U)(linebit)
+ }
+ }
+
+ def RData(data: UInt = 0.U(1.W), i: Int = 0): UInt = {
+ if (i < p.lsuDataBits / 8) {
+ val d0 = bank0.io.dbus.rdata(8 * i + 7, 8 * i)
+ val d1 = bank1.io.dbus.rdata(8 * i + 7, 8 * i)
+ val d = Mux(rsel(i), d1, d0)
+ val r = if (i == 0) d else Cat(d, data)
+ assert(d.getWidth == 8)
+ assert(r.getWidth == (i + 1) * 8)
+ RData(r, i + 1)
+ } else {
+ data
+ }
+ }
+
+ io.dbus.rdata := RData()
+
+ io.dbus.ready := dbusready
+
+ // dbus transaction must latch until completion.
+ val addrLatchActive = RegInit(false.B)
+ val addrLatchData = Reg(UInt(32.W))
+
+ when (io.dbus.valid && !io.dbus.ready && !addrLatchActive) {
+ addrLatchActive := true.B
+ addrLatchData := io.dbus.addr
+ } .elsewhen (addrLatchActive && io.dbus.ready) {
+ addrLatchActive := false.B
+ }
+
+ // assert(!(addrLatchActive && !io.dbus.valid)) -- do not use, allow temporary deassertion
+ assert(!(addrLatchActive && addrLatchData =/= io.dbus.addr))
+
+ // ---------------------------------------------------------------------------
+ // AXI read bus multiplexor.
+ val rresp0 = io.axi.read.data.bits.id(p.axi1IdBits - 1) === 0.U
+ val rresp1 = io.axi.read.data.bits.id(p.axi1IdBits - 1) === 1.U
+
+ val raxi0 = bank0.io.axi.read.addr.valid
+ val raxi1 = !raxi0
+
+ io.axi.read.addr.valid := bank0.io.axi.read.addr.valid || bank1.io.axi.read.addr.valid
+ io.axi.read.addr.bits.addr := Mux(raxi0, BankOutAddress(bank0.io.axi.read.addr.bits.addr, 0),
+ BankOutAddress(bank1.io.axi.read.addr.bits.addr, 1))
+ io.axi.read.addr.bits.id := Mux(raxi0, Cat(0.U(1.W), bank0.io.axi.read.addr.bits.id), Cat(1.U(1.W), bank1.io.axi.read.addr.bits.id))
+
+ bank0.io.axi.read.addr.ready := io.axi.read.addr.ready && raxi0
+ bank1.io.axi.read.addr.ready := io.axi.read.addr.ready && raxi1
+
+ bank0.io.axi.read.data.valid := io.axi.read.data.valid && rresp0
+ bank0.io.axi.read.data.bits := io.axi.read.data.bits
+
+ bank1.io.axi.read.data.valid := io.axi.read.data.valid && rresp1
+ bank1.io.axi.read.data.bits := io.axi.read.data.bits
+
+ io.axi.read.data.ready := bank0.io.axi.read.data.ready && rresp0 ||
+ bank1.io.axi.read.data.ready && rresp1
+
+ // ---------------------------------------------------------------------------
+ // AXI write bus multiplexor.
+ val waxi0 = Wire(Bool())
+ val waxi1 = Wire(Bool())
+ val wresp0 = io.axi.write.resp.bits.id(p.axi1IdBits - 1) === 0.U
+ val wresp1 = io.axi.write.resp.bits.id(p.axi1IdBits - 1) === 1.U
+
+ if (true) {
+ waxi0 := bank0.io.axi.write.addr.valid
+ waxi1 := !waxi0
+ } else {
+ // Flushes interleave banks for whole line writes.
+ // Change when selected bank not active and other is active.
+ // Change on last transaction in a line write.
+ val wsel = RegInit(false.B)
+
+ when (wsel) {
+ when (bank0.io.axi.write.addr.valid && !bank1.io.axi.write.addr.valid) {
+ wsel := false.B
+ } .elsewhen (bank1.io.axi.write.addr.valid && bank1.io.axi.write.addr.ready && bank1.io.axi.write.addr.bits.id === ~0.U((p.axi1IdBits - 1).W)) {
+ wsel := false.B
+ }
+ } .otherwise {
+ when (bank1.io.axi.write.addr.valid && !bank0.io.axi.write.addr.valid) {
+ wsel := true.B
+ } .elsewhen (bank0.io.axi.write.addr.valid && bank0.io.axi.write.addr.ready && bank0.io.axi.write.addr.bits.id === ~0.U((p.axi1IdBits - 1).W)) {
+ wsel := true.B
+ }
+ }
+
+ waxi0 := wsel === false.B
+ waxi1 := wsel === true.B
+ }
+
+ io.axi.write.addr.valid := bank0.io.axi.write.addr.valid && waxi0 ||
+ bank1.io.axi.write.addr.valid && waxi1
+ io.axi.write.addr.bits.addr := Mux(waxi0, BankOutAddress(bank0.io.axi.write.addr.bits.addr, 0),
+ BankOutAddress(bank1.io.axi.write.addr.bits.addr, 1))
+ io.axi.write.addr.bits.id := Mux(waxi0, Cat(0.U(1.W), bank0.io.axi.write.addr.bits.id),
+ Cat(1.U(1.W), bank1.io.axi.write.addr.bits.id))
+
+ io.axi.write.data.valid := bank0.io.axi.write.data.valid && waxi0 ||
+ bank1.io.axi.write.data.valid && waxi1
+ io.axi.write.data.bits := Mux(waxi0, bank0.io.axi.write.data.bits, bank1.io.axi.write.data.bits)
+
+ bank0.io.axi.write.addr.ready := io.axi.write.addr.ready && waxi0
+ bank1.io.axi.write.addr.ready := io.axi.write.addr.ready && waxi1
+ bank0.io.axi.write.data.ready := io.axi.write.data.ready && waxi0
+ bank1.io.axi.write.data.ready := io.axi.write.data.ready && waxi1
+
+ bank0.io.axi.write.resp.valid := io.axi.write.resp.valid && wresp0
+ bank0.io.axi.write.resp.bits := io.axi.write.resp.bits
+
+ bank1.io.axi.write.resp.valid := io.axi.write.resp.valid && wresp1
+ bank1.io.axi.write.resp.bits := io.axi.write.resp.bits
+
+ io.axi.write.resp.ready := bank0.io.axi.write.resp.ready && wresp0 ||
+ bank1.io.axi.write.resp.ready && wresp1
+
+ assert(!(io.axi.write.addr.valid && !io.axi.write.data.valid))
+ assert(!(io.axi.write.addr.valid && (io.axi.write.addr.ready =/= io.axi.write.data.ready)))
+
+ // ---------------------------------------------------------------------------
+ // Flush controls.
+ // bank0.io.flush.valid := io.flush.valid && bank1.io.flush.ready
+ // bank1.io.flush.valid := io.flush.valid && bank0.io.flush.ready
+ bank0.io.flush.valid := io.flush.valid
+ bank0.io.flush.all := io.flush.all
+ bank0.io.flush.clean := io.flush.clean
+
+ bank1.io.flush.valid := io.flush.valid
+ bank1.io.flush.all := io.flush.all
+ bank1.io.flush.clean := io.flush.clean
+
+ io.flush.ready := bank0.io.flush.ready && bank1.io.flush.ready
+}
+
+class L1DCacheBank(p: Parameters) extends Module {
+ // A relatively simple cache block. Only one transaction may post at a time.
+ // 2^8 * 256 / 8 = 8KiB 4-way Tag[31,12] + Index[11,6] + Data[5,0]
+ val slots = p.l1dslots
+ val slotBits = log2Ceil(slots)
+ val assoc = 4 // 2, 4, 8, 16, slots
+ val sets = slots / assoc
+ val setLsb = log2Ceil(p.lsuDataBits / 8)
+ val setMsb = log2Ceil(sets) + setLsb - 1
+ val tagLsb = setMsb + 1
+ val tagMsb = 30
+
+ val io = IO(new Bundle {
+ val dbus = Flipped(new DBusIO(p, true))
+ val axi = new AxiMasterIO(p.axi1AddrBits - 1, p.axi1DataBits, p.axi1IdBits - 1)
+ val flush = Flipped(new DFlushIO(p))
+ })
+
+ // AXI memory consistency, maintain per-byte strobes.
+ val bytes = p.lsuDataBits / 8
+
+ def Mem8to9(d: UInt, m: UInt): UInt = {
+ assert(d.getWidth == p.lsuDataBits)
+ assert(m.getWidth == p.lsuDataBits / 8)
+ val data = Wire(Vec(bytes, UInt(9.W)))
+ for (i <- 0 until bytes) {
+ data(i) := Cat(m(i), d(7 + i * 8, 0 + i * 8))
+ }
+ data.asUInt
+ }
+
+ def Mem9to8(d: UInt): UInt = {
+ assert(d.getWidth == p.lsuDataBits * 9 / 8)
+ val data = Wire(Vec(bytes, UInt(8.W)))
+ for (i <- 0 until bytes) {
+ data(i) := d(7 + i * 9, 0 + i * 9)
+ }
+ data.asUInt
+ }
+
+ def Mem9to1(d: UInt): UInt = {
+ assert(d.getWidth == p.lsuDataBits * 9 / 8)
+ val data = Wire(Vec(bytes, UInt(1.W)))
+ for (i <- 0 until bytes) {
+ data(i) := Cat(d(8 + i * 9))
+ }
+ data.asUInt
+ }
+
+ val checkBit = if (p.lsuDataBits == 128) 4
+ else if (p.lsuDataBits == 256) 5 else 6
+ assert(assoc == 2 || assoc == 4 || assoc == 8 || assoc == 16 || assoc == slots)
+ assert(assoc != 2 || setLsb == checkBit && setMsb == (checkBit + 6) && tagLsb == (checkBit + 7))
+ assert(assoc != 4 || setLsb == checkBit && setMsb == (checkBit + 5) && tagLsb == (checkBit + 6))
+ assert(assoc != 8 || setLsb == checkBit && setMsb == (checkBit + 4) && tagLsb == (checkBit + 5))
+ assert(assoc != 16 || setLsb == checkBit && setMsb == (checkBit + 3) && tagLsb == (checkBit + 4))
+ assert(assoc != slots || tagLsb == checkBit)
+
+ class Sram_1rwm_256x288 extends BlackBox {
+ val io = IO(new Bundle {
+ val clock = Input(Clock())
+ val valid = Input(Bool())
+ val write = Input(Bool())
+ val addr = Input(UInt(slotBits.W))
+ val wdata = Input(UInt((p.axi1DataBits * 9 / 8).W))
+ val wmask = Input(UInt((p.axi1DataBits * 1 / 8).W))
+ val rdata = Output(UInt((p.axi1DataBits * 9 / 8).W))
+ })
+ }
+
+ // Check io.dbus.wmask is in range of addr and size.
+ val busbytes = p.lsuDataBits / 8
+ val linemsb = log2Ceil(busbytes)
+ val chkmask0 = (~0.U(busbytes.W)) >> (busbytes.U - io.dbus.size)
+ val chkmask1 = Cat(chkmask0, chkmask0) << io.dbus.addr(linemsb - 1, 0)
+ val chkmask = chkmask1(2 * busbytes - 1, busbytes)
+ assert(!(io.dbus.valid && io.dbus.write) || (io.dbus.wmask & ~chkmask) === 0.U)
+
+ // ---------------------------------------------------------------------------
+ // CAM state.
+ val valid = RegInit(VecInit(Seq.fill(slots)(false.B)))
+ val dirty = RegInit(VecInit(Seq.fill(slots)(false.B)))
+ val camaddr = Reg(Vec(slots, UInt(32.W)))
+ // val mem = Mem1RWM(slots, p.lsuDataBits * 9 / 8, 9)
+ val mem = Module(new Sram_1rwm_256x288())
+
+ val history = Reg(Vec(slots / assoc, Vec(assoc, UInt(log2Ceil(assoc).W))))
+
+ val matchSet = Wire(Vec(slots, Bool()))
+ val matchAddr = Wire(Vec(assoc, Bool()))
+
+ val matchSlotB = Wire(Vec(slots, Bool()))
+ val matchSlot = matchSlotB.asUInt
+ val replaceSlotB = Wire(Vec(slots, Bool()))
+ val replaceSlot = replaceSlotB.asUInt
+
+ // OR mux lookup of associative entries.
+ def camaddrRead(i: Int, value: UInt = 0.U(32.W)): UInt = {
+ if (i < slots) {
+ camaddrRead(i + assoc, value | MuxOR(matchSet(i), camaddr(i)))
+ } else {
+ value
+ }
+ }
+
+ for (i <- 0 until assoc) {
+ val ca = camaddrRead(i)
+ matchAddr(i) := io.dbus.addr(tagMsb, tagLsb) === ca(tagMsb, tagLsb)
+ }
+
+ for (i <- 0 until slots) {
+ val set = i / assoc
+ val setMatch = if (assoc == slots) true.B else io.dbus.addr(setMsb, setLsb) === set.U
+ matchSet(i) := setMatch
+ }
+
+ for (i <- 0 until slots) {
+ val set = i / assoc
+ val index = i % assoc
+
+ matchSlotB(i) := valid(i) && matchSet(i) && matchAddr(index)
+
+ val historyMatch = history(set)(index) === 0.U
+ replaceSlotB(i) := matchSet(i) && historyMatch
+ assert((i - set * assoc) == index)
+ }
+
+ assert(PopCount(matchSlot) <= 1.U)
+ assert(PopCount(replaceSlot) <= 1.U)
+
+ val found = matchSlot =/= 0.U
+
+ val replaceNum = Wire(Vec(slots, UInt(slotBits.W)))
+ for (i <- 0 until slots) {
+ replaceNum(i) := MuxOR(replaceSlot(i), i.U)
+ }
+
+ val replaceId = VecOR(replaceNum, slots)
+ assert(replaceId.getWidth == slotBits)
+
+ val readNum = Wire(Vec(slots, UInt(slotBits.W)))
+ for (i <- 0 until slots) {
+ readNum(i) := MuxOR(matchSlotB(i), i.U)
+ }
+ val foundId = VecOR(readNum, slots)
+
+ for (i <- 0 until slots / assoc) {
+ // Get the matched value from the OneHot encoding of the set.
+ val matchSet = matchSlot((i + 1) * assoc - 1, i * assoc)
+ assert(PopCount(matchSet) <= 1.U)
+ val matchIndices = Wire(Vec(assoc, UInt(log2Ceil(assoc).W)))
+ for (j <- 0 until assoc) {
+ matchIndices(j) := MuxOR(matchSet(j), j.U)
+ }
+ val matchIndex = VecOR(matchIndices, assoc)
+ assert(matchIndex.getWidth == log2Ceil(assoc))
+ val matchValue = history(i)(matchIndex)
+
+ // History based on count values so that high set size has less DFF usage.
+ when (io.dbus.valid && io.dbus.ready && (if (assoc == slots) true.B else io.dbus.addr(setMsb, setLsb) === i.U)) {
+ for (j <- 0 until assoc) {
+ when (matchSet(j)) {
+ history(i)(j) := (assoc - 1).U
+ } .elsewhen (history(i)(j) > matchValue) {
+ history(i)(j) := history(i)(j) - 1.U
+ assert(history(i)(j) > 0.U)
+ }
+ }
+ }
+ }
+
+ // Reset history to unique values within sets.
+ // Must be placed below all other assignments.
+ // Note the definition is Reg() so will generate an asynchronous reset.
+ when (reset.asBool) {
+ for (i <- 0 until slots / assoc) {
+ for (j <- 0 until assoc) {
+ history(i)(j) := j.U
+ }
+ }
+ }
+
+ // These checks are extremely slow to compile.
+ if (false) {
+ for (i <- 0 until slots / assoc) {
+ for (j <- 0 until assoc) {
+ for (k <- 0 until assoc) {
+ if (j != k) {
+ assert(history(i)(j) =/= history(i)(k))
+ }
+ }
+ }
+ }
+ }
+
+ // ---------------------------------------------------------------------------
+ // Flush interface.
+ object FlushState extends ChiselEnum {
+ val sNone, sCapture, sProcess, sMemwaddr, sMemwdata, sAxiready, sAxiresp, sEnd = Value
+ }
+
+ val fstate = RegInit(FlushState.sNone)
+ val flush = RegInit(VecInit(Seq.fill(slots)(false.B)))
+
+ // ---------------------------------------------------------------------------
+ // AXI interface.
+ val ractive = RegInit(false.B)
+ val wactive = RegInit(false.B)
+ val active = ractive || wactive
+
+ assert(!(ractive && fstate =/= FlushState.sNone))
+
+ val axiraddrvalid = RegInit(false.B)
+ val axirdataready = RegInit(false.B)
+
+ val memwaddrEn = RegInit(false.B)
+ val memwdataEn = RegInit(false.B)
+ val axiwaddrvalid = RegInit(false.B)
+ val axiwdatavalid = RegInit(false.B)
+ val axiwdatabuf = Reg(UInt(p.axi1DataBits.W))
+ val axiwstrbbuf = Reg(UInt((p.axi1DataBits / 8).W))
+
+ val axiraddr = Reg(UInt(32.W))
+ val axiwaddr = Reg(UInt(32.W))
+
+ val replaceIdReg = Reg(UInt(slotBits.W))
+
+ val alignedAddr = Cat(io.dbus.addr(tagMsb, setLsb), 0.U(setLsb.W))
+
+ when (io.dbus.valid && !io.dbus.ready && !active) {
+ ractive := true.B
+ wactive := dirty(replaceId)
+ assert(!(dirty(replaceId) && !valid(replaceId)))
+ axiraddrvalid := true.B
+ axirdataready := true.B
+ valid(replaceId) := false.B
+ dirty(replaceId) := false.B
+ replaceIdReg := replaceId
+ camaddr(replaceId) := alignedAddr
+ axiraddr := alignedAddr
+ axiwaddr := camaddr(replaceId)
+ }
+
+ // Writeback pulsed controls to memory.
+ memwaddrEn := io.dbus.valid && !io.dbus.ready && !active && dirty(replaceId)
+ memwdataEn := memwaddrEn
+
+ when (io.dbus.valid && io.dbus.ready && io.dbus.write) {
+ dirty(foundId) := true.B
+ }
+
+ when (io.axi.read.addr.valid && io.axi.read.addr.ready) {
+ axiraddrvalid := false.B
+ }
+
+ when (io.axi.read.data.valid && io.axi.read.data.ready) {
+ valid(replaceIdReg) := true.B
+ axirdataready := false.B
+ ractive := false.B
+ }
+
+ when (memwdataEn) {
+ val rdata = mem.io.rdata
+ axiwdatabuf := Mem9to8(rdata)
+ axiwstrbbuf := Mem9to1(rdata)
+ axiwaddrvalid := true.B
+ axiwdatavalid := true.B
+ }
+
+ when (io.axi.write.addr.valid && io.axi.write.addr.ready) {
+ axiwaddrvalid := false.B
+ }
+
+ when (io.axi.write.data.valid && io.axi.write.data.ready) {
+ axiwdatavalid := false.B
+ }
+
+ when (io.axi.write.resp.valid && io.axi.write.resp.ready) {
+ wactive := false.B
+ }
+
+ io.axi.read.addr.valid := axiraddrvalid
+ io.axi.read.addr.bits.addr := axiraddr
+ io.axi.read.addr.bits.id := 0.U
+ io.axi.read.data.ready := axirdataready
+ assert(!(io.axi.read.data.valid && !io.axi.read.data.ready))
+
+ io.axi.write.addr.valid := axiwaddrvalid
+ io.axi.write.addr.bits.id := 0.U
+ io.axi.write.addr.bits.addr := axiwaddr
+
+ io.axi.write.resp.ready := true.B
+
+ io.axi.write.data.valid := axiwdatavalid
+ io.axi.write.data.bits.data := axiwdatabuf.asUInt
+ io.axi.write.data.bits.strb := axiwstrbbuf.asUInt
+
+ assert(!(io.axi.read.addr.valid && !ractive))
+ assert(!(io.axi.read.data.ready && !ractive))
+ assert(!(io.axi.write.addr.valid && !wactive && fstate === FlushState.sNone))
+
+ // ---------------------------------------------------------------------------
+ // Axi Write Response Count.
+ val wrespcnt = RegInit(0.U((slotBits + 1).W))
+ val wrespinc = io.axi.write.addr.valid && io.axi.write.addr.ready
+ val wrespdec = io.axi.write.resp.valid && io.axi.write.resp.ready
+
+ when (wrespinc && !wrespdec) {
+ wrespcnt := wrespcnt + 1.U
+ } .elsewhen (!wrespinc && wrespdec) {
+ wrespcnt := wrespcnt - 1.U
+ }
+
+ // ---------------------------------------------------------------------------
+ // Flush interface.
+ val flushId = Ctz(flush.asUInt)(slotBits - 1, 0)
+
+ for (i <- 0 until slots) {
+ assert(!(flush(i) && !dirty(i)))
+ }
+
+ switch(fstate) {
+ is (FlushState.sNone) {
+ when (io.flush.valid && !axiwaddrvalid && !axiwdatavalid && !axiraddrvalid && !axirdataready) {
+ fstate := FlushState.sCapture
+ replaceIdReg := foundId
+ }
+ }
+
+ is (FlushState.sCapture) {
+ fstate := FlushState.sProcess
+ flush(replaceIdReg) := dirty(replaceIdReg) // matched (without .all)
+ when (io.flush.all) {
+ for (i <- 0 until slots) {
+ flush(i) := dirty(i)
+ }
+ }
+ }
+
+ is (FlushState.sProcess) {
+ when (flush.asUInt === 0.U) {
+ fstate := FlushState.sAxiresp
+ } .otherwise {
+ fstate := FlushState.sMemwaddr
+ memwaddrEn := true.B
+ }
+ replaceIdReg := flushId
+ }
+
+ is (FlushState.sMemwaddr) {
+ assert(memwaddrEn)
+ fstate := FlushState.sMemwdata
+ axiwaddr := camaddr(replaceIdReg)
+ flush(replaceIdReg) := false.B
+ dirty(replaceIdReg) := false.B
+ when (io.flush.clean) {
+ valid(replaceIdReg) := false.B
+ }
+ }
+
+ is (FlushState.sMemwdata) {
+ assert(memwdataEn)
+ fstate := FlushState.sAxiready
+ }
+
+ is (FlushState.sAxiready) {
+ when ((!axiwaddrvalid || io.axi.write.addr.valid && io.axi.write.addr.ready) &&
+ (!axiwdatavalid || io.axi.write.data.valid && io.axi.write.data.ready)) {
+ fstate := FlushState.sProcess
+ }
+ }
+
+ is (FlushState.sAxiresp) {
+ when (wrespcnt === 0.U) {
+ fstate := FlushState.sEnd
+ }
+ }
+
+ is (FlushState.sEnd) {
+ // Must complete the handshake as there are multiple banks.
+ when (io.flush.ready && !io.flush.valid) {
+ fstate := FlushState.sNone
+ }
+ when (io.flush.clean) {
+ when (io.flush.all) {
+ for (i <- 0 until slots) {
+ valid(i) := false.B
+ assert(!dirty(i))
+ assert(!flush(i))
+ }
+ }
+ }
+ }
+ }
+
+ io.flush.ready := fstate === FlushState.sEnd
+
+ assert(!(io.flush.valid && io.dbus.valid))
+
+ // ---------------------------------------------------------------------------
+ // Core Data Bus.
+ io.dbus.ready := found && !ractive
+ io.dbus.rdata := Mem9to8(mem.io.rdata)
+ assert(!(io.dbus.valid && io.dbus.size === 0.U))
+
+ // ---------------------------------------------------------------------------
+ // Memory controls.
+ val axiwrite = memwaddrEn
+ val axiread = io.axi.read.data.valid && io.axi.read.data.ready
+ val buswrite = io.dbus.valid && io.dbus.ready && io.dbus.write
+ val busread = io.dbus.valid && !io.dbus.write && !ractive
+
+ val wdbits = p.axi1DataBits
+ val wmbits = p.axi1DataBits / 8
+ val id = io.axi.read.data.bits.id
+ val rsel = axirdataready
+ mem.io.clock := clock
+ mem.io.valid := busread || buswrite || axiread || axiwrite
+ mem.io.write := rsel && !axiwrite || io.dbus.valid && io.dbus.write && !ractive
+ mem.io.addr := Mux(rsel || axiwrite, replaceIdReg, foundId)
+ mem.io.wmask := Mux(rsel, ~0.U(wmbits.W), io.dbus.wmask)
+ mem.io.wdata := Mux(rsel, Mem8to9(io.axi.read.data.bits.data, 0.U(wmbits.W)),
+ Mem8to9(io.dbus.wdata, ~0.U(wmbits.W)))
+
+ assert(PopCount(busread +& buswrite +& axiread) <= 1.U)
+}
+
+object EmitL1DCache extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new L1DCache(p), args)
+}
+
+object EmitL1DCacheBank extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new L1DCacheBank(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/L1ICache.scala b/hdl/chisel/src/kelvin/L1ICache.scala
new file mode 100644
index 0000000..cc20969
--- /dev/null
+++ b/hdl/chisel/src/kelvin/L1ICache.scala
@@ -0,0 +1,256 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object L1ICache {
+ def apply(p: Parameters): L1ICache = {
+ return Module(new L1ICache(p))
+ }
+}
+
+class L1ICache(p: Parameters) extends Module {
+ // A relatively simple cache block. Only one transaction may post at a time.
+ // 2^8 * 256 / 8 = 8KiB 4-way Tag[31,12] + Index[11,6] + Data[5,0]
+ assert(p.axi0IdBits == 4)
+ assert(p.axi0DataBits == 256)
+
+ val slots = p.l1islots
+ val slotBits = log2Ceil(slots)
+ val assoc = 4 // 2, 4, 8, 16, slots
+ val sets = slots / assoc
+ val setLsb = log2Ceil(p.fetchDataBits / 8)
+ val setMsb = log2Ceil(sets) + setLsb - 1
+ val tagLsb = setMsb + 1
+ val tagMsb = 31
+
+ val io = IO(new Bundle {
+ val ibus = Flipped(new IBusIO(p))
+ val flush = Flipped(new IFlushIO(p))
+ val axi = new Bundle {
+ val read = new AxiMasterReadIO(p.axi0AddrBits, p.axi0DataBits, p.axi0IdBits)
+ }
+ })
+
+ assert(assoc == 2 || assoc == 4 || assoc == 8 || assoc == 16 || assoc == slots)
+ assert(assoc != 2 || setLsb == 5 && setMsb == 11 && tagLsb == 12)
+ assert(assoc != 4 || setLsb == 5 && setMsb == 10 && tagLsb == 11)
+ assert(assoc != 8 || setLsb == 5 && setMsb == 9 && tagLsb == 10)
+ assert(assoc != 16 || setLsb == 5 && setMsb == 8 && tagLsb == 9)
+ assert(assoc != slots || tagLsb == 5)
+
+ class Sram_1rw_256x256 extends BlackBox {
+ val io = IO(new Bundle {
+ val clock = Input(Clock())
+ val valid = Input(Bool())
+ val write = Input(Bool())
+ val addr = Input(UInt(slotBits.W))
+ val wdata = Input(UInt(p.axi0DataBits.W))
+ val rdata = Output(UInt(p.axi0DataBits.W))
+ })
+ }
+
+ // ---------------------------------------------------------------------------
+ // CAM state.
+ val valid = RegInit(VecInit(Seq.fill(slots)(false.B)))
+ val camaddr = Reg(Vec(slots, UInt(32.W)))
+ // val mem = Mem1RW(slots, UInt(p.axi0DataBits.W))
+ val mem = Module(new Sram_1rw_256x256())
+
+ val history = Reg(Vec(slots / assoc, Vec(assoc, UInt(log2Ceil(assoc).W))))
+
+ val matchSet = Wire(Vec(slots, Bool()))
+ val matchAddr = Wire(Vec(assoc, Bool()))
+
+ val matchSlotB = Wire(Vec(slots, Bool()))
+ val matchSlot = matchSlotB.asUInt
+ val replaceSlotB = Wire(Vec(slots, Bool()))
+ val replaceSlot = replaceSlotB.asUInt
+
+ // OR mux lookup of associative entries.
+ def camaddrRead(i: Int, value: UInt = 0.U(32.W)): UInt = {
+ if (i < slots) {
+ camaddrRead(i + assoc, value | MuxOR(matchSet(i), camaddr(i)))
+ } else {
+ value
+ }
+ }
+
+ for (i <- 0 until assoc) {
+ val ca = camaddrRead(i)
+ matchAddr(i) := io.ibus.addr(tagMsb, tagLsb) === ca(tagMsb, tagLsb)
+ }
+
+ for (i <- 0 until slots) {
+ val set = i / assoc
+ val setMatch = if (assoc == slots) true.B else io.ibus.addr(setMsb, setLsb) === set.U
+ matchSet(i) := setMatch
+ }
+
+ for (i <- 0 until slots) {
+ val set = i / assoc
+ val index = i % assoc
+
+ matchSlotB(i) := valid(i) && matchSet(i) && matchAddr(index)
+
+ val historyMatch = history(set)(index) === 0.U
+ replaceSlotB(i) := matchSet(i) && historyMatch
+ assert((i - set * assoc) == index)
+ }
+
+ assert(PopCount(matchSlot) <= 1.U)
+ assert(PopCount(replaceSlot) <= 1.U)
+
+ val found = io.ibus.valid && matchSlot =/= 0.U
+
+ val replaceNum = Wire(Vec(slots, UInt(slotBits.W)))
+ for (i <- 0 until slots) {
+ replaceNum(i) := MuxOR(replaceSlot(i), i.U)
+ }
+
+ val replaceId = VecOR(replaceNum, slots)
+ assert(replaceId.getWidth == slotBits)
+
+ val readNum = Wire(Vec(slots, UInt(slotBits.W)))
+ for (i <- 0 until slots) {
+ readNum(i) := MuxOR(matchSlotB(i), i.U)
+ }
+ val readId = VecOR(readNum, slots)
+
+ for (i <- 0 until slots / assoc) {
+ // Get the matched value from the OneHot encoding of the set.
+ val matchSet = matchSlot((i + 1) * assoc - 1, i * assoc)
+ assert(PopCount(matchSet) <= 1.U)
+ val matchIndices = Wire(Vec(assoc, UInt(log2Ceil(assoc).W)))
+ for (j <- 0 until assoc) {
+ matchIndices(j) := MuxOR(matchSet(j), j.U)
+ }
+ val matchIndex = VecOR(matchIndices, assoc)
+ assert(matchIndex.getWidth == log2Ceil(assoc))
+ val matchValue = history(i)(matchIndex)
+
+ // History based on count values so that high set size has less DFF usage.
+ when (io.ibus.valid && io.ibus.ready && (if (assoc == slots) true.B else io.ibus.addr(setMsb, setLsb) === i.U)) {
+ for (j <- 0 until assoc) {
+ when (matchSet(j)) {
+ history(i)(j) := (assoc - 1).U
+ } .elsewhen (history(i)(j) > matchValue) {
+ history(i)(j) := history(i)(j) - 1.U
+ assert(history(i)(j) > 0.U)
+ }
+ }
+ }
+ }
+
+ // Reset history to unique values within sets.
+ // Must be placed below all other assignments.
+ // Note the definition is Reg() so will generate an asynchronous reset.
+ when (reset.asBool) {
+ for (i <- 0 until slots / assoc) {
+ for (j <- 0 until assoc) {
+ history(i)(j) := j.U
+ }
+ }
+ }
+
+ // These checks are extremely slow to compile.
+ if (false) {
+ for (i <- 0 until slots / assoc) {
+ for (j <- 0 until assoc) {
+ for (k <- 0 until assoc) {
+ if (j != k) {
+ assert(history(i)(j) =/= history(i)(k))
+ }
+ }
+ }
+ }
+ }
+
+ // ---------------------------------------------------------------------------
+ // Core Instruction Bus.
+ io.ibus.ready := found
+
+ io.ibus.rdata := mem.io.rdata
+
+ // ---------------------------------------------------------------------------
+ // axi interface.
+ val axivalid = RegInit(false.B) // io.axi.read.addr.valid
+ val axiready = RegInit(false.B) // io.axi.read.data.ready
+ val axiaddr = Reg(UInt(32.W))
+
+ val replaceIdReg = Reg(UInt(slotBits.W))
+
+ when (io.ibus.valid && !io.ibus.ready && !axivalid && !axiready) {
+ replaceIdReg := replaceId
+ }
+
+ when (io.axi.read.addr.valid && io.axi.read.addr.ready) {
+ axivalid := false.B
+ } .elsewhen (io.ibus.valid && !io.ibus.ready && !axivalid && !axiready) {
+ axivalid := true.B
+ }
+
+ when (io.axi.read.data.valid && io.axi.read.data.ready) {
+ axiready := false.B
+ } .elsewhen (io.axi.read.addr.valid && io.axi.read.addr.ready && !axiready) {
+ axiready := true.B
+ }
+
+ when (io.flush.valid) {
+ for (i <- 0 until slots) {
+ valid(i) := false.B
+ }
+ } .elsewhen (io.ibus.valid && !io.ibus.ready && !axivalid && !axiready) {
+ valid(replaceId) := false.B
+ } .elsewhen (io.axi.read.data.valid && io.axi.read.data.ready) {
+ valid(replaceIdReg) := true.B
+ }
+
+ when (io.ibus.valid && !io.ibus.ready && !axivalid && !axiready) {
+ val alignedAddr = Cat(io.ibus.addr(31, setLsb), 0.U(setLsb.W))
+ axiaddr := alignedAddr
+ camaddr(replaceId) := alignedAddr
+ } .elsewhen (io.axi.read.addr.valid && io.axi.read.addr.ready) {
+ axiaddr := axiaddr + (p.axi0DataBits / 8).U
+ }
+
+ io.axi.read.defaults()
+ io.axi.read.addr.valid := axivalid
+ io.axi.read.addr.bits.addr := axiaddr
+ io.axi.read.addr.bits.id := 0.U
+ io.axi.read.data.ready := axiready
+
+ io.flush.ready := true.B
+
+ // IBus transaction must latch until completion.
+ val addrLatchActive = RegInit(false.B)
+ val addrLatchData = Reg(UInt(32.W))
+
+ when (io.flush.valid) {
+ addrLatchActive := false.B
+ } .elsewhen (io.ibus.valid && !io.ibus.ready && !addrLatchActive) {
+ addrLatchActive := true.B
+ addrLatchData := io.ibus.addr
+ } .elsewhen (addrLatchActive && io.ibus.ready) {
+ addrLatchActive := false.B
+ }
+
+ assert(!(addrLatchActive && !io.ibus.valid))
+ assert(!(addrLatchActive && addrLatchData =/= io.ibus.addr))
+
+ // ---------------------------------------------------------------------------
+ // Memory controls.
+ val memwrite = io.axi.read.data.valid && io.axi.read.data.ready
+ val memread = io.ibus.valid && !axivalid && !axiready
+ mem.io.clock := clock
+ mem.io.valid := memread || memwrite
+ mem.io.write := axiready
+ mem.io.addr := Mux(axiready, replaceIdReg, readId)
+ mem.io.wdata := io.axi.read.data.bits.data
+}
+
+object EmitL1ICache extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new L1ICache(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/Library.scala b/hdl/chisel/src/kelvin/Library.scala
new file mode 100644
index 0000000..f0e04d5
--- /dev/null
+++ b/hdl/chisel/src/kelvin/Library.scala
@@ -0,0 +1,331 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+object Mux0 {
+ def apply(valid: Bool, data: UInt): UInt = {
+ Mux(valid, data, 0.U(data.getWidth))
+ }
+
+ def apply(valid: Bool, data: Bool): Bool = {
+ Mux(valid, data, false.B)
+ }
+}
+
+object MuxOR {
+ def apply(valid: Bool, data: UInt): UInt = {
+ Mux(valid, data, 0.U(data.getWidth))
+ }
+
+ def apply(valid: Bool, data: Bool): Bool = {
+ Mux(valid, data, false.B)
+ }
+}
+
+object Min {
+ def apply(a: UInt, b: UInt): UInt = {
+ assert(a.getWidth == b.getWidth)
+ Mux(a < b, a, b)
+ }
+}
+
+object Max {
+ def apply(a: UInt, b: UInt): UInt = {
+ assert(a.getWidth == b.getWidth)
+ Mux(a > b, a, b)
+ }
+}
+
+object Repeat {
+ def apply(b: Bool, n: Int): UInt = {
+ val r = VecInit(Seq.fill(n)(b))
+ r.asUInt
+ }
+}
+
+object SignExt {
+ def apply(v: UInt, n: Int): UInt = {
+ val s = v.getWidth
+ val r = Cat(Repeat(v(s - 1), n - s), v)
+ assert(r.getWidth == n)
+ r.asUInt
+ }
+}
+
+// ORs vector lanes.
+// Vec(4, UInt(7.W)) -> UInt(7.W)
+// for (i <- 0 until count) out |= in(i)
+object VecOR {
+ def apply(vec: Vec[UInt], count: Int, index: Int, bits: UInt): UInt = {
+ if (index < count) {
+ apply(vec, count, index+1, bits | vec(index))
+ } else {
+ bits
+ }
+ }
+
+ def apply(vec: Vec[Bool], count: Int, index: Int, bits: Bool): Bool = {
+ if (index < count) {
+ apply(vec, count, index+1, bits || vec(index))
+ } else {
+ bits
+ }
+ }
+
+ def apply(vec: Vec[UInt], count: Int): UInt = {
+ apply(vec, count, 0, 0.U)
+ }
+
+ def apply(vec: Vec[Bool], count: Int): Bool = {
+ apply(vec, count, 0, false.B)
+ }
+
+ def apply(vec: Vec[UInt]): UInt = {
+ val count = vec.length
+ apply(vec, count, 0, 0.U)
+ }
+
+ def apply(vec: Vec[Bool]): Bool = {
+ val count = vec.length
+ apply(vec, count, 0, false.B)
+ }
+}
+
+object IndexMask {
+ def apply(data: Vec[UInt], index: UInt): Vec[UInt] = {
+ val count = data.length
+ val width = data(0).getWidth.W
+ val value = Wire(Vec(count, UInt(width)))
+ for (i <- 0 until count) {
+ value(i) := Mux(i.U === index, data(i), 0.U)
+ }
+ value
+ }
+}
+
+object OrReduce {
+ def apply(data: Vec[UInt]): UInt = {
+ if (data.length > 1) {
+ val count = data.length / 2
+ val odd = data.length & 1
+ val width = data(0).getWidth.W
+ val value = Wire(Vec(count + odd, UInt(width)))
+ for (i <- 0 until count) {
+ value(i) := data(2 * i + 0) | data(2 * i + 1)
+ }
+ if (odd != 0) {
+ value(count) := data(2 * count)
+ }
+ OrReduce(value)
+ } else {
+ data(0)
+ }
+ }
+}
+
+object VecAt {
+ def apply(data: Vec[Bool], index: UInt): Bool = {
+ assert(data.length == (1 << index.getWidth))
+ val count = data.length
+ val dataUInt = Wire(Vec(count, UInt(1.W)))
+ for (i <- 0 until count) {
+ dataUInt(i) := data(i)
+ }
+ OrReduce(IndexMask(dataUInt, index)) =/= 0.U
+ }
+
+ def apply(data: Vec[UInt], index: UInt): UInt = {
+ assert(data.length == (1 << index.getWidth))
+ OrReduce(IndexMask(data, index))
+ }
+}
+
+object BoolAt {
+ def apply(udata: UInt, index: UInt): Bool = {
+ assert(udata.getWidth == (1 << index.getWidth))
+ val width = udata.getWidth
+ val data = Wire(Vec(width, UInt(1.W)))
+ for (i <- 0 until width) {
+ data(i) := udata(i)
+ }
+ OrReduce(IndexMask(data, index)) =/= 0.U
+ }
+}
+
+object WiredAND {
+ def apply(bits: UInt): Bool = {
+ WiredAND(VecInit(bits.asBools))
+ }
+
+ def apply(bits: Vec[Bool]): Bool = {
+ val count = bits.length
+ if (count > 1) {
+ val limit = (count + 1) / 2
+ val value = Wire(Vec(limit, Bool()))
+ for (i <- 0 until limit) {
+ if (i * 2 + 1 >= count) {
+ value(i) := bits(2 * i + 0)
+ } else {
+ value(i) := bits(2 * i + 0) & bits(2 * i + 1)
+ }
+ }
+ WiredAND(value)
+ } else {
+ bits(0)
+ }
+ }
+}
+
+object WiredOR {
+ def apply(bits: UInt): Bool = {
+ WiredOR(VecInit(bits.asBools))
+ }
+
+ def apply(bits: Vec[Bool]): Bool = {
+ val count = bits.length
+ if (count > 1) {
+ val limit = (count + 1) / 2
+ val value = Wire(Vec(limit, Bool()))
+ for (i <- 0 until limit) {
+ if (i * 2 + 1 >= count) {
+ value(i) := bits(2 * i + 0)
+ } else {
+ value(i) := bits(2 * i + 0) | bits(2 * i + 1)
+ }
+ }
+ WiredOR(value)
+ } else {
+ bits(0)
+ }
+ }
+}
+
+object OneHot {
+ def apply(bits: UInt, count: Int): UInt = {
+ // // UIntToOH(bits, count)
+ // val bools = Wire(Vec(count, Bool()))
+ // for (i <- 0 until count) {
+ // bools(i) := bits === i.U
+ // }
+ // val r = bools.asUInt
+ // assert(r.getWidth == count)
+ // r
+
+ UIntToOH(bits, count)
+ }
+}
+
+// Page mask for two address ranges, factoring unaligned address overflow.
+object PageMaskShift {
+ def apply(address: UInt, length: UInt): UInt = {
+ assert(address.getWidth == 32)
+
+ // Find the power2 page size that contains the range offset+length.
+ // The address width is one less than length, as we want to use the
+ // page base of zero and length to match the page size.
+ val psel = Cat((address(9,0) +& length) <= 1024.U,
+ (address(8,0) +& length) <= 512.U,
+ (address(7,0) +& length) <= 256.U,
+ (address(6,0) +& length) <= 128.U,
+ (address(5,0) +& length) <= 64.U,
+ (address(4,0) +& length) <= 32.U,
+ (address(3,0) +& length) <= 16.U,
+ (address(2,0) +& length) <= 8.U,
+ (address(1,0) +& length) <= 4.U)
+
+ val pshift =
+ Mux(psel(0), 2.U, Mux(psel(1), 3.U, Mux(psel(2), 4.U, Mux(psel(3), 5.U,
+ Mux(psel(4), 6.U, Mux(psel(5), 7.U, Mux(psel(6), 8.U, Mux(psel(7), 9.U,
+ Mux(psel(8), 10.U, 0.U)))))))))
+
+ // Determine the longest run of lsb 1's. We OR 1's of the address lsb so
+ // that base+length overflow ripple extends as far as needed.
+ // Include an additional lsb 1 to round us to the next page size, as we will
+ // not perform in page test beyond the segmentBits size.
+ val addrmask = Cat(address(31,10), ~0.U(10.W), 1.U(1.W))
+ val cto = PriorityEncoder(~addrmask)
+ assert(cto.getWidth == 6)
+
+ // Mask shift value.
+ val shift = Mux(psel =/= 0.U, pshift, cto)
+ assert(shift.getWidth == 6)
+
+ shift
+ }
+}
+
+object Cto {
+ def apply(bits: UInt): UInt = {
+ PriorityEncoder(Cat(1.U(1.W), ~bits))
+ }
+}
+
+object Ctz {
+ def apply(bits: UInt): UInt = {
+ PriorityEncoder(Cat(1.U(1.W), bits))
+ }
+}
+
+// Unused
+object Clb {
+ def apply(bits: UInt): UInt = {
+ val clo = Clo(bits)
+ val clz = Clz(bits)
+ Mux(bits(bits.getWidth - 1), clo, clz)
+ }
+}
+
+// Unused
+object Clo {
+ def apply(bits: UInt): UInt = {
+ PriorityEncoder(Cat(1.U(1.W), Reverse(~bits)))
+ }
+}
+
+object Clz {
+ def apply(bits: UInt): UInt = {
+ PriorityEncoder(Cat(1.U(1.W), Reverse(bits)))
+ }
+}
+
+object WCtz {
+ def apply(bits: UInt, offset: Int = 0): UInt = {
+ assert((bits.getWidth % 32) == 0)
+ val z = Ctz(bits(31, 0))
+ val v = z | offset.U
+ assert(z.getWidth == 6)
+ if (bits.getWidth > 32) {
+ Mux(!z(5), v, WCtz(bits(bits.getWidth - 1, 32), offset + 32))
+ } else {
+ Mux(!z(5), v, (offset + 32).U)
+ }
+ }
+}
+
+object DecodeBits {
+ def apply(inst: UInt, bitPattern: String, v: Bool = true.B, index: Int = 31):
+ Bool = {
+ // System.out.println(">>> String \"" + bitPattern + "\" = " + bitPattern.length + " : " + index)
+ if (bitPattern.length > 0) {
+ if (bitPattern(0) == '0') {
+ val bit = ~inst(index)
+ DecodeBits(inst, bitPattern.drop(1), v && bit, index - 1)
+ } else if (bitPattern(0) == '1') {
+ val bit = inst(index)
+ DecodeBits(inst, bitPattern.drop(1), v && bit, index - 1)
+ } else if (bitPattern(0) == 'x') {
+ val bit = inst(index)
+ DecodeBits(inst, bitPattern.drop(1), v, index - 1)
+ } else if (bitPattern(0) == '_') {
+ DecodeBits(inst, bitPattern.drop(1), v, index)
+ } else {
+ assert(false)
+ v
+ }
+ } else {
+ assert(index == -1)
+ v
+ }
+ }
+}
diff --git a/hdl/chisel/src/kelvin/Parameters.scala b/hdl/chisel/src/kelvin/Parameters.scala
new file mode 100644
index 0000000..2024883
--- /dev/null
+++ b/hdl/chisel/src/kelvin/Parameters.scala
@@ -0,0 +1,72 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+case class Parameters() {
+ case object Core {
+ val tiny = 0
+ val little = 1
+ val big = 2
+ }
+
+ val core = sys.env.get("KELVIN_SIMD").getOrElse("256").toInt match {
+ case 128 => Core.tiny
+ case 256 => Core.little
+ case 512 => Core.big
+ }
+
+ // Machine.
+ val programCounterBits = 32
+ val instructionBits = 32
+ val instructionLanes = 4
+
+ // Vector Length (register-file and compute).
+ val vectorBits = core match {
+ case Core.tiny => 128
+ case Core.little => 256
+ case Core.big => 512
+ }
+
+ val vectorCountBits = log2Ceil(vectorBits / 8) + 1 + 2 // +2 stripmine
+ assert(vectorBits == 512 || vectorBits == 256
+ || vectorBits == 128) // 128 = faster builds, but not production(?).
+
+ // Vector queue.
+ val vectorFifoDepth = 16
+
+ // L0ICache Fetch unit.
+ // val fetchCacheBytes = 2048
+ val fetchCacheBytes = 1024
+ // val fetchCacheBytes = 128
+
+ // Scalar Core Fetch bus.
+ val fetchAddrBits = 32 // do not change
+ val fetchDataBits = 256 // do not change
+
+ // Scalar Core Load Store Unit bus.
+ val lsuAddrBits = 32 // do not change
+ val lsuDataBits = vectorBits
+
+ // [External] Core AXI interface.
+ val axiSysIdBits = 7
+ val axiSysAddrBits = 32
+ val axiSysDataBits = vectorBits
+
+ // [Internal] L1ICache interface.
+ val l1islots = 256
+ val axi0IdBits = 4 // (1x banks, 4 bits unused)
+ val axi0AddrBits = 32
+ val axi0DataBits = fetchDataBits
+
+ // [Internal] L1DCache interface.
+ val l1dslots = 256 // (x2 banks)
+ val axi1IdBits = 4 // (x2 banks, 3 bits unused)
+ val axi1AddrBits = 32
+ val axi1DataBits = vectorBits
+
+ // [Internal] TCM[Vector,Scalar] interface.
+ val axi2IdBits = 6
+ val axi2AddrBits = 32
+ val axi2DataBits = vectorBits
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Alu.scala b/hdl/chisel/src/kelvin/scalar/Alu.scala
new file mode 100644
index 0000000..2ac5855
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Alu.scala
@@ -0,0 +1,117 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Alu {
+ def apply(p: Parameters): Alu = {
+ return Module(new Alu(p))
+ }
+}
+
+case class AluOp() {
+ val ADD = 0
+ val SUB = 1
+ val SLT = 2
+ val SLTU = 3
+ val XOR = 4
+ val OR = 5
+ val AND = 6
+ val SLL = 7
+ val SRL = 8
+ val SRA = 9
+ val LUI = 10
+ val CLZ = 11
+ val CTZ = 12
+ val PCNT = 13
+ val MIN = 14
+ val MINU = 15
+ val MAX = 16
+ val MAXU = 17
+ val Entries = 18
+}
+
+class AluIO(p: Parameters) extends Bundle {
+ val valid = Input(Bool())
+ val addr = Input(UInt(5.W))
+ val op = Input(UInt(new AluOp().Entries.W))
+}
+
+class Alu(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ // Decode cycle.
+ val req = new AluIO(p)
+
+ // Execute cycle.
+ val rs1 = Flipped(new RegfileReadDataIO)
+ val rs2 = Flipped(new RegfileReadDataIO)
+ val rd = Flipped(new RegfileWriteDataIO)
+ })
+
+ val alu = new AluOp()
+
+ val valid = RegInit(false.B)
+ val addr = Reg(UInt(5.W))
+ val op = RegInit(0.U(alu.Entries.W))
+
+ // Pulse the cycle after the decoded request.
+ valid := io.req.valid
+
+ // Avoid output toggles by not updating state between uses.
+ // The Regfile has the same behavior, leaving read ports unchanged.
+ when (io.req.valid) {
+ addr := io.req.addr
+ op := io.req.op
+ }
+
+ // val rs1 = MuxOR(valid, io.rs1.data)
+ // val rs2 = MuxOR(valid, io.rs2.data)
+ val rs1 = io.rs1.data
+ val rs2 = io.rs2.data
+ val shamt = rs2(4,0)
+
+ // TODO: should we be masking like this for energy?
+ // TODO: a single addsub for add/sub/slt/sltu
+ // val add = MuxOR(op(alu.ADD), rs1) + MuxOR(op(alu.ADD), rs2)
+ // val sub = MuxOR(op(alu.SUB), rs1) - MuxOR(op(alu.SUB), rs2)
+ // val sll = MuxOR(op(alu.SLL), rs1) << MuxOR(op(alu.SLL), shamt)
+ // val srl = MuxOR(op(alu.SRL), rs1) >> MuxOR(op(alu.SRL), shamt)
+ // val sra = (MuxOR(op(alu.SRA), rs1.asSInt, 0.S) >> MuxOR(op(alu.SRA), shamt)).asUInt
+ // val slt = MuxOR(op(alu.SLT), rs1.asSInt, 0.S) < MuxOR(op(alu.SLT), rs2.asSInt, 0.S)
+ // val sltu = MuxOR(op(alu.SLTU), rs1) < MuxOR(op(alu.SLTU), rs2)
+ // val and = MuxOR(op(alu.AND), rs1) & MuxOR(op(alu.AND), rs2)
+ // val or = MuxOR(op(alu.OR), rs1) | MuxOR(op(alu.OR), rs2)
+ // val xor = MuxOR(op(alu.XOR), rs1) ^ MuxOR(op(alu.XOR), rs2)
+ // val lui = MuxOR(op(alu.LUI), rs2)
+ // val clz = MuxOR(op(alu.CLZ), CLZ(rs1))
+ // val ctz = MuxOR(op(alu.CTZ), CTZ(rs1))
+ // val pcnt = MuxOR(op(alu.PCNT), PopCount(rs1))
+
+ // io.rd.data := add | sub | sll | srl | sra | slt | sltu | and | or | xor | lui
+
+ io.rd.valid := valid
+ io.rd.addr := addr
+ io.rd.data := MuxOR(op(alu.ADD), rs1 + rs2) |
+ MuxOR(op(alu.SUB), rs1 - rs2) |
+ MuxOR(op(alu.SLT), rs1.asSInt < rs2.asSInt) |
+ MuxOR(op(alu.SLTU), rs1 < rs2) |
+ MuxOR(op(alu.XOR), rs1 ^ rs2) |
+ MuxOR(op(alu.OR), rs1 | rs2) |
+ MuxOR(op(alu.AND), rs1 & rs2) |
+ MuxOR(op(alu.SLL), rs1 << shamt) |
+ MuxOR(op(alu.SRL), rs1 >> shamt) |
+ MuxOR(op(alu.SRA), (rs1.asSInt >> shamt).asUInt) |
+ MuxOR(op(alu.LUI), rs2) |
+ MuxOR(op(alu.CLZ), Clz(rs1)) |
+ MuxOR(op(alu.CTZ), Ctz(rs1)) |
+ MuxOR(op(alu.PCNT), PopCount(rs1)) |
+ MuxOR(op(alu.MIN), Mux(rs1.asSInt < rs2.asSInt, rs1, rs2)) |
+ MuxOR(op(alu.MAX), Mux(rs1.asSInt > rs2.asSInt, rs1, rs2)) |
+ MuxOR(op(alu.MINU), Mux(rs1 < rs2, rs1, rs2)) |
+ MuxOR(op(alu.MAXU), Mux(rs1 > rs2, rs1, rs2))
+
+ // Assertions.
+ assert(!(valid && !io.rs1.valid && !op(alu.LUI)))
+ assert(!(valid && !io.rs2.valid))
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Bru.scala b/hdl/chisel/src/kelvin/scalar/Bru.scala
new file mode 100644
index 0000000..02d7121
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Bru.scala
@@ -0,0 +1,222 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Bru {
+ def apply(p: Parameters): Bru = {
+ return Module(new Bru(p))
+ }
+}
+
+case class BruOp() {
+ val JAL = 0
+ val JALR = 1
+ val BEQ = 2
+ val BNE = 3
+ val BLT = 4
+ val BGE = 5
+ val BLTU = 6
+ val BGEU = 7
+ val EBREAK = 8
+ val ECALL = 9
+ val EEXIT = 10
+ val EYIELD = 11
+ val ECTXSW = 12
+ val MPAUSE = 13
+ val MRET = 14
+ val FENCEI = 15
+ val UNDEF = 16
+ val Entries = 17
+}
+
+class BruIO(p: Parameters) extends Bundle {
+ val valid = Input(Bool())
+ val fwd = Input(Bool())
+ val op = Input(UInt(new BruOp().Entries.W))
+ val pc = Input(UInt(p.programCounterBits.W))
+ val target = Input(UInt(p.programCounterBits.W))
+ val link = Input(UInt(5.W))
+}
+
+class BranchTakenIO(p: Parameters) extends Bundle {
+ val valid = Output(Bool())
+ val value = Output(UInt(p.programCounterBits.W))
+}
+
+class Bru(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ // Decode cycle.
+ val req = new BruIO(p)
+
+ // Execute cycle.
+ val csr = new CsrBruIO(p)
+ val rs1 = Input(new RegfileReadDataIO)
+ val rs2 = Input(new RegfileReadDataIO)
+ val rd = Flipped(new RegfileWriteDataIO)
+ val taken = new BranchTakenIO(p)
+ val target = Flipped(new RegfileBranchTargetIO)
+ val interlock = Output(Bool())
+ val iflush = Output(Bool())
+ })
+
+ val branch = new BruOp()
+
+ val interlock = RegInit(false.B)
+
+ val readRs = RegInit(false.B)
+ val fwd = RegInit(false.B)
+ val op = RegInit(0.U(branch.Entries.W))
+ val target = Reg(UInt(p.programCounterBits.W))
+ val linkValid = RegInit(false.B)
+ val linkAddr = Reg(UInt(5.W))
+ val linkData = Reg(UInt(p.programCounterBits.W))
+ val pcEx = Reg(UInt(32.W))
+
+ linkValid := io.req.valid && io.req.link =/= 0.U &&
+ (io.req.op(branch.JAL) || io.req.op(branch.JALR))
+
+ op := Mux(io.req.valid, io.req.op, 0.U)
+ fwd := io.req.valid && io.req.fwd
+
+ readRs := Mux(io.req.valid,
+ io.req.op(branch.BEQ) || io.req.op(branch.BNE) ||
+ io.req.op(branch.BLT) || io.req.op(branch.BGE) ||
+ io.req.op(branch.BLTU) || io.req.op(branch.BGEU), false.B)
+
+ val mode = io.csr.out.mode // (0) machine, (1) user
+
+ val pcDe = io.req.pc
+ val pc4De = io.req.pc + 4.U
+
+ when (io.req.valid) {
+ val mret = io.req.op(branch.MRET) && !mode
+ val call = io.req.op(branch.MRET) && mode ||
+ io.req.op(branch.EBREAK) ||
+ io.req.op(branch.ECALL) ||
+ io.req.op(branch.EEXIT) ||
+ io.req.op(branch.EYIELD) ||
+ io.req.op(branch.ECTXSW) ||
+ io.req.op(branch.MPAUSE)
+ target := Mux(mret, io.csr.out.mepc,
+ Mux(call, io.csr.out.mtvec,
+ Mux(io.req.fwd || io.req.op(branch.FENCEI), pc4De,
+ Mux(io.req.op(branch.JALR), io.target.data,
+ io.req.target))))
+ linkAddr := io.req.link
+ linkData := pc4De
+ pcEx := pcDe
+ }
+
+ interlock := io.req.valid && (io.req.op(branch.EBREAK) ||
+ io.req.op(branch.ECALL) || io.req.op(branch.EEXIT) ||
+ io.req.op(branch.EYIELD) || io.req.op(branch.ECTXSW) ||
+ io.req.op(branch.MPAUSE) || io.req.op(branch.MRET))
+
+ io.interlock := interlock
+
+ // This mux sits on the critical path.
+ // val rs1 = Mux(readRs, io.rs1.data, 0.U)
+ // val rs2 = Mux(readRs, io.rs2.data, 0.U)
+ val rs1 = io.rs1.data
+ val rs2 = io.rs2.data
+
+ val eq = rs1 === rs2
+ val neq = !eq
+ val lt = rs1.asSInt < rs2.asSInt
+ val ge = !lt
+ val ltu = rs1 < rs2
+ val geu = !ltu
+
+ io.taken.valid := op(branch.EBREAK) && mode ||
+ op(branch.ECALL) && mode ||
+ op(branch.EEXIT) && mode ||
+ op(branch.EYIELD) && mode ||
+ op(branch.ECTXSW) && mode ||
+ op(branch.MRET) && !mode ||
+ op(branch.MRET) && mode || // fault
+ op(branch.MPAUSE) && mode || // fault
+ op(branch.FENCEI) ||
+ (op(branch.JAL) ||
+ op(branch.JALR) ||
+ op(branch.BEQ) && eq ||
+ op(branch.BNE) && neq ||
+ op(branch.BLT) && lt ||
+ op(branch.BGE) && ge ||
+ op(branch.BLTU) && ltu ||
+ op(branch.BGEU) && geu) =/= fwd
+
+ io.taken.value := target
+
+ io.rd.valid := linkValid
+ io.rd.addr := linkAddr
+ io.rd.data := linkData
+
+ // Undefined Fault.
+ val undefFault = op(branch.UNDEF)
+
+ // Usage Fault.
+ val usageFault = op(branch.EBREAK) && !mode ||
+ op(branch.ECALL) && !mode ||
+ op(branch.EEXIT) && !mode ||
+ op(branch.EYIELD) && !mode ||
+ op(branch.ECTXSW) && !mode ||
+ op(branch.MPAUSE) && mode ||
+ op(branch.MRET) && mode
+
+ io.csr.in.mode.valid := op(branch.EBREAK) && mode ||
+ op(branch.ECALL) && mode ||
+ op(branch.EEXIT) && mode ||
+ op(branch.EYIELD) && mode ||
+ op(branch.ECTXSW) && mode ||
+ op(branch.MPAUSE) && mode || // fault
+ op(branch.MRET) && mode || // fault
+ op(branch.MRET) && !mode
+ io.csr.in.mode.bits := MuxOR(op(branch.MRET) && !mode, true.B)
+
+ io.csr.in.mepc.valid := op(branch.EBREAK) && mode ||
+ op(branch.ECALL) && mode ||
+ op(branch.EEXIT) && mode ||
+ op(branch.EYIELD) && mode ||
+ op(branch.ECTXSW) && mode ||
+ op(branch.MPAUSE) && mode || // fault
+ op(branch.MRET) && mode // fault
+ io.csr.in.mepc.bits := Mux(op(branch.EYIELD), linkData, pcEx)
+
+ io.csr.in.mcause.valid := undefFault || usageFault ||
+ op(branch.EBREAK) && mode ||
+ op(branch.ECALL) && mode ||
+ op(branch.EEXIT) && mode ||
+ op(branch.EYIELD) && mode ||
+ op(branch.ECTXSW) && mode
+
+ val faultMsb = 1.U << 31
+ io.csr.in.mcause.bits := Mux(undefFault, 2.U | faultMsb,
+ Mux(usageFault, 16.U | faultMsb,
+ MuxOR(op(branch.EBREAK), 1.U) |
+ MuxOR(op(branch.ECALL), 2.U) |
+ MuxOR(op(branch.EEXIT), 3.U) |
+ MuxOR(op(branch.EYIELD), 4.U) |
+ MuxOR(op(branch.ECTXSW), 5.U)))
+
+ io.csr.in.mtval.valid := undefFault || usageFault
+ io.csr.in.mtval.bits := pcEx
+
+ io.iflush := op(branch.FENCEI)
+
+ // Pipeline will be halted.
+ io.csr.in.halt := op(branch.MPAUSE) && !mode || io.csr.in.fault
+ io.csr.in.fault := undefFault && !mode || usageFault && !mode
+
+ // Assertions.
+ val valid = RegInit(false.B)
+ val ignore = op(branch.JAL) || op(branch.JALR) || op(branch.EBREAK) ||
+ op(branch.ECALL) || op(branch.EEXIT) || op(branch.EYIELD) ||
+ op(branch.ECTXSW) || op(branch.MPAUSE) || op(branch.MRET) ||
+ op(branch.FENCEI) || op(branch.UNDEF)
+
+ valid := io.req.valid
+ assert(!(valid && !io.rs1.valid) || ignore)
+ assert(!(valid && !io.rs2.valid) || ignore)
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Csr.scala b/hdl/chisel/src/kelvin/scalar/Csr.scala
new file mode 100644
index 0000000..7c78680
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Csr.scala
@@ -0,0 +1,273 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Csr {
+ def apply(p: Parameters): Csr = {
+ return Module(new Csr(p))
+ }
+}
+
+case class CsrOp() {
+ val CSRRW = 0
+ val CSRRS = 1
+ val CSRRC = 2
+ val Entries = 3
+}
+
+class CsrInIO(p: Parameters) extends Bundle {
+ val value = Input(Vec(12, UInt(32.W)))
+}
+
+class CsrOutIO(p: Parameters) extends Bundle {
+ val value = Output(Vec(8, UInt(32.W)))
+}
+
+class CsrInOutIO(p: Parameters) extends Bundle {
+ val in = new CsrInIO(p)
+ val out = new CsrOutIO(p)
+}
+
+class CsrBruIO(p: Parameters) extends Bundle {
+ val in = new Bundle {
+ val mode = Valid(Bool())
+ val mcause = Valid(UInt(32.W))
+ val mepc = Valid(UInt(32.W))
+ val mtval = Valid(UInt(32.W))
+ val halt = Output(Bool())
+ val fault = Output(Bool())
+ }
+ val out = new Bundle {
+ val mode = Input(Bool())
+ val mepc = Input(UInt(32.W))
+ val mtvec = Input(UInt(32.W))
+ }
+ def defaults() = {
+ out.mode := false.B
+ out.mepc := 0.U
+ out.mtvec := 0.U
+ }
+}
+
+class CsrIO(p: Parameters) extends Bundle {
+ val valid = Input(Bool())
+ val addr = Input(UInt(5.W))
+ val index = Input(UInt(12.W))
+ val op = Input(UInt(new CsrOp().Entries.W))
+}
+
+class Csr(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ // Reset and shutdown.
+ val csr = new CsrInOutIO(p)
+
+ // Decode cycle.
+ val req = new CsrIO(p)
+
+ // Execute cycle.
+ val rs1 = Flipped(new RegfileReadDataIO)
+ val rd = Flipped(new RegfileWriteDataIO)
+ val bru = Flipped(new CsrBruIO(p))
+
+ // Vector core.
+ val vcore = Input(new Bundle { val undef = Bool() })
+
+ // Pipeline Control.
+ val halted = Output(Bool())
+ val fault = Output(Bool())
+ })
+
+ val csr = new CsrOp()
+
+ val valid = RegInit(false.B)
+ val addr = Reg(UInt(5.W))
+ val index = Reg(UInt(12.W))
+ val op = RegInit(0.U(csr.Entries.W))
+
+ // Pipeline Control.
+ val halted = RegInit(false.B)
+ val fault = RegInit(false.B)
+
+ // Machine(0)/User(1) Mode.
+ val mode = RegInit(false.B)
+
+ // CSRs parallel loaded when(reset).
+ val mpc = Reg(UInt(32.W))
+ val msp = Reg(UInt(32.W))
+ val mcause = Reg(UInt(32.W))
+ val mtval = Reg(UInt(32.W))
+ val mcontext0 = Reg(UInt(32.W))
+ val mcontext1 = Reg(UInt(32.W))
+ val mcontext2 = Reg(UInt(32.W))
+ val mcontext3 = Reg(UInt(32.W))
+ val mcontext4 = Reg(UInt(32.W))
+ val mcontext5 = Reg(UInt(32.W))
+ val mcontext6 = Reg(UInt(32.W))
+ val mcontext7 = Reg(UInt(32.W))
+
+ // CSRs with initialization.
+ val fflags = RegInit(0.U(5.W))
+ val frm = RegInit(0.U(3.W))
+ val mie = RegInit(0.U(1.W))
+ val mtvec = RegInit(0.U(32.W))
+ val mscratch = RegInit(0.U(32.W))
+ val mepc = RegInit(0.U(32.W))
+
+ val fcsr = Cat(frm, fflags)
+
+ // Decode the Index.
+ val fflagsEn = index === 0x001.U
+ val frmEn = index === 0x002.U
+ val fcsrEn = index === 0x003.U
+ val mieEn = index === 0x304.U
+ val mtvecEn = index === 0x305.U
+ val mscratchEn = index === 0x340.U
+ val mepcEn = index === 0x341.U
+ val mcauseEn = index === 0x342.U
+ val mtvalEn = index === 0x343.U
+ val mcontext0En = index === 0x7C0.U
+ val mcontext1En = index === 0x7C1.U
+ val mcontext2En = index === 0x7C2.U
+ val mcontext3En = index === 0x7C3.U
+ val mcontext4En = index === 0x7C4.U
+ val mcontext5En = index === 0x7C5.U
+ val mcontext6En = index === 0x7C6.U
+ val mcontext7En = index === 0x7C7.U
+ val mpcEn = index === 0x7E0.U
+ val mspEn = index === 0x7E1.U
+
+ // Control registers.
+ when (io.req.valid) {
+ valid := io.req.valid
+ addr := io.req.addr
+ index := io.req.index
+ op := io.req.op
+ } .elsewhen (valid) {
+ valid := false.B
+ addr := 0.U
+ index := 0.U
+ op := 0.U
+ }
+
+ // Pipeline Control.
+ when (io.bru.in.halt || io.vcore.undef) {
+ halted := true.B
+ }
+
+ when (io.bru.in.fault || io.vcore.undef) {
+ fault := true.B
+ }
+
+ io.halted := halted
+ io.fault := fault
+
+ assert(!(io.fault && !io.halted))
+
+ // Register state.
+ val rs1 = io.rs1.data
+
+ val rdata = MuxOR(fflagsEn, fflags) |
+ MuxOR(frmEn, frm) |
+ MuxOR(fcsrEn, fcsr) |
+ MuxOR(mieEn, mie) |
+ MuxOR(mtvecEn, mtvec) |
+ MuxOR(mscratchEn, mscratch) |
+ MuxOR(mepcEn, mepc) |
+ MuxOR(mcauseEn, mcause) |
+ MuxOR(mtvalEn, mtval) |
+ MuxOR(mcontext0En, mcontext0) |
+ MuxOR(mcontext1En, mcontext1) |
+ MuxOR(mcontext2En, mcontext2) |
+ MuxOR(mcontext3En, mcontext3) |
+ MuxOR(mcontext4En, mcontext4) |
+ MuxOR(mcontext5En, mcontext5) |
+ MuxOR(mcontext6En, mcontext6) |
+ MuxOR(mcontext7En, mcontext7) |
+ MuxOR(mpcEn, mpc) |
+ MuxOR(mspEn, msp)
+
+ val wdata = MuxOR(op(csr.CSRRW), rs1) |
+ MuxOR(op(csr.CSRRS), rdata | rs1) |
+ MuxOR(op(csr.CSRRC), rdata & ~rs1)
+
+ when (valid) {
+ when (fflagsEn) { fflags := wdata }
+ when (frmEn) { frm := wdata }
+ when (fcsrEn) { fflags := wdata(4,0)
+ frm := wdata(7,5) }
+ when (mieEn) { mie := wdata }
+ when (mtvecEn) { mtvec := wdata }
+ when (mscratchEn) { mscratch := wdata }
+ when (mepcEn) { mepc := wdata }
+ when (mcauseEn) { mcause := wdata }
+ when (mtvalEn) { mtval := wdata }
+ when (mpcEn) { mpc := wdata }
+ when (mspEn) { msp := wdata }
+ when (mcontext0En) { mcontext0 := wdata }
+ when (mcontext1En) { mcontext1 := wdata }
+ when (mcontext2En) { mcontext2 := wdata }
+ when (mcontext3En) { mcontext3 := wdata }
+ when (mcontext4En) { mcontext4 := wdata }
+ when (mcontext5En) { mcontext5 := wdata }
+ when (mcontext6En) { mcontext6 := wdata }
+ when (mcontext7En) { mcontext7 := wdata }
+ }
+
+ when (io.bru.in.mode.valid) {
+ mode := io.bru.in.mode.bits
+ }
+
+ val firstFault = !mcause(31)
+
+ when (io.bru.in.mcause.valid && firstFault) {
+ mcause := io.bru.in.mcause.bits
+ }
+
+ when (io.bru.in.mtval.valid && firstFault) {
+ mtval := io.bru.in.mtval.bits
+ }
+
+ when (io.bru.in.mepc.valid) {
+ mepc := io.bru.in.mepc.bits
+ }
+
+ // This pattern of separate when() blocks requires resets after the data.
+ when (reset.asBool) {
+ mpc := io.csr.in.value(0)
+ msp := io.csr.in.value(1)
+ mcause := io.csr.in.value(2)
+ mtval := io.csr.in.value(3)
+ mcontext0 := io.csr.in.value(4)
+ mcontext1 := io.csr.in.value(5)
+ mcontext2 := io.csr.in.value(6)
+ mcontext3 := io.csr.in.value(7)
+ mcontext4 := io.csr.in.value(8)
+ mcontext5 := io.csr.in.value(9)
+ mcontext6 := io.csr.in.value(10)
+ mcontext7 := io.csr.in.value(11)
+ }
+
+ // Forwarding.
+ io.bru.out.mode := mode
+ io.bru.out.mepc := Mux(mepcEn, wdata, mepc)
+ io.bru.out.mtvec := Mux(mtvecEn, wdata, mtvec)
+
+ io.csr.out.value(0) := mpc
+ io.csr.out.value(1) := msp
+ io.csr.out.value(2) := mcause
+ io.csr.out.value(3) := mtval
+ io.csr.out.value(4) := mcontext0
+ io.csr.out.value(5) := mcontext1
+ io.csr.out.value(6) := mcontext2
+ io.csr.out.value(7) := mcontext3
+
+ // Write port.
+ io.rd.valid := valid
+ io.rd.addr := addr
+ io.rd.data := rdata
+
+ // Assertions.
+ assert(!(valid && !io.rs1.valid))
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Debug.scala b/hdl/chisel/src/kelvin/scalar/Debug.scala
new file mode 100644
index 0000000..d7fedf3
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Debug.scala
@@ -0,0 +1,19 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+// Debug signals for HDL development.
+class DebugIO(p: Parameters) extends Bundle {
+ val en = Output(UInt(4.W))
+ val addr0 = Output(UInt(32.W))
+ val addr1 = Output(UInt(32.W))
+ val addr2 = Output(UInt(32.W))
+ val addr3 = Output(UInt(32.W))
+ val inst0 = Output(UInt(32.W))
+ val inst1 = Output(UInt(32.W))
+ val inst2 = Output(UInt(32.W))
+ val inst3 = Output(UInt(32.W))
+ val cycles = Output(UInt(32.W))
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Decode.scala b/hdl/chisel/src/kelvin/scalar/Decode.scala
new file mode 100644
index 0000000..c7a7373
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Decode.scala
@@ -0,0 +1,660 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Decode {
+ def apply(p: Parameters, pipeline: Int): Decode = {
+ return Module(new Decode(p, pipeline))
+ }
+}
+
+class DecodeSerializeIO extends Bundle {
+ val lsu = Output(Bool())
+ val mul = Output(Bool())
+ val jump = Output(Bool())
+ val brcond = Output(Bool())
+ val vinst = Output(Bool()) // all vector instructions
+
+ def defaults() = {
+ lsu := false.B
+ mul := false.B
+ jump := false.B
+ brcond := false.B
+ vinst := false.B
+ }
+}
+
+class Decode(p: Parameters, pipeline: Int) extends Module {
+ val io = IO(new Bundle {
+ // Core controls.
+ val halted = Input(Bool())
+
+ // Decode input interface.
+ val inst = Flipped(new FetchInstruction(p))
+ val scoreboard = new Bundle {
+ val regd = Input(UInt(32.W))
+ val comb = Input(UInt(32.W))
+ val spec = Output(UInt(32.W))
+ }
+ val mactive = Input(Bool()) // memory active
+
+ // Register file decode cycle interface.
+ val rs1Read = Flipped(new RegfileReadAddrIO)
+ val rs1Set = Flipped(new RegfileReadSetIO)
+ val rs2Read = Flipped(new RegfileReadAddrIO)
+ val rs2Set = Flipped(new RegfileReadSetIO)
+ val rdMark = Flipped(new RegfileWriteAddrIO)
+ val busRead = Flipped(new RegfileBusAddrIO)
+
+ // ALU interface.
+ val alu = Flipped(new AluIO(p))
+
+ // Branch interface.
+ val bru = Flipped(new BruIO(p))
+
+ // CSR interface.
+ val csr = Flipped(new CsrIO(p))
+
+ // LSU interface.
+ val lsu = Flipped(new LsuIO(p))
+
+ // Multiplier interface.
+ val mlu = Flipped(new MluIO(p))
+
+ // Divide interface.
+ val dvu = Flipped(new DvuIO(p))
+
+ // Vector interface.
+ val vinst = Flipped(new VInstIO)
+
+ // Branch status.
+ val branchTaken = Input(Bool())
+
+ // Interlock Controls
+ val interlock = Input(Bool())
+ val serializeIn = Flipped(new DecodeSerializeIO)
+ val serializeOut = new DecodeSerializeIO
+
+ // Scalar logging.
+ val slog = Output(Bool())
+ })
+
+ val decodeEn = io.inst.valid && io.inst.ready && !io.branchTaken
+
+ // The decode logic.
+ val d = Module(new DecodedInstruction(p, pipeline))
+ d.io.addr := io.inst.addr
+ d.io.inst := io.inst.inst
+
+ val vldst = d.io.vld || d.io.vst
+ val vldst_wb = vldst && io.inst.inst(28)
+
+ val rdAddr = Mux(vldst, io.inst.inst(19,15), io.inst.inst(11,7))
+ val rs1Addr = io.inst.inst(19,15)
+ val rs2Addr = io.inst.inst(24,20)
+ val rs3Addr = io.inst.inst(31,27)
+
+ val isAluImm = d.io.addi || d.io.slti || d.io.sltiu || d.io.xori ||
+ d.io.ori || d.io.andi || d.io.slli || d.io.srli || d.io.srai
+
+ val isAluReg = d.io.add || d.io.sub || d.io.slt || d.io.sltu || d.io.xor ||
+ d.io.or || d.io.and || d.io.sll || d.io.srl || d.io.sra
+
+ val isAlu1Bit = d.io.clz || d.io.ctz || d.io.pcnt
+ val isAlu2Bit = d.io.min || d.io.minu || d.io.max || d.io.maxu
+
+ val isCondBr = d.io.beq || d.io.bne || d.io.blt || d.io.bge ||
+ d.io.bltu || d.io.bgeu
+
+ val isCsr = d.io.csrrw || d.io.csrrs || d.io.csrrc
+ val isCsrImm = isCsr && io.inst.inst(14)
+ val isCsrReg = isCsr && !io.inst.inst(14)
+
+ val isLoad = d.io.lb || d.io.lh || d.io.lw || d.io.lbu || d.io.lhu
+ val isStore = d.io.sb || d.io.sh || d.io.sw
+ val isLsu = isLoad || isStore || d.io.vld || d.io.vst || d.io.flushat || d.io.flushall
+
+ val isMul = d.io.mul || d.io.mulh || d.io.mulhsu || d.io.mulhu || d.io.mulhr || d.io.mulhsur || d.io.mulhur || d.io.dmulh || d.io.dmulhr
+
+ val isDvu = d.io.div || d.io.divu || d.io.rem || d.io.remu
+
+ val isVIop = io.vinst.op(new VInstOp().VIOP)
+
+ val isVIopVs1 = isVIop
+ val isVIopVs2 = isVIop && io.inst.inst(1,0) === 0.U // exclude: .vv
+ val isVIopVs3 = isVIop && io.inst.inst(2,0) === 1.U // exclude: .vvv
+
+ // Use the forwarded scoreboard to interlock on multicycle operations.
+ val aluRdEn = !io.scoreboard.comb(rdAddr) || isVIopVs1 || isStore || isCondBr
+ val aluRs1En = !io.scoreboard.comb(rs1Addr) || isVIopVs1 || isLsu || d.io.auipc
+ val aluRs2En = !io.scoreboard.comb(rs2Addr) || isVIopVs2 || isLsu || d.io.auipc || isAluImm || isAlu1Bit
+ // val aluRs3En = !io.scoreboard.comb(rs3Addr) || isVIopVs3
+ // val aluEn = aluRdEn && aluRs1En && aluRs2En && aluRs3En // TODO: is aluRs3En needed?
+ val aluEn = aluRdEn && aluRs1En && aluRs2En
+
+ // Interlock jalr but special case return.
+ val bruEn = !d.io.jalr || !io.scoreboard.regd(rs1Addr) ||
+ io.inst.inst(31,20) === 0.U
+
+ // Require interlock on address generation as there is no write forwarding.
+ val lsuEn = !isLsu ||
+ !io.serializeIn.lsu && io.lsu.ready &&
+ (!isLsu || !io.serializeIn.brcond) && // TODO: can this line be removed?
+ !(Mux(io.busRead.bypass, io.scoreboard.comb(rs1Addr),
+ io.scoreboard.regd(rs1Addr)) ||
+ io.scoreboard.comb(rs2Addr) && (isStore || vldst))
+
+ // Interlock mul, only one lane accepted.
+ val mulEn = !isMul || !io.serializeIn.mul
+
+
+ // Vector extension interlock.
+ val vinstEn = !(io.serializeIn.vinst || isVIop && io.serializeIn.brcond) &&
+ !(io.vinst.op =/= 0.U && !io.vinst.ready)
+
+ // Fence interlock.
+ // Input mactive used passthrough, prefer to avoid registers in Decode.
+ val fenceEn = !(d.io.fence && io.mactive)
+
+ // ALU opcode.
+ val alu = new AluOp()
+ val aluOp = Wire(Vec(alu.Entries, Bool()))
+ val aluValid = WiredOR(io.alu.op) // used without decodeEn
+ io.alu.valid := decodeEn && aluValid
+ io.alu.addr := rdAddr
+ io.alu.op := aluOp.asUInt
+
+ aluOp(alu.ADD) := d.io.auipc || d.io.addi || d.io.add
+ aluOp(alu.SUB) := d.io.sub
+ aluOp(alu.SLT) := d.io.slti || d.io.slt
+ aluOp(alu.SLTU) := d.io.sltiu || d.io.sltu
+ aluOp(alu.XOR) := d.io.xori || d.io.xor
+ aluOp(alu.OR) := d.io.ori || d.io.or
+ aluOp(alu.AND) := d.io.andi || d.io.and
+ aluOp(alu.SLL) := d.io.slli || d.io.sll
+ aluOp(alu.SRL) := d.io.srli || d.io.srl
+ aluOp(alu.SRA) := d.io.srai || d.io.sra
+ aluOp(alu.LUI) := d.io.lui
+ aluOp(alu.CLZ) := d.io.clz
+ aluOp(alu.CTZ) := d.io.ctz
+ aluOp(alu.PCNT) := d.io.pcnt
+ aluOp(alu.MIN) := d.io.min
+ aluOp(alu.MINU) := d.io.minu
+ aluOp(alu.MAX) := d.io.max
+ aluOp(alu.MAXU) := d.io.maxu
+
+ // Branch conditional opcode.
+ val bru = new BruOp()
+ val bruOp = Wire(Vec(bru.Entries, Bool()))
+ val bruValid = WiredOR(io.bru.op) // used without decodeEn
+ io.bru.valid := decodeEn && bruValid
+ io.bru.fwd := io.inst.brchFwd
+ io.bru.op := bruOp.asUInt
+ io.bru.pc := io.inst.addr
+ io.bru.target := io.inst.addr + Mux(io.inst.inst(2), d.io.immjal, d.io.immbr)
+ io.bru.link := rdAddr
+
+ bruOp(bru.JAL) := d.io.jal
+ bruOp(bru.JALR) := d.io.jalr
+ bruOp(bru.BEQ) := d.io.beq
+ bruOp(bru.BNE) := d.io.bne
+ bruOp(bru.BLT) := d.io.blt
+ bruOp(bru.BGE) := d.io.bge
+ bruOp(bru.BLTU) := d.io.bltu
+ bruOp(bru.BGEU) := d.io.bgeu
+ bruOp(bru.EBREAK) := d.io.ebreak
+ bruOp(bru.ECALL) := d.io.ecall
+ bruOp(bru.EEXIT) := d.io.eexit
+ bruOp(bru.EYIELD) := d.io.eyield
+ bruOp(bru.ECTXSW) := d.io.ectxsw
+ bruOp(bru.MPAUSE) := d.io.mpause
+ bruOp(bru.MRET) := d.io.mret
+ bruOp(bru.FENCEI) := d.io.fencei
+ bruOp(bru.UNDEF) := d.io.undef
+
+ // CSR opcode.
+ val csr = new CsrOp()
+ val csrOp = Wire(Vec(csr.Entries, Bool()))
+ val csrValid = WiredOR(io.csr.op) // used without decodeEn
+ io.csr.valid := decodeEn && csrValid
+ io.csr.addr := rdAddr
+ io.csr.index := io.inst.inst(31,20)
+ io.csr.op := csrOp.asUInt
+
+ csrOp(csr.CSRRW) := d.io.csrrw
+ csrOp(csr.CSRRS) := d.io.csrrs
+ csrOp(csr.CSRRC) := d.io.csrrc
+
+ // LSU opcode.
+ val lsu = new LsuOp()
+ val lsuOp = Wire(Vec(lsu.Entries, Bool()))
+ val lsuValid = WiredOR(io.lsu.op) // used without decodeEn
+ io.lsu.valid := decodeEn && lsuValid
+ io.lsu.store := io.inst.inst(5)
+ io.lsu.addr := rdAddr
+ io.lsu.op := lsuOp.asUInt
+
+ lsuOp(lsu.LB) := d.io.lb
+ lsuOp(lsu.LH) := d.io.lh
+ lsuOp(lsu.LW) := d.io.lw
+ lsuOp(lsu.LBU) := d.io.lbu
+ lsuOp(lsu.LHU) := d.io.lhu
+ lsuOp(lsu.SB) := d.io.sb
+ lsuOp(lsu.SH) := d.io.sh
+ lsuOp(lsu.SW) := d.io.sw
+ lsuOp(lsu.FENCEI) := d.io.fencei
+ lsuOp(lsu.FLUSHAT) := d.io.flushat
+ lsuOp(lsu.FLUSHALL) := d.io.flushall
+
+ lsuOp(lsu.VLDST) := d.io.vld || d.io.vst
+
+ // MLU opcode.
+ val mlu = new MluOp()
+ val mluOp = Wire(Vec(mlu.Entries, Bool()))
+ val mluValid = WiredOR(io.mlu.op) // used without decodeEn
+ io.mlu.valid := decodeEn && mluValid
+ io.mlu.addr := rdAddr
+ io.mlu.op := mluOp.asUInt
+
+ mluOp(mlu.MUL) := d.io.mul
+ mluOp(mlu.MULH) := d.io.mulh
+ mluOp(mlu.MULHSU) := d.io.mulhsu
+ mluOp(mlu.MULHU) := d.io.mulhu
+ mluOp(mlu.MULHR) := d.io.mulhr
+ mluOp(mlu.MULHSUR) := d.io.mulhsur
+ mluOp(mlu.MULHUR) := d.io.mulhur
+ mluOp(mlu.DMULH) := d.io.dmulh
+ mluOp(mlu.DMULHR) := d.io.dmulhr
+
+ // DIV opcode.
+ val dvu = new DvuOp()
+ val dvuOp = Wire(Vec(dvu.Entries, Bool()))
+ val dvuValid = WiredOR(io.dvu.op) // used without decodeEn
+ io.dvu.valid := decodeEn && dvuValid
+ io.dvu.addr := rdAddr
+ io.dvu.op := dvuOp.asUInt
+
+ dvuOp(dvu.DIV) := d.io.div
+ dvuOp(dvu.DIVU) := d.io.divu
+ dvuOp(dvu.REM) := d.io.rem
+ dvuOp(dvu.REMU) := d.io.remu
+
+ val dvuEn = WiredOR(io.dvu.op) === 0.U || io.dvu.ready
+
+ // Vector instructions.
+ val vinst = new VInstOp()
+ val vinstOp = Wire(Vec(vinst.Entries, Bool()))
+ val vinstValid = WiredOR(vinstOp) // used without decodeEn
+
+ io.vinst.valid := decodeEn && vinstValid
+ io.vinst.addr := rdAddr
+ io.vinst.inst := io.inst.inst
+ io.vinst.op := vinstOp.asUInt
+
+ vinstOp(vinst.VLD) := d.io.vld
+ vinstOp(vinst.VST) := d.io.vst
+ vinstOp(vinst.VIOP) := d.io.viop
+ vinstOp(vinst.GETVL) := d.io.getvl
+ vinstOp(vinst.GETMAXVL) := d.io.getmaxvl
+
+ // Scalar logging.
+ io.slog := decodeEn && d.io.slog
+
+ // Register file read ports.
+ io.rs1Read.valid := decodeEn && (isCondBr || isAluReg || isAluImm || isAlu1Bit || isAlu2Bit ||
+ isCsrImm || isCsrReg || isMul || isDvu || d.io.slog ||
+ d.io.getvl || d.io.vld || d.io.vst)
+ io.rs2Read.valid := decodeEn && (isCondBr || isAluReg || isAlu2Bit || isStore ||
+ isCsrReg || isMul || isDvu || d.io.slog || d.io.getvl ||
+ d.io.vld || d.io.vst || d.io.viop)
+
+ // rs1 is on critical path to busPortAddr.
+ io.rs1Read.addr := Mux(io.inst.inst(0), rs1Addr, rs3Addr)
+
+ // rs2 is used for the vector operation scalar value.
+ io.rs2Read.addr := rs2Addr
+
+ // Register file set ports.
+ io.rs1Set.valid := decodeEn && (d.io.auipc || isCsrImm)
+ io.rs2Set.valid := io.rs1Set.valid || decodeEn && (isAluImm || isAlu1Bit || d.io.lui)
+
+ io.rs1Set.value := Mux(isCsr, d.io.immcsr, io.inst.addr) // Program Counter (PC)
+
+ io.rs2Set.value := MuxCase(d.io.imm12,
+ IndexedSeq((d.io.auipc || d.io.lui) -> d.io.imm20))
+
+ // Register file write address ports. We speculate without knowing the decode
+ // enable status to improve timing, and under a branch is ignored anyway.
+ val rdMark_valid =
+ aluValid || csrValid || mluValid || dvuValid && io.dvu.ready ||
+ lsuValid && isLoad ||
+ d.io.getvl || d.io.getmaxvl || vldst_wb ||
+ bruValid && (bruOp(bru.JAL) || bruOp(bru.JALR)) && rdAddr =/= 0.U
+
+ // val scoreboard_spec = Mux(rdMark_valid || d.io.vst, OneHot(rdAddr, 32), 0.U) // TODO: why was d.io.vst included?
+ val scoreboard_spec = Mux(rdMark_valid, OneHot(rdAddr, 32), 0.U)
+ io.scoreboard.spec := Cat(scoreboard_spec(31,1), 0.U(1.W))
+
+ io.rdMark.valid := decodeEn && rdMark_valid
+ io.rdMark.addr := rdAddr
+
+ // Register file bus address port.
+ // Pointer chasing bypass if immediate is zero.
+ // Load/Store immediate selection keys off bit5, and RET off bit6.
+ io.busRead.valid := lsuValid
+ io.busRead.bypass := io.inst.inst(31,25) === 0.U &&
+ Mux(!io.inst.inst(5) || io.inst.inst(6), io.inst.inst(24,20) === 0.U,
+ io.inst.inst(11,7) === 0.U)
+
+ // SB,SH,SW 0100011
+ // FSW 0100111 //TODO(hoangm)
+ val storeSelect = io.inst.inst(6,3) === 4.U && io.inst.inst(1,0) === 3.U
+ io.busRead.immen := !d.io.flushat
+ io.busRead.immed := Cat(d.io.imm12(31,5),
+ Mux(storeSelect, d.io.immst(4,0), d.io.imm12(4,0)))
+
+ // Decode ready signalling to fetch.
+ // This must not factor branchTaken, which will be done directly in the
+ // fetch unit. Note above decodeEn resolves for branch for execute usage.
+ io.inst.ready := aluEn && bruEn && lsuEn && mulEn && dvuEn && vinstEn && fenceEn &&
+ !io.serializeIn.jump && !io.halted && !io.interlock &&
+ (pipeline.U === 0.U || !d.io.undef)
+
+ // Serialize Interface.
+ // io.serializeOut.lsu := io.serializeIn.lsu || lsuValid || vldst // vldst interlock for address generation cycle in vinst
+ // io.serializeOut.lsu := io.serializeIn.lsu || vldst // vldst interlock for address generation cycle in vinst
+ io.serializeOut.lsu := io.serializeIn.lsu
+ io.serializeOut.mul := io.serializeIn.mul || mluValid
+ io.serializeOut.jump := io.serializeIn.jump || d.io.jal || d.io.jalr ||
+ d.io.ebreak || d.io.ecall || d.io.eexit ||
+ d.io.eyield || d.io.ectxsw || d.io.mpause || d.io.mret
+ io.serializeOut.brcond := io.serializeIn.brcond |
+ d.io.beq || d.io.bne || d.io.blt || d.io.bge || d.io.bltu || d.io.bgeu
+ io.serializeOut.vinst := io.serializeIn.vinst
+}
+
+class DecodedInstruction(p: Parameters, pipeline: Int) extends Module {
+ val io = IO(new Bundle {
+ val addr = Input(UInt(32.W))
+ val inst = Input(UInt(32.W))
+
+ // Immediates
+ val imm12 = Output(UInt(32.W))
+ val imm20 = Output(UInt(32.W))
+ val immjal = Output(UInt(32.W))
+ val immbr = Output(UInt(32.W))
+ val immcsr = Output(UInt(32.W))
+ val immst = Output(UInt(32.W))
+
+ // RV32I
+ val lui = Output(Bool())
+ val auipc = Output(Bool())
+ val jal = Output(Bool())
+ val jalr = Output(Bool())
+ val beq = Output(Bool())
+ val bne = Output(Bool())
+ val blt = Output(Bool())
+ val bge = Output(Bool())
+ val bltu = Output(Bool())
+ val bgeu = Output(Bool())
+ val csrrw = Output(Bool())
+ val csrrs = Output(Bool())
+ val csrrc = Output(Bool())
+ val lb = Output(Bool())
+ val lh = Output(Bool())
+ val lw = Output(Bool())
+ val lbu = Output(Bool())
+ val lhu = Output(Bool())
+ val sb = Output(Bool())
+ val sh = Output(Bool())
+ val sw = Output(Bool())
+ val fence = Output(Bool())
+ val addi = Output(Bool())
+ val slti = Output(Bool())
+ val sltiu = Output(Bool())
+ val xori = Output(Bool())
+ val ori = Output(Bool())
+ val andi = Output(Bool())
+ val slli = Output(Bool())
+ val srli = Output(Bool())
+ val srai = Output(Bool())
+ val add = Output(Bool())
+ val sub = Output(Bool())
+ val slt = Output(Bool())
+ val sltu = Output(Bool())
+ val xor = Output(Bool())
+ val or = Output(Bool())
+ val and = Output(Bool())
+ val sll = Output(Bool())
+ val srl = Output(Bool())
+ val sra = Output(Bool())
+
+ // RV32M
+ val mul = Output(Bool())
+ val mulh = Output(Bool())
+ val mulhsu = Output(Bool())
+ val mulhu = Output(Bool())
+ val mulhr = Output(Bool())
+ val mulhsur = Output(Bool())
+ val mulhur = Output(Bool())
+ val dmulh = Output(Bool())
+ val dmulhr = Output(Bool())
+ val div = Output(Bool())
+ val divu = Output(Bool())
+ val rem = Output(Bool())
+ val remu = Output(Bool())
+
+ // RV32B
+ val clz = Output(Bool())
+ val ctz = Output(Bool())
+ val pcnt = Output(Bool())
+ val min = Output(Bool())
+ val minu = Output(Bool())
+ val max = Output(Bool())
+ val maxu = Output(Bool())
+
+ // Vector instructions.
+ val getvl = Output(Bool())
+ val getmaxvl = Output(Bool())
+ val vld = Output(Bool())
+ val vst = Output(Bool())
+ val viop = Output(Bool())
+
+ // Core controls.
+ val ebreak = Output(Bool())
+ val ecall = Output(Bool())
+ val eexit = Output(Bool())
+ val eyield = Output(Bool())
+ val ectxsw = Output(Bool())
+ val mpause = Output(Bool())
+ val mret = Output(Bool())
+ val undef = Output(Bool())
+
+ // Fences.
+ val fencei = Output(Bool())
+ val flushat = Output(Bool())
+ val flushall = Output(Bool())
+
+ // Scalar logging.
+ val slog = Output(Bool())
+ })
+
+ val op = io.inst
+
+ // Immediates
+ io.imm12 := Cat(Fill(20, op(31)), op(31,20))
+ io.imm20 := Cat(op(31,12), 0.U(12.W))
+ io.immjal := Cat(Fill(12, op(31)), op(19,12), op(20), op(30,21), 0.U(1.W))
+ io.immbr := Cat(Fill(20, op(31)), op(7), op(30,25), op(11,8), 0.U(1.W))
+ io.immcsr := op(19,15)
+ io.immst := Cat(Fill(20, op(31)), op(31,25), op(11,7))
+
+ // RV32I
+ io.lui := DecodeBits(op, "xxxxxxxxxxxxxxxxxxxx_xxxxx_0110111")
+ io.auipc := DecodeBits(op, "xxxxxxxxxxxxxxxxxxxx_xxxxx_0010111")
+ io.jal := DecodeBits(op, "xxxxxxxxxxxxxxxxxxxx_xxxxx_1101111")
+ io.jalr := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_000_xxxxx_1100111")
+ io.beq := DecodeBits(op, "xxxxxxx_xxxxx_xxxxx_000_xxxxx_1100011")
+ io.bne := DecodeBits(op, "xxxxxxx_xxxxx_xxxxx_001_xxxxx_1100011")
+ io.blt := DecodeBits(op, "xxxxxxx_xxxxx_xxxxx_100_xxxxx_1100011")
+ io.bge := DecodeBits(op, "xxxxxxx_xxxxx_xxxxx_101_xxxxx_1100011")
+ io.bltu := DecodeBits(op, "xxxxxxx_xxxxx_xxxxx_110_xxxxx_1100011")
+ io.bgeu := DecodeBits(op, "xxxxxxx_xxxxx_xxxxx_111_xxxxx_1100011")
+ io.csrrw := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_x01_xxxxx_1110011")
+ io.csrrs := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_x10_xxxxx_1110011")
+ io.csrrc := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_x11_xxxxx_1110011")
+ io.lb := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_000_xxxxx_0000011")
+ io.lh := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_001_xxxxx_0000011")
+ io.lw := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_010_xxxxx_0000011")
+ io.lbu := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_100_xxxxx_0000011")
+ io.lhu := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_101_xxxxx_0000011")
+ io.sb := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_000_xxxxx_0100011")
+ io.sh := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_001_xxxxx_0100011")
+ io.sw := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_010_xxxxx_0100011")
+ io.fence := DecodeBits(op, "0000_xxxx_xxxx_00000_000_00000_0001111")
+ io.addi := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_000_xxxxx_0010011")
+ io.slti := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_010_xxxxx_0010011")
+ io.sltiu := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_011_xxxxx_0010011")
+ io.xori := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_100_xxxxx_0010011")
+ io.ori := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_110_xxxxx_0010011")
+ io.andi := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_111_xxxxx_0010011")
+ io.slli := DecodeBits(op, "0000000_xxxxx_xxxxx_001_xxxxx_0010011")
+ io.srli := DecodeBits(op, "0000000_xxxxx_xxxxx_101_xxxxx_0010011")
+ io.srai := DecodeBits(op, "0100000_xxxxx_xxxxx_101_xxxxx_0010011")
+ io.add := DecodeBits(op, "0000000_xxxxx_xxxxx_000_xxxxx_0110011")
+ io.sub := DecodeBits(op, "0100000_xxxxx_xxxxx_000_xxxxx_0110011")
+ io.slt := DecodeBits(op, "0000000_xxxxx_xxxxx_010_xxxxx_0110011")
+ io.sltu := DecodeBits(op, "0000000_xxxxx_xxxxx_011_xxxxx_0110011")
+ io.xor := DecodeBits(op, "0000000_xxxxx_xxxxx_100_xxxxx_0110011")
+ io.or := DecodeBits(op, "0000000_xxxxx_xxxxx_110_xxxxx_0110011")
+ io.and := DecodeBits(op, "0000000_xxxxx_xxxxx_111_xxxxx_0110011")
+ io.sll := DecodeBits(op, "0000000_xxxxx_xxxxx_001_xxxxx_0110011")
+ io.srl := DecodeBits(op, "0000000_xxxxx_xxxxx_101_xxxxx_0110011")
+ io.sra := DecodeBits(op, "0100000_xxxxx_xxxxx_101_xxxxx_0110011")
+
+ // RV32M
+ io.mul := DecodeBits(op, "0000_001_xxxxx_xxxxx_000_xxxxx_0110011")
+ io.mulh := DecodeBits(op, "0000_001_xxxxx_xxxxx_001_xxxxx_0110011")
+ io.mulhsu := DecodeBits(op, "0000_001_xxxxx_xxxxx_010_xxxxx_0110011")
+ io.mulhu := DecodeBits(op, "0000_001_xxxxx_xxxxx_011_xxxxx_0110011")
+ io.mulhr := DecodeBits(op, "0010_001_xxxxx_xxxxx_001_xxxxx_0110011")
+ io.mulhsur := DecodeBits(op, "0010_001_xxxxx_xxxxx_010_xxxxx_0110011")
+ io.mulhur := DecodeBits(op, "0010_001_xxxxx_xxxxx_011_xxxxx_0110011")
+ io.dmulh := DecodeBits(op, "0000_010_xxxxx_xxxxx_001_xxxxx_0110011")
+ io.dmulhr := DecodeBits(op, "0010_010_xxxxx_xxxxx_001_xxxxx_0110011")
+ io.div := DecodeBits(op, "0000_001_xxxxx_xxxxx_100_xxxxx_0110011")
+ io.divu := DecodeBits(op, "0000_001_xxxxx_xxxxx_101_xxxxx_0110011")
+ io.rem := DecodeBits(op, "0000_001_xxxxx_xxxxx_110_xxxxx_0110011")
+ io.remu := DecodeBits(op, "0000_001_xxxxx_xxxxx_111_xxxxx_0110011")
+
+ // RV32B
+ io.clz := DecodeBits(op, "0110000_00000_xxxxx_001_xxxxx_0010011")
+ io.ctz := DecodeBits(op, "0110000_00001_xxxxx_001_xxxxx_0010011")
+ io.pcnt := DecodeBits(op, "0110000_00010_xxxxx_001_xxxxx_0010011")
+ io.min := DecodeBits(op, "0000101_xxxxx_xxxxx_100_xxxxx_0110011")
+ io.minu := DecodeBits(op, "0000101_xxxxx_xxxxx_101_xxxxx_0110011")
+ io.max := DecodeBits(op, "0000101_xxxxx_xxxxx_110_xxxxx_0110011")
+ io.maxu := DecodeBits(op, "0000101_xxxxx_xxxxx_111_xxxxx_0110011")
+
+ // Decode scalar log.
+ val slog = DecodeBits(op, "01111_00_00000_xxxxx_0xx_00000_11101_11")
+
+ // Vector length.
+ io.getvl := DecodeBits(op, "0001x_xx_xxxxx_xxxxx_000_xxxxx_11101_11") && op(26,25) =/= 3.U && (op(24,20) =/= 0.U || op(19,15) =/= 0.U)
+ io.getmaxvl := DecodeBits(op, "0001x_xx_00000_00000_000_xxxxx_11101_11") && op(26,25) =/= 3.U
+
+ // Vector load/store.
+ io.vld := DecodeBits(op, "000xxx_0xxxxx_xxxxx0_xx_xxxxxx_x_111_11") // vld
+
+ io.vst := DecodeBits(op, "001xxx_0xxxxx_xxxxx0_xx_xxxxxx_x_111_11") || // vst
+ DecodeBits(op, "011xxx_0xxxxx_xxxxx0_xx_xxxxxx_x_111_11") // vstq
+
+ // Convolution transfer accumulators to vregs. Also decodes acset/actr ops.
+ val vconv = DecodeBits(op, "010100_000000_000000_xx_xxxxxx_x_111_11")
+
+ // Duplicate
+ val vdup = DecodeBits(op, "01000x_0xxxxx_000000_xx_xxxxxx_x_111_11") && op(13,12) <= 2.U
+ val vdupi = vdup && op(26) === 0.U
+
+ // Vector instructions.
+ io.viop := op(0) === 0.U || // .vv .vx
+ op(1,0) === 1.U || // .vvv .vxv
+ vconv || vdupi
+
+ // [extensions] Core controls.
+ io.ebreak := DecodeBits(op, "000000000001_00000_000_00000_11100_11")
+ io.ecall := DecodeBits(op, "000000000000_00000_000_00000_11100_11")
+ io.eexit := DecodeBits(op, "000000100000_00000_000_00000_11100_11")
+ io.eyield := DecodeBits(op, "000001000000_00000_000_00000_11100_11")
+ io.ectxsw := DecodeBits(op, "000001100000_00000_000_00000_11100_11")
+ io.mpause := DecodeBits(op, "000010000000_00000_000_00000_11100_11")
+ io.mret := DecodeBits(op, "001100000010_00000_000_00000_11100_11")
+
+ // Fences.
+ io.fencei := DecodeBits(op, "0000_0000_0000_00000_001_00000_0001111")
+ io.flushat := DecodeBits(op, "0010x_xx_00000_xxxxx_000_00000_11101_11") && op(19,15) =/= 0.U
+ io.flushall := DecodeBits(op, "0010x_xx_00000_00000_000_00000_11101_11")
+
+ // [extensions] Scalar logging.
+ io.slog := slog
+
+ // Stub out decoder state not used beyond pipeline0.
+ if (pipeline > 0) {
+ io.csrrw := false.B
+ io.csrrs := false.B
+ io.csrrc := false.B
+
+ io.div := false.B
+ io.divu := false.B
+ io.rem := false.B
+ io.remu := false.B
+
+ io.ebreak := false.B
+ io.ecall := false.B
+ io.eexit := false.B
+ io.eyield := false.B
+ io.ectxsw := false.B
+ io.mpause := false.B
+ io.mret := false.B
+
+ io.fence := false.B
+ io.fencei := false.B
+ io.flushat := false.B
+ io.flushall := false.B
+
+ io.slog := false.B
+ }
+
+ // Generate the undefined opcode.
+ val decoded = Cat(io.lui, io.auipc,
+ io.jal, io.jalr,
+ io.beq, io.bne, io.blt, io.bge, io.bltu, io.bgeu,
+ io.csrrw, io.csrrs, io.csrrc,
+ io.lb, io.lh, io.lw, io.lbu, io.lhu,
+ io.sb, io.sh, io.sw, io.fence,
+ io.addi, io.slti, io.sltiu, io.xori, io.ori, io.andi,
+ io.add, io.sub, io.slt, io.sltu, io.xor, io.or, io.and,
+ io.slli, io.srli, io.srai, io.sll, io.srl, io.sra,
+ io.mul, io.mulh, io.mulhsu, io.mulhu, io.mulhr, io.mulhsur, io.mulhur, io.dmulh, io.dmulhr,
+ io.div, io.divu, io.rem, io.remu,
+ io.clz, io.ctz, io.pcnt, io.min, io.minu, io.max, io.maxu,
+ io.viop, io.vld, io.vst,
+ io.getvl, io.getmaxvl,
+ io.ebreak, io.ecall, io.eexit, io.eyield, io.ectxsw,
+ io.mpause, io.mret, io.fencei, io.flushat, io.flushall, io.slog)
+
+ io.undef := !WiredOR(decoded)
+
+ // Delay the assert until the next cycle, so that logs appear on console.
+ val onehot_failed = RegInit(false.B)
+ assert(!onehot_failed)
+
+ val onehot_decode = PopCount(decoded)
+ when ((onehot_decode + io.undef) =/= 1.U) {
+ onehot_failed := true.B
+ printf("[FAIL] decode inst=%x addr=%x decoded=0b%b pipeline=%d\n",
+ io.inst, io.addr, decoded, pipeline.U)
+ }
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Dvu.scala b/hdl/chisel/src/kelvin/scalar/Dvu.scala
new file mode 100644
index 0000000..8f117d1
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Dvu.scala
@@ -0,0 +1,145 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Dvu {
+ def apply(p: Parameters): Dvu = {
+ return Module(new Dvu(p))
+ }
+}
+
+case class DvuOp() {
+ val DIV = 0
+ val DIVU = 1
+ val REM = 2
+ val REMU = 3
+ val Entries = 4
+}
+
+class DvuIO(p: Parameters) extends Bundle {
+ val valid = Input(Bool())
+ val ready = Output(Bool())
+ val addr = Input(UInt(5.W))
+ val op = Input(UInt(new DvuOp().Entries.W))
+}
+
+class Dvu(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ // Decode cycle.
+ val req = new DvuIO(p)
+
+ // Execute cycle.
+ val rs1 = Flipped(new RegfileReadDataIO)
+ val rs2 = Flipped(new RegfileReadDataIO)
+ val rd = new Bundle { // RegfileWriteDataIO
+ val valid = Output(Bool())
+ val ready = Input(Bool())
+ val addr = Output(UInt(5.W))
+ val data = Output(UInt(32.W))
+ }
+ })
+
+ // This implemention differs to common::idiv by supporting early termination,
+ // and only performs one bit per cycle.
+ val dvu = new DvuOp()
+
+ def Divide(prvDivide: UInt, prvRemain: UInt, denom: UInt): (UInt, UInt) = {
+ val shfRemain = Cat(prvRemain(30,0), prvDivide(31))
+ val subtract = shfRemain -& denom
+ assert(subtract.getWidth == 33)
+ val divDivide = Wire(UInt(32.W))
+ val divRemain = Wire(UInt(32.W))
+
+ when (!subtract(32)) {
+ divDivide := Cat(prvDivide(30,0), 1.U(1.W))
+ divRemain := subtract(31,0)
+ } .otherwise {
+ divDivide := Cat(prvDivide(30,0), 0.U(1.W))
+ divRemain := shfRemain
+ }
+
+ (divDivide, divRemain)
+ }
+
+ val active = RegInit(false.B)
+ val compute = RegInit(false.B)
+
+ val addr1 = Reg(UInt(5.W))
+ val signed1 = Reg(Bool())
+ val divide1 = Reg(Bool())
+ val addr2 = Reg(UInt(5.W))
+ val signed2d = Reg(Bool())
+ val signed2r = Reg(Bool())
+ val divide2 = Reg(Bool())
+
+ val count = Reg(UInt(6.W))
+
+ val divide = Reg(UInt(32.W))
+ val remain = Reg(UInt(32.W))
+ val denom = Reg(UInt(32.W))
+
+ val divByZero = io.rs2.data === 0.U
+
+ io.req.ready := !active && !compute && !count(5)
+
+ // This is not a Clz, one value too small.
+ def Clz1(bits: UInt): UInt = {
+ val msb = bits.getWidth - 1
+ Mux(bits(msb), 0.U, PriorityEncoder(Reverse(bits(msb - 1, 0))))
+ }
+
+ // Disable active second to last cycle.
+ when (io.req.valid && io.req.ready) {
+ active := true.B
+ } .elsewhen (count === 30.U) {
+ active := false.B
+ }
+
+ // Compute is delayed by one cycle.
+ compute := active
+
+ when (io.req.valid && io.req.ready) {
+ addr1 := io.req.addr
+ signed1 := io.req.op(dvu.DIV) || io.req.op(dvu.REM)
+ divide1 := io.req.op(dvu.DIV) || io.req.op(dvu.DIVU)
+ }
+
+ when (active && !compute) {
+ addr2 := addr1
+ signed2d := signed1 && (io.rs1.data(31) =/= io.rs2.data(31)) && !divByZero
+ signed2r := signed1 && io.rs1.data(31)
+ divide2 := divide1
+
+ val inp = Mux(signed1 && io.rs1.data(31), ~io.rs1.data + 1.U, io.rs1.data)
+
+ // The divBy0 uses full latency to simplify logic.
+ // Count the leading zeroes, which is one less than the priority encoding.
+ val clz = Mux(io.rs2.data === 0.U, 0.U, Clz1(inp))
+
+ denom := Mux(signed1 && io.rs2.data(31), ~io.rs2.data + 1.U, io.rs2.data)
+ divide := inp << clz
+ remain := 0.U
+ count := clz
+ } .elsewhen (compute && count < 32.U) {
+ val (div, rem) = Divide(divide, remain, denom)
+ divide := div
+ remain := rem
+ count := count + 1.U
+ } .elsewhen (io.rd.valid && io.rd.ready) {
+ count := 0.U
+ }
+
+ val div = Mux(signed2d, ~divide + 1.U, divide)
+ val rem = Mux(signed2r, ~remain + 1.U, remain)
+
+ io.rd.valid := count(5)
+ io.rd.addr := addr2
+ io.rd.data := Mux(divide2, div, rem)
+}
+
+object EmitDvu extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new Dvu(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Fetch.scala b/hdl/chisel/src/kelvin/scalar/Fetch.scala
new file mode 100644
index 0000000..0351317
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Fetch.scala
@@ -0,0 +1,507 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Fetch {
+ def apply(p: Parameters): Fetch = {
+ return Module(new Fetch(p))
+ }
+}
+
+class IBusIO(p: Parameters) extends Bundle {
+ // Control Phase.
+ val valid = Output(Bool())
+ val ready = Input(Bool())
+ val addr = Output(UInt(p.fetchAddrBits.W))
+ // Read Phase.
+ val rdata = Input(UInt(p.fetchDataBits.W))
+}
+
+class FetchInstruction(p: Parameters) extends Bundle {
+ val valid = Output(Bool())
+ val ready = Input(Bool())
+ val addr = Output(UInt(p.programCounterBits.W))
+ val inst = Output(UInt(p.instructionBits.W))
+ val brchFwd = Output(Bool())
+}
+
+class FetchIO(p: Parameters) extends Bundle {
+ val lanes = Vec(p.instructionLanes, new FetchInstruction(p))
+}
+
+class Fetch(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ val csr = new CsrInIO(p)
+ val ibus = new IBusIO(p)
+ val inst = new FetchIO(p)
+ val branch = Flipped(Vec(4, new BranchTakenIO(p)))
+ val linkPort = Flipped(new RegfileLinkPortIO)
+ val iflush = Flipped(new IFlushIO(p))
+ })
+
+ // This is the only compiled and tested configuration (at this time).
+ assert(p.fetchAddrBits == 32)
+ assert(p.fetchDataBits == 256)
+
+ val aslice = Slice(UInt(p.fetchAddrBits.W), true)
+ val readAddr = Reg(UInt(p.fetchAddrBits.W))
+ val readDataEn = RegInit(false.B)
+
+ val readAddrEn = io.ibus.valid && io.ibus.ready
+ val readData = io.ibus.rdata
+ readDataEn := readAddrEn && !io.iflush.valid
+
+ io.iflush.ready := !aslice.io.out.valid
+
+ // L0 cache
+ // ____________________________________
+ // | Tag |Index|xxxxx|
+ // ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
+ val lanes = p.fetchDataBits / p.instructionBits // input lanes
+ val indices = p.fetchCacheBytes * 8 / p.fetchDataBits
+ val indexLsb = log2Ceil(p.fetchDataBits / 8)
+ val indexMsb = log2Ceil(indices) + indexLsb - 1
+ val tagLsb = indexMsb + 1
+ val tagMsb = p.fetchAddrBits - 1
+ val indexCountBits = log2Ceil(indices - 1)
+
+ if (p.fetchCacheBytes == 1024) {
+ assert(indexLsb == 5)
+ assert(indexMsb == 9)
+ assert(tagLsb == 10)
+ assert(tagMsb == 31)
+ assert(indices == 32)
+ assert(indexCountBits == 5)
+ assert(lanes == 8)
+ }
+
+ val l0valid = RegInit(0.U(indices.W))
+ val l0req = RegInit(0.U(indices.W))
+ val l0tag = Reg(Vec(indices, UInt((tagMsb - tagLsb + 1).W)))
+ val l0data = Reg(Vec(indices, UInt(p.fetchDataBits.W)))
+
+ // Instruction outputs.
+ val instValid = RegInit(VecInit(Seq.fill(4)(false.B)))
+ val instAddr = Reg(Vec(4, UInt(p.instructionBits.W)))
+ val instBits = Reg(Vec(4, UInt(p.instructionBits.W)))
+
+ val instAligned0 = Cat(instAddr(0)(31, indexLsb), 0.U(indexLsb.W))
+ val instAligned1 = instAligned0 + Cat(1.U, 0.U(indexLsb.W))
+
+ val instIndex0 = instAligned0(indexMsb, indexLsb)
+ val instIndex1 = instAligned1(indexMsb, indexLsb)
+
+ val instTag0 = instAligned0(tagMsb, tagLsb)
+ val instTag1 = instAligned1(tagMsb, tagLsb)
+
+ val l0valid0 = l0valid(instIndex0)
+ val l0valid1 = l0valid(instIndex1)
+
+ val l0tag0 = VecAt(l0tag, instIndex0)
+ val l0tag1 = VecAt(l0tag, instIndex1)
+
+ val match0 = l0valid0 && instTag0 === l0tag0
+ val match1 = l0valid1 && instTag1 === l0tag1
+
+ // Read interface.
+ // Do not request entries that are already inflight.
+ // Perform a branch tag lookup to see if target is in cache.
+ def Predecode(addr: UInt, op: UInt): (Bool, UInt) = {
+ val jal = DecodeBits(op, "xxxxxxxxxxxxxxxxxxxx_xxxxx_1101111")
+ val immed = Cat(Fill(12, op(31)), op(19,12), op(20), op(30,21), 0.U(1.W))
+ val target = addr + immed
+ (jal, target)
+ }
+
+ val (preBranchTaken0, preBranchTarget0) =
+ Predecode(instAddr(0), instBits(0))
+ val (preBranchTaken1, preBranchTarget1) =
+ Predecode(instAddr(1), instBits(1))
+ val (preBranchTaken2, preBranchTarget2) =
+ Predecode(instAddr(2), instBits(2))
+ val (preBranchTaken3, preBranchTarget3) =
+ Predecode(instAddr(3), instBits(3))
+
+ val preBranchTaken = io.inst.lanes(0).valid && preBranchTaken0 ||
+ io.inst.lanes(1).valid && preBranchTaken1 ||
+ io.inst.lanes(2).valid && preBranchTaken2 ||
+ io.inst.lanes(3).valid && preBranchTaken3
+
+ val preBranchTarget = Mux(preBranchTaken0, preBranchTarget0,
+ Mux(preBranchTaken1, preBranchTarget1,
+ Mux(preBranchTaken2, preBranchTarget2,
+ preBranchTarget3)))
+
+ val preBranchTag = preBranchTarget(tagMsb, tagLsb)
+ val preBranchIndex = preBranchTarget(indexMsb, indexLsb)
+
+ val branchTag0 = io.branch(0).value(tagMsb, tagLsb)
+ val branchTag1 = io.branch(1).value(tagMsb, tagLsb)
+ val branchTag2 = io.branch(2).value(tagMsb, tagLsb)
+ val branchTag3 = io.branch(3).value(tagMsb, tagLsb)
+ val branchIndex0 = io.branch(0).value(indexMsb, indexLsb)
+ val branchIndex1 = io.branch(1).value(indexMsb, indexLsb)
+ val branchIndex2 = io.branch(2).value(indexMsb, indexLsb)
+ val branchIndex3 = io.branch(3).value(indexMsb, indexLsb)
+
+ val l0validB0 = l0valid(branchIndex0)
+ val l0validB1 = l0valid(branchIndex1)
+ val l0validB2 = l0valid(branchIndex2)
+ val l0validB3 = l0valid(branchIndex3)
+ val l0validP = l0valid(preBranchIndex)
+
+ val l0tagB0 = VecAt(l0tag, branchIndex0)
+ val l0tagB1 = VecAt(l0tag, branchIndex1)
+ val l0tagB2 = VecAt(l0tag, branchIndex2)
+ val l0tagB3 = VecAt(l0tag, branchIndex3)
+ val l0tagP = VecAt(l0tag, preBranchIndex)
+
+ val reqB0 = io.branch(0).valid && !l0req(branchIndex0) &&
+ (branchTag0 =/= l0tagB0 || !l0validB0)
+ val reqB1 = io.branch(1).valid && !l0req(branchIndex1) &&
+ (branchTag1 =/= l0tagB1 || !l0validB1) &&
+ !io.branch(0).valid
+ val reqB2 = io.branch(2).valid && !l0req(branchIndex2) &&
+ (branchTag2 =/= l0tagB2 || !l0validB2) &&
+ !io.branch(0).valid && !io.branch(1).valid
+ val reqB3 = io.branch(3).valid && !l0req(branchIndex3) &&
+ (branchTag3 =/= l0tagB3 || !l0validB3) &&
+ !io.branch(0).valid && !io.branch(1).valid && !io.branch(2).valid
+ val reqP = preBranchTaken && !l0req(preBranchIndex) && (preBranchTag =/= l0tagP || !l0validP)
+ val req0 = !match0 && !l0req(instIndex0)
+ val req1 = !match1 && !l0req(instIndex1)
+
+ aslice.io.in.valid := (reqB0 || reqB1 || reqB2 || reqB3 || reqP || req0 || req1) && !io.iflush.valid
+ aslice.io.in.bits := Mux(reqB0, Cat(io.branch(0).value(31,indexLsb), 0.U(indexLsb.W)),
+ Mux(reqB1, Cat(io.branch(1).value(31,indexLsb), 0.U(indexLsb.W)),
+ Mux(reqB2, Cat(io.branch(2).value(31,indexLsb), 0.U(indexLsb.W)),
+ Mux(reqB3, Cat(io.branch(3).value(31,indexLsb), 0.U(indexLsb.W)),
+ Mux(reqP, Cat(preBranchTarget(31,indexLsb), 0.U(indexLsb.W)),
+ Mux(req0, instAligned0, instAligned1))))))
+
+ when (readAddrEn) {
+ readAddr := io.ibus.addr
+ }
+
+ io.ibus.valid := aslice.io.out.valid
+ aslice.io.out.ready := io.ibus.ready || io.iflush.valid
+ io.ibus.addr := aslice.io.out.bits
+
+ // initialize tags to 1s as 0xfffxxxxx are invalid instruction addresses
+ val l0validClr = WireInit(0.U(indices.W))
+ val l0validSet = WireInit(0.U(indices.W))
+ val l0reqClr = WireInit(0.U(indices.W))
+ val l0reqSet = WireInit(0.U(indices.W))
+
+ val readIdx = readAddr(indexMsb, indexLsb)
+
+ for (i <- 0 until indices) {
+ when (readDataEn && readIdx === i.U) {
+ l0tag(i.U) := readAddr(tagMsb, tagLsb)
+ l0data(i.U) := readData
+ }
+ }
+
+ when (readDataEn) {
+ val bits = OneHot(readIdx, indices)
+ l0validSet := bits
+ l0reqClr := bits
+ }
+
+ when (io.iflush.valid) {
+ val clr = ~(0.U(l0validClr.getWidth.W))
+ l0validClr := clr
+ l0reqClr := clr
+ }
+
+ when (aslice.io.in.valid && aslice.io.in.ready) {
+ l0reqSet := OneHot(aslice.io.in.bits(indexMsb, indexLsb), indices)
+ }
+
+ when (l0validClr =/= 0.U || l0validSet =/= 0.U) {
+ l0valid := (l0valid | l0validSet) & ~l0validClr
+ }
+
+ when (l0reqClr =/= 0.U || l0reqSet =/= 0.U) {
+ l0req := (l0req | l0reqSet) & ~l0reqClr
+ }
+
+ // Instruction Outputs
+ // Do not use the next instruction address directly in the lookup, as that
+ // creates excessive timing pressure. We know that the match is either on
+ // the old line or the next line, so can late mux on lookups of prior.
+ // Widen the arithmetic paths and select from results.
+ val fetchEn = Wire(Vec(4, Bool()))
+
+ for (i <- 0 until 4) {
+ fetchEn(i) := io.inst.lanes(i).valid && io.inst.lanes(i).ready
+ }
+
+ val fsel = Cat(fetchEn(3),
+ fetchEn(2) && !fetchEn(3),
+ fetchEn(1) && !fetchEn(2) && !fetchEn(3),
+ fetchEn(0) && !fetchEn(1) && !fetchEn(2) && !fetchEn(3),
+ !fetchEn(0) && !fetchEn(1) && !fetchEn(2) && !fetchEn(3))
+
+ val nxtInstAddr0 = instAddr(0) // 0
+ val nxtInstAddr1 = instAddr(1) // 4
+ val nxtInstAddr2 = instAddr(2) // 8
+ val nxtInstAddr3 = instAddr(3) // 12
+ val nxtInstAddr4 = instAddr(0) + 16.U // 16
+ val nxtInstAddr5 = instAddr(1) + 16.U // 20
+ val nxtInstAddr6 = instAddr(2) + 16.U // 24
+ val nxtInstAddr7 = instAddr(3) + 16.U // 28
+
+ val nxtInstAddr = Wire(Vec(4, UInt(p.instructionBits.W)))
+
+ nxtInstAddr(0) := Mux(fsel(4), nxtInstAddr4, 0.U) |
+ Mux(fsel(3), nxtInstAddr3, 0.U) |
+ Mux(fsel(2), nxtInstAddr2, 0.U) |
+ Mux(fsel(1), nxtInstAddr1, 0.U) |
+ Mux(fsel(0), nxtInstAddr0, 0.U)
+
+ nxtInstAddr(1) := Mux(fsel(4), nxtInstAddr5, 0.U) |
+ Mux(fsel(3), nxtInstAddr4, 0.U) |
+ Mux(fsel(2), nxtInstAddr3, 0.U) |
+ Mux(fsel(1), nxtInstAddr2, 0.U) |
+ Mux(fsel(0), nxtInstAddr1, 0.U)
+
+ nxtInstAddr(2) := Mux(fsel(4), nxtInstAddr6, 0.U) |
+ Mux(fsel(3), nxtInstAddr5, 0.U) |
+ Mux(fsel(2), nxtInstAddr4, 0.U) |
+ Mux(fsel(1), nxtInstAddr3, 0.U) |
+ Mux(fsel(0), nxtInstAddr2, 0.U)
+
+ nxtInstAddr(3) := Mux(fsel(4), nxtInstAddr7, 0.U) |
+ Mux(fsel(3), nxtInstAddr6, 0.U) |
+ Mux(fsel(2), nxtInstAddr5, 0.U) |
+ Mux(fsel(1), nxtInstAddr4, 0.U) |
+ Mux(fsel(0), nxtInstAddr3, 0.U)
+
+ val nxtInstIndex0 = nxtInstAddr(0)(indexMsb, indexLsb)
+ val nxtInstIndex1 = nxtInstAddr(3)(indexMsb, indexLsb)
+
+ val readFwd0 =
+ readDataEn && readAddr(31,indexLsb) === instAligned0(31,indexLsb)
+ val readFwd1 =
+ readDataEn && readAddr(31,indexLsb) === instAligned1(31,indexLsb)
+
+ val nxtMatch0Fwd = match0 || readFwd0
+ val nxtMatch1Fwd = match1 || readFwd1
+
+ val nxtMatch0 =
+ Mux(instIndex0(0) === nxtInstIndex0(0), nxtMatch0Fwd, nxtMatch1Fwd)
+ val nxtMatch1 =
+ Mux(instIndex0(0) === nxtInstIndex1(0), nxtMatch0Fwd, nxtMatch1Fwd)
+
+ val nxtInstValid = Wire(Vec(4, Bool()))
+
+ val nxtInstBits0 = Mux(readFwd0, readData, VecAt(l0data, instIndex0))
+ val nxtInstBits1 = Mux(readFwd1, readData, VecAt(l0data, instIndex1))
+ val nxtInstBits = Wire(Vec(16, UInt(p.instructionBits.W)))
+
+ for (i <- 0 until 8) {
+ val offset = 32 * i
+ nxtInstBits(i + 0) := nxtInstBits0(31 + offset, offset)
+ nxtInstBits(i + 8) := nxtInstBits1(31 + offset, offset)
+ }
+
+ def BranchMatchDe(valid: Bool, value: UInt):
+ (Bool, UInt, Vec[UInt], Vec[UInt]) = {
+
+ val addr = VecInit(value,
+ value + 4.U,
+ value + 8.U,
+ value + 12.U)
+
+ val match0 = l0valid(addr(0)(indexMsb,indexLsb)) &&
+ addr(0)(tagMsb,tagLsb) === VecAt(l0tag, addr(0)(indexMsb,indexLsb))
+ val match1 = l0valid(addr(3)(indexMsb,indexLsb)) &&
+ addr(3)(tagMsb,tagLsb) === VecAt(l0tag, addr(3)(indexMsb,indexLsb))
+
+ val vvalid = VecInit(Mux(addr(0)(4,2) <= 7.U, match0, match1),
+ Mux(addr(0)(4,2) <= 6.U, match0, match1),
+ Mux(addr(0)(4,2) <= 5.U, match0, match1),
+ Mux(addr(0)(4,2) <= 4.U, match0, match1))
+
+ val muxbits0 = VecAt(l0data, addr(0)(indexMsb,indexLsb))
+ val muxbits1 = VecAt(l0data, addr(3)(indexMsb,indexLsb))
+ val muxbits = Wire(Vec(16, UInt(p.instructionBits.W)))
+
+ for (i <- 0 until 8) {
+ val offset = 32 * i
+ muxbits(i + 0) := muxbits0(31 + offset, offset)
+ muxbits(i + 8) := muxbits1(31 + offset, offset)
+ }
+
+ val bits = Wire(Vec(4, UInt(p.instructionBits.W)))
+ for (i <- 0 until 4) {
+ val idx = Cat(addr(0)(5) =/= addr(i)(5), addr(i)(4,2))
+ bits(i) := VecAt(muxbits, idx)
+ }
+
+ (valid, vvalid.asUInt, addr, bits)
+ }
+
+ def BranchMatchEx(branch: Vec[BranchTakenIO]):
+ (Bool, UInt, Vec[UInt], Vec[UInt]) = {
+ val valid = branch(0).valid || branch(1).valid ||
+ branch(2).valid || branch(3).valid
+
+ val addr = VecInit(Mux(branch(0).valid, branch(0).value,
+ Mux(branch(1).valid, branch(1).value,
+ Mux(branch(2).valid, branch(2).value,
+ branch(3).value))),
+ Mux(branch(0).valid, branch(0).value + 4.U,
+ Mux(branch(1).valid, branch(1).value + 4.U,
+ Mux(branch(2).valid, branch(2).value + 4.U,
+ branch(3).value + 4.U))),
+ Mux(branch(0).valid, branch(0).value + 8.U,
+ Mux(branch(1).valid, branch(1).value + 8.U,
+ Mux(branch(2).valid, branch(2).value + 8.U,
+ branch(3).value + 8.U))),
+ Mux(branch(0).valid, branch(0).value + 12.U,
+ Mux(branch(1).valid, branch(1).value + 12.U,
+ Mux(branch(2).valid, branch(2).value + 12.U,
+ branch(3).value + 12.U))))
+
+ val match0 = l0valid(addr(0)(indexMsb,indexLsb)) &&
+ addr(0)(tagMsb,tagLsb) === VecAt(l0tag, addr(0)(indexMsb,indexLsb))
+ val match1 = l0valid(addr(3)(indexMsb,indexLsb)) &&
+ addr(3)(tagMsb,tagLsb) === VecAt(l0tag, addr(3)(indexMsb,indexLsb))
+
+ val vvalid = VecInit(Mux(addr(0)(4,2) <= 7.U, match0, match1),
+ Mux(addr(0)(4,2) <= 6.U, match0, match1),
+ Mux(addr(0)(4,2) <= 5.U, match0, match1),
+ Mux(addr(0)(4,2) <= 4.U, match0, match1))
+
+ val muxbits0 = VecAt(l0data, addr(0)(indexMsb,indexLsb))
+ val muxbits1 = VecAt(l0data, addr(3)(indexMsb,indexLsb))
+ val muxbits = Wire(Vec(16, UInt(p.instructionBits.W)))
+
+ for (i <- 0 until 8) {
+ val offset = 32 * i
+ muxbits(i + 0) := muxbits0(31 + offset, offset)
+ muxbits(i + 8) := muxbits1(31 + offset, offset)
+ }
+
+ val bits = Wire(Vec(4, UInt(p.instructionBits.W)))
+ for (i <- 0 until 4) {
+ val idx = Cat(addr(0)(5) =/= addr(i)(5), addr(i)(4,2))
+ bits(i) := VecAt(muxbits, idx)
+ }
+
+ (valid, vvalid.asUInt, addr, bits)
+ }
+
+ def PredecodeDe(addr: UInt, op: UInt): (Bool, UInt) = {
+ val jal = DecodeBits(op, "xxxxxxxxxxxxxxxxxxxx_xxxxx_1101111")
+ val ret = DecodeBits(op, "000000000000_00001_000_00000_1100111") &&
+ io.linkPort.valid
+ val bxx = DecodeBits(op, "xxxxxxx_xxxxx_xxxxx_xxx_xxxxx_1100011") &&
+ op(31) && op(14,13) =/= 1.U
+ val immjal = Cat(Fill(12, op(31)), op(19,12), op(20), op(30,21), 0.U(1.W))
+ val immbxx = Cat(Fill(20, op(31)), op(7), op(30,25), op(11,8), 0.U(1.W))
+ val immed = Mux(op(2), immjal, immbxx)
+ val target = Mux(ret, io.linkPort.value, addr + immed)
+ (jal || ret || bxx, target)
+ }
+
+ val (brchTakenDe0, brchTargetDe0) = PredecodeDe(instAddr(0), instBits(0))
+ val (brchTakenDe1, brchTargetDe1) = PredecodeDe(instAddr(1), instBits(1))
+ val (brchTakenDe2, brchTargetDe2) = PredecodeDe(instAddr(2), instBits(2))
+ val (brchTakenDe3, brchTargetDe3) = PredecodeDe(instAddr(3), instBits(3))
+
+ val brchTakenDeOr =
+ io.inst.lanes(0).valid && io.inst.lanes(0).ready && brchTakenDe0 ||
+ io.inst.lanes(1).valid && io.inst.lanes(1).ready && brchTakenDe1 ||
+ io.inst.lanes(2).valid && io.inst.lanes(2).ready && brchTakenDe2 ||
+ io.inst.lanes(3).valid && io.inst.lanes(3).ready && brchTakenDe3
+
+ val brchTargetDe = Mux(brchTakenDe0, brchTargetDe0,
+ Mux(brchTakenDe1, brchTargetDe1,
+ Mux(brchTakenDe2, brchTargetDe2,
+ brchTargetDe3)))
+
+ val (brchTakenDe, brchValidDe, brchAddrDe, brchBitsDe) =
+ BranchMatchDe(brchTakenDeOr, brchTargetDe)
+
+ val (brchTakenEx, brchValidEx, brchAddrEx, brchBitsEx) =
+ BranchMatchEx(io.branch)
+
+ val brchValidDeMask =
+ Cat(!brchTakenDe0 && !brchTakenDe1 && !brchTakenDe2,
+ !brchTakenDe0 && !brchTakenDe1,
+ !brchTakenDe0,
+ true.B)
+
+ val brchFwd = Cat(
+ brchTakenDe3 && !brchTakenDe0 && !brchTakenDe1 && !brchTakenDe2,
+ brchTakenDe2 && !brchTakenDe0 && !brchTakenDe1,
+ brchTakenDe1 && !brchTakenDe0,
+ brchTakenDe0)
+
+ for (i <- 0 until 4) {
+ // 1, 11, 111, ...
+ nxtInstValid(i) := Mux(nxtInstAddr(0)(4,2) <= (7 - i).U, nxtMatch0, nxtMatch1)
+
+ val nxtInstValidUInt = nxtInstValid.asUInt
+ instValid(i) := Mux(brchTakenEx, brchValidEx(i,0) === ~0.U((i+1).W),
+ Mux(brchTakenDe, brchValidDe(i,0) === ~0.U((i+1).W),
+ nxtInstValidUInt(i,0) === ~0.U((i+1).W))) && !io.iflush.valid
+
+ instAddr(i) := Mux(brchTakenEx, brchAddrEx(i),
+ Mux(brchTakenDe, brchAddrDe(i), nxtInstAddr(i)))
+
+ // The (2,0) bits are the offset within the base line plus the next line.
+ // The (3) bit of the index must factor the base difference of addresses
+ // instAddr and nxtInstAddr which are line aligned.
+ val idx = Cat(instAddr(0)(5) =/= nxtInstAddr(i)(5), nxtInstAddr(i)(4,2))
+ instBits(i) := Mux(brchTakenEx, brchBitsEx(i),
+ Mux(brchTakenDe, brchBitsDe(i),
+ VecAt(nxtInstBits, idx)))
+ }
+
+ // This pattern of separate when() blocks requires resets after the data.
+ when (reset.asBool) {
+ val addr = Cat(io.csr.value(0)(31,2), 0.U(2.W))
+ instAddr(0) := addr
+ instAddr(1) := addr + 4.U
+ instAddr(2) := addr + 8.U
+ instAddr(3) := addr + 12.U
+ }
+
+ // Outputs
+ for (i <- 0 until 4) {
+ io.inst.lanes(i).valid := instValid(i) & brchValidDeMask(i)
+ io.inst.lanes(i).addr := instAddr(i)
+ io.inst.lanes(i).inst := instBits(i)
+ io.inst.lanes(i).brchFwd := brchFwd(i)
+ }
+
+ // Assertions.
+ assert(instAddr(0) + 4.U === instAddr(1))
+ assert(instAddr(0) + 8.U === instAddr(2))
+ assert(instAddr(0) + 12.U === instAddr(3))
+
+ assert(fsel.getWidth == 5)
+ assert(PopCount(fsel) <= 1.U)
+
+ val instValidUInt = instValid.asUInt
+ assert(!(!instValidUInt(0) && (instValidUInt(3,1) =/= 0.U)))
+ assert(!(!instValidUInt(1) && (instValidUInt(3,2) =/= 0.U)))
+ assert(!(!instValidUInt(2) && (instValidUInt(3,3) =/= 0.U)))
+
+ val instLanesReady = Cat(io.inst.lanes(3).ready, io.inst.lanes(2).ready,
+ io.inst.lanes(1).ready, io.inst.lanes(0).ready)
+ assert(!(!instLanesReady(0) && (instLanesReady(3,1) =/= 0.U)))
+ assert(!(!instLanesReady(1) && (instLanesReady(3,2) =/= 0.U)))
+ assert(!(!instLanesReady(2) && (instLanesReady(3,3) =/= 0.U)))
+}
+
+object EmitFetch extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new Fetch(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Flush.scala b/hdl/chisel/src/kelvin/scalar/Flush.scala
new file mode 100644
index 0000000..dddfcdf
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Flush.scala
@@ -0,0 +1,21 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+class IFlushIO(p: Parameters) extends Bundle {
+ val valid = Output(Bool())
+ val ready = Input(Bool())
+}
+
+class DFlushIO(p: Parameters) extends Bundle {
+ val valid = Output(Bool())
+ val ready = Input(Bool())
+ val all = Output(Bool()) // all=0, see io.dbus.addr for line address.
+ val clean = Output(Bool()) // clean and flush
+}
+
+class DFlushFenceiIO(p: Parameters) extends DFlushIO(p) {
+ val fencei = Output(Bool())
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala
new file mode 100644
index 0000000..7b6f82e
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -0,0 +1,288 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Lsu {
+ def apply(p: Parameters): Lsu = {
+ return Module(new Lsu(p))
+ }
+}
+
+class DBusIO(p: Parameters, bank: Boolean = false) extends Bundle {
+ // Control Phase.
+ val valid = Output(Bool())
+ val ready = Input(Bool())
+ val write = Output(Bool())
+ val addr = Output(UInt((p.lsuAddrBits - (if (bank) 1 else 0)).W))
+ val adrx = Output(UInt((p.lsuAddrBits - (if (bank) 1 else 0)).W))
+ val size = Output(UInt((log2Ceil(p.lsuDataBits / 8) + 1).W))
+ val wdata = Output(UInt(p.lsuDataBits.W))
+ val wmask = Output(UInt((p.lsuDataBits / 8).W))
+ // Read Phase.
+ val rdata = Input(UInt(p.lsuDataBits.W))
+}
+
+case class LsuOp() {
+ val LB = 0
+ val LH = 1
+ val LW = 2
+ val LBU = 3
+ val LHU = 4
+ val SB = 5
+ val SH = 6
+ val SW = 7
+ val FENCEI = 8
+ val FLUSHAT = 9
+ val FLUSHALL = 10
+ val VLDST = 11
+ val Entries = 12
+}
+
+class LsuIO(p: Parameters) extends Bundle {
+ val valid = Input(Bool())
+ val ready = Output(Bool())
+ val store = Input(Bool())
+ val addr = Input(UInt(5.W))
+ val op = Input(UInt(new LsuOp().Entries.W))
+}
+
+class LsuCtrl(p: Parameters) extends Bundle {
+ val addr = UInt(32.W)
+ val adrx = UInt(32.W)
+ val data = UInt(32.W)
+ val index = UInt(5.W)
+ val size = UInt((log2Ceil(p.lsuDataBits / 8) + 1).W)
+ val write = Bool()
+ val sext = Bool()
+ val iload = Bool()
+ val fencei = Bool()
+ val flushat = Bool()
+ val flushall = Bool()
+ val sldst = Bool() // scalar load/store cached
+ val vldst = Bool() // vector load/store
+ val suncd = Bool() // scalar load/store uncached
+}
+
+class LsuReadData(p: Parameters) extends Bundle {
+ val addr = UInt(32.W)
+ val index = UInt(5.W)
+ val size = UInt((log2Ceil(p.lsuDataBits / 8) + 1).W)
+ val sext = Bool()
+ val iload = Bool()
+ val sldst = Bool()
+ val suncd = Bool()
+}
+
+class Lsu(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ // Decode cycle.
+ val req = Vec(4, new LsuIO(p))
+ val busPort = Flipped(new RegfileBusPortIO)
+
+ // Execute cycle(s).
+ val rd = Flipped(new RegfileWriteDataIO)
+
+ // Cached interface.
+ val dbus = new DBusIO(p)
+ val flush = new DFlushFenceiIO(p)
+
+ // Uncached interface.
+ val ubus = new DBusIO(p)
+
+ // Vector switch.
+ val vldst = Output(Bool())
+ })
+
+ val lsu = new LsuOp()
+
+ // AXI Queues.
+ val n = 8
+ val ctrl = Fifo4(new LsuCtrl(p), n)
+ val data = Slice(new LsuReadData(p), true, true)
+
+ // Match and mask.
+ val ctrlready = Cat(ctrl.io.count <= (n - 4).U,
+ ctrl.io.count <= (n - 3).U,
+ ctrl.io.count <= (n - 2).U,
+ ctrl.io.count <= (n - 1).U)
+
+ io.req(0).ready := ctrlready(0) && data.io.in.ready
+ io.req(1).ready := ctrlready(1) && data.io.in.ready
+ io.req(2).ready := ctrlready(2) && data.io.in.ready
+ io.req(3).ready := ctrlready(3) && data.io.in.ready
+
+ // Address phase must use simple logic to resolve mask for unaligned address.
+ val linebit = log2Ceil(p.lsuDataBits / 8)
+ val lineoffset = (p.lsuDataBits / 8)
+
+ // ---------------------------------------------------------------------------
+ // Control Port Inputs.
+ ctrl.io.in.valid := io.req(0).valid || io.req(1).valid ||
+ io.req(2).valid || io.req(3).valid
+
+ for (i <- 0 until 4) {
+ val uncached = io.busPort.addr(i)(31)
+
+ val opstore = io.req(i).op(lsu.SW) || io.req(i).op(lsu.SH) || io.req(i).op(lsu.SB)
+ val opiload = io.req(i).op(lsu.LW) || io.req(i).op(lsu.LH) || io.req(i).op(lsu.LB) || io.req(i).op(lsu.LHU) || io.req(i).op(lsu.LBU)
+ val opload = opiload
+ val opfencei = io.req(i).op(lsu.FENCEI)
+ val opflushat = io.req(i).op(lsu.FLUSHAT)
+ val opflushall = io.req(i).op(lsu.FLUSHALL)
+ val opsldst = opstore || opload
+ val opvldst = io.req(i).op(lsu.VLDST)
+ val opsext = io.req(i).op(lsu.LB) || io.req(i).op(lsu.LH)
+ val opsize = Cat(io.req(i).op(lsu.LW) || io.req(i).op(lsu.SW),
+ io.req(i).op(lsu.LH) || io.req(i).op(lsu.LHU) || io.req(i).op(lsu.SH),
+ io.req(i).op(lsu.LB) || io.req(i).op(lsu.LBU) || io.req(i).op(lsu.SB))
+
+ ctrl.io.in.bits(i).valid := io.req(i).valid && ctrlready(i) && !(opvldst && uncached)
+
+ ctrl.io.in.bits(i).bits.addr := io.busPort.addr(i)
+ ctrl.io.in.bits(i).bits.adrx := io.busPort.addr(i) + lineoffset.U
+ ctrl.io.in.bits(i).bits.data := io.busPort.data(i)
+ ctrl.io.in.bits(i).bits.index := io.req(i).addr
+ ctrl.io.in.bits(i).bits.sext := opsext
+ ctrl.io.in.bits(i).bits.size := opsize
+ ctrl.io.in.bits(i).bits.iload := opiload
+ ctrl.io.in.bits(i).bits.fencei := opfencei
+ ctrl.io.in.bits(i).bits.flushat := opflushat
+ ctrl.io.in.bits(i).bits.flushall := opflushall
+ ctrl.io.in.bits(i).bits.sldst := opsldst && !uncached
+ ctrl.io.in.bits(i).bits.vldst := opvldst
+ ctrl.io.in.bits(i).bits.suncd := opsldst && uncached
+ ctrl.io.in.bits(i).bits.write := !opload
+ }
+
+ // ---------------------------------------------------------------------------
+ // Control Port Outputs.
+ val wsel = ctrl.io.out.bits.addr(1,0)
+ val wda = ctrl.io.out.bits.data
+ val wdataS =
+ MuxOR(wsel === 0.U, wda(31,0)) |
+ MuxOR(wsel === 1.U, Cat(wda(23,16), wda(15,8), wda(7,0), wda(31,24))) |
+ MuxOR(wsel === 2.U, Cat(wda(15,8), wda(7,0), wda(31,24), wda(23,16))) |
+ MuxOR(wsel === 3.U, Cat(wda(7,0), wda(31,24), wda(23,16), wda(15,8)))
+ val wmaskB = p.lsuDataBits / 8
+ val wmaskT = (~0.U(wmaskB.W)) >> (wmaskB.U - ctrl.io.out.bits.size)
+ val wmaskS = (wmaskT << ctrl.io.out.bits.addr(linebit-1,0)) |
+ (wmaskT >> (lineoffset.U - ctrl.io.out.bits.addr(linebit-1,0)))
+ val wdata = Wire(UInt(p.lsuDataBits.W))
+ val wmask = wmaskS(lineoffset - 1, 0)
+
+ if (p.lsuDataBits == 128) {
+ wdata := Cat(wdataS, wdataS, wdataS, wdataS)
+ } else if (p.lsuDataBits == 256) {
+ wdata := Cat(wdataS, wdataS, wdataS, wdataS,
+ wdataS, wdataS, wdataS, wdataS)
+ } else if (p.lsuDataBits == 512) {
+ wdata := Cat(wdataS, wdataS, wdataS, wdataS,
+ wdataS, wdataS, wdataS, wdataS,
+ wdataS, wdataS, wdataS, wdataS,
+ wdataS, wdataS, wdataS, wdataS)
+ } else {
+ assert(false)
+ }
+
+ io.dbus.valid := ctrl.io.out.valid && ctrl.io.out.bits.sldst
+ io.dbus.write := ctrl.io.out.bits.write
+ io.dbus.addr := Cat(0.U(1.W), ctrl.io.out.bits.addr(30,0))
+ io.dbus.adrx := Cat(0.U(1.W), ctrl.io.out.bits.adrx(30,0))
+ io.dbus.size := ctrl.io.out.bits.size
+ io.dbus.wdata := wdata
+ io.dbus.wmask := wmask
+ assert(!(io.dbus.valid && ctrl.io.out.bits.addr(31)))
+ assert(!(io.dbus.valid && io.dbus.addr(31)))
+ assert(!(io.dbus.valid && io.dbus.adrx(31)))
+
+ io.ubus.valid := ctrl.io.out.valid && ctrl.io.out.bits.suncd
+ io.ubus.write := ctrl.io.out.bits.write
+ io.ubus.addr := Cat(0.U(1.W), ctrl.io.out.bits.addr(30,0))
+ io.ubus.adrx := Cat(0.U(1.W), ctrl.io.out.bits.adrx(30,0))
+ io.ubus.size := ctrl.io.out.bits.size
+ io.ubus.wdata := wdata
+ io.ubus.wmask := wmask
+ assert(!(io.ubus.valid && !ctrl.io.out.bits.addr(31)))
+ assert(!(io.ubus.valid && io.dbus.addr(31)))
+ assert(!(io.ubus.valid && io.dbus.adrx(31)))
+
+ io.flush.valid := ctrl.io.out.valid && (ctrl.io.out.bits.fencei || ctrl.io.out.bits.flushat || ctrl.io.out.bits.flushall)
+ io.flush.all := ctrl.io.out.bits.fencei || ctrl.io.out.bits.flushall
+ io.flush.clean := true.B
+ io.flush.fencei := ctrl.io.out.bits.fencei
+
+ ctrl.io.out.ready := io.flush.valid && io.flush.ready ||
+ io.dbus.valid && io.dbus.ready ||
+ io.ubus.valid && io.ubus.ready ||
+ ctrl.io.out.bits.vldst && io.dbus.ready
+
+ io.vldst := ctrl.io.out.valid && ctrl.io.out.bits.vldst
+
+ // ---------------------------------------------------------------------------
+ // Load response.
+ data.io.in.valid := io.dbus.valid && io.dbus.ready && !io.dbus.write ||
+ io.ubus.valid && io.ubus.ready && !io.ubus.write
+
+ data.io.in.bits.addr := ctrl.io.out.bits.addr
+ data.io.in.bits.index := ctrl.io.out.bits.index
+ data.io.in.bits.sext := ctrl.io.out.bits.sext
+ data.io.in.bits.size := ctrl.io.out.bits.size
+ data.io.in.bits.iload := ctrl.io.out.bits.iload
+ data.io.in.bits.sldst := ctrl.io.out.bits.sldst
+ data.io.in.bits.suncd := ctrl.io.out.bits.suncd
+
+ data.io.out.ready := true.B
+
+ assert(!(ctrl.io.in.valid && !data.io.in.ready))
+
+ // ---------------------------------------------------------------------------
+ // Register file ports.
+ val rvalid = data.io.out.valid
+ val rsext = data.io.out.bits.sext
+ val rsize = data.io.out.bits.size
+ val rsel = data.io.out.bits.addr(linebit - 1, 0)
+
+ // Rotate and sign extend.
+ def RotSignExt(datain: UInt, dataout: UInt = 0.U(p.lsuDataBits.W), i: Int = 0): UInt = {
+ assert(datain.getWidth == p.lsuDataBits)
+ assert(dataout.getWidth == p.lsuDataBits)
+
+ if (i < p.lsuDataBits / 8) {
+ val mod = p.lsuDataBits
+
+ val rdata = Cat(datain((8 * (i + 3) + 7) % mod, (8 * (i + 3)) % mod),
+ datain((8 * (i + 2) + 7) % mod, (8 * (i + 2)) % mod),
+ datain((8 * (i + 1) + 7) % mod, (8 * (i + 1)) % mod),
+ datain((8 * (i + 0) + 7) % mod, (8 * (i + 0)) % mod))
+
+ val sizeMask = Mux(rsize === 4.U, 0xffffffff.S(32.W).asUInt,
+ Mux(rsize === 2.U, 0x0000ffff.U(32.W), 0x000000ff.U(32.W)))
+
+ val signExtend = Mux(rsext,
+ Mux(rsize === 2.U,
+ Mux(rdata(15), 0xffff0000.S(32.W).asUInt, 0.U(32.W)),
+ Mux(rdata(7), 0xffffff00.S(32.W).asUInt, 0.U(32.W))),
+ 0.U)
+ assert(sizeMask.getWidth == 32)
+ assert(signExtend.getWidth == 32)
+
+ val sdata = MuxOR(rsel === i.U, rdata & sizeMask | signExtend)
+ RotSignExt(datain, dataout | sdata, i + 1)
+ } else {
+ dataout
+ }
+ }
+
+ val rdata = RotSignExt(MuxOR(data.io.out.bits.sldst, io.dbus.rdata) |
+ MuxOR(data.io.out.bits.suncd, io.ubus.rdata))
+
+ // pass-through
+ io.rd.valid := rvalid && data.io.out.bits.iload
+ io.rd.addr := data.io.out.bits.index
+ io.rd.data := rdata
+
+ assert(!ctrl.io.out.valid || PopCount(Cat(ctrl.io.out.bits.sldst, ctrl.io.out.bits.vldst, ctrl.io.out.bits.suncd)) <= 1.U)
+ assert(!data.io.out.valid || PopCount(Cat(data.io.out.bits.sldst, data.io.out.bits.suncd)) <= 1.U)
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Mlu.scala b/hdl/chisel/src/kelvin/scalar/Mlu.scala
new file mode 100644
index 0000000..173909d
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Mlu.scala
@@ -0,0 +1,140 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Mlu {
+ def apply(p: Parameters): Mlu = {
+ return Module(new Mlu(p))
+ }
+}
+
+case class MluOp() {
+ val MUL = 0
+ val MULH = 1
+ val MULHSU = 2
+ val MULHU = 3
+ val MULHR = 4
+ val MULHSUR = 5
+ val MULHUR = 6
+ val DMULH = 7
+ val DMULHR = 8
+ val Entries = 9
+}
+
+class MluIO(p: Parameters) extends Bundle {
+ val valid = Input(Bool())
+ val addr = Input(UInt(5.W))
+ val op = Input(UInt(new MluOp().Entries.W))
+}
+
+class Mlu(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ // Decode cycle.
+ val req = Vec(4, new MluIO(p))
+
+ // Execute cycle.
+ val rs1 = Vec(4, Flipped(new RegfileReadDataIO))
+ val rs2 = Vec(4, Flipped(new RegfileReadDataIO))
+ val rd = Flipped(new RegfileWriteDataIO)
+ })
+
+ val mlu = new MluOp()
+
+ val op = RegInit(0.U(mlu.Entries.W))
+ val valid1 = RegInit(false.B)
+ val valid2 = RegInit(false.B)
+ val addr1 = Reg(UInt(5.W))
+ val addr2 = Reg(UInt(5.W))
+ val sel = Reg(UInt(4.W))
+
+ valid1 := io.req(0).valid || io.req(1).valid ||
+ io.req(2).valid || io.req(3).valid
+ valid2 := valid1
+
+ when (io.req(0).valid) {
+ op := io.req(0).op
+ addr1 := io.req(0).addr
+ sel := 1.U
+ } .elsewhen (io.req(1).valid) {
+ op := io.req(1).op
+ addr1 := io.req(1).addr
+ sel := 2.U
+ } .elsewhen (io.req(2).valid) {
+ op := io.req(2).op
+ addr1 := io.req(2).addr
+ sel := 4.U
+ } .elsewhen (io.req(3).valid) {
+ op := io.req(3).op
+ addr1 := io.req(3).addr
+ sel := 8.U
+ } .otherwise {
+ op := 0.U
+ sel := 0.U
+ }
+
+ val rs1 = MuxOR(valid1 & sel(0), io.rs1(0).data) |
+ MuxOR(valid1 & sel(1), io.rs1(1).data) |
+ MuxOR(valid1 & sel(2), io.rs1(2).data) |
+ MuxOR(valid1 & sel(3), io.rs1(3).data)
+
+ val rs2 = MuxOR(valid1 & sel(0), io.rs2(0).data) |
+ MuxOR(valid1 & sel(1), io.rs2(1).data) |
+ MuxOR(valid1 & sel(2), io.rs2(2).data) |
+ MuxOR(valid1 & sel(3), io.rs2(3).data)
+
+ // Multiplier has a registered output.
+ val mul2 = Reg(UInt(32.W))
+ val round2 = Reg(UInt(1.W))
+
+ when (valid1) {
+ val rs2signed = op(mlu.MULH) || op(mlu.MULHR) || op(mlu.DMULH) || op(mlu.DMULHR)
+ val rs1signed = op(mlu.MULHSU) || op(mlu.MULHSUR) || rs2signed
+ val rs1s = Cat(rs1signed && rs1(31), rs1).asSInt
+ val rs2s = Cat(rs2signed && rs2(31), rs2).asSInt
+ val prod = rs1s.asSInt * rs2s.asSInt
+ assert(prod.getWidth == 66)
+
+ addr2 := addr1
+ round2 := prod(30) && op(mlu.DMULHR) ||
+ prod(31) && (op(mlu.MULHR) || op(mlu.MULHSUR) || op(mlu.MULHUR))
+
+ when (op(mlu.MUL)) {
+ mul2 := prod(31,0)
+ } .elsewhen (op(mlu.MULH) || op(mlu.MULHSU) || op(mlu.MULHU) || op(mlu.MULHR) || op(mlu.MULHSUR) || op(mlu.MULHUR)) {
+ mul2 := prod(63,32)
+ } .elsewhen (op(mlu.DMULH) || op(mlu.DMULHR)) {
+ val maxneg = 2.U(2.W)
+ val halfneg = 1.U(2.W)
+ val sat = rs1(29,0) === 0.U && rs2(29,0) === 0.U &&
+ (rs1(31,30) === maxneg && rs2(31,30) === maxneg ||
+ rs1(31,30) === maxneg && rs2(31,30) === halfneg ||
+ rs2(31,30) === maxneg && rs1(31,30) === halfneg)
+ when (sat) {
+ when (prod(65)) {
+ mul2 := 0x7fffffff.U(32.W)
+ } .otherwise {
+ mul2 := Cat(1.U(1.W), 0.U(31.W))
+ }
+ } .otherwise {
+ mul2 := prod(62,31)
+ }
+ }
+ }
+
+ io.rd.valid := valid2
+ io.rd.addr := addr2
+ io.rd.data := mul2 + round2
+
+ // Assertions.
+ for (i <- 0 until 4) {
+ assert(!(valid1 && sel(i) && !io.rs1(i).valid))
+ assert(!(valid1 && sel(i) && !io.rs2(i).valid))
+ }
+}
+
+object EmitMlu extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new Mlu(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Regfile.scala b/hdl/chisel/src/kelvin/scalar/Regfile.scala
new file mode 100644
index 0000000..800aaac
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Regfile.scala
@@ -0,0 +1,255 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Regfile {
+ def apply(p: Parameters): Regfile = {
+ return Module(new Regfile(p))
+ }
+}
+
+class RegfileReadAddrIO extends Bundle {
+ val valid = Input(Bool())
+ val addr = Input(UInt(5.W))
+}
+
+class RegfileReadSetIO extends Bundle {
+ val valid = Input(Bool())
+ val value = Input(UInt(32.W))
+}
+
+class RegfileReadDataIO extends Bundle {
+ val valid = Output(Bool())
+ val data = Output(UInt(32.W))
+}
+
+class RegfileWriteAddrIO extends Bundle {
+ val valid = Input(Bool())
+ val addr = Input(UInt(5.W))
+}
+
+class RegfileWriteDataIO extends Bundle {
+ val valid = Input(Bool())
+ val addr = Input(UInt(5.W))
+ val data = Input(UInt(32.W))
+}
+
+class RegfileBusAddrIO extends Bundle {
+ val valid = Input(Bool())
+ val bypass = Input(Bool())
+ val immen = Input(Bool())
+ val immed = Input(UInt(32.W))
+}
+
+class RegfileBusPortIO extends Bundle {
+ val addr = Output(Vec(4, UInt(32.W)))
+ val data = Output(Vec(4, UInt(32.W)))
+}
+
+class RegfileLinkPortIO extends Bundle {
+ val valid = Output(Bool())
+ val value = Output(UInt(32.W))
+}
+
+class RegfileBranchTargetIO extends Bundle {
+ val data = Output(UInt(32.W))
+}
+
+class Regfile(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ // Decode cycle.
+ val readAddr = Vec(8, new RegfileReadAddrIO)
+ val readSet = Vec(8, new RegfileReadSetIO)
+ val writeAddr = Vec(4, new RegfileWriteAddrIO)
+ val busAddr = Vec(4, new RegfileBusAddrIO)
+ val target = Vec(4, new RegfileBranchTargetIO)
+ val linkPort = new RegfileLinkPortIO
+ val busPort = new RegfileBusPortIO
+
+ // Execute cycle.
+ val readData = Vec(8, new RegfileReadDataIO)
+ val writeData = Vec(6, new RegfileWriteDataIO)
+ val writeMask = Vec(5, new Bundle {val valid = Input(Bool())})
+ val scoreboard = new Bundle {
+ val regd = Output(UInt(32.W))
+ val comb = Output(UInt(32.W))
+ }
+ })
+
+ // 8R6W
+ // 8 read ports
+ // 6 write ports
+
+ // The scalar registers, integer (and float todo).
+ val regfile = Reg(Vec(32, UInt(32.W)))
+
+ // ***************************************************************************
+ // The scoreboard.
+ // ***************************************************************************
+ val scoreboard = RegInit(0.U(32.W))
+
+ // The write Addr:Data contract is against speculated opcodes. If an opcode
+ // is in the shadow of a taken branch it will still Set:Clr the scoreboard,
+ // but the actual write will be Masked.
+ val scoreboard_set =
+ MuxOR(io.writeAddr(0).valid, OneHot(io.writeAddr(0).addr, 32)) |
+ MuxOR(io.writeAddr(1).valid, OneHot(io.writeAddr(1).addr, 32)) |
+ MuxOR(io.writeAddr(2).valid, OneHot(io.writeAddr(2).addr, 32)) |
+ MuxOR(io.writeAddr(3).valid, OneHot(io.writeAddr(3).addr, 32))
+
+ val scoreboard_clr0 =
+ MuxOR(io.writeData(0).valid, OneHot(io.writeData(0).addr, 32)) |
+ MuxOR(io.writeData(1).valid, OneHot(io.writeData(1).addr, 32)) |
+ MuxOR(io.writeData(2).valid, OneHot(io.writeData(2).addr, 32)) |
+ MuxOR(io.writeData(3).valid, OneHot(io.writeData(3).addr, 32)) |
+ MuxOR(io.writeData(4).valid, OneHot(io.writeData(4).addr, 32)) |
+ MuxOR(io.writeData(5).valid, OneHot(io.writeData(5).addr, 32))
+
+ val scoreboard_clr = Cat(scoreboard_clr0(31,1), 0.U(1.W))
+
+ when (scoreboard_set =/= 0.U || scoreboard_clr =/= 0.U) {
+ val nxtScoreboard = (scoreboard & ~scoreboard_clr) | scoreboard_set
+ scoreboard := Cat(nxtScoreboard(31,1), 0.U(1.W))
+ }
+
+ io.scoreboard.regd := scoreboard
+ io.scoreboard.comb := scoreboard & ~scoreboard_clr
+
+ // ***************************************************************************
+ // The read port response.
+ // ***************************************************************************
+ val readDataReady = RegInit(VecInit(Seq.fill(8){false.B}))
+ val readDataBits = Reg(Vec(8, UInt(32.W)))
+ val nxtReadDataBits = Wire(Vec(8, UInt(32.W)))
+
+ for (i <- 0 until 8) {
+ io.readData(i).valid := readDataReady(i)
+ io.readData(i).data := readDataBits(i)
+ }
+
+ // ***************************************************************************
+ // One hot write ports.
+ // ***************************************************************************
+ val writeValid = Wire(Vec(32, Bool()))
+ val writeData = Wire(Vec(32, UInt(32.W)))
+
+ writeValid(0) := true.B // do not require special casing of indices
+ writeData(0) := 0.U // regfile(0) is optimized away
+
+ for (i <- 1 until 32) {
+ val valid = Cat(io.writeData(5).valid && io.writeData(5).addr === i.U,
+ io.writeData(4).valid && io.writeData(4).addr === i.U &&
+ !io.writeMask(4).valid,
+ io.writeData(3).valid && io.writeData(3).addr === i.U &&
+ !io.writeMask(3).valid,
+ io.writeData(2).valid && io.writeData(2).addr === i.U &&
+ !io.writeMask(2).valid,
+ io.writeData(1).valid && io.writeData(1).addr === i.U &&
+ !io.writeMask(1).valid,
+ io.writeData(0).valid && io.writeData(0).addr === i.U &&
+ !io.writeMask(0).valid)
+
+ val data = MuxOR(valid(0), io.writeData(0).data) |
+ MuxOR(valid(1), io.writeData(1).data) |
+ MuxOR(valid(2), io.writeData(2).data) |
+ MuxOR(valid(3), io.writeData(3).data) |
+ MuxOR(valid(4), io.writeData(4).data) |
+ MuxOR(valid(5), io.writeData(5).data)
+
+ writeValid(i) := valid =/= 0.U
+ writeData(i) := data
+
+ assert(PopCount(valid) <= 1.U)
+ }
+
+ for (i <- 0 until 32) {
+ when (writeValid(i)) {
+ regfile(i) := writeData(i)
+ }
+ }
+
+ // ***************************************************************************
+ // Read ports with write forwarding.
+ // ***************************************************************************
+ val rdata = Wire(Vec(8, UInt(32.W)))
+ val wdata = Wire(Vec(8, UInt(32.W)))
+ val rwdata = Wire(Vec(8, UInt(32.W)))
+ for (i <- 0 until 8) {
+ val idx = io.readAddr(i).addr
+ val write = VecAt(writeValid, idx)
+ rdata(i) := VecAt(regfile, idx)
+ wdata(i) := VecAt(writeData, idx)
+ rwdata(i) := Mux(write, wdata(i), rdata(i))
+ }
+
+ for (i <- 0 until 8) {
+ val setValid = io.readSet(i).valid
+ val setValue = io.readSet(i).value
+
+ val nxtReadDataReady = io.readAddr(i).valid || setValid
+
+ readDataReady(i) := nxtReadDataReady
+
+ nxtReadDataBits(i) := Mux(setValid, setValue, rwdata(i))
+
+ when (nxtReadDataReady) {
+ readDataBits(i) := nxtReadDataBits(i)
+ }
+ }
+
+ // Bus port priority encoded address.
+ val busAddr = Wire(Vec(4, UInt(32.W)))
+ val busValid = Cat(io.busAddr(3).valid, io.busAddr(2).valid,
+ io.busAddr(1).valid, io.busAddr(0).valid)
+
+ for (i <- 0 until 4) {
+ busAddr(i) := Mux(io.busAddr(i).bypass, rwdata(2 * i),
+ Mux(io.busAddr(i).immen, rdata(2 * i) + io.busAddr(i).immed,
+ rdata(2 * i)))
+ }
+
+ for (i <- 0 until 4) {
+ io.busPort.addr(i) := busAddr(i)
+ io.busPort.data(i) := nxtReadDataBits(2 * i + 1)
+ }
+
+ // Branch target address combinatorial.
+ for (i <- 0 until 4) {
+ io.target(i).data := busAddr(i)
+ }
+
+ // ***************************************************************************
+ // Link port.
+ // ***************************************************************************
+ io.linkPort.valid := !scoreboard(1)
+ io.linkPort.value := regfile(1)
+
+ // ***************************************************************************
+ // Assertions.
+ // ***************************************************************************
+ for (i <- 0 until 4) {
+ assert(busAddr(i).getWidth == p.lsuAddrBits)
+ }
+
+ for (i <- 0 until 6) {
+ for (j <- (i+1) until 6) {
+ // Delay the failure a cycle for debugging purposes.
+ val write_fail = RegInit(false.B)
+ write_fail := io.writeData(i).valid && io.writeData(j).valid &&
+ io.writeData(i).addr === io.writeData(j).addr &&
+ io.writeData(i).addr =/= 0.U
+ assert(!write_fail)
+ }
+ }
+
+ val scoreboard_error = RegInit(false.B)
+ scoreboard_error := (scoreboard & scoreboard_clr) =/= scoreboard_clr
+ assert(!scoreboard_error)
+}
+
+object EmitRegfile extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new Regfile(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/scalar/SCore.scala b/hdl/chisel/src/kelvin/scalar/SCore.scala
new file mode 100644
index 0000000..786bbe6
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/SCore.scala
@@ -0,0 +1,350 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object SCore {
+ def apply(p: Parameters): SCore = {
+ return Module(new SCore(p))
+ }
+}
+
+class SCore(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ val csr = new CsrInOutIO(p)
+ val halted = Output(Bool())
+ val fault = Output(Bool())
+
+ val ibus = new IBusIO(p)
+ val dbus = new DBusIO(p)
+ val ubus = new DBusIO(p)
+ val vldst = Output(Bool())
+
+ val vcore = Flipped(new VCoreIO(p))
+
+ val iflush = new IFlushIO(p)
+ val dflush = new DFlushIO(p)
+ val slog = new SLogIO(p)
+
+ val debug = new DebugIO(p)
+ })
+
+ // The functional units that make up the core.
+ val regfile = Regfile(p)
+ val fetch = Fetch(p)
+ val decode = Seq(Decode(p, 0), Decode(p, 1), Decode(p, 2), Decode(p, 3))
+ val alu = Seq.fill(4)(Alu(p))
+ val bru = Seq.fill(4)(Bru(p))
+ val csr = Csr(p)
+ val lsu = Lsu(p)
+ val mlu = Mlu(p)
+ val dvu = Dvu(p)
+
+ // Wire up the core.
+ val branchTaken = bru(0).io.taken.valid || bru(1).io.taken.valid ||
+ bru(2).io.taken.valid || bru(3).io.taken.valid
+
+ // ---------------------------------------------------------------------------
+ // IFlush
+ val iflush = RegInit(false.B)
+
+ when (bru(0).io.iflush) {
+ iflush := true.B
+ } .elsewhen (fetch.io.iflush.ready && io.iflush.ready &&
+ lsu.io.flush.ready && lsu.io.flush.fencei) {
+ iflush := false.B
+ }
+
+ io.dflush.valid := lsu.io.flush.valid
+ io.dflush.all := lsu.io.flush.all
+ io.dflush.clean := lsu.io.flush.clean
+ lsu.io.flush.ready := io.dflush.ready
+
+ assert(!bru(1).io.iflush)
+ assert(!bru(2).io.iflush)
+ assert(!bru(3).io.iflush)
+
+ // ---------------------------------------------------------------------------
+ // Fetch
+ fetch.io.csr := io.csr.in
+
+ for (i <- 0 until 4) {
+ fetch.io.branch(i) := bru(i).io.taken
+ }
+
+ fetch.io.linkPort := regfile.io.linkPort
+
+ fetch.io.iflush.valid := iflush
+
+ // ---------------------------------------------------------------------------
+ // Decode
+ val mask = VecInit(true.B,
+ decode(0).io.inst.ready,
+ decode(0).io.inst.ready && decode(1).io.inst.ready,
+ decode(0).io.inst.ready && decode(1).io.inst.ready &&
+ decode(2).io.inst.ready)
+
+ for (i <- 0 until 4) {
+ decode(i).io.inst.valid := fetch.io.inst.lanes(i).valid && mask(i)
+ fetch.io.inst.lanes(i).ready := decode(i).io.inst.ready && mask(i)
+ decode(i).io.inst.addr := fetch.io.inst.lanes(i).addr
+ decode(i).io.inst.inst := fetch.io.inst.lanes(i).inst
+ decode(i).io.inst.brchFwd := fetch.io.inst.lanes(i).brchFwd
+
+ decode(i).io.branchTaken := branchTaken
+ decode(i).io.halted := csr.io.halted
+ }
+
+ // Interlock based on regfile write port dependencies.
+ decode(0).io.interlock := bru(0).io.interlock
+ decode(1).io.interlock := decode(0).io.interlock
+ decode(2).io.interlock := decode(1).io.interlock
+ decode(3).io.interlock := decode(2).io.interlock
+
+ // Serialize opcodes with only one pipeline.
+ decode(0).io.serializeIn.defaults()
+ decode(1).io.serializeIn := decode(0).io.serializeOut
+ decode(2).io.serializeIn := decode(1).io.serializeOut
+ decode(3).io.serializeIn := decode(2).io.serializeOut
+
+ // In decode update multi-issue scoreboard state.
+ val scoreboard_spec1 = decode(0).io.scoreboard.spec
+ val scoreboard_spec2 = decode(1).io.scoreboard.spec | scoreboard_spec1
+ val scoreboard_spec3 = decode(2).io.scoreboard.spec | scoreboard_spec2
+ assert(scoreboard_spec1.getWidth == 32)
+ assert(scoreboard_spec2.getWidth == 32)
+ assert(scoreboard_spec3.getWidth == 32)
+
+ decode(0).io.scoreboard.comb := regfile.io.scoreboard.comb
+ decode(0).io.scoreboard.regd := regfile.io.scoreboard.regd
+ decode(1).io.scoreboard.comb := regfile.io.scoreboard.comb | scoreboard_spec1
+ decode(1).io.scoreboard.regd := regfile.io.scoreboard.regd | scoreboard_spec1
+ decode(2).io.scoreboard.comb := regfile.io.scoreboard.comb | scoreboard_spec2
+ decode(2).io.scoreboard.regd := regfile.io.scoreboard.regd | scoreboard_spec2
+ decode(3).io.scoreboard.comb := regfile.io.scoreboard.comb | scoreboard_spec3
+ decode(3).io.scoreboard.regd := regfile.io.scoreboard.regd | scoreboard_spec3
+
+
+ decode(0).io.mactive := io.vcore.mactive
+ decode(1).io.mactive := false.B
+ decode(2).io.mactive := false.B
+ decode(3).io.mactive := false.B
+
+ // ---------------------------------------------------------------------------
+ // ALU
+ for (i <- 0 until 4) {
+ alu(i).io.req := decode(i).io.alu
+ alu(i).io.rs1 := regfile.io.readData(2 * i + 0)
+ alu(i).io.rs2 := regfile.io.readData(2 * i + 1)
+ }
+
+ // ---------------------------------------------------------------------------
+ // Branch Unit
+ for (i <- 0 until 4) {
+ bru(i).io.req := decode(i).io.bru
+ bru(i).io.rs1 := regfile.io.readData(2 * i + 0)
+ bru(i).io.rs2 := regfile.io.readData(2 * i + 1)
+ bru(i).io.target := regfile.io.target(i)
+ }
+
+ bru(0).io.csr <> csr.io.bru
+ bru(1).io.csr.defaults()
+ bru(2).io.csr.defaults()
+ bru(3).io.csr.defaults()
+
+ io.iflush.valid := iflush
+
+ // ---------------------------------------------------------------------------
+ // Control Status Unit
+ csr.io.csr <> io.csr
+
+ csr.io.req <> decode(0).io.csr
+ csr.io.rs1 := regfile.io.readData(0)
+
+ csr.io.vcore.undef := io.vcore.undef
+
+ // ---------------------------------------------------------------------------
+ // Status
+ io.halted := csr.io.halted
+ io.fault := csr.io.fault
+
+ // ---------------------------------------------------------------------------
+ // Load/Store Unit
+ lsu.io.busPort := regfile.io.busPort
+
+ for (i <- 0 until 4) {
+ lsu.io.req(i).valid := decode(i).io.lsu.valid
+ lsu.io.req(i).store := decode(i).io.lsu.store
+ lsu.io.req(i).addr := decode(i).io.lsu.addr
+ lsu.io.req(i).op := decode(i).io.lsu.op
+ decode(i).io.lsu.ready := lsu.io.req(i).ready
+ }
+
+ // ---------------------------------------------------------------------------
+ // Multiplier Unit
+ mlu.io.req(0) := decode(0).io.mlu
+ mlu.io.req(1) := decode(1).io.mlu
+ mlu.io.req(2) := decode(2).io.mlu
+ mlu.io.req(3) := decode(3).io.mlu
+ mlu.io.rs1(0) := regfile.io.readData(0)
+ mlu.io.rs1(1) := regfile.io.readData(2)
+ mlu.io.rs1(2) := regfile.io.readData(4)
+ mlu.io.rs1(3) := regfile.io.readData(6)
+ mlu.io.rs2(0) := regfile.io.readData(1)
+ mlu.io.rs2(1) := regfile.io.readData(3)
+ mlu.io.rs2(2) := regfile.io.readData(5)
+ mlu.io.rs2(3) := regfile.io.readData(7)
+
+ // On taken branches, multicycle MLU execute must be masked
+ val mluInvalidate = RegInit(false.B)
+ mluInvalidate := branchTaken
+
+ // ---------------------------------------------------------------------------
+ // Divide Unit
+ dvu.io.req <> decode(0).io.dvu
+ dvu.io.rs1 := regfile.io.readData(0)
+ dvu.io.rs2 := regfile.io.readData(1)
+ dvu.io.rd.ready := !mlu.io.rd.valid
+
+ // TODO: make port conditional on pipeline index.
+ for (i <- 1 until 4) {
+ decode(i).io.dvu.ready := false.B
+ }
+
+ // ---------------------------------------------------------------------------
+ // Register File
+ for (i <- 0 until 4) {
+ regfile.io.readAddr(2 * i + 0) := decode(i).io.rs1Read
+ regfile.io.readAddr(2 * i + 1) := decode(i).io.rs2Read
+ regfile.io.readSet(2 * i + 0) := decode(i).io.rs1Set
+ regfile.io.readSet(2 * i + 1) := decode(i).io.rs2Set
+ regfile.io.writeAddr(i) := decode(i).io.rdMark
+ regfile.io.busAddr(i) := decode(i).io.busRead
+
+ val csr0Valid = if (i == 0) csr.io.rd.valid else false.B
+ val csr0Addr = if (i == 0) csr.io.rd.addr else 0.U
+ val csr0Data = if (i == 0) csr.io.rd.data else 0.U
+
+
+ regfile.io.writeData(i).valid := csr0Valid ||
+ alu(i).io.rd.valid || bru(i).io.rd.valid ||
+ io.vcore.rd(i).valid
+
+ regfile.io.writeData(i).addr :=
+ MuxOR(csr0Valid, csr0Addr) |
+ MuxOR(alu(i).io.rd.valid, alu(i).io.rd.addr) |
+ MuxOR(bru(i).io.rd.valid, bru(i).io.rd.addr) |
+ MuxOR(io.vcore.rd(i).valid, io.vcore.rd(i).addr)
+
+ regfile.io.writeData(i).data :=
+ MuxOR(csr0Valid, csr0Data) |
+ MuxOR(alu(i).io.rd.valid, alu(i).io.rd.data) |
+ MuxOR(bru(i).io.rd.valid, bru(i).io.rd.data) |
+ MuxOR(io.vcore.rd(i).valid, io.vcore.rd(i).data)
+
+ assert((csr0Valid +&
+ alu(i).io.rd.valid +& bru(i).io.rd.valid +&
+ io.vcore.rd(i).valid) <= 1.U)
+ }
+
+ regfile.io.writeData(4).valid := mlu.io.rd.valid || dvu.io.rd.valid
+ regfile.io.writeData(4).addr := Mux(mlu.io.rd.valid, mlu.io.rd.addr, dvu.io.rd.addr)
+ regfile.io.writeData(4).data := Mux(mlu.io.rd.valid, mlu.io.rd.data, dvu.io.rd.data)
+ assert(!(mlu.io.rd.valid && (dvu.io.rd.valid && dvu.io.rd.ready))) // TODO: stall dvu on mlu write
+
+ regfile.io.writeData(5).valid := lsu.io.rd.valid
+ regfile.io.writeData(5).addr := lsu.io.rd.addr
+ regfile.io.writeData(5).data := lsu.io.rd.data
+
+ regfile.io.writeMask(0).valid := false.B
+ regfile.io.writeMask(1).valid := regfile.io.writeMask(0).valid ||
+ bru(0).io.taken.valid
+ regfile.io.writeMask(2).valid := regfile.io.writeMask(1).valid ||
+ bru(1).io.taken.valid
+ regfile.io.writeMask(3).valid := regfile.io.writeMask(2).valid ||
+ bru(2).io.taken.valid
+ regfile.io.writeMask(4).valid := mluInvalidate
+
+ // ---------------------------------------------------------------------------
+ // Vector Extension
+ for (i <- 0 until 4) {
+ io.vcore.vinst(i) <> decode(i).io.vinst
+ }
+
+ for (i <- 0 until 8) {
+ io.vcore.rs(i) := regfile.io.readData(i)
+ }
+
+ // ---------------------------------------------------------------------------
+ // Fetch Bus
+ io.ibus <> fetch.io.ibus
+
+ // ---------------------------------------------------------------------------
+ // Local Data Bus Port
+ io.dbus <> lsu.io.dbus
+ io.ubus <> lsu.io.ubus
+
+ io.vldst := lsu.io.vldst
+
+ // ---------------------------------------------------------------------------
+ // Scalar logging interface
+ val slogValid = RegInit(false.B)
+ val slogAddr = Reg(UInt(2.W))
+ val slogEn = decode(0).io.slog
+
+ slogValid := slogEn
+ when (slogEn) {
+ slogAddr := decode(0).io.inst.inst(14,12)
+ }
+
+ io.slog.valid := slogValid
+ io.slog.addr := MuxOR(slogValid, slogAddr)
+ io.slog.data := MuxOR(slogValid, regfile.io.readData(0).data)
+
+ // ---------------------------------------------------------------------------
+ // DEBUG
+ val cycles = RegInit(0.U(32.W))
+ cycles := cycles + 1.U
+ io.debug.cycles := cycles
+
+ val debugEn = RegInit(0.U(4.W))
+ val debugAddr = Reg(Vec(4, UInt(32.W)))
+ val debugInst = Reg(Vec(4, UInt(32.W)))
+
+ val debugBrch =
+ Cat(bru(0).io.taken.valid || bru(1).io.taken.valid || bru(2).io.taken.valid,
+ bru(0).io.taken.valid || bru(1).io.taken.valid,
+ bru(0).io.taken.valid,
+ false.B)
+
+ debugEn := Cat(fetch.io.inst.lanes(3).valid && fetch.io.inst.lanes(3).ready && !branchTaken,
+ fetch.io.inst.lanes(2).valid && fetch.io.inst.lanes(2).ready && !branchTaken,
+ fetch.io.inst.lanes(1).valid && fetch.io.inst.lanes(1).ready && !branchTaken,
+ fetch.io.inst.lanes(0).valid && fetch.io.inst.lanes(0).ready && !branchTaken)
+
+ debugAddr(0) := fetch.io.inst.lanes(0).addr
+ debugAddr(1) := fetch.io.inst.lanes(1).addr
+ debugAddr(2) := fetch.io.inst.lanes(2).addr
+ debugAddr(3) := fetch.io.inst.lanes(3).addr
+ debugInst(0) := fetch.io.inst.lanes(0).inst
+ debugInst(1) := fetch.io.inst.lanes(1).inst
+ debugInst(2) := fetch.io.inst.lanes(2).inst
+ debugInst(3) := fetch.io.inst.lanes(3).inst
+
+ io.debug.en := debugEn & ~debugBrch
+
+ io.debug.addr0 := debugAddr(0)
+ io.debug.addr1 := debugAddr(1)
+ io.debug.addr2 := debugAddr(2)
+ io.debug.addr3 := debugAddr(3)
+ io.debug.inst0 := debugInst(0)
+ io.debug.inst1 := debugInst(1)
+ io.debug.inst2 := debugInst(2)
+ io.debug.inst3 := debugInst(3)
+}
+
+object EmitSCore extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new SCore(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/scalar/SLog.scala b/hdl/chisel/src/kelvin/scalar/SLog.scala
new file mode 100644
index 0000000..b52963a
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/SLog.scala
@@ -0,0 +1,12 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+// Scalar instrumentation logging (printf).
+class SLogIO(p: Parameters) extends Bundle {
+ val valid = Output(Bool())
+ val addr = Output(UInt(5.W))
+ val data = Output(UInt(32.W))
+}
diff --git a/hdl/chisel/src/kelvin/vector/VAlu.scala b/hdl/chisel/src/kelvin/vector/VAlu.scala
new file mode 100644
index 0000000..b074a13
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VAlu.scala
@@ -0,0 +1,395 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object VAlu {
+ def apply(p: Parameters): VAlu = {
+ return Module(new VAlu(p))
+ }
+}
+
+class VAlu(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ // Instructions.
+ val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+ val active = Output(UInt(64.W))
+
+ // VRegfile.
+ val vrfsb = Input(UInt(128.W))
+ val read = Vec(7, new VRegfileReadIO(p))
+ val write = Vec(4, new VRegfileWriteIO(p))
+ val whint = Vec(4, new VRegfileWhintIO(p))
+ val scalar = Vec(2, new VRegfileScalarIO(p))
+
+ // Testbench signals.
+ val read_0_ready = Output(Bool())
+ val read_1_ready = Output(Bool())
+ val read_2_ready = Output(Bool())
+ val read_3_ready = Output(Bool())
+ val read_4_ready = Output(Bool())
+ val read_5_ready = Output(Bool())
+ val read_6_ready = Output(Bool())
+ })
+
+ val cmdqDepth = 8
+
+ val e = new VEncodeOp()
+
+ // ---------------------------------------------------------------------------
+ // Tie-offs.
+ for (i <- 0 until 7) {
+ io.read(i).valid := false.B
+ io.read(i).addr := 0.U
+ io.read(i).tag := 0.U
+ }
+
+ for (i <- 0 until 4) {
+ io.write(i).valid := false.B
+ io.write(i).addr := 0.U
+ io.write(i).data := 0.U
+ }
+
+ for (i <- 0 until 4) {
+ io.whint(i).valid := false.B
+ io.whint(i).addr := 0.U
+ }
+
+ // ---------------------------------------------------------------------------
+ // Opcode checks.
+ for (i <- 0 until 4) {
+ when (io.in.valid && io.in.ready) {
+ when (io.in.bits(i).valid) {
+ val op = io.in.bits(i).bits.op
+ val supported =
+ // Arithmetic
+ op === e.vabsd.U ||
+ op === e.vacc.U ||
+ op === e.vadd.U ||
+ op === e.vadds.U ||
+ op === e.vaddw.U ||
+ op === e.vadd3.U ||
+ op === e.vdup.U ||
+ op === e.vhadd.U ||
+ op === e.vhsub.U ||
+ op === e.vmax.U ||
+ op === e.vmin.U ||
+ op === e.vpadd.U ||
+ op === e.vpsub.U ||
+ op === e.vrsub.U ||
+ op === e.vsub.U ||
+ op === e.vsubs.U ||
+ op === e.vsubw.U ||
+ // Compare.
+ op === e.veq.U ||
+ op === e.vne.U ||
+ op === e.vlt.U ||
+ op === e.vle.U ||
+ op === e.vgt.U ||
+ op === e.vge.U ||
+ // Logical.
+ op === e.vand.U ||
+ op === e.vclb.U ||
+ op === e.vclz.U ||
+ op === e.vcpop.U ||
+ op === e.vmv.U ||
+ op === e.vmv2.U ||
+ op === e.vmvp.U ||
+ op === e.adwinit.U ||
+ op === e.vnot.U ||
+ op === e.vor.U ||
+ op === e.vrev.U ||
+ op === e.vror.U ||
+ op === e.vxor.U ||
+ // Shift.
+ op === e.vshl.U ||
+ op === e.vshr.U ||
+ op === e.vshf.U ||
+ op === e.vsrans.U ||
+ op === e.vsraqs.U ||
+ // Multiply.
+ op === e.vdmulh.U ||
+ op === e.vdmulh2.U ||
+ op === e.vmadd.U ||
+ op === e.vmul.U ||
+ op === e.vmul2.U ||
+ op === e.vmulh.U ||
+ op === e.vmulh2.U ||
+ op === e.vmuls.U ||
+ op === e.vmuls2.U ||
+ op === e.vmulw.U ||
+ // Shuffle.
+ op === e.vslidevn.U ||
+ op === e.vslidevp.U ||
+ op === e.vslidehn2.U ||
+ op === e.vslidehp2.U ||
+ op === e.vsel.U ||
+ op === e.vevn.U ||
+ op === e.vodd.U ||
+ op === e.vevnodd.U ||
+ op === e.vzip.U ||
+ // ML
+ op === e.vdwconv.U ||
+ op === e.adwconv.U
+
+ when (!supported) {
+ printf("**Op=%d unsupported\n", op)
+ }
+ assert(supported)
+
+ assert(!(io.in.bits(i).bits.vt.valid && io.in.bits(i).bits.sv.valid))
+
+ when (op === e.vdwconv.U || op === e.adwconv.U) {
+ val sparse = io.in.bits(i).bits.sv.data(3,2)
+ assert(io.in.bits(i).bits.m === false.B)
+ assert(io.in.bits(i).bits.sz === 4.U)
+ assert(io.in.bits(i).bits.sv.valid === false.B)
+ assert(sparse < 3.U)
+ }
+ }
+ }
+ }
+
+ // ---------------------------------------------------------------------------
+ // Command Queue.
+ class VAluCmdq extends Bundle {
+ val op = UInt(new VEncodeOp().bits.W)
+ val f2 = UInt(3.W)
+ val sz = UInt(3.W)
+ val vd = new VAddr()
+ val ve = new VAddr()
+ val vs = new VAddrTag()
+ val vt = new VAddrTag()
+ val vu = new VAddrTag()
+ val sv = new SData()
+ val cmdsync = Bool()
+ }
+
+ def Fin(in: VDecodeBits, alu: Int): VAluCmdq = {
+ val out = Wire(new VAluCmdq)
+ out.op := in.op
+ out.f2 := in.f2
+ out.sz := in.sz
+ out.cmdsync := in.cmdsync
+ when ((alu == 0).B || !in.cmdsync) {
+ out.vd := in.vd
+ out.ve := in.ve
+ out.vs := in.vs
+ out.vt := in.vt
+ out.vu := in.vu
+ } .otherwise {
+ out.vd := in.vf
+ out.ve := in.vg
+ out.vs := in.vx
+ out.vt := in.vy
+ out.vu := in.vz
+ }
+ out.sv := in.sv
+ out
+ }
+
+ def Fin0(in: VDecodeBits): VAluCmdq = {
+ Fin(in, 0)
+ }
+
+ def Fin1(in: VDecodeBits): VAluCmdq = {
+ Fin(in, 1)
+ }
+
+ def Fout(in: VAluCmdq, m: Bool, step: UInt, valid: Bool): (VAluCmdq, Bool) = {
+ val vevnodd = in.op === e.vevn.U || in.op === e.vodd.U || in.op === e.vevnodd.U
+ val vzip = in.op === e.vzip.U
+ val out = Wire(new VAluCmdq)
+ val last = !m || step === 3.U
+ out := in
+ out.vd.addr := in.vd.addr + 1.U
+ out.ve.addr := in.ve.addr + 1.U
+ out.vs.addr := in.vs.addr + 1.U
+ out.vt.addr := in.vt.addr + 1.U
+ out.vu.addr := in.vu.addr + 1.U
+ when (m && vevnodd) {
+ out.vu.addr := in.vu.addr
+ when (step === 1.U) { // halfway
+ out.vs.addr := in.vu.addr + 0.U
+ out.vt.addr := in.vu.addr + 1.U
+ } .otherwise {
+ out.vs.addr := in.vs.addr + 2.U
+ out.vt.addr := in.vt.addr + 2.U
+ }
+ }
+ when (vzip) {
+ assert(in.ve.addr === (in.vd.addr + 1.U))
+ out.vd.addr := in.vd.addr + 2.U
+ out.ve.addr := in.ve.addr + 2.U
+ }
+ (out, last)
+ }
+
+ def Factive(in: VAluCmdq, m: Bool, step: UInt): UInt = {
+ assert(step.getWidth == 5)
+ assert(step <= 4.U)
+ // Only reads are reported in active, vrfsb tracks writes.
+ val active = MuxOR(in.vs.valid, RegActive(m, step(2,0), in.vs.addr)) |
+ MuxOR(in.vt.valid, RegActive(m, step(2,0), in.vt.addr)) |
+ MuxOR(in.vu.valid, RegActive(m, step(2,0), in.vu.addr))
+ assert(active.getWidth == 64)
+ active
+ }
+
+ val q0 = VCmdq(cmdqDepth, new VAluCmdq, Fin0, Fout, Factive)
+ val q1 = VCmdq(cmdqDepth, new VAluCmdq, Fin1, Fout, Factive)
+
+ q0.io.in.valid := io.in.valid && q1.io.in.ready
+ q1.io.in.valid := io.in.valid && q0.io.in.ready
+ io.in.ready := q0.io.in.ready && q1.io.in.ready
+
+ q0.io.in.bits := io.in.bits
+ q1.io.in.bits := io.in.bits
+
+ val q0ready = ScoreboardReady(q0.io.out.bits.vs, io.vrfsb) &&
+ ScoreboardReady(q0.io.out.bits.vt, io.vrfsb) &&
+ ScoreboardReady(q0.io.out.bits.vu, io.vrfsb)
+
+ val q1ready = ScoreboardReady(q1.io.out.bits.vs, io.vrfsb) &&
+ ScoreboardReady(q1.io.out.bits.vt, io.vrfsb) &&
+ ScoreboardReady(q1.io.out.bits.vu, io.vrfsb)
+
+ q0.io.out.ready := q0ready && (!q0.io.out.bits.cmdsync || q1.io.out.valid && q1ready && q1.io.out.bits.cmdsync)
+ q1.io.out.ready := q1ready && (!q1.io.out.bits.cmdsync || q0.io.out.valid && q0ready && q0.io.out.bits.cmdsync)
+
+ // ---------------------------------------------------------------------------
+ // ALU Selection interleaving.
+ val alureg = RegInit(false.B)
+ val alusel = Wire(Vec(5, Bool()))
+
+ // Toggle if previous was valid and was not a synchronized dual command.
+ alusel(0) := alureg
+ alusel(1) := Mux(io.in.bits(0).valid && !io.in.bits(0).bits.cmdsync, !alusel(0), alusel(0))
+ alusel(2) := Mux(io.in.bits(1).valid && !io.in.bits(1).bits.cmdsync, !alusel(1), alusel(1))
+ alusel(3) := Mux(io.in.bits(2).valid && !io.in.bits(2).bits.cmdsync, !alusel(2), alusel(2))
+ alusel(4) := Mux(io.in.bits(3).valid && !io.in.bits(3).bits.cmdsync, !alusel(3), alusel(3))
+
+ when (io.in.valid && io.in.ready) {
+ alureg := alusel(4)
+ }
+
+ for (i <- 0 until 4) {
+ q0.io.in.bits(i).valid := io.in.bits(i).valid && (alusel(i) === 0.U || io.in.bits(i).bits.cmdsync)
+ q1.io.in.bits(i).valid := io.in.bits(i).valid && (alusel(i) === 1.U || io.in.bits(i).bits.cmdsync)
+ }
+
+ // ---------------------------------------------------------------------------
+ // Read ports.
+ io.read(0).valid := q0.io.out.bits.vs.valid
+ io.read(1).valid := q0.io.out.bits.vt.valid
+ io.read(2).valid := q0.io.out.bits.vu.valid
+ io.read(3).valid := q1.io.out.bits.vs.valid
+ io.read(4).valid := q1.io.out.bits.vt.valid
+ io.read(5).valid := q1.io.out.bits.vu.valid
+
+ io.read(0).addr := q0.io.out.bits.vs.addr
+ io.read(1).addr := q0.io.out.bits.vt.addr
+ io.read(2).addr := q0.io.out.bits.vu.addr
+ io.read(3).addr := q1.io.out.bits.vs.addr
+ io.read(4).addr := q1.io.out.bits.vt.addr
+ io.read(5).addr := q1.io.out.bits.vu.addr
+
+ io.read(0).tag := OutTag(q0.io.out.bits.vs)
+ io.read(1).tag := OutTag(q0.io.out.bits.vt)
+ io.read(2).tag := OutTag(q0.io.out.bits.vu)
+ io.read(3).tag := OutTag(q1.io.out.bits.vs)
+ io.read(4).tag := OutTag(q1.io.out.bits.vt)
+ io.read(5).tag := OutTag(q1.io.out.bits.vu)
+
+ io.scalar(0).valid := q0.io.out.bits.sv.valid
+ io.scalar(1).valid := q1.io.out.bits.sv.valid
+
+ io.scalar(0).data := q0.io.out.bits.sv.data
+ io.scalar(1).data := q1.io.out.bits.sv.data
+
+ io.read_0_ready := io.read(0).valid && q0.io.out.ready
+ io.read_1_ready := io.read(1).valid && q0.io.out.ready
+ io.read_2_ready := io.read(2).valid && q0.io.out.ready
+ io.read_3_ready := io.read(3).valid && q1.io.out.ready
+ io.read_4_ready := io.read(4).valid && q1.io.out.ready
+ io.read_5_ready := io.read(5).valid && q1.io.out.ready
+ io.read_6_ready := false.B
+
+ // ---------------------------------------------------------------------------
+ // Alu0.
+ val alu0 = Module(new VAluInt(p, 0))
+
+ alu0.io.in.valid := q0.io.out.valid && q0.io.out.ready
+ alu0.io.in.op := q0.io.out.bits.op
+ alu0.io.in.f2 := q0.io.out.bits.f2
+ alu0.io.in.sz := q0.io.out.bits.sz
+ alu0.io.in.vd.addr := q0.io.out.bits.vd.addr
+ alu0.io.in.ve.addr := q0.io.out.bits.ve.addr
+ alu0.io.in.sv.data := q0.io.out.bits.sv.data
+
+ alu0.io.read(0).data := io.read(0).data
+ alu0.io.read(1).data := io.read(1).data
+ alu0.io.read(2).data := io.read(2).data
+ alu0.io.read(3).data := io.read(3).data
+ alu0.io.read(4).data := io.read(4).data
+ alu0.io.read(5).data := io.read(5).data
+ alu0.io.read(6).data := io.read(6).data
+
+ io.write(0).valid := alu0.io.write(0).valid
+ io.write(0).addr := alu0.io.write(0).addr
+ io.write(0).data := alu0.io.write(0).data
+
+ io.write(1).valid := alu0.io.write(1).valid
+ io.write(1).addr := alu0.io.write(1).addr
+ io.write(1).data := alu0.io.write(1).data
+
+ io.whint(0).valid := alu0.io.whint(0).valid
+ io.whint(0).addr := alu0.io.whint(0).addr
+
+ io.whint(1).valid := alu0.io.whint(1).valid
+ io.whint(1).addr := alu0.io.whint(1).addr
+
+ // ---------------------------------------------------------------------------
+ // Alu1.
+ val alu1 = Module(new VAluInt(p, 1))
+
+ alu1.io.in.valid := q1.io.out.valid && q1.io.out.ready
+ alu1.io.in.op := q1.io.out.bits.op
+ alu1.io.in.f2 := q1.io.out.bits.f2
+ alu1.io.in.sz := q1.io.out.bits.sz
+ alu1.io.in.vd.addr := q1.io.out.bits.vd.addr
+ alu1.io.in.ve.addr := q1.io.out.bits.ve.addr
+ alu1.io.in.sv.data := q1.io.out.bits.sv.data
+
+ alu1.io.read(0).data := io.read(3).data
+ alu1.io.read(1).data := io.read(4).data
+ alu1.io.read(2).data := io.read(5).data
+ alu1.io.read(3).data := io.read(0).data
+ alu1.io.read(4).data := io.read(1).data
+ alu1.io.read(5).data := io.read(2).data
+ alu1.io.read(6).data := io.read(6).data
+
+ io.write(2).valid := alu1.io.write(0).valid
+ io.write(2).addr := alu1.io.write(0).addr
+ io.write(2).data := alu1.io.write(0).data
+
+ io.write(3).valid := alu1.io.write(1).valid
+ io.write(3).addr := alu1.io.write(1).addr
+ io.write(3).data := alu1.io.write(1).data
+
+ io.whint(2).valid := alu1.io.whint(0).valid
+ io.whint(2).addr := alu1.io.whint(0).addr
+
+ io.whint(3).valid := alu1.io.whint(1).valid
+ io.whint(3).addr := alu1.io.whint(1).addr
+
+ // ---------------------------------------------------------------------------
+ // Active.
+ io.active := q0.io.active | q1.io.active
+}
+
+object EmitVAlu extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new VAlu(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VAluInt.scala b/hdl/chisel/src/kelvin/vector/VAluInt.scala
new file mode 100644
index 0000000..619f5d1
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VAluInt.scala
@@ -0,0 +1,1529 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+// VAluInt is foremost an ML depthwise and activiation unit with pipelining
+// behaviors optimized to this functionality. All operations are pipelined with
+// a result latency of 2cc geared towards the goal of simplicity of design.
+//
+// Note: widening operations modify the size from ISA defined destination to
+// source read registers of sz/2.
+
+class VAluInt(p: Parameters, aluid: Int) extends Module {
+ val e = new VEncodeOp()
+
+ val io = IO(new Bundle {
+ val in = Input(new Bundle {
+ val valid = Bool()
+ val op = UInt(e.bits.W)
+ val f2 = UInt(3.W)
+ val sz = UInt(3.W)
+ val vd = new AluAddr() // write port 0
+ val ve = new AluAddr() // write port 1
+ val sv = new Bundle { val data = UInt(32.W) } // scala value
+ })
+ val read = Vec(7, Input(new Bundle {
+ val data = UInt(p.vectorBits.W)
+ }))
+ val write = Vec(2, Output(new Bundle {
+ val valid = Bool()
+ val addr = UInt(6.W)
+ val data = UInt(p.vectorBits.W)
+ }))
+ val whint = Vec(2, Output(new Bundle {
+ val valid = Bool()
+ val addr = UInt(6.W)
+ }))
+ })
+
+ class AluAddr extends Bundle {
+ val addr = UInt(6.W)
+ }
+
+ val lanes = p.vectorBits / 32
+ assert(lanes == 4 || lanes == 8 || lanes == 16)
+
+ assert(!io.in.valid || PopCount(io.in.sz) <= 1.U)
+
+ // ---------------------------------------------------------------------------
+ // Tie-offs.
+ for (i <- 0 until 2) {
+ io.write(i).valid := false.B
+ io.write(i).addr := 0.U
+ io.write(i).data := 0.U
+ }
+ for (i <- 0 until 2) {
+ io.whint(i).valid := false.B
+ io.whint(i).addr := 0.U
+ }
+
+ // ---------------------------------------------------------------------------
+ // Encodings.
+ val e_absd = io.in.op === e.vabsd.U
+ val e_acc = io.in.op === e.vacc.U
+ val e_dup = io.in.op === e.vdup.U
+ val e_max = io.in.op === e.vmax.U
+ val e_min = io.in.op === e.vmin.U
+ val e_rsub = io.in.op === e.vrsub.U
+ val e_srans = io.in.op === e.vsrans.U
+ val e_sraqs = if (aluid == 0) io.in.op === e.vsraqs.U else false.B
+
+ val e_slidevn = io.in.op === e.vslidevn.U || io.in.op === e.vslidehn.U || io.in.op === e.vslidehn2.U
+ val e_slidevp = io.in.op === e.vslidevp.U || io.in.op === e.vslidehp.U || io.in.op === e.vslidehp2.U
+ val e_slidehn2 = io.in.op === e.vslidehn2.U
+ val e_slidehp2 = io.in.op === e.vslidehp2.U
+ val e_sel = io.in.op === e.vsel.U
+ val e_evn = io.in.op === e.vevn.U || io.in.op === e.vevnodd.U
+ val e_odd = io.in.op === e.vodd.U || io.in.op === e.vevnodd.U
+ val e_zip = io.in.op === e.vzip.U
+
+ val e_dwinit = io.in.op === e.adwinit.U
+ val e_dwconv = io.in.op === e.vdwconv.U || io.in.op === e.adwconv.U
+ val e_dwconva = io.in.op === e.adwconv.U
+
+ val e_add_add = io.in.op === e.vadd.U
+ val e_add_adds = io.in.op === e.vadds.U
+ val e_add_addw = io.in.op === e.vaddw.U
+ val e_add_add3 = io.in.op === e.vadd3.U
+ val e_add_hadd = io.in.op === e.vhadd.U
+ val e_add = e_add_add || e_add_adds || e_add_addw || e_add_add3 || e_add_hadd
+
+ val e_cmp_eq = io.in.op === e.veq.U
+ val e_cmp_ne = io.in.op === e.vne.U
+ val e_cmp_lt = io.in.op === e.vlt.U
+ val e_cmp_le = io.in.op === e.vle.U
+ val e_cmp_gt = io.in.op === e.vgt.U
+ val e_cmp_ge = io.in.op === e.vge.U
+ val e_cmp = e_cmp_eq || e_cmp_ne || e_cmp_lt || e_cmp_le || e_cmp_gt || e_cmp_ge
+ assert(PopCount(Cat(e_cmp_eq, e_cmp_ne, e_cmp_lt, e_cmp_le, e_cmp_gt, e_cmp_ge)) <= 1.U)
+
+ val e_log_and = io.in.op === e.vand.U
+ val e_log_or = io.in.op === e.vor.U
+ val e_log_xor = io.in.op === e.vxor.U
+ val e_log_not = io.in.op === e.vnot.U
+ val e_log_rev = io.in.op === e.vrev.U
+ val e_log_ror = io.in.op === e.vror.U
+ val e_log_clb = io.in.op === e.vclb.U
+ val e_log_clz = io.in.op === e.vclz.U
+ val e_log_cpop = io.in.op === e.vcpop.U
+ val e_log = e_log_and || e_log_or || e_log_xor || e_log_not || e_log_rev || e_log_ror || e_log_clb || e_log_clz || e_log_cpop
+ assert(PopCount(Cat(e_log_and, e_log_or, e_log_xor, e_log_not, e_log_rev, e_log_ror, e_log_clb, e_log_clz, e_log_cpop)) <= 1.U)
+
+ val e_mul0_dmulh = io.in.op === e.vdmulh.U || io.in.op === e.vdmulh2.U
+ val e_mul0_mul = io.in.op === e.vmul.U || io.in.op === e.vmul2.U
+ val e_mul0_mulh = io.in.op === e.vmulh.U || io.in.op === e.vmulh2.U
+ val e_mul0_muls = io.in.op === e.vmuls.U || io.in.op === e.vmuls2.U
+ val e_mul0_mulw = io.in.op === e.vmulw.U
+ val e_mul0_madd = io.in.op === e.vmadd.U
+ val e_mul0 = e_mul0_dmulh || e_mul0_mul || e_mul0_mulh || e_mul0_muls || e_mul0_mulw || e_mul0_madd
+
+ val e_mul1_dmulh = io.in.op === e.vdmulh2.U
+ val e_mul1_mul = io.in.op === e.vmul2.U
+ val e_mul1_mulh = io.in.op === e.vmulh2.U
+ val e_mul1_muls = io.in.op === e.vmuls2.U
+ val e_mul1 = e_mul1_dmulh || e_mul1_mul || e_mul1_mulh || e_mul1_muls
+
+ val e_mv2 = io.in.op === e.vmv2.U
+ val e_mvp = io.in.op === e.vmvp.U
+ val e_mv = io.in.op === e.vmv.U || e_mv2 || e_mvp
+
+ val e_padd_add = io.in.op === e.vpadd.U
+ val e_padd_sub = io.in.op === e.vpsub.U
+ val e_padd = e_padd_add || e_padd_sub
+
+ val e_shf_shl = io.in.op === e.vshl.U
+ val e_shf_shr = io.in.op === e.vshr.U
+ val e_shf_shf = io.in.op === e.vshf.U
+ val e_shf_l = e_shf_shl || e_shf_shf
+ val e_shf_r = e_shf_shr || e_shf_shf
+
+ val e_sub_sub = io.in.op === e.vsub.U
+ val e_sub_subs = io.in.op === e.vsubs.U
+ val e_sub_subw = io.in.op === e.vsubw.U
+ val e_sub_hsub = io.in.op === e.vhsub.U
+ val e_sub = e_sub_sub || e_sub_subs || e_sub_subw || e_sub_hsub
+
+ val e_negative = io.in.f2(0) && e_mul0_dmulh
+ val e_round = io.in.f2(1) && (e_add_hadd || e_sub_hsub || e_mul0_dmulh || e_mul0_mulh || e_shf_shf || e_srans || e_sraqs)
+ val e_signed = !io.in.f2(0) || e_mul0_dmulh
+
+ assert(!(e_mul1_dmulh && !e_mul0_dmulh))
+ assert(!(e_mul1_mul && !e_mul0_mul))
+ assert(!(e_mul1_mulh && !e_mul0_mulh))
+ assert(!(e_mul1_muls && !e_mul0_muls))
+
+ // ---------------------------------------------------------------------------
+ // Control.
+ val vdvalid0 = RegInit(false.B)
+ val vdvalid1 = RegInit(false.B)
+ val vevalid0 = RegInit(false.B)
+ val vevalid1 = RegInit(false.B)
+ val wmask = RegInit(false.B)
+ val vdaddr0 = Reg(new AluAddr())
+ val vdaddr1 = Reg(new AluAddr())
+ val veaddr0 = Reg(new AluAddr())
+ val veaddr1 = Reg(new AluAddr())
+ val sz = RegInit(0.U(3.W))
+ val f2 = RegInit(0.U(3.W))
+ val sv = RegInit(0.U(32.W))
+
+ when (io.in.valid) {
+ // Note: sz is source size, not destination as is ISA defined.
+ val nxt_vdvalid = e_dwconv || e_mul0 || e_absd || e_acc || e_add || e_cmp || e_dup || e_log || e_evn || e_max || e_min || e_mv || e_padd || e_rsub || e_sel || e_shf_l || e_shf_r || e_slidevn || e_slidevp || e_srans || e_sraqs || e_sub || e_zip
+ val nxt_vevalid = e_dwconv || e_mul1 || e_mul0_mulw || e_acc || e_add_addw || e_mv2 || e_mvp || e_odd || e_slidehn2 || e_slidehp2 || e_sub_subw || e_zip
+ val nxt_widen = e_acc || e_add_addw || e_mul0_mulw || e_sub_subw
+ vdvalid0 := nxt_vdvalid
+ vevalid0 := nxt_vevalid
+ wmask := e_dwconva
+ sz := MuxOR(nxt_vdvalid || nxt_vevalid, Mux(nxt_widen, io.in.sz >> 1.U, io.in.sz))
+ f2 := io.in.f2
+ sv := io.in.sv.data
+ } .elsewhen (vdvalid0 || vevalid0) {
+ vdvalid0 := false.B
+ vevalid0 := false.B
+ wmask := false.B
+ sz := 0.U
+ f2 := 0.U
+ sv := 0.U
+ }
+
+ // Register VAluIntLane results, but mask io.write.valid outputs.
+ vdvalid1 := vdvalid0 && !wmask
+ vevalid1 := vevalid0 && !wmask
+
+ when (io.in.valid) {
+ vdaddr0 := io.in.vd
+ veaddr0 := io.in.ve
+ }
+
+ when (vdvalid0) {
+ vdaddr1 := vdaddr0
+ }
+
+ when (vevalid0) {
+ veaddr1 := veaddr0
+ }
+
+ // ---------------------------------------------------------------------------
+ // Side-bands.
+ val negative = Reg(Bool())
+ val round = Reg(Bool())
+ val signed = Reg(Bool())
+
+ when (io.in.valid) {
+ negative := e_negative
+ round := e_round
+ signed := e_signed
+ }
+
+ // ---------------------------------------------------------------------------
+ // Operations.
+ val absd = Reg(Bool())
+ val acc = Reg(Bool())
+ val dup = Reg(Bool())
+ val max = Reg(Bool())
+ val min = Reg(Bool())
+ val srans = Reg(Bool())
+ val sraqs = Reg(Bool())
+
+ val slidevn = Reg(Bool())
+ val slidevp = Reg(Bool())
+ val slidehn2 = Reg(Bool())
+ val slidehp2 = Reg(Bool())
+ val sel = Reg(Bool())
+ val evn = Reg(Bool())
+ val odd = Reg(Bool())
+ val zip = Reg(Bool())
+
+ val dwinit = Reg(Bool())
+ val dwconv = Reg(Bool())
+ val dwconvData = Reg(Bool())
+
+ val add = Reg(Bool())
+ val add_add = Reg(Bool())
+ val add_adds = Reg(Bool())
+ val add_addw = Reg(Bool())
+ val add_add3 = Reg(Bool())
+ val add_hadd = Reg(Bool())
+
+ val padd = Reg(Bool())
+ val padd_add = Reg(Bool())
+ val padd_sub = Reg(Bool())
+
+ val rsub = Reg(Bool())
+ val rsub_rsub = Reg(Bool())
+
+ val sub = Reg(Bool())
+ val sub_sub = Reg(Bool())
+ val sub_subs = Reg(Bool())
+ val sub_subw = Reg(Bool())
+ val sub_hsub = Reg(Bool())
+
+ val cmp = Reg(Bool())
+ val cmp_eq = Reg(Bool())
+ val cmp_ne = Reg(Bool())
+ val cmp_lt = Reg(Bool())
+ val cmp_le = Reg(Bool())
+ val cmp_gt = Reg(Bool())
+ val cmp_ge = Reg(Bool())
+
+ val log = Reg(Bool())
+ val log_and = Reg(Bool())
+ val log_or = Reg(Bool())
+ val log_xor = Reg(Bool())
+ val log_not = Reg(Bool())
+ val log_rev = Reg(Bool())
+ val log_ror = Reg(Bool())
+ val log_clb = Reg(Bool())
+ val log_clz = Reg(Bool())
+ val log_cpop = Reg(Bool())
+
+ val mul0 = Reg(Bool())
+ val mul0_dmulh = Reg(Bool())
+ val mul0_mul = Reg(Bool())
+ val mul0_mulh = Reg(Bool())
+ val mul0_muls = Reg(Bool())
+ val mul0_mulw = Reg(Bool())
+ val mul0_madd = Reg(Bool())
+
+ val mul1 = Reg(Bool())
+ val mul1_dmulh = Reg(Bool())
+ val mul1_mul = Reg(Bool())
+ val mul1_mulh = Reg(Bool())
+ val mul1_muls = Reg(Bool())
+
+ val mv = Reg(Bool())
+ val mv2 = Reg(Bool())
+ val mvp = Reg(Bool())
+
+ val shf_l = Reg(Bool())
+ val shf_r = Reg(Bool())
+ val shf_shl = Reg(Bool())
+ val shf_shr = Reg(Bool())
+ val shf_shf = Reg(Bool())
+
+ val validClr = RegInit(false.B)
+ validClr := io.in.valid
+
+ when (io.in.valid || validClr) {
+ val valid = io.in.valid
+
+ absd := valid && e_absd
+ acc := valid && e_acc
+ dup := valid && e_dup
+ max := valid && e_max
+ min := valid && e_min
+ srans := valid && e_srans
+ sraqs := valid && e_sraqs
+
+ slidevn := valid && e_slidevn
+ slidevp := valid && e_slidevp
+ slidehn2 := valid && e_slidehn2
+ slidehp2 := valid && e_slidehp2
+ sel := valid && e_sel
+ evn := valid && e_evn
+ odd := valid && e_odd
+ zip := valid && e_zip
+
+ dwinit := valid && e_dwinit
+ dwconv := valid && e_dwconv
+
+ add := valid && e_add // unit activation
+ add_add := valid && e_add_add
+ add_adds := valid && e_add_adds
+ add_addw := valid && e_add_addw
+ add_add3 := valid && e_add_add3
+ add_hadd := valid && e_add_hadd
+
+ padd := valid && e_padd
+ padd_add := valid && e_padd_add
+ padd_sub := valid && e_padd_sub
+
+ cmp := valid && (e_cmp || e_absd || e_max || e_min) // unit activation
+ cmp_eq := valid && e_cmp_eq
+ cmp_ne := valid && e_cmp_ne
+ cmp_lt := valid && e_cmp_lt
+ cmp_le := valid && e_cmp_le
+ cmp_gt := valid && e_cmp_gt
+ cmp_ge := valid && e_cmp_ge
+
+ log := valid && e_log // unit activation
+ log_and := valid && e_log_and
+ log_or := valid && e_log_or
+ log_xor := valid && e_log_xor
+ log_not := valid && e_log_not
+ log_rev := valid && e_log_rev
+ log_ror := valid && e_log_ror
+ log_clb := valid && e_log_clb
+ log_clz := valid && e_log_clz
+ log_cpop := valid && e_log_cpop
+
+ mul0 := valid && e_mul0 // unit activation
+ mul0_dmulh := valid && e_mul0_dmulh
+ mul0_mul := valid && e_mul0_mul
+ mul0_mulh := valid && e_mul0_mulh
+ mul0_muls := valid && e_mul0_muls
+ mul0_mulw := valid && e_mul0_mulw
+ mul0_madd := valid && e_mul0_madd
+
+ mul1 := valid && e_mul1 // unit activation
+ mul1_dmulh := valid && e_mul1_dmulh
+ mul1_mul := valid && e_mul1_mul
+ mul1_mulh := valid && e_mul1_mulh
+ mul1_muls := valid && e_mul1_muls
+
+ mv := valid && e_mv
+ mv2 := valid && e_mv2
+ mvp := valid && e_mvp
+
+ rsub := valid && (e_rsub || e_absd) // unit activation
+ rsub_rsub := valid && e_rsub
+
+ shf_l := valid && e_shf_l // unit activation
+ shf_r := valid && e_shf_r // unit activation
+ shf_shl := valid && e_shf_shl
+ shf_shr := valid && e_shf_shr
+ shf_shf := valid && e_shf_shf
+
+ sub := valid && (e_sub || e_absd)
+ sub_sub := valid && e_sub_sub
+ sub_subs := valid && e_sub_subs
+ sub_subw := valid && e_sub_subw
+ sub_hsub := valid && e_sub_hsub
+ }
+
+ // Second cycle of ALU pipeline.
+ dwconvData := dwconv
+
+ // ---------------------------------------------------------------------------
+ // ALU segments.
+ val valu = for (i <- 0 until lanes) yield {
+ Module(new VAluIntLane)
+ }
+
+ val load = Wire(Vec(2, UInt(p.vectorBits.W)))
+
+ for (i <- 0 until lanes) {
+ val msb = 32 * i + 31
+ val lsb = 32 * i
+ valu(i).io.in.vdvalid := vdvalid0
+ valu(i).io.in.vevalid := vevalid0
+ valu(i).io.in.sz := sz
+ for (j <- 0 until 7) {
+ valu(i).io.read(j).data := io.read(j).data(msb, lsb)
+ }
+ for (j <- 0 until 2) {
+ valu(i).io.load(j) := load(j)(msb, lsb)
+ }
+ }
+
+ for (i <- 0 until lanes) {
+ valu(i).io.in.negative := negative
+ valu(i).io.in.round := round
+ valu(i).io.in.signed := signed
+ }
+
+ for (i <- 0 until lanes) {
+ valu(i).io.op.absd := absd
+ valu(i).io.op.acc := acc
+ valu(i).io.op.dup := dup
+ valu(i).io.op.max := max
+ valu(i).io.op.min := min
+ valu(i).io.op.mv := mv
+ valu(i).io.op.mv2 := mv2
+ valu(i).io.op.mvp := mvp
+ valu(i).io.op.srans := srans
+ valu(i).io.op.sraqs := sraqs
+
+ valu(i).io.op.dwinit := dwinit
+ valu(i).io.op.dwconv := dwconv
+ valu(i).io.op.dwconvData := dwconvData
+
+ valu(i).io.op.add.en := add
+ valu(i).io.op.add.add := add_add
+ valu(i).io.op.add.adds := add_adds
+ valu(i).io.op.add.addw := add_addw
+ valu(i).io.op.add.add3 := add_add3
+ valu(i).io.op.add.hadd := add_hadd
+
+ valu(i).io.op.cmp.en := cmp
+ valu(i).io.op.cmp.eq := cmp_eq
+ valu(i).io.op.cmp.ne := cmp_ne
+ valu(i).io.op.cmp.lt := cmp_lt
+ valu(i).io.op.cmp.le := cmp_le
+ valu(i).io.op.cmp.gt := cmp_gt
+ valu(i).io.op.cmp.ge := cmp_ge
+
+ valu(i).io.op.log.en := log
+ valu(i).io.op.log.and := log_and
+ valu(i).io.op.log.or := log_or
+ valu(i).io.op.log.xor := log_xor
+ valu(i).io.op.log.not := log_not
+ valu(i).io.op.log.rev := log_rev
+ valu(i).io.op.log.ror := log_ror
+ valu(i).io.op.log.clb := log_clb
+ valu(i).io.op.log.clz := log_clz
+ valu(i).io.op.log.cpop := log_cpop
+
+ valu(i).io.op.mul0.en := mul0
+ valu(i).io.op.mul0.dmulh := mul0_dmulh
+ valu(i).io.op.mul0.mul := mul0_mul
+ valu(i).io.op.mul0.mulh := mul0_mulh
+ valu(i).io.op.mul0.muls := mul0_muls
+ valu(i).io.op.mul0.mulw := mul0_mulw
+ valu(i).io.op.mul0.madd := mul0_madd
+
+ valu(i).io.op.mul1.en := mul1
+ valu(i).io.op.mul1.dmulh := mul1_dmulh
+ valu(i).io.op.mul1.mul := mul1_mul
+ valu(i).io.op.mul1.mulh := mul1_mulh
+ valu(i).io.op.mul1.muls := mul1_muls
+
+ valu(i).io.op.padd.en := padd
+ valu(i).io.op.padd.add := padd_add
+ valu(i).io.op.padd.sub := padd_sub
+
+ valu(i).io.op.rsub.en := rsub
+ valu(i).io.op.rsub.rsub := rsub_rsub
+
+ valu(i).io.op.shf.en.l := shf_l
+ valu(i).io.op.shf.en.r := shf_r
+ valu(i).io.op.shf.shl := shf_shl
+ valu(i).io.op.shf.shr := shf_shr
+ valu(i).io.op.shf.shf := shf_shf
+
+ valu(i).io.op.sub.en := sub
+ valu(i).io.op.sub.sub := sub_sub
+ valu(i).io.op.sub.subs := sub_subs
+ valu(i).io.op.sub.subw := sub_subw
+ valu(i).io.op.sub.hsub := sub_hsub
+ }
+
+ // ---------------------------------------------------------------------------
+ // VSlide.
+ def VSliden(sz: Int, sel: UInt, a: UInt, b: UInt): UInt = {
+ val size = 8 << sz
+ assert(sz == 0 || sz == 1 || sz == 2)
+ assert(size == 8 || size == 16 || size == 32)
+ assert(sel.getWidth == 2)
+
+ val cnt = a.getWidth / size
+ val cnt2 = cnt * 2
+ val in = Wire(Vec(cnt2, UInt(size.W)))
+ val sout1 = Wire(Vec(cnt, UInt(size.W)))
+ val sout2 = Wire(Vec(cnt, UInt(size.W)))
+ val sout3 = Wire(Vec(cnt, UInt(size.W)))
+ val sout4 = Wire(Vec(cnt, UInt(size.W)))
+
+ for (i <- 0 until cnt) {
+ val l = i * size // lsb
+ val m = l + size - 1 // msb
+ in(i) := a(m,l)
+ in(i + cnt) := b(m,l)
+ }
+
+ for (i <- 0 until cnt) {
+ sout1(i) := in(i + 1)
+ sout2(i) := in(i + 2)
+ sout3(i) := in(i + 3)
+ sout4(i) := in(i + 4)
+ }
+
+ val out = MuxOR(sel === 0.U, sout1.asUInt) |
+ MuxOR(sel === 1.U, sout2.asUInt) |
+ MuxOR(sel === 2.U, sout3.asUInt) |
+ MuxOR(sel === 3.U, sout4.asUInt)
+ assert(out.getWidth == a.getWidth)
+
+ out
+ }
+
+ def VSlidep(sz: Int, sel: UInt, a: UInt, b: UInt): UInt = {
+ val size = 8 << sz
+ assert(sz == 0 || sz == 1 || sz == 2)
+ assert(size == 8 || size == 16 || size == 32)
+ assert(sel.getWidth == 2)
+
+ val cnt = a.getWidth / size
+ val cnt2 = cnt * 2
+ val in = Wire(Vec(cnt2, UInt(size.W)))
+ val sout1 = Wire(Vec(cnt, UInt(size.W)))
+ val sout2 = Wire(Vec(cnt, UInt(size.W)))
+ val sout3 = Wire(Vec(cnt, UInt(size.W)))
+ val sout4 = Wire(Vec(cnt, UInt(size.W)))
+
+ for (i <- 0 until cnt) {
+ val l = i * size // lsb
+ val m = l + size - 1 // msb
+ in(i) := a(m,l)
+ in(i + cnt) := b(m,l)
+ }
+
+ for (i <- 0 until cnt) {
+ sout1(i) := in(i - 1 + cnt)
+ sout2(i) := in(i - 2 + cnt)
+ sout3(i) := in(i - 3 + cnt)
+ sout4(i) := in(i - 4 + cnt)
+ }
+
+ val out = MuxOR(sel === 0.U, sout1.asUInt) |
+ MuxOR(sel === 1.U, sout2.asUInt) |
+ MuxOR(sel === 2.U, sout3.asUInt) |
+ MuxOR(sel === 3.U, sout4.asUInt)
+ assert(out.getWidth == a.getWidth)
+
+ out
+ }
+
+ val slidenb0 = VSliden(0, f2(1,0), MuxOR(slidevn && sz(0), io.read(0).data), MuxOR(slidevn && sz(0), io.read(1).data))
+ val slidenh0 = VSliden(1, f2(1,0), MuxOR(slidevn && sz(1), io.read(0).data), MuxOR(slidevn && sz(1), io.read(1).data))
+ val slidenw0 = VSliden(2, f2(1,0), MuxOR(slidevn && sz(2), io.read(0).data), MuxOR(slidevn && sz(2), io.read(1).data))
+
+ val slidenb1 = VSliden(0, f2(1,0), MuxOR(slidehn2 && sz(0), io.read(1).data), MuxOR(slidehn2 && sz(0), io.read(2).data))
+ val slidenh1 = VSliden(1, f2(1,0), MuxOR(slidehn2 && sz(1), io.read(1).data), MuxOR(slidehn2 && sz(1), io.read(2).data))
+ val slidenw1 = VSliden(2, f2(1,0), MuxOR(slidehn2 && sz(2), io.read(1).data), MuxOR(slidehn2 && sz(2), io.read(2).data))
+
+ val slidepb0 = VSlidep(0, f2(1,0), MuxOR(slidevp && sz(0), io.read(0).data), MuxOR(slidevp && sz(0), io.read(1).data))
+ val slideph0 = VSlidep(1, f2(1,0), MuxOR(slidevp && sz(1), io.read(0).data), MuxOR(slidevp && sz(1), io.read(1).data))
+ val slidepw0 = VSlidep(2, f2(1,0), MuxOR(slidevp && sz(2), io.read(0).data), MuxOR(slidevp && sz(2), io.read(1).data))
+
+ val slidepb1 = VSlidep(0, f2(1,0), MuxOR(slidehp2 && sz(0), io.read(1).data), MuxOR(slidehp2 && sz(0), io.read(2).data))
+ val slideph1 = VSlidep(1, f2(1,0), MuxOR(slidehp2 && sz(1), io.read(1).data), MuxOR(slidehp2 && sz(1), io.read(2).data))
+ val slidepw1 = VSlidep(2, f2(1,0), MuxOR(slidehp2 && sz(2), io.read(1).data), MuxOR(slidehp2 && sz(2), io.read(2).data))
+
+ val slide0 = slidenb0 | slidenh0 | slidenw0 |
+ slidepb0 | slideph0 | slidepw0
+
+ val slide1 = slidenb1 | slidenh1 | slidenw1 |
+ slidepb1 | slideph1 | slidepw1
+
+ // ---------------------------------------------------------------------------
+ // Select.
+ def VSel(sz: Int, a: UInt, b: UInt, c: UInt): UInt = {
+ val size = 8 << sz
+ assert(sz == 0 || sz == 1 || sz == 2)
+ assert(size == 8 || size == 16 || size == 32)
+
+ val cnt = a.getWidth / size
+ val sout = Wire(Vec(cnt, UInt(size.W)))
+
+ for (i <- 0 until cnt) {
+ val l = i * size // lsb
+ val m = l + size - 1 // msb
+ sout(i) := Mux(a(l), c(m,l), b(m,l))
+ }
+
+ val out = sout.asUInt
+ assert(out.getWidth == a.getWidth)
+
+ out
+ }
+
+ val selb0 = VSel(0, MuxOR(sel && sz(0), io.read(0).data), MuxOR(sel && sz(0), io.read(1).data), MuxOR(sel && sz(0), io.read(2).data))
+ val selh0 = VSel(1, MuxOR(sel && sz(1), io.read(0).data), MuxOR(sel && sz(1), io.read(1).data), MuxOR(sel && sz(1), io.read(2).data))
+ val selw0 = VSel(2, MuxOR(sel && sz(2), io.read(0).data), MuxOR(sel && sz(2), io.read(1).data), MuxOR(sel && sz(2), io.read(2).data))
+
+ val sel0 = selb0 | selh0 | selw0
+
+ // ---------------------------------------------------------------------------
+ // Even/Odd.
+ def VEvnOdd(sel: Int, sz: Int, a: UInt, b: UInt): UInt = {
+ val size = 8 << sz
+ assert(sz == 0 || sz == 1 || sz == 2)
+ assert(size == 8 || size == 16 || size == 32)
+ assert(sel == 0 || sel == 1)
+
+ val cnt = a.getWidth / size
+ val h = a.getWidth / 2
+ val evnodd = Wire(Vec(cnt, UInt(size.W)))
+
+ for (i <- 0 until cnt / 2) {
+ val j = i * 2 + sel
+ val l = j * size // lsb
+ val m = l + size - 1 // msb
+ evnodd(i) := a(m,l)
+ }
+
+ for (i <- cnt / 2 until cnt) {
+ val j = (i - cnt / 2) * 2 + sel
+ val l = j * size // lsb
+ val m = l + size - 1 // msb
+ evnodd(i) := b(m,l)
+ }
+
+ val out = evnodd.asUInt
+ assert(out.getWidth == a.getWidth)
+
+ out
+ }
+
+ val evnb = VEvnOdd(0, 0, MuxOR(evn && sz(0), io.read(0).data), MuxOR(evn && sz(0), io.read(1).data))
+ val evnh = VEvnOdd(0, 1, MuxOR(evn && sz(1), io.read(0).data), MuxOR(evn && sz(1), io.read(1).data))
+ val evnw = VEvnOdd(0, 2, MuxOR(evn && sz(2), io.read(0).data), MuxOR(evn && sz(2), io.read(1).data))
+ val oddb = VEvnOdd(1, 0, MuxOR(odd && sz(0), io.read(0).data), MuxOR(odd && sz(0), io.read(1).data))
+ val oddh = VEvnOdd(1, 1, MuxOR(odd && sz(1), io.read(0).data), MuxOR(odd && sz(1), io.read(1).data))
+ val oddw = VEvnOdd(1, 2, MuxOR(odd && sz(2), io.read(0).data), MuxOR(odd && sz(2), io.read(1).data))
+
+ val evn0 = evnb | evnh | evnw
+ val odd1 = oddb | oddh | oddw
+
+ // ---------------------------------------------------------------------------
+ // VZip.
+ def VZip(sz: Int, a: UInt, b: UInt): (UInt, UInt) = {
+ val size = 8 << sz
+ assert(sz == 0 || sz == 1 || sz == 2)
+ assert(size == 8 || size == 16 || size == 32)
+
+ val cnt = a.getWidth / size
+ val h = a.getWidth / 2
+ val zip0 = Wire(Vec(cnt, UInt(size.W)))
+ val zip1 = Wire(Vec(cnt, UInt(size.W)))
+
+ for (i <- 0 until cnt) {
+ val j = i / 2
+ val l = j * size // lsb
+ val m = l + size - 1 // msb
+ if ((i & 1) == 0) {
+ zip0(i) := a(m+0,l+0)
+ zip1(i) := a(m+h,l+h)
+ } else {
+ zip0(i) := b(m+0,l+0)
+ zip1(i) := b(m+h,l+h)
+ }
+ }
+
+ val out0 = zip0.asUInt
+ val out1 = zip1.asUInt
+ assert(out0.getWidth == a.getWidth)
+ assert(out1.getWidth == a.getWidth)
+
+ (out0, out1)
+ }
+
+ val (zipb0, zipb1) = VZip(0, MuxOR(zip && sz(0), io.read(0).data), MuxOR(zip && sz(0), io.read(1).data))
+ val (ziph0, ziph1) = VZip(1, MuxOR(zip && sz(1), io.read(0).data), MuxOR(zip && sz(1), io.read(1).data))
+ val (zipw0, zipw1) = VZip(2, MuxOR(zip && sz(2), io.read(0).data), MuxOR(zip && sz(2), io.read(1).data))
+
+ val zip0 = zipb0 | ziph0 | zipw0
+ val zip1 = zipb1 | ziph1 | zipw1
+
+ // ---------------------------------------------------------------------------
+ // Depthwise.
+ val (dwconv0, dwconv1) =
+ if (aluid == 0) {
+ VDot(aluid, dwconv,
+ VecInit(io.read(0).data, io.read(1).data, io.read(2).data),
+ VecInit(io.read(3).data, io.read(4).data, io.read(5).data), sv)
+ } else {
+ VDot(aluid, dwconv,
+ VecInit(io.read(3).data, io.read(4).data, io.read(5).data),
+ VecInit(io.read(0).data, io.read(1).data, io.read(2).data), sv)
+ }
+
+ // ---------------------------------------------------------------------------
+ // Parallel Load registered VAluIntLane stage.
+ load(0) := evn0 | zip0 | slide0 | dwconv0 | sel0
+ load(1) := odd1 | zip1 | slide1 | dwconv1
+
+ // ---------------------------------------------------------------------------
+ // Outputs.
+ val vddata = Wire(Vec(lanes, UInt(32.W)))
+ val vedata = Wire(Vec(lanes, UInt(32.W)))
+
+ for (i <- 0 until lanes) {
+ vddata(i) := valu(i).io.write(0).data
+ vedata(i) := valu(i).io.write(1).data
+ }
+
+ io.write(0).valid := vdvalid1
+ io.write(0).addr := vdaddr1.addr
+ io.write(0).data := vddata.asUInt
+
+ io.write(1).valid := vevalid1
+ io.write(1).addr := veaddr1.addr
+ io.write(1).data := vedata.asUInt
+
+ io.whint(0).valid := vdvalid0 && !wmask
+ io.whint(0).addr := vdaddr0.addr
+
+ io.whint(1).valid := vevalid0 && !wmask
+ io.whint(1).addr := veaddr0.addr
+}
+
+class VAluIntLane extends Module {
+ val e = new VEncodeOp()
+
+ val io = IO(new Bundle {
+ val in = Input(new Bundle {
+ val vdvalid = Bool()
+ val vevalid = Bool()
+ val sz = UInt(3.W)
+ val negative = Bool()
+ val round = Bool()
+ val signed = Bool()
+ })
+ val op = Input(new Bundle {
+ val absd = Bool()
+ val acc = Bool()
+ val dup = Bool()
+ val max = Bool()
+ val min = Bool()
+ val mv = Bool()
+ val mv2 = Bool()
+ val mvp = Bool()
+ val srans = Bool()
+ val sraqs = Bool()
+
+ val dwinit = Bool()
+ val dwconv = Bool()
+ val dwconvData = Bool()
+
+ val add = new Bundle {
+ val en = Bool()
+ val add = Bool()
+ val adds = Bool()
+ val addw = Bool()
+ val add3 = Bool()
+ val hadd = Bool()
+ }
+
+ val cmp = new Bundle {
+ val en = Bool()
+ val eq = Bool()
+ val ne = Bool()
+ val lt = Bool()
+ val le = Bool()
+ val gt = Bool()
+ val ge = Bool()
+ }
+
+ val log = new Bundle {
+ val en = Bool()
+ val and = Bool()
+ val or = Bool()
+ val xor = Bool()
+ val not = Bool()
+ val rev = Bool()
+ val ror = Bool()
+ val clb = Bool()
+ val clz = Bool()
+ val cpop = Bool()
+ }
+
+ val mul0 = new Bundle {
+ val en = Bool()
+ val dmulh = Bool()
+ val mul = Bool()
+ val mulh = Bool()
+ val muls = Bool()
+ val mulw = Bool()
+ val madd = Bool()
+ }
+
+ val mul1 = new Bundle {
+ val en = Bool()
+ val dmulh = Bool()
+ val mul = Bool()
+ val mulh = Bool()
+ val muls = Bool()
+ }
+
+ val padd = new Bundle {
+ val en = Bool()
+ val add = Bool()
+ val sub = Bool()
+ }
+
+ val rsub = new Bundle {
+ val en = Bool()
+ val rsub = Bool()
+ }
+
+ val shf = new Bundle {
+ val en = new Bundle {
+ val l = Bool() // left
+ val r = Bool() // right
+ }
+ val shl = Bool()
+ val shr = Bool()
+ val shf = Bool()
+ }
+
+ val sub = new Bundle {
+ val en = Bool()
+ val sub = Bool()
+ val subs = Bool()
+ val subw = Bool()
+ val hsub = Bool()
+ }
+ })
+ val read = Vec(7, Input(new Bundle {
+ val data = UInt(32.W)
+ }))
+ val write = Vec(2, Output(new Bundle {
+ val data = UInt(32.W)
+ }))
+ val load = Vec(2, Input(UInt(32.W))) // parallel load data
+ })
+
+ def VAlu(sz: Int, a: UInt, b: UInt, c: UInt, d: UInt, e: UInt, f: UInt): (UInt, UInt, UInt, UInt, UInt, UInt) = {
+ // Note: sz is source size, not destination as is ISA defined.
+ val size = 8 << sz
+ assert(sz == 0 || sz == 1 || sz == 2)
+ assert(size == 8 || size == 16 || size == 32)
+ assert(a.getWidth == b.getWidth)
+ assert(a.getWidth == c.getWidth)
+ assert(a.getWidth == 32)
+ val cnt = a.getWidth / size
+ val alu0 = Wire(Vec(cnt, UInt(size.W)))
+ val alu1 = Wire(Vec(cnt, UInt(size.W)))
+ val aluw0 = Wire(Vec(cnt / 2, UInt((2 * size).W)))
+ val aluw1 = Wire(Vec(cnt / 2, UInt((2 * size).W)))
+ val rnd0 = Wire(Vec(cnt, UInt(size.W)))
+ val rnd1 = Wire(Vec(cnt, UInt(size.W)))
+
+ // -------------------------------------------------------------------------
+ // Controls.
+ val negative = io.in.negative
+ val round = io.in.round
+ val signed = io.in.signed
+
+ // -------------------------------------------------------------------------
+ // Datapath.
+ val aw = a
+ val bw = b
+ val cw = c
+ val dw = d
+ val ew = e
+ val fw = f
+
+ val acc_a = MuxOR(io.op.acc, aw)
+ val acc_b = MuxOR(io.op.acc, bw)
+ val acc_c = MuxOR(io.op.acc, cw)
+
+ val add_a = MuxOR(io.op.add.en, aw)
+ val add_b = MuxOR(io.op.add.en, bw)
+ val add_r = io.op.add.hadd && round
+
+ val cmp_a = MuxOR(io.op.cmp.en, aw)
+ val cmp_b = MuxOR(io.op.cmp.en, bw)
+
+ val log_a = MuxOR(io.op.log.en, aw)
+ val log_b = MuxOR(io.op.log.en, bw)
+
+ val mul0_a = MuxOR(io.op.mul0.en, aw)
+ val mul0_b = MuxOR(io.op.mul0.en, bw)
+ val mul1_a = MuxOR(io.op.mul1.en, cw)
+ val mul1_b = MuxOR(io.op.mul1.en, bw)
+
+ val padd_a = MuxOR(io.op.padd.en, aw)
+
+ val rsub_a = MuxOR(io.op.rsub.en, aw)
+ val rsub_b = MuxOR(io.op.rsub.en, bw)
+
+ val shl_a = MuxOR(io.op.shf.en.l, aw)
+ val shl_b = MuxOR(io.op.shf.en.l, bw)
+ val shr_a = MuxOR(io.op.shf.en.r, aw)
+ val shr_b = MuxOR(io.op.shf.en.r, bw)
+
+ val srans_a = MuxOR(io.op.srans, aw)
+ val srans_b = MuxOR(io.op.srans, bw)
+ val srans_c = MuxOR(io.op.srans, cw)
+
+ val sraqs_a = MuxOR(io.op.sraqs, aw)
+ val sraqs_b = MuxOR(io.op.sraqs, bw)
+ val sraqs_c = MuxOR(io.op.sraqs, cw)
+ val sraqs_d = MuxOR(io.op.sraqs, dw)
+ val sraqs_f = MuxOR(io.op.sraqs, fw)
+
+ val sub_a = MuxOR(io.op.sub.en, aw)
+ val sub_b = MuxOR(io.op.sub.en, bw)
+ val sub_r = io.op.sub.hsub && round
+
+ // -------------------------------------------------------------------------
+ // Functions.
+ for (i <- 0 until cnt) {
+ val l = i * size // lsb
+ val m = l + size - 1 // msb
+ val ln = (i / 2) * 2 * size // lsb narrowing
+ val mn = ln + 2 * size - 1 // msb narrowing
+ val lq = (i / 4) * 4 * size // lsb narrowing
+ val mq = lq + 4 * size - 1 // msb narrowing
+ val mshamt = l + log2Ceil(size) - 1
+
+ // -----------------------------------------------------------------------
+ // Arithmetic.
+ val add_sa = add_a(m) && signed
+ val add_sb = add_b(m) && signed
+ val adder = (Cat(add_sa, add_a(m,l)).asSInt +& Cat(add_sb, add_b(m,l)).asSInt).asUInt + add_r
+ val sataddmsb = adder(size, size - 1)
+ val sataddsel =
+ Cat( signed && sataddmsb === 2.U, // vadd.s -ve
+ signed && sataddmsb === 1.U, // vadd.s +ve
+ !signed && sataddmsb(1)) // vadd.su +ve
+ assert(PopCount(sataddsel) <= 1.U)
+
+ val sub_sa = sub_a(m) && signed
+ val sub_sb = sub_b(m) && signed
+ val subtr = (Cat(sub_sa, sub_a(m,l)).asSInt -& Cat(sub_sb, sub_b(m,l)).asSInt).asUInt + sub_r
+ val satsubmsb = subtr(size, size - 1)
+ val satsubsel =
+ Cat( signed && satsubmsb === 2.U, // vsub.s -ve
+ signed && satsubmsb === 1.U, // vsub.s +ve
+ !signed && satsubmsb(1)) // vsub.su +ve
+ assert(PopCount(satsubsel) <= 1.U)
+
+ val rsubtr = rsub_b(m,l) - rsub_a(m,l)
+
+ val xeq = cmp_a(m,l) === cmp_b(m,l)
+ val xne = cmp_a(m,l) =/= cmp_b(m,l)
+ val slt = cmp_a(m,l).asSInt() < cmp_b(m,l).asSInt()
+ val ult = cmp_a(m,l) < cmp_b(m,l)
+ val sle = slt || xeq
+ val ule = ult || xeq
+
+ val sult = Mux(signed, slt, ult)
+
+ def Shift(a: UInt, b: UInt, sln: UInt, sra: UInt, srl: UInt): UInt = {
+ assert(a.getWidth == size)
+ assert(b.getWidth == size)
+ assert(sln.getWidth == (2 * size - 1))
+ assert(sra.getWidth == size)
+ assert(srl.getWidth == size)
+ val slnsz = sln(size - 1, 0)
+ val input_neg = a(size - 1)
+ val input_zero = a === 0.U
+ val shamt_neg = b(size - 1)
+ val rs = Wire(UInt(size.W))
+ val ru = Wire(UInt(size.W))
+ if (true) {
+ val shamt_negsat = b.asSInt <= (-(size - 1)).S
+ val shamt_possat = b.asSInt >= (size - 1).S
+ val signb = ~0.U(size.W) >> (b(log2Ceil(size) - 1, 0) - 1.U)
+ val possat = shamt_neg && !input_neg && (shamt_negsat || (sln(2 * size - 2, size - 1) =/= 0.U )) && !input_zero
+ val negsat = shamt_neg && input_neg && (shamt_negsat || (sln(2 * size - 2, size - 1) =/= signb))
+ assert(!(possat && negsat))
+ val posmax = Cat(0.U(1.W), ~0.U((size - 1).W))
+ val negmin = Cat(1.U(1.W), 0.U((size - 1).W))
+ assert(posmax.getWidth == size)
+ assert(negmin.getWidth == size)
+
+ rs := MuxOR(!shamt_neg && !shamt_possat, sra) |
+ MuxOR(!shamt_neg && shamt_possat && input_neg, ~0.U(size.W)) |
+ MuxOR( shamt_neg && !possat && !negsat, slnsz) |
+ MuxOR( shamt_neg && possat, posmax) |
+ MuxOR( shamt_neg && negsat, negmin)
+ }
+ if (true) {
+ val shamt_negsat = b.asSInt <= -size.S
+ val shamt_possat = b.asSInt >= size.S
+ val possat = shamt_neg && (shamt_negsat || (sln(2 * size - 2, size) =/= 0.U)) && !input_zero
+ val posmax = ~0.U(size.W)
+ assert(posmax.getWidth == size)
+
+ ru := MuxOR(!shamt_neg && !shamt_possat, srl) |
+ MuxOR( shamt_neg && !possat, slnsz) |
+ MuxOR( shamt_neg && possat, posmax)
+ }
+ Mux(signed, rs, ru)
+ }
+
+ def Round(a: UInt, b: UInt): UInt = {
+ assert(a.getWidth == size)
+ assert(b.getWidth == size)
+ val input_neg = a(size - 1)
+ val shamt_neg = b(size - 1)
+ val shamt_zero = b === 0.U
+ val rbit = Cat(a(size - 2, 0), a(size - 1))(b(log2Ceil(size) - 1, 0)) // shf: idx[8] == idx[0]
+ val shamt_possat = Mux(signed, b.asSInt >= size.S, b.asSInt > size.S)
+ val r = MuxOR(round && !shamt_possat && !shamt_neg && !shamt_zero, rbit) |
+ MuxOR(round && shamt_possat && input_neg && signed, 1.U)
+ assert(r.getWidth == 1)
+ r
+ }
+
+ val shl = (shl_a(m,l) << shl_b(mshamt, l))(size - 1, 0)
+ val sln = (shl_a(m,l) << (size.U - shl_b(mshamt, l)))(2 * size - 2, 0)
+ val srl = shr_a(m,l) >> shr_b(mshamt, l)
+ val srs = MuxOR(shr_a(m), ((~0.U(size.W)) << ((size - 1).U - shr_b(mshamt, l)))(size - 1, 0))
+ val sra = srs | srl
+ val shf = Shift(shl_a(m,l), shl_b(m,l), sln, sra, srl)
+ val shr = Mux(signed, sra, srl)
+ assert(shl.getWidth == size)
+ assert(sln.getWidth == (2 * size - 1))
+ assert(sra.getWidth == size)
+ assert(srl.getWidth == size)
+ assert(srs.getWidth == size)
+ assert(shf.getWidth == size)
+
+ val shf_rnd = Round(shl_a(m,l), shl_b(m,l))
+ assert(shf_rnd.getWidth == 1)
+
+ def Srans(s: Int, a: UInt, b: UInt): UInt = {
+ assert(s == 2 || s == 4)
+ assert(a.getWidth == size * s)
+ assert(b.getWidth == size)
+
+ val shamt = b(log2Ceil(s * size) - 1, 0)
+ val srl = a >> shamt
+ val srs = MuxOR(a(s * size - 1), ((~0.U((s * size).W)) << ((s * size - 1).U - shamt))(s * size - 1, 0))
+ val sra = srs | srl
+ assert(srl.getWidth == (s * size))
+ assert(srs.getWidth == (s * size))
+ val rbit = Cat(a(s * size - 2, 0), 0.U(1.W))(shamt)
+ assert(rbit.getWidth == 1)
+
+ val umax = ((1 << size) - 1).S((s * size).W)
+ val umin = 0.S((s * size).W)
+ val smax = ((1 << (size - 1)) - 1).S((s * size).W)
+ val smin = -(1 << (size - 1)).S((s * size).W)
+ val rshf = Mux(round && rbit, sra + 1.U, sra)
+
+ val is_umax = !signed && (rshf.asSInt > umax)
+ val is_umin = !signed && (rshf.asSInt < umin)
+ val is_smax = signed && (rshf.asSInt > smax)
+ val is_smin = signed && (rshf.asSInt < smin)
+ val is_norm = !(is_umax || is_umin || is_smax || is_smin)
+ assert(PopCount(Cat(is_umax, is_umin, is_smax, is_smin, is_norm)) <= 1.U)
+
+ val r = MuxOR(is_umax, umax.asUInt(size - 1, 0)) |
+ MuxOR(is_umin, umin.asUInt(size - 1, 0)) |
+ MuxOR(is_smax, smax.asUInt(size - 1, 0)) |
+ MuxOR(is_smin, smin.asUInt(size - 1, 0)) |
+ MuxOR(is_norm, rshf(size - 1, 0))
+ assert(r.getWidth == size)
+ r
+ }
+
+ def Rev(a: UInt, s: UInt): UInt = {
+ if (size == 32) {
+ val b = Mux(!s(0), a, Cat(a(30), a(31), a(28), a(29), a(26), a(27), a(24), a(25),
+ a(22), a(23), a(20), a(21), a(18), a(19), a(16), a(17),
+ a(14), a(15), a(12), a(13), a(10), a(11), a( 8), a( 9),
+ a( 6), a( 7), a( 4), a( 5), a( 2), a( 3), a( 0), a( 1)))
+ val c = Mux(!s(1), b, Cat(b(29,28), b(31,30), b(25,24), b(27,26),
+ b(21,20), b(23,22), b(17,16), b(19,18),
+ b(13,12), b(15,14), b( 9, 8), b(11,10),
+ b( 5, 4), b( 7, 6), b( 1, 0), b( 3, 2)))
+ val d = Mux(!s(2), c, Cat(c(27,24), c(31,28), c(19,16), c(23,20),
+ c(11, 8), c(15,12), c( 3, 0), c( 7, 4)))
+ val e = Mux(!s(3), d, Cat(d(23,16), d(31,24), d( 7, 0), d(15, 8)))
+ val f = Mux(!s(4), e, Cat(e(15, 0), e(31,16)))
+ assert(a.getWidth == 32)
+ assert(b.getWidth == 32)
+ assert(c.getWidth == 32)
+ assert(d.getWidth == 32)
+ assert(e.getWidth == 32)
+ assert(f.getWidth == 32)
+ f
+ } else if (size == 16) {
+ val b = Mux(!s(0), a, Cat(a(14), a(15), a(12), a(13), a(10), a(11), a( 8), a( 9),
+ a( 6), a( 7), a( 4), a( 5), a( 2), a( 3), a( 0), a( 1)))
+ val c = Mux(!s(1), b, Cat(b(13,12), b(15,14), b( 9, 8), b(11,10),
+ b( 5, 4), b( 7, 6), b( 1, 0), b( 3, 2)))
+ val d = Mux(!s(2), c, Cat(c(11, 8), c(15,12), c( 3, 0), c( 7, 4)))
+ val e = Mux(!s(3), d, Cat(d( 7, 0), d(15, 8)))
+ assert(a.getWidth == 16)
+ assert(b.getWidth == 16)
+ assert(c.getWidth == 16)
+ assert(d.getWidth == 16)
+ assert(e.getWidth == 16)
+ e
+ } else {
+ val b = Mux(!s(0), a, Cat(a(6), a(7), a(4), a(5), a(2), a(3), a(0), a(1)))
+ val c = Mux(!s(1), b, Cat(b(5, 4), b(7, 6), b(1, 0), b( 3, 2)))
+ val d = Mux(!s(2), c, Cat(c(3, 0), c(7, 4)))
+ assert(a.getWidth == 8)
+ assert(b.getWidth == 8)
+ assert(c.getWidth == 8)
+ assert(d.getWidth == 8)
+ d
+ }
+ }
+
+ def Ror(a: UInt, s: UInt): UInt = {
+ if (size == 32) {
+ val b = Mux(!s(0), a, Cat(a(0), a(31,1)))
+ val c = Mux(!s(1), b, Cat(b(1,0), b(31,2)))
+ val d = Mux(!s(2), c, Cat(c(3,0), c(31,4)))
+ val e = Mux(!s(3), d, Cat(d(7,0), d(31,8)))
+ val f = Mux(!s(4), e, Cat(e(15,0), e(31,16)))
+ assert(a.getWidth == 32)
+ assert(b.getWidth == 32)
+ assert(c.getWidth == 32)
+ assert(d.getWidth == 32)
+ assert(e.getWidth == 32)
+ assert(f.getWidth == 32)
+ f
+ } else if (size == 16) {
+ val b = Mux(!s(0), a, Cat(a(0), a(15,1)))
+ val c = Mux(!s(1), b, Cat(b(1,0), b(15,2)))
+ val d = Mux(!s(2), c, Cat(c(3,0), c(15,4)))
+ val e = Mux(!s(3), d, Cat(d(7,0), d(15,8)))
+ assert(a.getWidth == 16)
+ assert(b.getWidth == 16)
+ assert(c.getWidth == 16)
+ assert(d.getWidth == 16)
+ assert(e.getWidth == 16)
+ e
+ } else {
+ val b = Mux(!s(0), a, Cat(a(0), a(7,1)))
+ val c = Mux(!s(1), b, Cat(b(1,0), b(7,2)))
+ val d = Mux(!s(2), c, Cat(c(3,0), c(7,4)))
+ assert(a.getWidth == 8)
+ assert(b.getWidth == 8)
+ assert(c.getWidth == 8)
+ assert(d.getWidth == 8)
+ d
+ }
+ }
+
+ val mul0_as = Cat(signed && mul0_a(m), mul0_a(m,l))
+ val mul0_bs = Cat(signed && mul0_b(m), mul0_b(m,l))
+ val mul0_sign = mul0_a(m) =/= mul0_b(m) && mul0_a(m,l) =/= 0.U && mul0_b(m,l) =/= 0.U
+ val prod0 = (mul0_as.asSInt * mul0_bs.asSInt).asUInt
+ val prodh0 = prod0(2 * size - 1, size)
+ val proddh0 = prod0(2 * size - 2, size - 1)
+
+ val mul1_as = Cat(signed && mul1_a(m), mul1_a(m,l))
+ val mul1_bs = Cat(signed && mul1_b(m), mul1_b(m,l))
+ val mul1_sign = mul1_a(m) =/= mul1_b(m) && mul1_a(m,l) =/= 0.U && mul1_b(m,l) =/= 0.U
+ val prod1 = (mul1_as.asSInt * mul1_bs.asSInt).asUInt
+ val prodh1 = prod1(2 * size - 1, size)
+ val proddh1 = prod1(2 * size - 2, size - 1)
+
+ val muls0_umax = !signed && prodh0 =/= 0.U
+ val muls0_smax = signed && !mul0_sign && ( prod0(size - 1) || prodh0 =/= 0.U(size.W))
+ val muls0_smin = signed && mul0_sign && (!prod0(size - 1) || prodh0 =/= ~0.U(size.W))
+ val muls0_base = !(muls0_umax || muls0_smax || muls0_smin)
+ assert(PopCount(Cat(muls0_umax, muls0_smax, muls0_smin, muls0_base)) <= 1.U)
+
+ val muls1_umax = !signed && prodh1 =/= 0.U
+ val muls1_smax = signed && !mul1_sign && ( prod1(size - 1) || prodh1 =/= 0.U(size.W))
+ val muls1_smin = signed && mul1_sign && (!prod1(size - 1) || prodh1 =/= ~0.U(size.W))
+ val muls1_base = !(muls1_umax || muls1_smax || muls1_smin)
+ assert(PopCount(Cat(muls1_umax, muls1_smax, muls1_smin, muls1_base)) <= 1.U)
+
+ val maxneg = Cat(1.U(1.W), 0.U((size - 1).W)) // 0x80...
+
+ val dmulh0_possat = mul0_a(m,l) === maxneg && mul0_b(m,l) === maxneg
+
+ val dmulh1_possat = mul1_a(m,l) === maxneg && mul1_b(m,l) === maxneg
+
+ val dmulh0 = MuxOR(!dmulh0_possat, proddh0) |
+ MuxOR(dmulh0_possat, Cat(0.U(1.W), ~0.U((size - 1).W))) // 0x7f...
+
+ val dmulh1 = MuxOR(!dmulh1_possat, proddh1) |
+ MuxOR(dmulh1_possat, Cat(0.U(1.W), ~0.U((size - 1).W))) // 0x7f...
+
+ val mulh0 = prodh0
+ val mulh1 = prodh1
+
+ val muls0 = MuxOR(muls0_umax, ~0.U(size.W)) |
+ MuxOR(muls0_smax, ~0.U((size - 1).W)) |
+ MuxOR(muls0_smin, Cat(1.U(1.W), 0.U((size - 1).W))) |
+ MuxOR(muls0_base, prod0(size - 1, 0))
+
+ val muls1 = MuxOR(muls1_umax, ~0.U(size.W)) |
+ MuxOR(muls1_smax, ~0.U((size - 1).W)) |
+ MuxOR(muls1_smin, Cat(1.U(1.W), 0.U((size - 1).W))) |
+ MuxOR(muls1_base, prod1(size - 1, 0))
+
+ val dmulh0_rnd = MuxOR(round && io.op.mul0.dmulh && io.in.sz(sz) && !dmulh0_possat,
+ Mux(negative && mul0_sign,
+ MuxOR(!prod0(size - 2), ~0.U(size.W)), // -1
+ MuxOR( prod0(size - 2), 1.U(size.W)))) // +1
+
+ val dmulh1_rnd = MuxOR(round && io.op.mul1.dmulh && io.in.sz(sz) && !dmulh1_possat,
+ Mux(negative && mul1_sign,
+ MuxOR(!prod1(size - 2), ~0.U(size.W)), // -1
+ MuxOR( prod1(size - 2), 1.U(size.W)))) // +1
+
+ val mulh0_rnd = round && io.op.mul0.mulh && prod0(size - 1)
+ val mulh1_rnd = round && io.op.mul1.mulh && prod1(size - 1)
+
+ // -----------------------------------------------------------------------
+ // Operations.
+ val absd = MuxOR(io.op.absd, Mux(sult, rsubtr, subtr(size - 1, 0)))
+ assert(absd.getWidth == size)
+
+ val acc = if (sz == 0 || sz == 1) { // size / 2
+ if ((i & 1) == 0) {
+ acc_a(mn,ln) + SignExt(Cat(signed & acc_b(m), acc_b(m,l)), 2 * size)
+ } else {
+ acc_c(mn,ln) + SignExt(Cat(signed & acc_b(m), acc_b(m,l)), 2 * size)
+ }
+ } else {
+ 0.U((2 * size).W)
+ }
+ assert(acc.getWidth == (2 * size))
+
+ val add = MuxOR(sataddsel(2) && io.op.add.adds, Cat(1.U(1.W), 0.U((size - 1).W))) |
+ MuxOR(sataddsel(1) && io.op.add.adds, ~0.U((size - 1).W)) |
+ MuxOR(sataddsel(0) && io.op.add.adds, ~0.U(size.W)) |
+ MuxOR(sataddsel === 0.U && io.op.add.adds || io.op.add.add || io.op.add.add3, adder(size - 1, 0)) |
+ MuxOR(io.op.add.hadd, adder(size, 1))
+
+ val addw = MuxOR(io.op.add.addw, SignExt(adder, 2 * size))
+ assert(addw.getWidth == (2 * size))
+
+ val dup = MuxOR(io.op.dup, io.read(1).data(m,l))
+
+ val max = MuxOR(io.op.max, Mux(sult, cmp_b(m,l), cmp_a(m,l)))
+ val min = MuxOR(io.op.min, Mux(sult, cmp_a(m,l), cmp_b(m,l)))
+
+ val mul0 = MuxOR(io.op.mul0.mul || io.op.mul0.madd, prod0(size - 1, 0)) |
+ MuxOR(io.op.mul0.dmulh, dmulh0) |
+ MuxOR(io.op.mul0.mulh, mulh0) |
+ MuxOR(io.op.mul0.muls, muls0)
+
+ val mul1 = MuxOR(io.op.mul1.mul, prod1(size - 1, 0)) |
+ MuxOR(io.op.mul1.dmulh, dmulh1) |
+ MuxOR(io.op.mul1.mulh, mulh1) |
+ MuxOR(io.op.mul1.muls, muls1)
+
+ val mulw = MuxOR(io.op.mul0.mulw, prod0(2 * size - 1, 0))
+
+ val padd =
+ if (sz == 1 || sz == 2) {
+ val p0 = i * size
+ val p1 = p0 + size / 2 - 1
+ val p2 = p1 + 1
+ val p3 = p0 + size - 1
+ val a = Cat(signed && padd_a(p1), padd_a(p1,p0))
+ val b = Cat(signed && padd_a(p3), padd_a(p3,p2))
+ val add = MuxOR(io.op.padd.add, SignExt((a.asSInt +& b.asSInt).asUInt, size))
+ val sub = MuxOR(io.op.padd.sub, SignExt((a.asSInt -& b.asSInt).asUInt, size))
+ assert(add.getWidth == size)
+ assert(sub.getWidth == size)
+ add | sub
+ } else {
+ 0.U(size.W)
+ }
+
+ val rsub = MuxOR(io.op.rsub.rsub, rsubtr)
+
+ val srans = if (sz == 0 || sz == 1) { // size / 2
+ if ((i & 1) == 0) {
+ Srans(2, srans_a(mn,ln), srans_b(m,l))
+ } else {
+ Srans(2, srans_c(mn,ln), srans_b(m,l))
+ }
+ } else {
+ 0.U(size.W)
+ }
+
+ val sraqs = if (sz == 0) { // size / 4
+ if ((i & 3) == 0) {
+ Srans(4, sraqs_a(mq,lq), sraqs_b(m,l))
+ } else if ((i & 3) == 1) {
+ Srans(4, sraqs_d(mq,lq), sraqs_b(m,l))
+ } else if ((i & 3) == 2) {
+ Srans(4, sraqs_c(mq,lq), sraqs_b(m,l))
+ } else {
+ Srans(4, sraqs_f(mq,lq), sraqs_b(m,l))
+ }
+ } else {
+ 0.U(size.W)
+ }
+
+ val sub = MuxOR(satsubsel(2) && io.op.sub.subs, Cat(1.U(1.W), 0.U((size - 1).W))) |
+ MuxOR(satsubsel(1) && io.op.sub.subs, ~0.U((size - 1).W)) |
+ MuxOR(satsubsel(0) && io.op.sub.subs, ~0.U(size.W)) |
+ MuxOR(satsubsel === 0.U && io.op.sub.subs || io.op.sub.sub, subtr(size - 1, 0)) |
+ MuxOR(io.op.sub.hsub, subtr(size, 1))
+
+ val subw = MuxOR(io.op.sub.subw, SignExt(subtr, 2 * size))
+ assert(subw.getWidth == (2 * size))
+
+ val cmp = io.in.sz(sz) &&
+ (MuxOR(io.op.cmp.eq, xeq) |
+ MuxOR(io.op.cmp.ne, xne) |
+ MuxOR(io.op.cmp.lt && signed, slt) |
+ MuxOR(io.op.cmp.lt && !signed, ult) |
+ MuxOR(io.op.cmp.le && signed, sle) |
+ MuxOR(io.op.cmp.le && !signed, ule) |
+ MuxOR(io.op.cmp.gt && signed, !sle) |
+ MuxOR(io.op.cmp.gt && !signed, !ule) |
+ MuxOR(io.op.cmp.ge && signed, !slt) |
+ MuxOR(io.op.cmp.ge && !signed, !ult))
+ assert(cmp.getWidth == 1)
+
+ val log =
+ MuxOR(io.op.log.and, log_a(m,l) & log_b(m,l)) |
+ MuxOR(io.op.log.or, log_a(m,l) | log_b(m,l)) |
+ MuxOR(io.op.log.xor, log_a(m,l) ^ log_b(m,l)) |
+ MuxOR(io.op.log.not, MuxOR(io.in.sz(sz), ~log_a(m,l))) |
+ MuxOR(io.op.log.rev, Rev(log_a(m,l), log_b(m,l))) |
+ MuxOR(io.op.log.ror, MuxOR(io.in.sz(sz), Ror(log_a(m,l), log_b(m,l)))) |
+ MuxOR(io.op.log.clb, MuxOR(io.in.sz(sz), Clb(log_a(m,l)))) |
+ MuxOR(io.op.log.clz, MuxOR(io.in.sz(sz), Clz(log_a(m,l)))) |
+ MuxOR(io.op.log.cpop, PopCount(log_a(m,l)))
+ assert(log.getWidth == size)
+
+ val shift =
+ MuxOR(io.op.shf.shl, shl) |
+ MuxOR(io.op.shf.shr, shr) |
+ MuxOR(io.op.shf.shf, shf)
+ assert(shf.getWidth == size)
+
+ val alu_oh = Cat(absd =/= 0.U,
+ add =/= 0.U,
+ cmp =/= 0.U,
+ dup =/= 0.U,
+ log =/= 0.U,
+ max =/= 0.U,
+ min =/= 0.U,
+ mul0 =/= 0.U,
+ padd =/= 0.U,
+ rsub =/= 0.U,
+ shift =/= 0.U,
+ srans =/= 0.U,
+ sraqs =/= 0.U,
+ sub =/= 0.U)
+
+ assert(PopCount(alu_oh) <= 1.U)
+
+ alu0(i) := mul0 | absd | add | cmp | dup | log | max | min | padd | rsub | shift | srans | sraqs | sub |
+ MuxOR(io.op.mv, aw(m,l))
+
+ alu1(i) := mul1 |
+ MuxOR(io.op.mvp, bw(m,l)) |
+ MuxOR(io.op.mv2, cw(m,l))
+
+ rnd0(i) := dmulh0_rnd | mulh0_rnd | shf_rnd
+ rnd1(i) := dmulh1_rnd | mulh1_rnd
+
+ if (sz < 2) {
+ if ((i & 1) == 0) {
+ aluw0(i / 2) := acc | addw | mulw | subw
+ } else {
+ aluw1(i / 2) := acc | addw | mulw | subw
+ }
+ }
+ }
+
+ val out_alu0 = alu0.asUInt
+ val out_alu1 = alu1.asUInt
+ val out_rnd0 = rnd0.asUInt
+ val out_rnd1 = rnd1.asUInt
+ val out_aluw0 = aluw0.asUInt
+ val out_aluw1 = aluw1.asUInt
+ assert(out_alu0.getWidth == a.getWidth)
+ assert(out_alu1.getWidth == a.getWidth)
+ assert(out_rnd0.getWidth == a.getWidth)
+ if (sz < 2) {
+ assert(out_aluw0.getWidth == a.getWidth)
+ assert(out_aluw1.getWidth == a.getWidth)
+ }
+
+ (out_alu0, out_alu1, out_rnd0, out_rnd1, out_aluw0, out_aluw1)
+ }
+
+ // ---------------------------------------------------------------------------
+ // Data mux.
+ val ina_b = MuxOR(io.in.sz(0), io.read(0).data)
+ val inb_b = MuxOR(io.in.sz(0), io.read(1).data)
+ val inc_b = MuxOR(io.in.sz(0), io.read(2).data)
+ val ind_b = MuxOR(io.in.sz(0), io.read(3).data)
+ val ine_b = MuxOR(io.in.sz(0), io.read(4).data)
+ val inf_b = MuxOR(io.in.sz(0), io.read(5).data)
+
+ val ina_h = MuxOR(io.in.sz(1), io.read(0).data)
+ val inb_h = MuxOR(io.in.sz(1), io.read(1).data)
+ val inc_h = MuxOR(io.in.sz(1), io.read(2).data)
+ val ind_h = MuxOR(io.in.sz(1), io.read(4).data)
+ val ine_h = MuxOR(io.in.sz(1), io.read(5).data)
+ val inf_h = MuxOR(io.in.sz(1), io.read(6).data)
+
+ val ina_w = MuxOR(io.in.sz(2), io.read(0).data)
+ val inb_w = MuxOR(io.in.sz(2), io.read(1).data)
+ val inc_w = MuxOR(io.in.sz(2), io.read(2).data)
+ val ind_w = MuxOR(io.in.sz(2), io.read(3).data)
+ val ine_w = MuxOR(io.in.sz(2), io.read(4).data)
+ val inf_w = MuxOR(io.in.sz(2), io.read(5).data)
+
+ val (outb0, outb1, rndb0, rndb1, outwb0, outwb1) = VAlu(0, ina_b, inb_b, inc_b, ind_b, ine_b, inf_b)
+ val (outh0, outh1, rndh0, rndh1, outwh0, outwh1) = VAlu(1, ina_h, inb_h, inc_h, ind_h, ine_h, inf_h)
+ val (outw0, outw1, rndw0, rndw1, _, _) = VAlu(2, ina_w, inb_w, inc_w, ind_w, ine_w, inf_w)
+
+ val out0 = outb0 | outh0 | outw0 | outwb0 | outwh0
+ val out1 = outb1 | outh1 | outw1 | outwb1 | outwh1
+ val rnd0 = rndb0 | rndh0 | rndw0
+ val rnd1 = rndb1 | rndh1 | rndw1
+
+ // ---------------------------------------------------------------------------
+ // Accumulator second input.
+ val accvalid0 = io.op.dwinit || io.op.mul0.dmulh || io.op.mul0.mulh || io.op.add.add3 || io.op.mul0.madd || io.op.shf.shf
+ val accvalid1 = io.op.dwinit || io.op.mul1.dmulh || io.op.mul1.mulh
+
+ val accum0 = MuxOR(io.op.add.add3 ||
+ io.op.mul0.madd, io.read(2).data) |
+ MuxOR(io.op.mul0.dmulh ||
+ io.op.mul0.mulh ||
+ io.op.shf.shf, rnd0) |
+ MuxOR(io.op.dwinit, io.read(0).data)
+
+ val accum1 = MuxOR(io.op.mul1.dmulh ||
+ io.op.mul1.mulh, rnd1) |
+ MuxOR(io.op.dwinit, io.read(1).data)
+
+ // ---------------------------------------------------------------------------
+ // Registration.
+ val wsz = RegInit(0.U(3.W))
+ val waccvalid0 = RegInit(false.B)
+ val waccvalid1 = RegInit(false.B)
+ val wdata0 = Reg(UInt(32.W))
+ val waccm0 = Reg(UInt(32.W))
+ val wdata1 = Reg(UInt(32.W))
+ val waccm1 = Reg(UInt(32.W))
+
+ wsz := MuxOR(io.in.vdvalid || io.in.vevalid, io.in.sz)
+ waccvalid0 := accvalid0 || io.op.dwconv
+ waccvalid1 := accvalid1 || io.op.dwconv
+
+ when (io.in.vdvalid) {
+ wdata0 := out0 | io.load(0)
+ }
+
+ when (accvalid0) {
+ waccm0 := accum0
+ } .elsewhen (io.op.dwconvData) {
+ waccm0 := io.write(0).data
+ }
+
+ when (io.in.vevalid) {
+ wdata1 := out1 | io.load(1)
+ }
+
+ when (accvalid1) {
+ waccm1 := accum1
+ } .elsewhen (io.op.dwconvData) {
+ waccm1 := io.write(1).data
+ }
+
+ def Accum(en: Bool, d: UInt, a: UInt): UInt = {
+ val dm = MuxOR(en, d)
+ val am = MuxOR(en, a)
+ val rm = MuxOR(en && wsz(0), Cat(dm(31,24) + am(31,24),
+ dm(23,16) + am(23,16),
+ dm(15, 8) + am(15, 8),
+ dm( 7, 0) + am( 7, 0))) |
+ MuxOR(en && wsz(1), Cat(dm(31,16) + am(31,16),
+ dm(15, 0) + am(15, 0))) |
+ MuxOR(en && wsz(2), dm(31, 0) + am(31, 0))
+ val rn = MuxOR(!en, d)
+ assert(rm.getWidth == 32)
+ assert(rn.getWidth == 32)
+ rm | rn
+ }
+
+ io.write(0).data := Accum(waccvalid0, wdata0, waccm0)
+ io.write(1).data := Accum(waccvalid1, wdata1, waccm1)
+}
+
+object EmitVAluInt extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new VAluInt(p, 0), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VCmdq.scala b/hdl/chisel/src/kelvin/vector/VCmdq.scala
new file mode 100644
index 0000000..41845ad
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VCmdq.scala
@@ -0,0 +1,167 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+// A queue of commands, reducing VDecodeBits to just the necessary fields.
+// <fin> retains just the needed fields or modifications.
+// <fout> accepts the current stripmine bank step.
+// <factive> returns the activation status for decode dependencies.
+
+object VCmdq {
+ def apply[T <: Data](n: Int, t: T, fin: (VDecodeBits) => T, fout: (T, Bool, UInt, Bool) => (T, Bool), factive: (T, Bool, UInt) => UInt) = {
+ Module(new VCmdq(n, t, fin, fout, factive))
+ }
+}
+
+class VCmdq[T <: Data](n: Int, t: T, fin: (VDecodeBits) => T, fout: (T, Bool, UInt, Bool) => (T, Bool), factive: (T, Bool, UInt) => UInt) extends Module {
+ val io = IO(new Bundle {
+ val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+ val out = Decoupled(t)
+ val active = Output(UInt(64.W))
+ val nempty = Output(Bool())
+ })
+
+ class VCmdqWrapper extends Bundle {
+ val tin = Output(t) // type input
+ val m = Output(Bool()) // stripmine
+ }
+
+ val f = Fifo4e(new VCmdqWrapper, n)
+
+ val active = RegInit(0.U(64.W))
+
+ val valid = RegInit(false.B)
+ val ready = io.out.ready
+ val value = Reg(new VCmdqWrapper)
+
+ // ---------------------------------------------------------------------------
+ // Step controls.
+ val step0 = 0.U(5.W)
+ val step = RegInit(step0)
+
+ val (tin, last) = fout(value.tin, value.m, step, valid)
+
+ // ---------------------------------------------------------------------------
+ // Fifo.
+ f.io.in.valid := io.in.valid
+ io.in.ready := f.io.in.ready
+
+ for (i <- 0 until 4) {
+ f.io.in.bits(i).valid := io.in.bits(i).valid
+ f.io.in.bits(i).bits.tin := fin(io.in.bits(i).bits)
+ f.io.in.bits(i).bits.m := io.in.bits(i).bits.m
+ }
+
+ f.io.out.ready := !valid || ready && last
+
+ // ---------------------------------------------------------------------------
+ // Output register.
+ when (f.io.out.valid && f.io.out.ready) {
+ valid := true.B
+ value := f.io.out.bits
+ step := 0.U
+ } .elsewhen (io.out.valid && io.out.ready) {
+ when (!last) {
+ valid := true.B
+ value.tin := tin
+ value.m := value.m
+ step := step + 1.U
+ } .otherwise {
+ // Output value.tin == 0 when not active (eg. do not drive vreg reads).
+ valid := false.B
+ value.tin := 0.U.asTypeOf(t)
+ value.m := false.B
+ step := 0.U
+ }
+ }
+
+ when (reset.asBool) {
+ value.tin := 0.U.asTypeOf(t)
+ value.m := false.B
+ }
+
+ // ---------------------------------------------------------------------------
+ // Active.
+ def ValueActive(data: UInt = 0.U(64.W), i: Int = 0): UInt = {
+ assert(data.getWidth == 64)
+ if (i < n) {
+ val active = MuxOR(f.io.entry(i).valid, factive(f.io.entry(i).bits.tin, f.io.entry(i).bits.m, step0))
+ ValueActive(data | active, i + 1)
+ } else {
+ val m = value.m
+ val active0 = factive(value.tin, m, step + 0.U)
+ val active1 = factive(value.tin, m, step + 1.U)
+ val active = MuxOR(valid && (!ready || !last),
+ Mux(!ready, active0, active1))
+ data | active
+ }
+ }
+
+ when (io.in.valid && io.in.ready || io.out.valid && io.out.ready) {
+ val fvalid = MuxOR(f.io.in.valid && f.io.in.ready,
+ Cat(f.io.in.bits(3).valid, f.io.in.bits(2).valid,
+ f.io.in.bits(1).valid, f.io.in.bits(0).valid))
+
+ active :=
+ MuxOR(fvalid(0), factive(f.io.in.bits(0).bits.tin, f.io.in.bits(0).bits.m, step0)) |
+ MuxOR(fvalid(1), factive(f.io.in.bits(1).bits.tin, f.io.in.bits(1).bits.m, step0)) |
+ MuxOR(fvalid(2), factive(f.io.in.bits(2).bits.tin, f.io.in.bits(2).bits.m, step0)) |
+ MuxOR(fvalid(3), factive(f.io.in.bits(3).bits.tin, f.io.in.bits(3).bits.m, step0)) |
+ ValueActive()
+ }
+
+ // ---------------------------------------------------------------------------
+ // Outputs.
+ io.out.valid := valid
+ io.out.bits := value.tin
+
+ io.active := active
+
+ io.nempty := f.io.nempty || valid
+}
+
+class VCmdqTestBundle extends Bundle {
+ val op = UInt(new VEncodeOp().bits.W)
+ val sz = UInt(3.W)
+ val vd = new VAddr()
+ val vs = new VAddrTag()
+ val data = UInt(32.W)
+}
+
+object EmitVCmdq extends App {
+ def VCmdqTestFin(in: VDecodeBits): VCmdqTestBundle = {
+ val out = Wire(new VCmdqTestBundle)
+ out.op := in.op
+ out.sz := in.sz
+ out.vd := in.vd
+ out.vs := in.vs
+ out.data := in.sv.data
+ out
+ }
+
+ def VCmdqTestFout(in: VCmdqTestBundle, m: Bool, step: UInt, valid: Bool): (VCmdqTestBundle, Bool) = {
+ val out = Wire(new VCmdqTestBundle)
+ val last = !m || step === 3.U
+ out.op := in.op
+ out.sz := in.sz
+ out.vd.valid := in.vd.valid
+ out.vs.valid := in.vs.valid
+ out.vd.addr := in.vd.addr + 1.U
+ out.vs.addr := in.vs.addr + 1.U
+ out.vs.tag := in.vs.tag
+ out.data := in.data
+ (out, last)
+ }
+
+ def VCmdqTestFactive(in: VCmdqTestBundle, m: Bool, step: UInt): UInt = {
+ assert(step.getWidth == 5)
+ val active = MuxOR(in.vd.valid, RegActive(m, step(2,0), in.vd.addr)) |
+ MuxOR(in.vs.valid, RegActive(m, step(2,0), in.vs.addr))
+ assert(active.getWidth == 64)
+ active
+ }
+
+ (new chisel3.stage.ChiselStage).emitVerilog(new VCmdq(8, new VCmdqTestBundle, VCmdqTestFin, VCmdqTestFout, VCmdqTestFactive), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VCommon.scala b/hdl/chisel/src/kelvin/vector/VCommon.scala
new file mode 100644
index 0000000..5db855e
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VCommon.scala
@@ -0,0 +1,124 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+// Convert register port into a onehot w/wo stripmining.
+object RegActive {
+ def apply(m: Bool, step: UInt, regnum: UInt): UInt = {
+ assert(step.getWidth == 3)
+ assert(regnum.getWidth == 6)
+ assert(step <= 4.U)
+
+ val oh = OneHot(regnum(5,2), 16)
+
+ val oh0 = Cat(0.U(3.W), oh(15),
+ 0.U(3.W), oh(14),
+ 0.U(3.W), oh(13),
+ 0.U(3.W), oh(12),
+ 0.U(3.W), oh(11),
+ 0.U(3.W), oh(10),
+ 0.U(3.W), oh(9),
+ 0.U(3.W), oh(8),
+ 0.U(3.W), oh(7),
+ 0.U(3.W), oh(6),
+ 0.U(3.W), oh(5),
+ 0.U(3.W), oh(4),
+ 0.U(3.W), oh(3),
+ 0.U(3.W), oh(2),
+ 0.U(3.W), oh(1),
+ 0.U(3.W), oh(0))
+
+ val oh1 = Cat(0.U(2.W), oh(15), 0.U(1.W),
+ 0.U(2.W), oh(14), 0.U(1.W),
+ 0.U(2.W), oh(13), 0.U(1.W),
+ 0.U(2.W), oh(12), 0.U(1.W),
+ 0.U(2.W), oh(11), 0.U(1.W),
+ 0.U(2.W), oh(10), 0.U(1.W),
+ 0.U(2.W), oh(9), 0.U(1.W),
+ 0.U(2.W), oh(8), 0.U(1.W),
+ 0.U(2.W), oh(7), 0.U(1.W),
+ 0.U(2.W), oh(6), 0.U(1.W),
+ 0.U(2.W), oh(5), 0.U(1.W),
+ 0.U(2.W), oh(4), 0.U(1.W),
+ 0.U(2.W), oh(3), 0.U(1.W),
+ 0.U(2.W), oh(2), 0.U(1.W),
+ 0.U(2.W), oh(1), 0.U(1.W),
+ 0.U(2.W), oh(0), 0.U(1.W))
+
+ val oh2 = Cat(0.U(1.W), oh(15), 0.U(2.W),
+ 0.U(1.W), oh(14), 0.U(2.W),
+ 0.U(1.W), oh(13), 0.U(2.W),
+ 0.U(1.W), oh(12), 0.U(2.W),
+ 0.U(1.W), oh(11), 0.U(2.W),
+ 0.U(1.W), oh(10), 0.U(2.W),
+ 0.U(1.W), oh(9), 0.U(2.W),
+ 0.U(1.W), oh(8), 0.U(2.W),
+ 0.U(1.W), oh(7), 0.U(2.W),
+ 0.U(1.W), oh(6), 0.U(2.W),
+ 0.U(1.W), oh(5), 0.U(2.W),
+ 0.U(1.W), oh(4), 0.U(2.W),
+ 0.U(1.W), oh(3), 0.U(2.W),
+ 0.U(1.W), oh(2), 0.U(2.W),
+ 0.U(1.W), oh(1), 0.U(2.W),
+ 0.U(1.W), oh(0), 0.U(2.W))
+
+ val oh3 = Cat(oh(15), 0.U(3.W),
+ oh(14), 0.U(3.W),
+ oh(13), 0.U(3.W),
+ oh(12), 0.U(3.W),
+ oh(11), 0.U(3.W),
+ oh(10), 0.U(3.W),
+ oh(9), 0.U(3.W),
+ oh(8), 0.U(3.W),
+ oh(7), 0.U(3.W),
+ oh(6), 0.U(3.W),
+ oh(5), 0.U(3.W),
+ oh(4), 0.U(3.W),
+ oh(3), 0.U(3.W),
+ oh(2), 0.U(3.W),
+ oh(1), 0.U(3.W),
+ oh(0), 0.U(3.W))
+
+ assert(oh.getWidth == 16)
+ assert(oh0.getWidth == 64)
+ assert(oh1.getWidth == 64)
+ assert(oh2.getWidth == 64)
+ assert(oh3.getWidth == 64)
+
+ val idx = regnum(1,0)
+
+ val active = MuxOR(!m && idx === 0.U || m && step <= 0.U, oh0) |
+ MuxOR(!m && idx === 1.U || m && step <= 1.U, oh1) |
+ MuxOR(!m && idx === 2.U || m && step <= 2.U, oh2) |
+ MuxOR(!m && idx === 3.U || m && step <= 3.U, oh3)
+ assert(active.getWidth == 64)
+
+ active
+ }
+}
+
+// Convert tagged address into register file format.
+object OutTag {
+ def apply(v: VAddrTag): UInt = {
+ OutTag(v.addr, v.tag)
+ }
+
+ def apply(addr: UInt, tag: UInt): UInt = {
+ assert(addr.getWidth == 6)
+ assert(tag.getWidth == 4)
+ tag(addr(1,0))
+ }
+}
+
+object ScoreboardReady {
+ def apply(a: VAddrTag, sb: UInt): Bool = {
+ assert(a.addr.getWidth == 6)
+ assert(a.tag.getWidth == 4)
+ assert(sb.getWidth == 128)
+ val tag = a.tag(a.addr(1,0))
+ val idx = Cat(tag, a.addr)
+ assert(idx.getWidth == 7)
+ (!a.valid || !sb(idx))
+ }
+}
diff --git a/hdl/chisel/src/kelvin/vector/VConvAlu.scala b/hdl/chisel/src/kelvin/vector/VConvAlu.scala
new file mode 100644
index 0000000..c469161
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VConvAlu.scala
@@ -0,0 +1,109 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+object VConvAlu {
+ def apply(p: Parameters): VConvAlu = {
+ return Module(new VConvAlu(p))
+ }
+}
+
+class VConvAlu(p: Parameters) extends Module {
+ val tcnt = p.vectorBits / 32
+
+ val io = IO(new Bundle {
+ val op = new Bundle {
+ val conv = Input(Bool()) // convolution
+ val init = Input(Bool()) // initialize
+ val tran = Input(Bool()) // transpose
+ val clear = Input(Bool()) // clear accumulator
+ }
+ val index = Input(UInt(log2Ceil(tcnt).W))
+ val adata = Input(UInt((tcnt * 32).W))
+ val bdata = Input(UInt((tcnt * 32).W))
+ val abias = Input(UInt(9.W))
+ val bbias = Input(UInt(9.W))
+ val asign = Input(Bool())
+ val bsign = Input(Bool())
+ val out = Output(Vec(tcnt, UInt((tcnt * 32).W)))
+ })
+
+ // MatMul
+ // B B B B
+ // A . . . .
+ // A . . . .
+ // A . . . .
+ // A . . . .
+
+ val acc = Reg(Vec(tcnt, Vec(tcnt, UInt(32.W))))
+
+ assert(PopCount(Cat(io.op.conv, io.op.tran, io.op.clear)) <= 1.U)
+
+ // ---------------------------------------------------------------------------
+ // Output interleave to match shift reductions.
+ def Interleave(i: Int, j: Int): (Int, Int) = {
+ val interleave = Seq(0, 2, 1, 3);
+ val rbase = i & ~3;
+ val rquad = i & 3;
+ val word = j;
+ val si = rbase + interleave(word & 3);
+ val sj = rquad * (tcnt / 4) + (word / 4);
+ (si, sj)
+ }
+
+ // ---------------------------------------------------------------------------
+ // Matrix Multiply.
+ val dpa = Wire(Vec(tcnt, Vec(tcnt, UInt(32.W)))) // dot product accumulate
+
+ for (i <- 0 until tcnt) {
+ for (j <- 0 until tcnt) {
+ val accum = MuxOR(io.op.conv, acc(i)(j))
+ dpa(i)(j) := accum + VDot(io.op.conv,
+ io.adata(i * 32 + 31, i * 32), io.bdata(j * 32 + 31, j * 32),
+ io.abias, io.bbias, io.asign, io.bsign)
+ }
+ }
+
+ // ---------------------------------------------------------------------------
+ // Parallel load.
+ val pload = MuxOR(io.op.tran, io.adata) |
+ MuxOR(io.op.init, io.bdata)
+
+ // ---------------------------------------------------------------------------
+ // Accumulators.
+ for (i <- 0 until tcnt) {
+ for (j <- 0 until tcnt) {
+ val (si, sj) = Interleave(i, j)
+
+ val aclr = io.op.clear || reset.asBool
+ val conv = io.op.conv
+ val load = (io.op.init || io.op.tran) && si.U === io.index
+
+ when (aclr || conv || load) {
+ acc(i)(j) := Mux(conv, dpa(i)(j),
+ pload(sj * 32 + 31, sj * 32))
+ }
+ }
+ }
+
+ // ---------------------------------------------------------------------------
+ // Outputs.
+ val out = Wire(Vec(tcnt, Vec(tcnt, UInt(32.W))))
+
+ for (i <- 0 until tcnt) {
+ for (j <- 0 until tcnt) {
+ val (si, sj) = Interleave(i, j)
+ out(si)(sj) := acc(i)(j)
+ }
+ }
+
+ for (i <- 0 until tcnt) {
+ io.out(i) := out(i).asUInt
+ }
+}
+
+object EmitVConvAlu extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new VConvAlu(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VConvCtrl.scala b/hdl/chisel/src/kelvin/vector/VConvCtrl.scala
new file mode 100644
index 0000000..78fa0a4
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VConvCtrl.scala
@@ -0,0 +1,197 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object VConvCtrl {
+ def apply(p: Parameters): VConvCtrl = {
+ return Module(new VConvCtrl(p))
+ }
+}
+
+class VConvCtrl(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ // Instructions.
+ val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+ val active = Output(UInt(64.W))
+
+ // RegisterFile.
+ val vrfsb = Input(UInt(128.W))
+ val out = new VRegfileConvIO(p)
+ })
+
+ // A usable depth of outstanding commands.
+ val cmdqDepth = 4
+
+ val e = new VEncodeOp()
+
+ // ---------------------------------------------------------------------------
+ // Command Queue.
+ class VConvCtrlCmdq extends Bundle {
+ val conv = Bool() // convolution
+ val init = Bool() // initialize (set)
+ val tran = Bool() // transpose
+ val wclr = Bool() // get and clear (marks last cycle)
+ val addr1 = UInt(6.W)
+ val addr2 = UInt(6.W)
+ val base2 = UInt(6.W)
+ val mode = UInt(2.W)
+ val mark2 = UInt((p.vectorBits / 32).W)
+ val index = UInt(log2Ceil(p.vectorBits / 32).W)
+ val end = UInt(log2Ceil(p.vectorBits / 32).W)
+ val abias = UInt(9.W)
+ val bbias = UInt(9.W)
+ val asign = Bool()
+ val bsign = Bool()
+ }
+
+ def Fin(in: VDecodeBits): VConvCtrlCmdq = {
+ val out = Wire(new VConvCtrlCmdq)
+
+ val vcget = in.op === e.vcget.U
+ val acset = in.op === e.acset.U
+ val actr = in.op === e.actr.U
+ val aconv = in.op === e.aconv.U
+
+ val addr1 = in.vs.addr
+ val addr2 = Mux(acset, in.vs.addr, in.vu.addr)
+ val data = in.sv.data
+ val sp = (p.vectorBits / 32) - 1
+ val mark2 = Wire(UInt((p.vectorBits / 32).W))
+ val start = Mux(acset || actr, 0.U, data(6,2))
+ val stop = Mux(acset || actr, sp.U, data(11,7))
+
+ if (p.vectorBits == 128) {
+ mark2 := 0xf.U >> (3.U - (stop(1,0) - start(1,0)))
+ } else if (p.vectorBits == 256) {
+ mark2 := 0xff.U >> (7.U - (stop(2,0) - start(2,0)))
+ } else if (p.vectorBits == 512) {
+ mark2 := 0xffff.U >> (15.U - (stop(3,0) - start(3,0)))
+ } else {
+ assert(false)
+ }
+
+ out.conv := aconv
+ out.init := acset
+ out.tran := actr
+ out.wclr := vcget
+ out.addr1 := addr1
+ out.addr2 := addr2
+ out.base2 := addr2
+ out.mode := data(1,0)
+ out.mark2 := mark2
+ out.index := start
+ out.end := stop
+ out.abias := data(20,12)
+ out.asign := data(21)
+ out.bbias := data(30,22)
+ out.bsign := data(31)
+
+ out
+ }
+
+ def Fout(in: VConvCtrlCmdq, m: Bool, step: UInt, valid: Bool): (VConvCtrlCmdq, Bool) = {
+ when (valid) {
+ assert(m === false.B)
+ assert(in.index <= in.end)
+
+ if (p.vectorBits == 128) {
+ assert(in.addr1(1,0) === 0.U)
+ } else if (p.vectorBits == 256) {
+ assert(in.addr1(2,0) === 0.U)
+ } else if (p.vectorBits == 512) {
+ assert(in.addr1(3,0) === 0.U)
+ }
+ }
+
+ val out = Wire(new VConvCtrlCmdq)
+ val last = in.index === in.end || in.wclr
+
+ out := in
+ out.index := in.index + 1.U
+ out.addr2 := in.addr2 + 1.U
+
+ (out, last)
+ }
+
+ def Factive(in: VConvCtrlCmdq, m: Bool, step: UInt): UInt = {
+ val active1 = Wire(UInt(64.W))
+ val active2 = Wire(UInt(64.W))
+
+ val addr1 = in.addr1
+ val addr2 = in.addr2
+
+ // (mark2 & (mark2 << step)) clears the lsb bits.
+ if (p.vectorBits == 128) {
+ active1 := 0xf.U << Cat(addr1(5,2), 0.U(2.W))
+ active2 := ((in.mark2 & (in.mark2 << step(1,0))) << in.base2)(63,0)
+ } else if (p.vectorBits == 256) {
+ active1 := 0xff.U << Cat(addr1(5,3), 0.U(3.W))
+ active2 := ((in.mark2 & (in.mark2 << step(2,0))) << in.base2)(63,0)
+ } else if (p.vectorBits == 512) {
+ active1 := 0xffff.U << Cat(addr1(5,4), 0.U(4.W))
+ active2 := ((in.mark2 & (in.mark2 << step(3,0))) << in.base2)(63,0)
+ } else {
+ assert(false)
+ }
+
+ // Only reads are reported in active, vrfsb tracks writes.
+ val active = MuxOR(in.conv || in.tran, active1) |
+ MuxOR(in.conv || in.init, active2)
+
+ active
+ }
+
+ val q = VCmdq(cmdqDepth, new VConvCtrlCmdq, Fin, Fout, Factive)
+
+ q.io.in <> io.in
+
+ // ---------------------------------------------------------------------------
+ // VRegfile Conv.
+ val active = Factive(q.io.out.bits, false.B, 0.U)
+
+ // Write ports take 2 cycles to commit to register store, but 3 cycles need
+ // to be factored due to ALU-to-ALU scoreboard forwarding.
+ val vrfsb0 = io.vrfsb(63,0) | io.vrfsb(127,64)
+ val vrfsb1 = RegInit(0.U(64.W))
+ val vrfsb2 = RegInit(0.U(64.W))
+ val vrfsb = vrfsb0 | vrfsb1 | vrfsb2
+ vrfsb1 := vrfsb0
+ vrfsb2 := vrfsb1
+
+ val ready = (active & vrfsb) === 0.U
+
+ q.io.out.ready := ready
+
+ io.out.valid := q.io.out.valid
+ io.out.ready := ready
+
+ io.out.op.conv := q.io.out.bits.conv
+ io.out.op.init := q.io.out.bits.init
+ io.out.op.tran := q.io.out.bits.tran
+ io.out.op.wclr := q.io.out.bits.wclr
+
+ io.out.mode := q.io.out.bits.mode
+ io.out.index := q.io.out.bits.index
+ io.out.addr1 := q.io.out.bits.addr1
+ io.out.addr2 := q.io.out.bits.addr2
+ io.out.abias := q.io.out.bits.abias
+ io.out.asign := q.io.out.bits.asign
+ io.out.bbias := q.io.out.bits.bbias
+ io.out.bsign := q.io.out.bits.bsign
+
+ assert(!(q.io.out.bits.wclr && !q.io.out.ready))
+
+ assert(!(io.out.valid && io.out.ready) ||
+ PopCount(Cat(io.out.op.conv, io.out.op.init, io.out.op.tran, io.out.op.wclr)) === 1.U)
+
+ // ---------------------------------------------------------------------------
+ // Active.
+ io.active := q.io.active
+}
+
+object EmitVConvCtrl extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new VConvCtrl(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VCore.scala b/hdl/chisel/src/kelvin/vector/VCore.scala
new file mode 100644
index 0000000..0ce01f4
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VCore.scala
@@ -0,0 +1,344 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object VCore {
+ def apply(p: Parameters): VCore = {
+ return Module(new VCore(p))
+ }
+}
+
+// object VCore {
+// def apply(p: Parameters): VCoreEmpty = {
+// return Module(new VCoreEmpty(p))
+// }
+// }
+
+class VCoreIO(p: Parameters) extends Bundle {
+ // Decode cycle.
+ val vinst = Vec(4, new VInstIO)
+
+ // Execute cycle.
+ val rs = Vec(8, Flipped(new RegfileReadDataIO))
+ val rd = Vec(4, Flipped(new RegfileWriteDataIO))
+
+ // Status.
+ val mactive = Output(Bool())
+
+ // Faults.
+ val undef = Output(Bool())
+}
+
+class VCore(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ // Score <> VCore
+ val score = new VCoreIO(p)
+
+ // Data bus interface.
+ val dbus = new DBusIO(p)
+ val last = Output(Bool())
+
+ // AXI interface.
+ val ld = new AxiMasterReadIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits)
+ val st = new AxiMasterWriteIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits)
+ })
+
+ // Decode : VInst.in
+ // Execute+0 : VInst.slice
+ // Execute+1 : VInst.out <> VDec::Fifo.in
+ // Execute+2 : VDec::Fifo.out <> VDec::Shuffle.in
+ // Execute+3 : VDec::Shuffle.out <> VCmdq::Fifo.in
+ // Execute+4 : VCmdq::Fifo.out <> VCmdq::Reg.in
+ // Execute+5 : VCmdq::Reg.out <> {VLdSt, VAlu, ...}
+
+ val vinst = VInst(p)
+ val vdec = VDecode(p)
+ val valu = VAlu(p)
+ val vconv = VConvCtrl(p)
+ val vldst = VLdSt(p)
+ val vld = VLd(p)
+ val vst = VSt(p)
+ val vrf = VRegfile(p)
+
+ vinst.io.in <> io.score.vinst
+ vinst.io.rs <> io.score.rs
+ vinst.io.rd <> io.score.rd
+
+ assert(PopCount(Cat(vst.io.read.valid && vst.io.read.ready,
+ vldst.io.read.valid && vldst.io.read.ready)) <= 1.U)
+
+ // ---------------------------------------------------------------------------
+ // VDecode.
+ vdec.io.vrfsb <> vrf.io.vrfsb
+
+ vdec.io.active := valu.io.active | vconv.io.active | vldst.io.active | vst.io.active
+
+ vdec.io.in.valid := vinst.io.out.valid
+ vinst.io.out.ready := vdec.io.in.ready
+ assert(!(vdec.io.in.valid && !vdec.io.in.ready))
+
+ vinst.io.out.stall := vdec.io.stall // decode backpressure
+
+ for (i <- 0 until 4) {
+ vdec.io.in.bits(i) := vinst.io.out.lane(i)
+ }
+
+ io.score.undef := vdec.io.undef
+
+ // ---------------------------------------------------------------------------
+ // VRegfile.
+ for (i <- 0 until 7) {
+ vrf.io.read(i).valid := false.B
+ vrf.io.read(i).addr := 0.U
+ vrf.io.read(i).tag := 0.U
+ }
+
+ for (i <- 0 until 6) {
+ vrf.io.write(i).valid := false.B
+ vrf.io.write(i).addr := 0.U
+ vrf.io.write(i).data := 0.U
+ }
+
+ for (i <- 0 until 4) {
+ vrf.io.whint(i).valid := false.B
+ vrf.io.whint(i).addr := 0.U
+ }
+
+ for (i <- 0 until 2) {
+ vrf.io.scalar(i).valid := false.B
+ vrf.io.scalar(i).data := 0.U
+ }
+
+ vrf.io.transpose.valid := false.B
+ vrf.io.transpose.index := 0.U
+ vrf.io.transpose.addr := 0.U
+
+ // ---------------------------------------------------------------------------
+ // VALU.
+ val aluvalid = Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).alu,
+ vdec.io.out(2).valid && vdec.io.cmdq(2).alu,
+ vdec.io.out(1).valid && vdec.io.cmdq(1).alu,
+ vdec.io.out(0).valid && vdec.io.cmdq(0).alu)
+
+ val aluready = Cat(valu.io.in.ready && vdec.io.cmdq(3).alu,
+ valu.io.in.ready && vdec.io.cmdq(2).alu,
+ valu.io.in.ready && vdec.io.cmdq(1).alu,
+ valu.io.in.ready && vdec.io.cmdq(0).alu)
+
+ valu.io.in.valid := aluvalid =/= 0.U
+
+ for (i <- 0 until 4) {
+ valu.io.in.bits(i).valid := aluvalid(i)
+ valu.io.in.bits(i).bits := vdec.io.out(i).bits
+ }
+
+ for (i <- 0 until 7) {
+ vrf.io.read(i).valid := valu.io.read(i).valid
+ vrf.io.read(i).addr := valu.io.read(i).addr
+ vrf.io.read(i).tag := valu.io.read(i).tag
+ }
+
+ for (i <- 0 until 7) {
+ valu.io.read(i).data := vrf.io.read(i).data
+ }
+
+ for (i <- 0 until 4) {
+ vrf.io.write(i).valid := valu.io.write(i).valid
+ vrf.io.write(i).addr := valu.io.write(i).addr
+ vrf.io.write(i).data := valu.io.write(i).data
+
+ vrf.io.whint(i).valid := valu.io.whint(i).valid
+ vrf.io.whint(i).addr := valu.io.whint(i).addr
+ }
+
+ for (i <- 0 until 2) {
+ vrf.io.scalar(i).valid := valu.io.scalar(i).valid
+ vrf.io.scalar(i).data := valu.io.scalar(i).data
+ }
+
+ valu.io.vrfsb := vrf.io.vrfsb.data
+
+ // ---------------------------------------------------------------------------
+ // VCONV.
+ val convvalid = Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).conv,
+ vdec.io.out(2).valid && vdec.io.cmdq(2).conv,
+ vdec.io.out(1).valid && vdec.io.cmdq(1).conv,
+ vdec.io.out(0).valid && vdec.io.cmdq(0).conv)
+
+ val convready = Cat(vconv.io.in.ready && vdec.io.cmdq(3).conv,
+ vconv.io.in.ready && vdec.io.cmdq(2).conv,
+ vconv.io.in.ready && vdec.io.cmdq(1).conv,
+ vconv.io.in.ready && vdec.io.cmdq(0).conv)
+
+ vconv.io.in.valid := convvalid =/= 0.U
+
+ for (i <- 0 until 4) {
+ vconv.io.in.bits(i).valid := convvalid(i)
+ vconv.io.in.bits(i).bits := vdec.io.out(i).bits
+ }
+
+ vrf.io.conv := vconv.io.out
+
+ vconv.io.vrfsb := vrf.io.vrfsb.data
+
+ // ---------------------------------------------------------------------------
+ // VLdSt.
+ val ldstvalid = Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).ldst,
+ vdec.io.out(2).valid && vdec.io.cmdq(2).ldst,
+ vdec.io.out(1).valid && vdec.io.cmdq(1).ldst,
+ vdec.io.out(0).valid && vdec.io.cmdq(0).ldst)
+
+ val ldstready = Cat(vldst.io.in.ready && vdec.io.cmdq(3).ldst,
+ vldst.io.in.ready && vdec.io.cmdq(2).ldst,
+ vldst.io.in.ready && vdec.io.cmdq(1).ldst,
+ vldst.io.in.ready && vdec.io.cmdq(0).ldst)
+
+ vldst.io.in.valid := ldstvalid =/= 0.U
+
+ for (i <- 0 until 4) {
+ vldst.io.in.bits(i).valid := ldstvalid(i)
+ vldst.io.in.bits(i).bits := vdec.io.out(i).bits
+ }
+
+ vldst.io.read.ready := !vst.io.read.valid
+ vldst.io.read.data := vrf.io.read(6).data
+
+ vldst.io.vrfsb := vrf.io.vrfsb.data
+
+ io.dbus <> vldst.io.dbus
+ io.last := vldst.io.last
+
+ // ---------------------------------------------------------------------------
+ // VLd.
+ val ldvalid = Wire(UInt(4.W))
+ val ldready = Wire(UInt(4.W))
+
+ ldvalid := Cat(vdec.io.cmdq(3).ld && vdec.io.out(3).valid,
+ vdec.io.cmdq(2).ld && vdec.io.out(2).valid,
+ vdec.io.cmdq(1).ld && vdec.io.out(1).valid,
+ vdec.io.cmdq(0).ld && vdec.io.out(0).valid)
+
+ ldready := Cat(vdec.io.cmdq(3).ld && vld.io.in.ready,
+ vdec.io.cmdq(2).ld && vld.io.in.ready,
+ vdec.io.cmdq(1).ld && vld.io.in.ready,
+ vdec.io.cmdq(0).ld && vld.io.in.ready)
+
+ vld.io.in.valid := ldvalid =/= 0.U
+
+ for (i <- 0 until 4) {
+ vld.io.in.bits(i).valid := ldvalid(i)
+ vld.io.in.bits(i).bits := vdec.io.out(i).bits
+ }
+
+ io.ld <> vld.io.axi
+
+ // ---------------------------------------------------------------------------
+ // VSt.
+ val stvalid = Wire(UInt(4.W))
+ val stready = Wire(UInt(4.W))
+
+ stvalid := Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).st,
+ vdec.io.out(2).valid && vdec.io.cmdq(2).st,
+ vdec.io.out(1).valid && vdec.io.cmdq(1).st,
+ vdec.io.out(0).valid && vdec.io.cmdq(0).st)
+
+ stready := Cat(vst.io.in.ready && vdec.io.cmdq(3).st,
+ vst.io.in.ready && vdec.io.cmdq(2).st,
+ vst.io.in.ready && vdec.io.cmdq(1).st,
+ vst.io.in.ready && vdec.io.cmdq(0).st)
+
+ vst.io.in.valid := stvalid =/= 0.U
+
+ for (i <- 0 until 4) {
+ vst.io.in.bits(i).valid := stvalid(i)
+ vst.io.in.bits(i).bits := vdec.io.out(i).bits
+ }
+
+ io.st <> vst.io.axi
+
+ vst.io.vrfsb := vrf.io.vrfsb.data
+
+ vst.io.read.ready := true.B
+ vst.io.read.data := vrf.io.read(6).data
+
+ // ---------------------------------------------------------------------------
+ // Load write.
+ vrf.io.write(4).valid := vldst.io.write.valid
+ vrf.io.write(4).addr := vldst.io.write.addr
+ vrf.io.write(4).data := vldst.io.write.data
+
+ vrf.io.write(5).valid := vld.io.write.valid
+ vrf.io.write(5).addr := vld.io.write.addr
+ vrf.io.write(5).data := vld.io.write.data
+
+ // ---------------------------------------------------------------------------
+ // Store read.
+ vrf.io.read(6).valid := vst.io.read.valid || vldst.io.read.valid
+ vrf.io.read(6).addr := Mux(vst.io.read.valid, vst.io.read.addr,
+ vldst.io.read.addr)
+ vrf.io.read(6).tag := Mux(vst.io.read.valid, vst.io.read.tag,
+ vldst.io.read.tag)
+
+ // ---------------------------------------------------------------------------
+ // VDecode.
+ for (i <- 0 until 4) {
+ vdec.io.out(i).ready := aluready(i) || convready(i) || ldstready(i) ||
+ ldready(i) || stready(i)
+ }
+
+ // ---------------------------------------------------------------------------
+ // Memory active status.
+ io.score.mactive := vinst.io.nempty || vdec.io.nempty ||
+ vld.io.nempty || vst.io.nempty
+}
+
+class VCoreEmpty(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ // Score <> VCore
+ val score = new VCoreIO(p)
+
+ // Data bus interface.
+ val dbus = new DBusIO(p)
+ val last = Output(Bool())
+
+ // AXI interface.
+ val ld = new AxiMasterReadIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits)
+ val st = new AxiMasterWriteIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits)
+ })
+
+ io.score.undef := io.score.vinst(0).valid || io.score.vinst(1).valid ||
+ io.score.vinst(2).valid || io.score.vinst(3).valid
+
+ io.score.mactive := false.B
+
+ io.dbus.valid := false.B
+ io.dbus.write := false.B
+ io.dbus.size := 0.U
+ io.dbus.addr := 0.U
+ io.dbus.adrx := 0.U
+ io.dbus.wdata := 0.U
+ io.dbus.wmask := 0.U
+ io.last := false.B
+
+ for (i <- 0 until 4) {
+ io.score.vinst(i).ready := true.B
+ io.score.rd(i).valid := false.B
+ io.score.rd(i).addr := 0.U
+ io.score.rd(i).data := 0.U
+ }
+
+ io.ld.addr.valid := false.B
+ io.ld.addr.bits.addr := 0.U
+ io.ld.addr.bits.id := 0.U
+ io.ld.data.ready := false.B
+
+ io.st.addr.valid := false.B
+ io.st.addr.bits.addr := 0.U
+ io.st.addr.bits.id := 0.U
+ io.st.data.valid := false.B
+ io.st.data.bits.data := 0.U
+ io.st.data.bits.strb := 0.U
+ io.st.resp.ready := false.B
+}
diff --git a/hdl/chisel/src/kelvin/vector/VDecode.scala b/hdl/chisel/src/kelvin/vector/VDecode.scala
new file mode 100644
index 0000000..3451adc
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VDecode.scala
@@ -0,0 +1,440 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common.Fifo4x4
+
+object VDecode {
+ def apply(p: Parameters): VDecode = {
+ return Module(new VDecode(p))
+ }
+}
+
+class VDecode(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ val in = Flipped(Decoupled(Vec(4, Valid(new VectorInstructionLane))))
+ val out = Vec(4, Decoupled(new VDecodeBits))
+ val cmdq = Vec(4, Output(new VDecodeCmdq))
+ val actv = Vec(4, Output(new VDecodeActive)) // used in testbench
+ val stall = Output(Bool())
+ val active = Input(UInt(64.W))
+ val vrfsb = new VRegfileScoreboardIO
+ val undef = Output(Bool())
+ val nempty = Output(Bool())
+ })
+
+ val guard = 8 // two cycles of 4-way dispatch
+ val depth = 16 + guard
+
+ val enc = new VEncodeOp()
+
+ val f = Fifo4x4(new VectorInstructionLane, depth)
+
+ val d = Seq(Module(new VDecodeInstruction(p)),
+ Module(new VDecodeInstruction(p)),
+ Module(new VDecodeInstruction(p)),
+ Module(new VDecodeInstruction(p)))
+
+ val e = Wire(Vec(4, new VDecodeBits))
+
+ val valid = RegInit(VecInit(Seq.fill(4)(false.B)))
+ val data = Reg(Vec(4, new VDecodeBits))
+ val cmdq = Reg(Vec(4, new VDecodeCmdq))
+ val actv = Wire(Vec(4, new VDecodeActive))
+ val actv2 = Reg(Vec(4, new VDecodeActive2))
+ val dataNxt = Wire(Vec(4, new VDecodeBits))
+ val cmdqNxt = Wire(Vec(4, new VDecodeCmdq))
+ val actvNxt = Wire(Vec(4, new VDecodeActive2))
+
+ // ---------------------------------------------------------------------------
+ // Decode.
+ for (i <- 0 until 4) {
+ d(i).io.in := f.io.out(i).bits
+ }
+
+ // ---------------------------------------------------------------------------
+ // Apply "out-of-order" tags to read/write registers.
+ // Since only one write may be outstanding, track using 1bit which side of
+ // write the read usage is occurring on.
+ val tagReg = RegInit(0.U(64.W))
+
+ val tag0 = tagReg
+ val tag1 = tag0 ^ d(0).io.actv.wactive
+ val tag2 = tag1 ^ d(1).io.actv.wactive
+ val tag3 = tag2 ^ d(2).io.actv.wactive
+ val tag4 = tag3 ^ d(3).io.actv.wactive
+
+ val tags = Seq(tag0, tag1, tag2, tag3, tag4)
+
+ // f.io.out is ordered, so can use a priority tree.
+ when(f.io.out(3).valid && f.io.out(3).ready) {
+ tagReg := tag4
+ } .elsewhen(f.io.out(2).valid && f.io.out(2).ready) {
+ tagReg := tag3
+ } .elsewhen(f.io.out(1).valid && f.io.out(1).ready) {
+ tagReg := tag2
+ } .elsewhen(f.io.out(0).valid && f.io.out(0).ready) {
+ tagReg := tag1
+ }
+
+ def TagAddr(tag: UInt, v: VAddrTag): VAddrTag = {
+ assert(tag.getWidth == 64)
+ assert(v.addr.getWidth == 6)
+ assert(v.tag === 0.U)
+ val addr = v.addr
+ val addrm = addr(5,2)
+ val tagm = Wire(Vec(16, UInt(4.W)))
+ for (i <- 0 until 16) {
+ tagm(i) := tag(4 * i + 3, 4 * i)
+ }
+ val r = Wire(new VAddrTag())
+ r.valid := v.valid
+ r.addr := v.addr
+ r.tag := VecAt(tagm, addrm)
+ r
+ }
+
+ for (i <- 0 until 4) {
+ e(i) := d(i).io.out
+ e(i).vs := TagAddr(tags(i), d(i).io.out.vs)
+ e(i).vt := TagAddr(tags(i), d(i).io.out.vt)
+ e(i).vu := TagAddr(tags(i), d(i).io.out.vu)
+ e(i).vx := TagAddr(tags(i), d(i).io.out.vx)
+ e(i).vy := TagAddr(tags(i), d(i).io.out.vy)
+ e(i).vz := TagAddr(tags(i), d(i).io.out.vz)
+ }
+
+ // ---------------------------------------------------------------------------
+ // Undef. (io.in.ready ignored to signal as early as possible)
+ io.undef := io.in.valid && (d(0).io.undef || d(1).io.undef || d(2).io.undef || d(3).io.undef)
+
+ // ---------------------------------------------------------------------------
+ // Fifo.
+ f.io.in <> io.in
+
+ val icount = MuxOR(io.in.valid, PopCount(Cat(io.in.bits(0).valid, io.in.bits(1).valid, io.in.bits(2).valid, io.in.bits(3).valid)))
+ assert(icount.getWidth == 3)
+
+ val ocount = PopCount(Cat(valid(0) && !(io.out(0).valid && io.out(0).ready),
+ valid(1) && !(io.out(1).valid && io.out(1).ready),
+ valid(2) && !(io.out(2).valid && io.out(2).ready),
+ valid(3) && !(io.out(3).valid && io.out(3).ready)))
+ assert(ocount.getWidth == 3)
+
+ for (i <- 0 until 4) {
+ f.io.out(i).ready := (i.U + ocount) < 4.U
+ }
+
+ // ---------------------------------------------------------------------------
+ // Valid.
+ val fcount = PopCount(Cat(f.io.out(0).valid && f.io.out(0).ready,
+ f.io.out(1).valid && f.io.out(1).ready,
+ f.io.out(2).valid && f.io.out(2).ready,
+ f.io.out(3).valid && f.io.out(3).ready))
+ assert(fcount.getWidth == 3)
+
+ for (i <- 0 until 4) {
+ valid(i) := (ocount + fcount) > i.U
+ }
+
+ // ---------------------------------------------------------------------------
+ // Stall.
+ io.stall := (f.io.count + icount) > (depth - guard).U
+
+ // ---------------------------------------------------------------------------
+ // Dependencies.
+ val depends = Wire(Vec(4, Bool()))
+
+ // Writes must not proceed past any outstanding reads or writes,
+ // or past any dispatching writes.
+ val wactive0 = io.vrfsb.data(63, 0) | io.vrfsb.data(127, 64) | io.active
+ val wactive1 = actv(0).ractive | actv(0).wactive | wactive0
+ val wactive2 = actv(1).ractive | actv(1).wactive | wactive1
+ val wactive3 = actv(2).ractive | actv(2).wactive | wactive2
+ val wactive = VecInit(wactive0, wactive1, wactive2, wactive3)
+
+ // Reads must not proceed past any dispatching writes.
+ val ractive0 = 0.U(64.W)
+ val ractive1 = actv(0).wactive | ractive0
+ val ractive2 = actv(1).wactive | ractive1
+ val ractive3 = actv(2).wactive | ractive2
+ val ractive = VecInit(ractive0, ractive1, ractive2, ractive3)
+
+ for (i <- 0 until 4) {
+ depends(i) := (wactive(i) & actv(i).wactive) =/= 0.U ||
+ (ractive(i) & actv(i).ractive) =/= 0.U
+ }
+
+ // ---------------------------------------------------------------------------
+ // Data.
+ val fvalid = VecInit(f.io.out(0).valid, f.io.out(1).valid,
+ f.io.out(2).valid, f.io.out(3).valid).asUInt
+ assert(!(fvalid(1) && fvalid(0,0) =/= 1.U))
+ assert(!(fvalid(2) && fvalid(1,0) =/= 3.U))
+ assert(!(fvalid(3) && fvalid(2,0) =/= 7.U))
+
+ // Register is updated when fifo has state or contents are active.
+ val dataEn = fvalid(0) || valid.asUInt =/= 0.U
+
+ for (i <- 0 until 4) {
+ when (dataEn) {
+ data(i) := dataNxt(i)
+ cmdq(i) := cmdqNxt(i)
+ actv2(i) := actvNxt(i)
+ }
+ }
+
+ for (i <- 0 until 4) {
+ actv(i).ractive := actv2(i).ractive
+ actv(i).wactive := actv2(i).wactive(63, 0) | actv2(i).wactive(127, 64)
+ }
+
+ // Tag the decode wactive.
+ val dactv = Wire(Vec(4, new VDecodeActive2))
+ for (i <- 0 until 4) {
+ val w0 = d(i).io.actv.wactive & ~tags(i + 1)
+ val w1 = d(i).io.actv.wactive & tags(i + 1)
+ dactv(i).ractive := d(i).io.actv.ractive
+ dactv(i).wactive := Cat(w1, w0)
+ }
+
+ // Data multiplexor of current values and fifo+decode output.
+ val dataMux = VecInit(data(0), data(1), data(2), data(3),
+ e(0), e(1), e(2), e(3))
+
+ val cmdqMux = VecInit(cmdq(0), cmdq(1), cmdq(2), cmdq(3),
+ d(0).io.cmdq, d(1).io.cmdq, d(2).io.cmdq, d(3).io.cmdq)
+
+ val actvMux = VecInit(actv2(0), actv2(1), actv2(2), actv2(3),
+ dactv(0), dactv(1), dactv(2), dactv(3))
+
+ // Mark the multiplexor entries that need to be kept.
+ val marked0 = Wire(UInt(5.W))
+ val marked1 = Wire(UInt(6.W))
+ val marked2 = Wire(UInt(7.W))
+
+ assert((marked1 & marked0) === marked0)
+ assert((marked2 & marked0) === marked0)
+ assert((marked2 & marked1) === marked1)
+
+ val output = Cat(io.out(3).valid && io.out(3).ready,
+ io.out(2).valid && io.out(2).ready,
+ io.out(1).valid && io.out(1).ready,
+ io.out(0).valid && io.out(0).ready)
+
+ when (valid(0) && !output(0)) {
+ dataNxt(0) := dataMux(0)
+ cmdqNxt(0) := cmdqMux(0)
+ actvNxt(0) := actvMux(0)
+ marked0 := 0x01.U
+ } .elsewhen (valid(1) && !output(1)) {
+ dataNxt(0) := dataMux(1)
+ cmdqNxt(0) := cmdqMux(1)
+ actvNxt(0) := actvMux(1)
+ marked0 := 0x03.U
+ } .elsewhen (valid(2) && !output(2)) {
+ dataNxt(0) := dataMux(2)
+ cmdqNxt(0) := cmdqMux(2)
+ actvNxt(0) := actvMux(2)
+ marked0 := 0x07.U
+ } .elsewhen (valid(3) && !output(3)) {
+ dataNxt(0) := dataMux(3)
+ cmdqNxt(0) := cmdqMux(3)
+ actvNxt(0) := actvMux(3)
+ marked0 := 0x0f.U
+ } .otherwise {
+ dataNxt(0) := dataMux(4)
+ cmdqNxt(0) := cmdqMux(4)
+ actvNxt(0) := actvMux(4)
+ marked0 := 0x1f.U
+ }
+
+ when (!marked0(1) && valid(1) && !output(1)) {
+ dataNxt(1) := dataMux(1)
+ cmdqNxt(1) := cmdqMux(1)
+ actvNxt(1) := actvMux(1)
+ marked1 := 0x03.U
+ } .elsewhen (!marked0(2) && valid(2) && !output(2)) {
+ dataNxt(1) := dataMux(2)
+ cmdqNxt(1) := cmdqMux(2)
+ actvNxt(1) := actvMux(2)
+ marked1 := 0x07.U
+ } .elsewhen (!marked0(3) && valid(3) && !output(3)) {
+ dataNxt(1) := dataMux(3)
+ cmdqNxt(1) := cmdqMux(3)
+ actvNxt(1) := actvMux(3)
+ marked1 := 0x0f.U
+ } .elsewhen (!marked0(4)) {
+ dataNxt(1) := dataMux(4)
+ cmdqNxt(1) := cmdqMux(4)
+ actvNxt(1) := actvMux(4)
+ marked1 := 0x1f.U
+ } .otherwise {
+ dataNxt(1) := dataMux(5)
+ cmdqNxt(1) := cmdqMux(5)
+ actvNxt(1) := actvMux(5)
+ marked1 := 0x3f.U
+ }
+
+ when (!marked1(2) && valid(2) && !output(2)) {
+ dataNxt(2) := dataMux(2)
+ cmdqNxt(2) := cmdqMux(2)
+ actvNxt(2) := actvMux(2)
+ marked2 := 0x07.U
+ } .elsewhen (!marked1(3) && valid(3) && !output(3)) {
+ dataNxt(2) := dataMux(3)
+ cmdqNxt(2) := cmdqMux(3)
+ actvNxt(2) := actvMux(3)
+ marked2 := 0x0f.U
+ } .elsewhen (!marked1(4)) {
+ dataNxt(2) := dataMux(4)
+ cmdqNxt(2) := cmdqMux(4)
+ actvNxt(2) := actvMux(4)
+ marked2 := 0x1f.U
+ } .elsewhen (!marked1(5)) {
+ dataNxt(2) := dataMux(5)
+ cmdqNxt(2) := cmdqMux(5)
+ actvNxt(2) := actvMux(5)
+ marked2 := 0x3f.U
+ } .otherwise {
+ dataNxt(2) := dataMux(6)
+ cmdqNxt(2) := cmdqMux(6)
+ actvNxt(2) := actvMux(6)
+ marked2 := 0x7f.U
+ }
+
+ when (!marked2(3) && valid(3) && !output(3)) {
+ dataNxt(3) := dataMux(3)
+ cmdqNxt(3) := cmdqMux(3)
+ actvNxt(3) := actvMux(3)
+ } .elsewhen (!marked2(4)) {
+ dataNxt(3) := dataMux(4)
+ cmdqNxt(3) := cmdqMux(4)
+ actvNxt(3) := actvMux(4)
+ } .elsewhen (!marked2(5)) {
+ dataNxt(3) := dataMux(5)
+ cmdqNxt(3) := cmdqMux(5)
+ actvNxt(3) := actvMux(5)
+ } .elsewhen (!marked2(6)) {
+ dataNxt(3) := dataMux(6)
+ cmdqNxt(3) := cmdqMux(6)
+ actvNxt(3) := actvMux(6)
+ } .otherwise {
+ dataNxt(3) := dataMux(7)
+ cmdqNxt(3) := cmdqMux(7)
+ actvNxt(3) := actvMux(7)
+ }
+
+ // ---------------------------------------------------------------------------
+ // Scoreboard.
+ io.vrfsb.set.valid := output(0) || output(1) || output(2) || output(3)
+
+ io.vrfsb.set.bits := (MuxOR(output(0), actv2(0).wactive) |
+ MuxOR(output(1), actv2(1).wactive) |
+ MuxOR(output(2), actv2(2).wactive) |
+ MuxOR(output(3), actv2(3).wactive))
+
+ assert((io.vrfsb.set.bits(63, 0) & io.vrfsb.set.bits(127, 64)) === 0.U)
+ assert(((io.vrfsb.data(63, 0) | io.vrfsb.data(127, 64)) & (io.vrfsb.set.bits(63, 0) | io.vrfsb.set.bits(127, 64))) === 0.U)
+
+ // ---------------------------------------------------------------------------
+ // Outputs.
+ val outvalid = Wire(Vec(4, Bool()))
+ val cmdsync = Wire(Vec(4, Bool()))
+
+ for (i <- 0 until 4) {
+ outvalid(i) := valid(i) && !depends(i)
+ cmdsync(i) := data(i).cmdsync
+ }
+
+ for (i <- 0 until 4) {
+ // Synchronize commands at cmdsync instance or if found in history.
+ // Note: {vdwinit, vdwconv, vdmulh}, vdmulh must not issue before vdwconv.
+ val synchronize = cmdsync.asUInt(i,0) =/= 0.U
+ val ordered = (~outvalid.asUInt(i,0)) === 0.U
+ val unorder = outvalid(i)
+ if (false) {
+ io.out(i).valid := Mux(synchronize, ordered, unorder)
+ } else {
+ io.out(i).valid := ordered
+ }
+ io.out(i).bits := data(i)
+ io.cmdq(i) := cmdq(i)
+ io.actv(i) := actv(i)
+ }
+
+ // ---------------------------------------------------------------------------
+ // Status.
+ val nempty = RegInit(false.B)
+
+ // Simple implementation, will overlap downstream units redundantly.
+ nempty := io.in.valid || f.io.nempty || valid.asUInt =/= 0.U
+
+ io.nempty := nempty
+}
+
+class VDecodeBits extends Bundle {
+ val op = UInt(new VEncodeOp().bits.W)
+ val f2 = UInt(3.W) // func2
+ val sz = UInt(3.W) // onehot size
+ val m = Bool() // stripmine
+
+ val vd = new VAddr()
+ val ve = new VAddr()
+ val vf = new VAddr()
+ val vg = new VAddr()
+ val vs = new VAddrTag()
+ val vt = new VAddrTag()
+ val vu = new VAddrTag()
+ val vx = new VAddrTag()
+ val vy = new VAddrTag()
+ val vz = new VAddrTag()
+ val sv = new SAddrData()
+
+ val cmdsync = Bool() // Dual command queues synchronize.
+}
+
+class VDecodeCmdq extends Bundle {
+ val alu = Bool() // ALU
+ val conv = Bool() // Convolution vregfile
+ val ldst = Bool() // L1Dcache load/store
+ val ld = Bool() // Uncached load
+ val st = Bool() // Uncached store
+}
+
+class VDecodeActive extends Bundle {
+ val ractive = UInt(64.W)
+ val wactive = UInt(64.W)
+}
+
+class VDecodeActive2 extends Bundle {
+ val ractive = UInt(64.W)
+ val wactive = UInt(128.W) // even/odd tags
+}
+
+class VAddr extends Bundle {
+ val valid = Bool()
+ val addr = UInt(6.W)
+}
+
+class VAddrTag extends Bundle {
+ val valid = Bool()
+ val addr = UInt(6.W)
+ val tag = UInt(4.W)
+}
+
+class SAddrData extends Bundle {
+ val valid = Bool()
+ val addr = UInt(32.W)
+ val data = UInt(32.W)
+}
+
+class SData extends Bundle {
+ val valid = Bool()
+ val data = UInt(32.W)
+}
+
+object EmitVDecode extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new VDecode(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VDecodeInstruction.scala b/hdl/chisel/src/kelvin/vector/VDecodeInstruction.scala
new file mode 100644
index 0000000..33dc5f2
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VDecodeInstruction.scala
@@ -0,0 +1,623 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+class VDecodeInstruction(p: Parameters) extends Module {
+ val dec = new VDecodeOp()
+ val enc = new VEncodeOp()
+
+ val io = IO(new Bundle {
+ val in = new Bundle {
+ val inst = Input(UInt(32.W))
+ val addr = Input(UInt(32.W))
+ val data = Input(UInt(32.W))
+ }
+ val out = Output(new VDecodeBits)
+ val cmdq = Output(new VDecodeCmdq)
+ val actv = Output(new VDecodeActive)
+ val undef = Output(Bool())
+ })
+
+ val inst = io.in.inst
+ val addr = io.in.addr
+ val data = io.in.data
+
+ val v = inst(0) // .vv .vx
+ val x = inst(1) // .vx
+ val x3 = inst(2) // .vxv
+ val func1 = inst(4,2)
+ val m = inst(5)
+ val sz = inst(13,12)
+ val func2 = inst(31,26)
+
+ val vdbits = inst(11,6)
+ val vsbits = inst(19,14)
+ val vtbits = inst(25,20)
+ val vubits = inst(31,26)
+
+ val quad = m && x // dual issue across ALUs
+
+ val uncached = addr(31)
+
+ def DecodeFmt(f1: Int, f2: Int, mask: Int = 0): Bool = {
+ assert(inst.getWidth == 32)
+ val m2 = ~mask.U(6.W) // unsigned, rounding, ...
+ v === 0.U && func1 === f1.U && (func2 & m2) === (f2.U & m2) && sz < 3.U
+ }
+
+ def ToM(a: UInt): UInt = {
+ val bbits = Wire(Vec(16, UInt(4.W)))
+ for (i <- 0 until 16) {
+ val v = a(i)
+ bbits(i) := Cat(v, v, v, v)
+ }
+ val b = bbits.asUInt
+ assert(a.getWidth == 16)
+ assert(b.getWidth == 64)
+ b
+ }
+
+ def RActiveVsVt(i: Int): UInt = {
+ assert(i == 2 || i == 3)
+ val vs = OneHot(vsbits, 64)
+ val vsm = MuxOR(m, ToM(OneHot(vsbits(5,2), 16)))
+ val vt =
+ if (i == 2) {
+ MuxOR(!x, OneHot(vtbits, 64))
+ } else {
+ MuxOR(!x3, OneHot(vtbits, 64))
+ }
+ val vtm =
+ if (i == 2) {
+ MuxOR(m && !x, ToM(OneHot(vtbits(5,2), 16)))
+ } else {
+ MuxOR(m && !x3, ToM(OneHot(vtbits(5,2), 16)))
+ }
+ assert(vs.getWidth == 64)
+ assert(vt.getWidth == 64)
+ assert(vsm.getWidth == 64)
+ assert(vtm.getWidth == 64)
+ vs | vsm | vt | vtm
+ }
+
+ def RActiveVs1(): UInt = {
+ // {vs+1} or {vsm+4}
+ val vs = Cat(OneHot(vsbits, 64), 0.U(1.W))(63,0)
+ val vsm = MuxOR(m, Cat(ToM(OneHot(vsbits(5,2), 16)), 0.U(4.W))(63,0))
+ assert(vs.getWidth == 64)
+ assert(vsm.getWidth == 64)
+ vs | vsm
+ }
+
+ def RActiveVs2(): UInt = {
+ // {vs+2} or {vsm+8}
+ val vs = Cat(OneHot(vsbits, 64), 0.U(2.W))(63,0)
+ val vsm = MuxOR(m, Cat(ToM(OneHot(vsbits(5,2), 16)), 0.U(8.W))(63,0))
+ assert(vs.getWidth == 64)
+ assert(vsm.getWidth == 64)
+ vs | vsm
+ }
+
+ def RActiveVs3(): UInt = {
+ // {vs+3} or {vsm+12}
+ val vs = Cat(OneHot(vsbits, 64), 0.U(3.W))(63,0)
+ val vsm = MuxOR(m, Cat(ToM(OneHot(vsbits(5,2), 16)), 0.U(12.W))(63,0))
+ assert(vs.getWidth == 64)
+ assert(vsm.getWidth == 64)
+ vs | vsm
+ }
+
+ def RActiveVd(): UInt = {
+ val vd = OneHot(vdbits, 64)
+ val vdm = MuxOR(m, ToM(OneHot(vdbits(5,2), 16)))
+ assert(vd.getWidth == 64)
+ assert(vdm.getWidth == 64)
+ vd | vdm
+ }
+
+ def RActiveVu(): UInt = {
+ val vu = OneHot(vubits, 64)
+ val vum = MuxOR(m, ToM(OneHot(vubits(5,2), 16)))
+ assert(vu.getWidth == 64)
+ assert(vum.getWidth == 64)
+ vu | vum
+ }
+
+ def WActiveVd(): UInt = {
+ val vd = OneHot(vdbits, 64)
+ val vdm = MuxOR(m, ToM(OneHot(vdbits(5,2), 16)))
+ assert(vd.getWidth == 64)
+ assert(vdm.getWidth == 64)
+ vd | vdm
+ }
+
+ def WActiveVd1(): UInt = {
+ // {vd+1} or {vdm+4}
+ val vd = Cat(OneHot(vdbits, 64), 0.U(1.W))(63,0)
+ val vdm = MuxOR(m, Cat(ToM(OneHot(vdbits(5,2), 16)), 0.U(4.W))(63,0))
+ assert(vd.getWidth == 64)
+ assert(vdm.getWidth == 64)
+ vd | vdm
+ }
+
+ def DepthwiseRead(): (UInt, UInt, UInt, UInt, UInt, UInt, UInt) = {
+ val vstbl = VecInit(0.U, 1.U, 2.U, 3.U, 4.U, 5.U, 6.U, 1.U, 1.U, 3.U, 5.U, 7.U, 2.U, 4.U, 6.U, 8.U)
+ val vttbl = VecInit(1.U, 2.U, 3.U, 4.U, 5.U, 6.U, 7.U, 0.U, 2.U, 4.U, 6.U, 8.U, 0.U, 0.U, 0.U, 0.U)
+ val vutbl = VecInit(2.U, 3.U, 4.U, 5.U, 6.U, 7.U, 8.U, 2.U, 0.U, 0.U, 0.U, 0.U, 1.U, 1.U, 1.U, 1.U)
+
+ val regbase = data(7,4)
+
+ val vs = vsbits + vstbl(regbase)
+ val vt = vsbits + vttbl(regbase)
+ val vu = vsbits + vutbl(regbase)
+ assert(vs.getWidth == 6)
+ assert(vt.getWidth == 6)
+ assert(vu.getWidth == 6)
+
+ val vx = vubits
+ val vy = vubits + Mux(m, 4.U, 1.U)
+ val vz = vubits + Mux(m, 8.U, 2.U)
+ assert(vx.getWidth == 6)
+ assert(vy.getWidth == 6)
+ assert(vz.getWidth == 6)
+
+ val ra_vs = OneHot(vs, 64)
+ val ra_vt = OneHot(vt, 64)
+ val ra_vu = OneHot(vu, 64)
+ val ra_vx = OneHot(vx, 64)
+ val ra_vy = OneHot(vy, 64)
+ val ra_vz = OneHot(vz, 64)
+ val ra_vxm = MuxOR(m, ToM(OneHot(vx(5,2), 16)))
+ val ra_vym = MuxOR(m, ToM(OneHot(vy(5,2), 16)))
+ val ra_vzm = MuxOR(m, ToM(OneHot(vz(5,2), 16)))
+ assert(ra_vs.getWidth == 64)
+ assert(ra_vt.getWidth == 64)
+ assert(ra_vu.getWidth == 64)
+ assert(ra_vx.getWidth == 64)
+ assert(ra_vy.getWidth == 64)
+ assert(ra_vz.getWidth == 64)
+ assert(ra_vxm.getWidth == 64)
+ assert(ra_vym.getWidth == 64)
+ assert(ra_vzm.getWidth == 64)
+
+ val ractive = ra_vs | ra_vt | ra_vu | ra_vx | ra_vy | ra_vz | ra_vxm | ra_vym | ra_vzm
+ assert(ractive.getWidth == 64)
+
+ (vs, vt, vu, vx, vy, vz, ractive)
+ }
+
+ def SlideRead(): (UInt, UInt, UInt, UInt, UInt, UInt, UInt) = {
+ val s = func2(3) // next(0) previous(1)
+ val vs = Mux(s, vsbits + 3.U, vsbits + 0.U)
+ val vt = Mux(s, vtbits + 0.U, vsbits + 1.U)
+ val vu = Mux(s, vtbits + 1.U, vsbits + 2.U)
+ val vx = Mux(s, vtbits + 1.U, vsbits + 2.U)
+ val vy = Mux(s, vtbits + 2.U, vsbits + 3.U)
+ val vz = Mux(s, vtbits + 3.U, vtbits + 0.U)
+ assert(vs.getWidth == 6)
+ assert(vt.getWidth == 6)
+ assert(vu.getWidth == 6)
+ assert(vx.getWidth == 6)
+ assert(vy.getWidth == 6)
+ assert(vz.getWidth == 6)
+
+ val ra_vs = OneHot(vs, 64)
+ val ra_vt = MuxOR(!x || !s, OneHot(vt, 64))
+ val ra_vu = MuxOR(!x || !s, OneHot(vu, 64))
+ val ra_vx = MuxOR(!x || !s, OneHot(vx, 64))
+ val ra_vy = MuxOR(!x || !s, OneHot(vy, 64))
+ val ra_vz = MuxOR(!x, OneHot(vz, 64))
+ assert(ra_vs.getWidth == 64)
+ assert(ra_vt.getWidth == 64)
+ assert(ra_vu.getWidth == 64)
+ assert(ra_vx.getWidth == 64)
+ assert(ra_vy.getWidth == 64)
+ assert(ra_vz.getWidth == 64)
+
+ val ractive = ra_vs | ra_vt | ra_vu | ra_vx | ra_vy | ra_vz
+ assert(ractive.getWidth == 64)
+
+ (vs, vt, vu, vx, vy, vz, ractive)
+ }
+
+ // ---------------------------------------------------------------------------
+ // Decode the instruction bits.
+
+ // Duplicate
+ val vdup = DecodeBits(inst, "01000x_0xxxxx_000000_xx_xxxxxx_x_111_11") && sz < 3.U
+ val vdupf2 = inst(31,27) === 8.U // used to prevent vdup and vldst op collision only
+
+ // Load/Store
+ val vldstdec = DecodeBits(inst, "xxxxxx_0xxxxx_xxxxx0_xx_xxxxxx_x_111_11") && sz < 3.U && !vdupf2
+ assert(!(vdup && vldstdec))
+
+ val vld = vldstdec && (func2 === 0.U || func2 === 1.U || func2 === 2.U ||
+ func2 === 4.U || func2 === 5.U || func2 === 6.U ||
+ func2 === 7.U)
+
+ val vst = vldstdec && (func2 === 8.U || func2 === 9.U || func2 === 10.U ||
+ func2 === 12.U || func2 === 13.U || func2 === 14.U ||
+ func2 === 15.U)
+
+ val vstq = vldstdec && (func2 === 26.U || func2 === 30.U)
+
+ val vldst = vld || vst || vstq
+
+ // Format0
+ val vadd = DecodeFmt(0, dec.vadd)
+ val vsub = DecodeFmt(0, dec.vsub)
+ val vrsub = DecodeFmt(0, dec.vrsub)
+ val veq = DecodeFmt(0, dec.veq)
+ val vne = DecodeFmt(0, dec.vne)
+ val vlt = DecodeFmt(0, dec.vlt, 1)
+ val vle = DecodeFmt(0, dec.vle, 1)
+ val vgt = DecodeFmt(0, dec.vgt, 1)
+ val vge = DecodeFmt(0, dec.vge, 1)
+ val vabsd = DecodeFmt(0, dec.vabsd, 1)
+ val vmax = DecodeFmt(0, dec.vmax, 1)
+ val vmin = DecodeFmt(0, dec.vmin, 1)
+ val vadd3 = DecodeFmt(0, dec.vadd3)
+
+ val vfmt0 = vadd || vsub || vrsub || veq || vne || vlt || vle || vgt || vge || vabsd || vmax || vmin || vadd3
+
+ // Format1
+ val vand = DecodeFmt(1, dec.vand)
+ val vor = DecodeFmt(1, dec.vor)
+ val vxor = DecodeFmt(1, dec.vxor)
+ val vnot = DecodeFmt(1, dec.vnot)
+ val vrev = DecodeFmt(1, dec.vrev)
+ val vror = DecodeFmt(1, dec.vror)
+ val vclb = DecodeFmt(1, dec.vclb)
+ val vclz = DecodeFmt(1, dec.vclz)
+ val vcpop = DecodeFmt(1, dec.vcpop)
+ val vmv = DecodeFmt(1, dec.vmv) && !quad
+ val vmv2 = DecodeFmt(1, dec.vmv) && quad
+ val vmvp = DecodeFmt(1, dec.vmvp)
+
+ val vfmt1 = vand || vor || vxor || vnot || vrev || vror || vclb || vclz || vcpop || vmv || vmv2 || vmvp
+
+ // do not include in 'vfmt1'
+ val acset = DecodeFmt(1, dec.acset) && x && !m && vtbits === 0.U
+ val actr = DecodeFmt(1, dec.actr) && x && !m && vtbits === 0.U
+ val adwinit = DecodeFmt(1, dec.adwinit)
+
+ // Format2
+ val vsll = DecodeFmt(2, dec.vsll)
+ val vsra = DecodeFmt(2, dec.vsra)
+ val vsrl = DecodeFmt(2, dec.vsrl)
+ val vsha = DecodeFmt(2, dec.vsha, 2)
+ val vshl = DecodeFmt(2, dec.vshl, 2)
+ val vsrans = DecodeFmt(2, dec.vsrans, 3)
+ val vsraqs = DecodeFmt(2, dec.vsraqs, 3)
+
+ val vfmt2 = vsll || vsra || vsrl || vsha || vshl || vsrans || vsraqs
+
+ // Format3
+ val vmul = DecodeFmt(3, dec.vmul) && !quad
+ val vmul2 = DecodeFmt(3, dec.vmul) && quad
+ val vmuls = DecodeFmt(3, dec.vmuls, 1) && !quad
+ val vmuls2 = DecodeFmt(3, dec.vmuls, 1) && quad
+ val vmulh = DecodeFmt(3, dec.vmulh, 2) && !quad
+ val vmulh2 = DecodeFmt(3, dec.vmulh, 2) && quad
+ val vmulhu = DecodeFmt(3, dec.vmulhu, 2) && !quad
+ val vmulhu2 = DecodeFmt(3, dec.vmulhu, 2) && quad
+ val vdmulh = DecodeFmt(3, dec.vdmulh, 3) && !quad
+ val vdmulh2 = DecodeFmt(3, dec.vdmulh, 3) && quad
+ val vmulw = DecodeFmt(3, dec.vmulw, 1)
+ val vmacc = DecodeFmt(3, dec.vmacc)
+ val vmadd = DecodeFmt(3, dec.vmadd)
+
+ val vfmt3 = vmul || vmul2 || vmuls || vmuls2 || vmulh || vmulh2 || vmulhu || vmulhu2 || vdmulh || vdmulh2 || vmulw || vmacc || vmadd
+
+ // Format4
+ val vadds = DecodeFmt(4, dec.vadds, 1)
+ val vsubs = DecodeFmt(4, dec.vsubs, 1)
+ val vaddw = DecodeFmt(4, dec.vaddw, 1)
+ val vsubw = DecodeFmt(4, dec.vsubw, 1)
+ val vacc = DecodeFmt(4, dec.vacc, 1)
+ val vpadd = DecodeFmt(4, dec.vpadd, 1)
+ val vpsub = DecodeFmt(4, dec.vpsub, 1)
+ val vhadd = DecodeFmt(4, dec.vhadd, 3)
+ val vhsub = DecodeFmt(4, dec.vhsub, 3)
+
+ val vfmt4 = vadds || vsubs || vaddw || vsubw || vacc || vpadd || vpsub || vhadd || vhsub
+
+ // Format6
+ val vslidevn = DecodeFmt(6, dec.vslidevn, 3)
+ val vslidehn = DecodeFmt(6, dec.vslidehn, 3) && !m
+ val vslidehn2 = DecodeFmt(6, dec.vslidehn, 3) && m
+ val vslidevp = DecodeFmt(6, dec.vslidevp, 3)
+ val vslidehp = DecodeFmt(6, dec.vslidehp, 3) && !m
+ val vslidehp2 = DecodeFmt(6, dec.vslidehp, 3) && m
+ val vsel = DecodeFmt(6, dec.vsel)
+ val vevn = DecodeFmt(6, dec.vevn)
+ val vodd = DecodeFmt(6, dec.vodd)
+ val vevnodd = DecodeFmt(6, dec.vevnodd)
+ val vzip = DecodeFmt(6, dec.vzip)
+
+ val vslideh2 = vslidehn2 || vslidehp2
+ val vevn3 = vevn || vevnodd || vodd
+
+ val vfmt6 = vslidevn | vslidehn | vslidehn2 | vslidevp | vslidehp | vslidehp2 | vsel | vevn | vodd | vevnodd | vzip
+
+ // FormatVVV
+ val aconv = DecodeBits(inst, "xxxxxx_1xxxxx_xxxxxx_10_xxxxxx_0_00_101")
+ val vcget = DecodeBits(inst, "010100_000000_000000_xx_xxxxxx_x_111_11")
+
+ val vdwconv = DecodeBits(inst, "xxxxxx_0xxxxx_xxxxxx_10_xxxxxx_x_10_101")
+ val adwconv = DecodeBits(inst, "xxxxxx_1xxxxx_xxxxxx_10_xxxxxx_x_10_101")
+ val vadwconv = vdwconv || adwconv
+
+ // Undef
+ val vopbits = Cat(
+ // Duplicate
+ vdup,
+ // Load/Store
+ vld, vst, vstq,
+ // Misc
+ vcget,
+ // Format0
+ vadd, vsub, vrsub, veq, vne, vlt, vle, vgt, vge, vabsd, vmax, vmin, vadd3,
+ // Format1
+ vand, vor, vxor, vnot, vrev, vror, vclb, vclz, vcpop, vmv, vmv2, vmvp, acset, actr, adwinit,
+ // Format2
+ vsll, vsra, vsrl, vsha, vshl, vsrans, vsraqs,
+ // Format3
+ vmul, vmul2, vmuls, vmuls2, vmulh, vmulh2, vmulhu, vmulhu2, vdmulh, vdmulh2, vmulw, vmacc, vmadd,
+ // Format4
+ vadds, vsubs, vaddw, vsubw, vacc, vpadd, vpsub, vhadd, vhsub,
+ // Format6
+ vslidevn, vslidehn, vslidehn2, vslidevp, vslidehp, vslidehp2, vsel, vevn, vodd, vevnodd, vzip,
+ // FormatVVV
+ aconv, vdwconv, adwconv)
+
+ val undef = !WiredOR(vopbits)
+ assert(PopCount(Cat(vopbits, undef)) === 1.U)
+
+ // Encode the opcode.
+ val op =
+ // Duplicate
+ MuxOR(vdup, enc.vdup.U) |
+ // Load/Store
+ MuxOR(vld, enc.vld.U) |
+ MuxOR(vst, enc.vst.U) |
+ MuxOR(vstq, enc.vstq.U) |
+ // Misc
+ MuxOR(vcget, enc.vcget.U) |
+ // Format0
+ MuxOR(vadd, enc.vadd.U) |
+ MuxOR(vsub, enc.vsub.U) |
+ MuxOR(vrsub, enc.vrsub.U) |
+ MuxOR(veq, enc.veq.U) |
+ MuxOR(vne, enc.vne.U) |
+ MuxOR(vlt, enc.vlt.U) |
+ MuxOR(vle, enc.vle.U) |
+ MuxOR(vgt, enc.vgt.U) |
+ MuxOR(vge, enc.vge.U) |
+ MuxOR(vabsd, enc.vabsd.U) |
+ MuxOR(vmax, enc.vmax.U) |
+ MuxOR(vmin, enc.vmin.U) |
+ MuxOR(vadd3, enc.vadd3.U) |
+ // Format1
+ MuxOR(vand, enc.vand.U) |
+ MuxOR(vor, enc.vor.U) |
+ MuxOR(vxor, enc.vxor.U) |
+ MuxOR(vnot, enc.vnot.U) |
+ MuxOR(vrev, enc.vrev.U) |
+ MuxOR(vror, enc.vror.U) |
+ MuxOR(vclb, enc.vclb.U) |
+ MuxOR(vclz, enc.vclz.U) |
+ MuxOR(vcpop, enc.vcpop.U) |
+ MuxOR(vmv, enc.vmv.U) |
+ MuxOR(vmv2, enc.vmv2.U) |
+ MuxOR(vmvp, enc.vmvp.U) |
+ MuxOR(acset, enc.acset.U) |
+ MuxOR(actr, enc.actr.U) |
+ MuxOR(adwinit, enc.adwinit.U) |
+ // Format2
+ MuxOR(vsll, enc.vshl.U) |
+ MuxOR(vsra, enc.vshr.U) |
+ MuxOR(vsrl, enc.vshr.U) |
+ MuxOR(vsha, enc.vshf.U) |
+ MuxOR(vshl, enc.vshf.U) |
+ MuxOR(vsrans, enc.vsrans.U) |
+ MuxOR(vsraqs, enc.vsraqs.U) |
+ // Format3
+ MuxOR(vmul, enc.vmul.U) |
+ MuxOR(vmul2, enc.vmul2.U) |
+ MuxOR(vmuls, enc.vmuls.U) |
+ MuxOR(vmuls2, enc.vmuls2.U) |
+ MuxOR(vmulh, enc.vmulh.U) |
+ MuxOR(vmulh2, enc.vmulh2.U) |
+ MuxOR(vmulhu, enc.vmulh.U) |
+ MuxOR(vmulhu2, enc.vmulh2.U) |
+ MuxOR(vdmulh, enc.vdmulh.U) |
+ MuxOR(vdmulh2, enc.vdmulh2.U) |
+ MuxOR(vmulw, enc.vmulw.U) |
+ MuxOR(vmacc, enc.vmadd.U) |
+ MuxOR(vmadd, enc.vmadd.U) |
+ // Format4
+ MuxOR(vadds, enc.vadds.U) |
+ MuxOR(vsubs, enc.vsubs.U) |
+ MuxOR(vaddw, enc.vaddw.U) |
+ MuxOR(vsubw, enc.vsubw.U) |
+ MuxOR(vacc, enc.vacc.U) |
+ MuxOR(vpadd, enc.vpadd.U) |
+ MuxOR(vpsub, enc.vpsub.U) |
+ MuxOR(vhadd, enc.vhadd.U) |
+ MuxOR(vhsub, enc.vhsub.U) |
+ // Format6
+ MuxOR(vslidevn, enc.vslidevn.U) |
+ MuxOR(vslidehn, enc.vslidehn.U) |
+ MuxOR(vslidehn2, enc.vslidehn2.U) |
+ MuxOR(vslidevp, enc.vslidevp.U) |
+ MuxOR(vslidehp, enc.vslidehp.U) |
+ MuxOR(vslidehp2, enc.vslidehp2.U) |
+ MuxOR(vsel, enc.vsel.U) |
+ MuxOR(vevn, enc.vevn.U) |
+ MuxOR(vodd, enc.vodd.U) |
+ MuxOR(vevnodd, enc.vevnodd.U) |
+ MuxOR(vzip, enc.vzip.U) |
+ // FormatVVV
+ MuxOR(aconv, enc.aconv.U) |
+ MuxOR(vdwconv, enc.vdwconv.U) |
+ MuxOR(adwconv, enc.adwconv.U)
+
+ // Scalar.
+ def ScalarData(sz: UInt, data: UInt): UInt = {
+ assert(sz.getWidth == 2)
+ assert(data.getWidth == 32)
+ MuxOR(sz === 0.U, Cat(data(7,0), data(7,0), data(7,0), data(7,0))) |
+ MuxOR(sz === 1.U, Cat(data(15,0), data(15,0))) |
+ MuxOR(sz === 2.U, data(31,0))
+ }
+
+ // Depthwise read.
+ val (vsdw, vtdw, vudw, vxdw, vydw, vzdw, ractivedw) = DepthwiseRead()
+
+ val ractivedi = ToM(OneHot(vsbits(5,2), 16))
+ val wactivedw = ToM(OneHot(vdbits(5,2), 16))
+
+ // Slide composite read.
+ val (vssl, vtsl, vusl, vxsl, vysl, vzsl, ractivesl) = SlideRead()
+
+ // Convolution read/write.
+ val ractiveconv1 = Wire(UInt(64.W))
+ val ractiveconv2 = Wire(UInt(64.W))
+ val ractiveaset = Wire(UInt(64.W))
+ val wactiveconv = Wire(UInt(64.W))
+
+ // Narrow reads (vs) are aligned to 16 register base (v0, v16, v32, v48).
+ // Wide reads (vu) are aligned to SIMD width(4,8,16), assumes scalar control
+ // field does not access beyond this bounds.
+ if (p.vectorBits == 128) {
+ ractiveconv1 := 0x000f.U << Cat(vsbits(5,4), 0.U(4.W))
+ ractiveconv2 := 0x000f.U << Cat(vubits(5,2), 0.U(2.W))
+ ractiveaset := 0x000f.U << Cat(vsbits(5,2), 0.U(2.W))
+ wactiveconv := 0x000f.U << Cat(vdbits(5,4), 0.U(4.W))
+ } else if (p.vectorBits == 256) {
+ ractiveconv1 := 0x00ff.U << Cat(vsbits(5,4), 0.U(4.W))
+ ractiveconv2 := 0x00ff.U << Cat(vubits(5,3), 0.U(3.W))
+ ractiveaset := 0x00ff.U << Cat(vsbits(5,3), 0.U(3.W))
+ wactiveconv := 0x00ff.U << Cat(vdbits(5,4), 0.U(4.W))
+ } else if (p.vectorBits == 512) {
+ ractiveconv1 := 0xffff.U << Cat(vsbits(5,4), 0.U(4.W))
+ ractiveconv2 := 0xffff.U << Cat(vubits(5,4), 0.U(4.W))
+ ractiveaset := 0xffff.U << Cat(vsbits(5,4), 0.U(4.W))
+ wactiveconv := 0xffff.U << Cat(vdbits(5,4), 0.U(4.W))
+ } else {
+ assert(false);
+ }
+
+ // Outputs.
+ io.undef := undef
+
+ io.out.op := op
+ io.out.f2 := func2(2,0)
+ io.out.sz := Cat(sz === 2.U, sz === 1.U, sz === 0.U)
+ io.out.m := m && !vdmulh2 && !vmul2 && !vmulh2 && !vmulhu2 && !vmuls2 && !vmv2 && !vslidehn2 && !vslidehp2
+ io.out.cmdsync := adwinit || vadwconv || vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2 || vslideh2 || vsraqs
+
+ io.out.vd.valid := vdwconv || vfmt0 || vfmt1 || vfmt2 || vfmt3 || vfmt4 || vfmt6 || vld || vdup || vcget
+ io.out.ve.valid := vdwconv || vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2 || vacc || vmv2 || vmvp || vmulw || vaddw || vsubw || vevnodd || vslideh2 || vzip
+ io.out.vf.valid := vdwconv || vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2 || vslideh2
+ io.out.vg.valid := vdwconv || vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2 || vslideh2
+ io.out.vs.valid := vadwconv || adwinit || vfmt0 || vfmt1 || vfmt2 || vfmt3 || vfmt4 || vfmt6 || vst || vstq || aconv
+ io.out.vt.valid := vadwconv || adwinit || !x && (vfmt0 || vfmt1 || vfmt2 || vfmt3 || vfmt4 || vfmt6)
+ io.out.vu.valid := vadwconv || vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2 || vacc || vadd3 || vmacc || vmadd || aconv || vsrans || vsraqs || vsel || vslideh2 || m && vevn3
+ io.out.vx.valid := vadwconv || adwinit || vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2 || vslideh2 || vsraqs
+ io.out.vy.valid := vadwconv || adwinit || vslideh2 || !x && (vsraqs)
+ io.out.vz.valid := vadwconv || vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2 || vslideh2 || vsraqs
+ io.out.sv.valid := x && (vdup || vfmt0 || vfmt1 || vfmt2 || vfmt3 || vfmt4 || vfmt6)
+
+ io.out.vd.addr := vdbits
+ io.out.ve.addr := Mux(vodd, vdbits,
+ Mux(vadwconv || vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2 || vslideh2 || vzip, vdbits + 1.U,
+ Mux(m, vdbits + 4.U, vdbits + 1.U)))
+ io.out.vf.addr := vdbits + 2.U
+ io.out.vg.addr := vdbits + 3.U
+ io.out.vs.addr := Mux(vadwconv, vsdw,
+ Mux(vslideh2, vssl,
+ Mux(vmadd || vst || vstq, vdbits,
+ vsbits)))
+ io.out.vt.addr := Mux(vadwconv, vtdw,
+ Mux(adwinit, vsbits + 1.U,
+ Mux(vslideh2, vtsl,
+ Mux(m && vevn3, vsbits + 1.U,
+ vtbits))))
+ io.out.vu.addr := Mux(vadwconv, vudw,
+ Mux(vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2, vsbits + 1.U,
+ Mux(vslideh2, vusl,
+ Mux(vacc || vsrans, Mux(m, vsbits + 4.U, vsbits + 1.U),
+ Mux(vsraqs, Mux(m, vsbits + 4.U, vsbits + 1.U),
+ Mux(vmacc || vadd3 || vsel, vdbits,
+ Mux(vmadd, vsbits,
+ Mux(vevn3, vtbits,
+ vubits))))))))
+ io.out.vx.addr := Mux(vadwconv, vxdw,
+ Mux(adwinit || vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2, vsbits + 2.U,
+ Mux(vsraqs, Mux(m, vsbits + 8.U, vsbits + 2.U),
+ vxsl)))
+ io.out.vy.addr := Mux(vadwconv, vydw,
+ Mux(adwinit, vsbits + 3.U,
+ Mux(vsraqs, vtbits,
+ vysl)))
+ io.out.vz.addr := Mux(vadwconv, vzdw,
+ Mux(vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2, vsbits + 3.U,
+ Mux(vsraqs, Mux(m, vsbits + 12.U, vsbits + 3.U),
+ vzsl)))
+
+ io.out.vs.tag := 0.U
+ io.out.vt.tag := 0.U
+ io.out.vu.tag := 0.U
+ io.out.vx.tag := 0.U
+ io.out.vy.tag := 0.U
+ io.out.vz.tag := 0.U
+
+ io.out.sv.addr := addr
+ io.out.sv.data := Mux(vldstdec, data,
+ Mux(vaddw || vmulw || vsubw, ScalarData(sz - 1.U, data),
+ ScalarData(sz, data)))
+
+ assert(PopCount(io.out.sz) <= 1.U)
+ assert(!(io.out.vx.valid && !io.out.cmdsync))
+ assert(!(io.out.vy.valid && !io.out.cmdsync))
+ assert(!(io.out.vz.valid && !io.out.cmdsync))
+
+ io.cmdq.alu := vdup || vfmt0 || vfmt1 || vfmt2 || vfmt3 || vfmt4 || vfmt6 || vadwconv || adwinit
+ io.cmdq.conv := aconv || vcget || acset || actr
+ io.cmdq.ldst := vldst && !uncached
+ io.cmdq.ld := vld && uncached
+ io.cmdq.st := (vst || vstq) && uncached
+
+ val cmdqchk = Cat(io.undef, io.cmdq.alu, io.cmdq.conv, io.cmdq.ldst, io.cmdq.ld, io.cmdq.st)
+ assert(PopCount(cmdqchk) === 1.U)
+
+ io.actv.ractive :=
+ MuxOR(vfmt0 || vfmt1 || vfmt2 || vfmt3 || vfmt4 ||
+ vfmt6 && !vslideh2, RActiveVsVt(2)) |
+ MuxOR(vsraqs || vsrans, RActiveVs1()) |
+ MuxOR(vsraqs, RActiveVs2()) |
+ MuxOR(vsraqs, RActiveVs3()) |
+ MuxOR(vmacc || vmadd || vst || vstq, RActiveVd()) |
+ MuxOR(vadwconv, ractivedw) |
+ MuxOR(adwinit, ractivedi) |
+ MuxOR(vslideh2, ractivesl) |
+ MuxOR(aconv || actr, ractiveconv1) |
+ MuxOR(aconv, ractiveconv2) |
+ MuxOR(acset, ractiveaset)
+
+ io.actv.wactive :=
+ MuxOR(vfmt0 || vfmt1 || vfmt2 || vfmt3 || vfmt4 || vfmt6 ||
+ vdup || vld, WActiveVd()) |
+ MuxOR(vmvp || vmulw || vacc || vaddw || vsubw || vevnodd || vzip,
+ WActiveVd1()) |
+ MuxOR(vdwconv, wactivedw) |
+ MuxOR(vcget, wactiveconv)
+}
+
+object EmitVDecodeInstruction extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new VDecodeInstruction(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VDecodeOp.scala b/hdl/chisel/src/kelvin/vector/VDecodeOp.scala
new file mode 100644
index 0000000..553af1b
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VDecodeOp.scala
@@ -0,0 +1,84 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+case class VDecodeOp() {
+ // Format0
+ val vadd = 0
+ val vsub = 1
+ val vrsub = 2
+ val veq = 6
+ val vne = 7
+ val vlt = 8
+ val vle = 10
+ val vgt = 12
+ val vge = 14
+ val vabsd = 16
+ val vmax = 18
+ val vmin = 20
+ val vadd3 = 24
+
+ // Format1
+ val vand = 0
+ val vor = 1
+ val vxor = 2
+ val vnot = 3
+ val vrev = 4
+ val vror = 5
+ val vclb = 8
+ val vclz = 9
+ val vcpop = 10
+ val vmv = 12
+ val vmvp = 13
+ val acset = 16
+ val actr = 17
+ val adwinit = 18
+
+ // Format2
+ val vsll = 1
+ val vsra = 2
+ val vsrl = 3
+ val vsha = 8
+ val vshl = 9
+ val vsrans = 16
+ val vsraqs = 24
+
+ // Format3
+ val vmul = 0
+ val vmuls = 2
+ val vmulw = 4
+ val vmulh = 8
+ val vmulhu = 9
+ val vdmulh = 16
+ val vmacc = 20
+ val vmadd = 21
+
+ // Format4
+ val vadds = 0
+ val vsubs = 2
+ val vaddw = 4
+ val vsubw = 6
+ val vacc = 10
+ val vpadd = 12
+ val vpsub = 14
+ val vhadd = 16
+ val vhsub = 20
+
+ // Format6
+ val vsliden = 0
+ val vslidevn = 0
+ val vslidehn = 4
+ val vslidep = 8
+ val vslidevp = 8
+ val vslidehp = 12
+ val vsel = 16
+ val vevn = 24
+ val vodd = 25
+ val vevnodd = 26
+ val vzip = 28
+
+ // FormatVVV
+ val aconv = 8
+ val vdwconv = 10
+}
diff --git a/hdl/chisel/src/kelvin/vector/VDot.scala b/hdl/chisel/src/kelvin/vector/VDot.scala
new file mode 100644
index 0000000..50d978e
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VDot.scala
@@ -0,0 +1,174 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+object VDot {
+ // Conv2D
+ def apply(en: Bool, adata: UInt, bdata: UInt,
+ abias: UInt, bbias: UInt, asign: Bool, bsign: Bool): UInt = {
+ assert(abias.getWidth == 9)
+ assert(bbias.getWidth == 9)
+ assert(adata.getWidth == 32)
+ assert(bdata.getWidth == 32)
+
+ val mul = Wire(Vec(4, SInt(20.W)))
+
+ // input clamps
+ val adatac = MuxOR(en, adata)
+ val bdatac = MuxOR(en, bdata)
+ val abiasc = MuxOR(en, abias)
+ val bbiasc = MuxOR(en, bbias)
+
+ for (i <- 0 until 4) {
+ val as = adatac(8 * i + 7) & asign
+ val bs = bdatac(8 * i + 7) & bsign
+ val aval = Cat(as, adatac(8 * i + 7, 8 * i)).asSInt +& abiasc.asSInt
+ val bval = Cat(bs, bdatac(8 * i + 7, 8 * i)).asSInt +& bbiasc.asSInt
+ val mval = aval * bval
+ mul(i) := mval
+
+ assert(aval.getWidth == 10)
+ assert(bval.getWidth == 10)
+ assert(mval.getWidth == 20)
+ }
+
+ val dotp = (mul(0) +& mul(1)) +& (mul(2) +& mul(3))
+ val sdotp = Cat(MuxOR(dotp(21), ~0.U(10.W)), dotp)
+
+ assert(dotp.getWidth == 22)
+ assert(sdotp.getWidth == 32)
+
+ sdotp
+ }
+
+ // Depthwise
+ def apply(alu: Int, en: Bool, adata: Vec[UInt], bdata: Vec[UInt],
+ scalar: UInt): (UInt, UInt) = {
+ assert(adata.length == 3)
+ assert(bdata.length == 3)
+ assert(scalar.getWidth == 32)
+ val sparse = scalar(3,2)
+ val abias = scalar(20,12)
+ val asign = scalar(21)
+ val bbias = scalar(30,22)
+ val bsign = scalar(31)
+
+ val sparse0 = sparse === 0.U
+ val sparse1 = sparse === 1.U
+ val sparse2 = sparse === 2.U
+
+ val w = adata(0).getWidth
+ val cnt = w / 32
+ val dout0 = Wire(Vec(cnt, UInt(32.W)))
+ val dout1 = Wire(Vec(cnt, UInt(32.W)))
+
+ // Input clamps and dense/sparse swizzle.
+ val adatac = Wire(Vec(3, Vec(cnt, UInt(32.W))))
+ val bdatac = Wire(Vec(3, Vec(cnt, UInt(32.W))))
+
+ val abiasc = MuxOR(en, abias)
+ val bbiasc = MuxOR(en, bbias)
+
+ // Sparse 1 [n-1,n,n+1].
+ val adata1 = Wire(Vec(cnt + 2, UInt(32.W)))
+ if (true) {
+ val lsb = (cnt - 1) * 32
+ val msb = lsb + 32 - 1
+ adata1(0) := MuxOR(en && sparse1, adata(0)(msb,lsb))
+ }
+ for (i <- 0 until cnt) {
+ val lsb = i * 32
+ val msb = lsb + 32 - 1
+ adata1(i + 1) := MuxOR(en && sparse1, adata(1)(msb,lsb))
+ }
+ if (true) {
+ val lsb = 0
+ val msb = 31
+ adata1(cnt + 1) := MuxOR(en && sparse1, adata(2)(msb,lsb))
+ }
+
+ // Sparse 2 [n,n+1,n+2].
+ val adata2 = Wire(Vec(cnt + 2, UInt(32.W)))
+ for (i <- 0 until cnt) {
+ val lsb = i * 32
+ val msb = lsb + 32 - 1
+ adata2(i) := MuxOR(en && sparse2, adata(0)(msb,lsb))
+ }
+ for (i <- 0 until 2) {
+ val lsb = i * 32
+ val msb = lsb + 32 - 1
+ adata2(cnt + i) := MuxOR(en && sparse2, adata(1)(msb,lsb))
+ }
+
+ // vdot(a,b) for sparse[0,1,2].
+ for (j <- 0 until 3) {
+ for (i <- 0 until cnt) {
+ val lsb = i * 32
+ val msb = lsb + 32 - 1
+ val k = i + j
+
+ val adata0 = MuxOR(en && sparse0, adata(j)(msb,lsb))
+
+ adatac(j)(i) := adata0 | adata1(k) | adata2(k)
+ bdatac(j)(i) := MuxOR(en, bdata(j)(msb,lsb))
+ }
+ }
+
+ for (i <- 0 until cnt) {
+ val ad = VecInit(adatac(0)(i), adatac(1)(i), adatac(2)(i))
+ val bd = VecInit(bdatac(0)(i), bdatac(1)(i), bdatac(2)(i))
+ val (o0, o1) = dwlane(alu, en, ad, bd, abiasc, bbiasc, asign, bsign)
+ dout0(i) := o0
+ dout1(i) := o1
+ }
+
+ val out0 = dout0.asUInt
+ val out1 = dout1.asUInt
+ assert(out0.getWidth == w)
+ assert(out1.getWidth == w)
+ (out0, out1)
+ }
+
+ private def dwlane(alu: Int, en: Bool, adata: Vec[UInt], bdata: Vec[UInt],
+ abias: UInt, bbias: UInt, asign: Bool, bsign: Bool):
+ (UInt, UInt) = {
+ assert(adata.length == 3)
+ assert(bdata.length == 3)
+ assert(abias.getWidth == 9)
+ assert(bbias.getWidth == 9)
+ for (i <- 0 until 3) {
+ assert(adata(i).getWidth == 32)
+ assert(bdata(i).getWidth == 32)
+ }
+
+ val out = Wire(Vec(2, UInt(32.W)))
+
+ for (j <- 0 until 2) {
+ val m = 2 * j + alu // alu[0]: {0, 2}; alu[1]: {1, 3}
+ val mul = Wire(Vec(3, SInt(20.W)))
+
+ for (i <- 0 until 3) {
+ val as = adata(i)(8 * m + 7) & asign
+ val bs = bdata(i)(8 * m + 7) & bsign
+ val aval = Cat(as, adata(i)(8 * m + 7, 8 * m)).asSInt +& abias.asSInt
+ val bval = Cat(bs, bdata(i)(8 * m + 7, 8 * m)).asSInt +& bbias.asSInt
+ val mval = aval * bval
+ mul(i) := mval
+
+ assert(aval.getWidth == 10)
+ assert(bval.getWidth == 10)
+ assert(mval.getWidth == 20)
+ }
+
+ val dotp = (mul(0) +& mul(1)) +& mul(2)
+ val sdotp = Cat(MuxOR(dotp(21), ~0.U(10.W)), dotp)
+ assert(dotp.getWidth == 22)
+ assert(sdotp.getWidth == 32)
+
+ out(j) := sdotp
+ }
+
+ (out(0), out(1))
+ }
+}
diff --git a/hdl/chisel/src/kelvin/vector/VEncodeOp.scala b/hdl/chisel/src/kelvin/vector/VEncodeOp.scala
new file mode 100644
index 0000000..754dd69
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VEncodeOp.scala
@@ -0,0 +1,104 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+// Opcode list will maintain unique IDs even if not populated in command queue.
+case class VEncodeOp() {
+ val undef = 0
+
+ // Duplicate
+ val vdup = 1
+
+ // Load/Store
+ val vld = 2
+ val vst = 3
+ val vstq = 4
+
+ // Misc
+ val vcget = 5
+
+ // Format0
+ val vadd = 6
+ val vsub = 7
+ val vrsub = 8
+ val veq = 9
+ val vne = 10
+ val vlt = 11
+ val vle = 12
+ val vgt = 13
+ val vge = 14
+ val vabsd = 15
+ val vmax = 16
+ val vmin = 17
+ val vadd3 = 18
+
+ // Format1
+ val vand = 19
+ val vor = 20
+ val vxor = 21
+ val vnot = 22
+ val vrev = 23
+ val vror = 24
+ val vclb = 25
+ val vclz = 26
+ val vcpop = 27
+ val vmv = 28
+ val vmv2 = 29
+ val vmvp = 30
+ val acset = 31
+ val actr = 32
+ val adwinit = 33
+
+ // Format2
+ val vshl = 34
+ val vshr = 35
+ val vshf = 36
+ val vsrans = 37
+ val vsraqs = 38
+
+ // Format3
+ val vmul = 39
+ val vmul2 = 40
+ val vmuls = 41
+ val vmuls2 = 42
+ val vmulh = 43
+ val vmulh2 = 44
+ val vdmulh = 45
+ val vdmulh2 = 46
+ val vmulw = 47
+ val vmadd = 48
+
+ // Format4
+ val vadds = 49
+ val vsubs = 50
+ val vaddw = 51
+ val vsubw = 52
+ val vacc = 53
+ val vpadd = 54
+ val vpsub = 55
+ val vhadd = 56
+ val vhsub = 57
+
+ // Format6
+ val vslidevn = 58
+ val vslidehn = 59
+ val vslidehn2 = 60
+ val vslidevp = 61
+ val vslidehp = 62
+ val vslidehp2 = 63
+ val vsel = 64
+ val vevn = 65
+ val vodd = 66
+ val vevnodd = 67
+ val vzip = 68
+
+ // FormatVVV
+ val aconv = 69
+ val vdwconv = 70
+ val adwconv = 71
+
+ // Entries
+ val entries = 72
+ val bits = log2Ceil(entries)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VInst.scala b/hdl/chisel/src/kelvin/vector/VInst.scala
new file mode 100644
index 0000000..2da7fe6
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VInst.scala
@@ -0,0 +1,281 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object VInst {
+ def apply(p: Parameters): VInst = {
+ return Module(new VInst(p))
+ }
+}
+
+case class VInstOp() {
+ val GETVL = 0
+ val GETMAXVL = 1
+ val VLD = 2
+ val VST = 3
+ val VIOP = 4
+ val Entries = 5
+ val Bits = log2Ceil(Entries)
+}
+
+class VInstIO extends Bundle {
+ val valid = Input(Bool())
+ val ready = Output(Bool())
+ val addr = Input(UInt(5.W))
+ val inst = Input(UInt(32.W))
+ val op = Input(UInt(new VInstOp().Entries.W))
+}
+
+class VectorInstructionIO extends Bundle {
+ val valid = Output(Bool())
+ val ready = Input(Bool())
+ val stall = Input(Bool())
+ val lane = Vec(4, Valid(new VectorInstructionLane))
+}
+
+class VectorInstructionLane extends Bundle {
+ val inst = UInt(32.W)
+ val addr = UInt(32.W)
+ val data = UInt(32.W)
+}
+
+class VAddressActive extends Bundle {
+ val entry = Vec(8, new Bundle {
+ val valid = Output(Bool())
+ val store = Output(Bool())
+ val addr = Output(UInt(32.W))
+ })
+}
+
+class VInst(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ // Decode cycle.
+ val in = Vec(4, new VInstIO)
+
+ // Execute cycle.
+ val rs = Vec(8, Flipped(new RegfileReadDataIO))
+ val rd = Vec(4, Flipped(new RegfileWriteDataIO))
+
+ // Vector interface.
+ val out = new VectorInstructionIO
+
+ // Status.
+ val nempty = Output(Bool())
+ })
+
+ val vinst = new VInstOp()
+
+ val maxvlb = (p.vectorBits / 8).U(p.vectorCountBits.W)
+ val maxvlh = (p.vectorBits / 16).U(p.vectorCountBits.W)
+ val maxvlw = (p.vectorBits / 32).U(p.vectorCountBits.W)
+ val maxvlbm = (p.vectorBits * 4 / 8).U(p.vectorCountBits.W)
+ val maxvlhm = (p.vectorBits * 4 / 16).U(p.vectorCountBits.W)
+ val maxvlwm = (p.vectorBits * 4 / 32).U(p.vectorCountBits.W)
+ assert(maxvlw >= 4.U)
+
+ val slice = Slice(Vec(4, new Bundle {
+ val vld = Output(Bool())
+ val vst = Output(Bool())
+ val lane = Valid(new VectorInstructionLane)
+ }), true)
+
+ val reqvalid = VecInit(io.in(0).valid && io.in(0).ready,
+ io.in(1).valid && io.in(1).ready,
+ io.in(2).valid && io.in(2).ready,
+ io.in(3).valid && io.in(3).ready)
+
+ val reqaddr = VecInit(io.in(0).inst(19,15),
+ io.in(1).inst(19,15),
+ io.in(2).inst(19,15),
+ io.in(3).inst(19,15))
+
+ // ---------------------------------------------------------------------------
+ // Response to Decode.
+ for (i <- 0 until 4) {
+ io.in(i).ready := !io.out.stall
+ }
+
+ // ---------------------------------------------------------------------------
+ // Controls.
+ val vld_o = RegInit(VecInit(Seq.fill(4)(false.B)))
+ val vld_u = RegInit(VecInit(Seq.fill(4)(false.B)))
+ val vst_o = RegInit(VecInit(Seq.fill(4)(false.B)))
+ val vst_u = RegInit(VecInit(Seq.fill(4)(false.B)))
+ val vst_q = RegInit(VecInit(Seq.fill(4)(false.B)))
+ val getvl = RegInit(VecInit(Seq.fill(4)(false.B)))
+ val getmaxvl = RegInit(VecInit(Seq.fill(4)(false.B)))
+
+ val rdAddr = Reg(Vec(4, UInt(5.W)))
+
+ for (i <- 0 until 4) {
+ when (reqvalid(i)) {
+ rdAddr(i) := io.in(i).addr
+ }
+ }
+
+ // ---------------------------------------------------------------------------
+ // Vector Interface.
+ val vvalid = RegInit(false.B)
+ val vinstValid = Reg(Vec(4, Bool()))
+ val vinstInst = Reg(Vec(4, UInt(32.W)))
+ val nxtVinstValid = Wire(Vec(4, Bool()))
+
+ vvalid := nxtVinstValid.asUInt =/= 0.U
+
+ for (i <- 0 until 4) {
+ nxtVinstValid(i) := reqvalid(i) && (io.in(i).op(vinst.VLD) ||
+ io.in(i).op(vinst.VST) ||
+ io.in(i).op(vinst.VIOP))
+ vinstValid(i) := nxtVinstValid(i)
+ vinstInst(i) := io.in(i).inst
+ }
+
+ for (i <- 0 until 4) {
+ val p = io.in(i).inst(28) // func2
+ val q = io.in(i).inst(30) // func2
+ vld_o(i) := reqvalid(i) && io.in(i).op(vinst.VLD) && !p
+ vld_u(i) := reqvalid(i) && io.in(i).op(vinst.VLD) && p
+ vst_o(i) := reqvalid(i) && io.in(i).op(vinst.VST) && !p
+ vst_u(i) := reqvalid(i) && io.in(i).op(vinst.VST) && p && !q
+ vst_q(i) := reqvalid(i) && io.in(i).op(vinst.VST) && p && q
+ getvl(i) := reqvalid(i) && io.in(i).op(vinst.GETVL)
+ getmaxvl(i) := reqvalid(i) && io.in(i).op(vinst.GETMAXVL)
+ }
+
+ // ---------------------------------------------------------------------------
+ // Register write port.
+ val lsuAdder = Wire(Vec(4, UInt(32.W)))
+ val getvlValue = Wire(Vec(4, UInt(p.vectorCountBits.W))) // bytes
+ val getmaxvlValue = Wire(Vec(4, UInt(p.vectorCountBits.W))) // bytes
+
+ for (i <- 0 until 4) {
+ val rs1 = io.rs(2 * i + 0).data
+ val rs2 = io.rs(2 * i + 1).data
+ val m = vinstInst(i)(5)
+ val sz = vinstInst(i)(13,12)
+ val sl = vinstInst(i)(27,26) // func2
+ val q = vinstInst(i)(30)
+ val count = rs2(31,0)
+ val xs2zero = vinstInst(i)(24,20) === 0.U
+
+ val max = MuxOR(sz === 0.U && !m, maxvlb) |
+ MuxOR(sz === 1.U && !m, maxvlh) |
+ MuxOR(sz === 2.U && !m, maxvlw) |
+ MuxOR(sz === 0.U && m, maxvlbm) |
+ MuxOR(sz === 1.U && m, maxvlhm) |
+ MuxOR(sz === 2.U && m, maxvlwm)
+
+ val cmp = Mux(count < max, count, max)
+
+ val bytes = (MuxOR(sz === 0.U && sl(0), cmp) |
+ MuxOR(sz === 1.U && sl(0), Cat(cmp, 0.U(1.W))) |
+ MuxOR(sz === 2.U && sl(0), Cat(cmp, 0.U(2.W))) |
+ MuxOR(!sl(0) && !m, maxvlb) |
+ MuxOR(!sl(0) && m, maxvlbm)
+ )(31,0)
+ assert(bytes.getWidth == 32)
+
+ val rt = (MuxOR(sz === 0.U, rs2) |
+ MuxOR(sz === 1.U, Cat(rs2, 0.U(1.W))) |
+ MuxOR(sz === 2.U, Cat(rs2, 0.U(2.W)))
+ )(31,0)
+
+ val rtm = (Cat(rt, 0.U(2.W)))(31,0)
+ val rtq = (Cat(rt, 0.U(4.W)))(31,0)
+
+ val p_x = sl === 0.U && xs2zero
+ val p_xx = sl === 0.U && !xs2zero
+ val lp_xx = sl === 1.U
+ val sp_xx = sl === 2.U && !q
+ val qp_xx = sl === 2.U && q // vstq.sp
+ val tp_xx = sl === 3.U
+ assert(PopCount(Cat(p_x, p_xx, lp_xx, sp_xx, qp_xx, tp_xx)) <= 1.U)
+
+ val offset = MuxOR(p_x, Mux(m, maxvlbm, maxvlb)) |
+ MuxOR(p_xx, rt) |
+ MuxOR(lp_xx, bytes) |
+ MuxOR(sp_xx, Mux(m, rtm, rt)) |
+ MuxOR(tp_xx, maxvlb) |
+ MuxOR(qp_xx, Mux(m, rtq, rtm))
+ assert(offset.getWidth == 32)
+
+ lsuAdder(i) := rs1 + offset
+ }
+
+ for (i <- 0 until 4) {
+ val len = Wire(UInt(p.vectorCountBits.W)) // bytes
+ val rs1 = io.rs(2 * i + 0).data
+ val rs2 = io.rs(2 * i + 1).data
+ val getvlsz = vinstInst(i)(26,25)
+ val getvlm = vinstInst(i)(27)
+ val maxvl = MuxOR(getvlsz === 0.U && !getvlm, maxvlb) |
+ MuxOR(getvlsz === 1.U && !getvlm, maxvlh) |
+ MuxOR(getvlsz === 2.U && !getvlm, maxvlw) |
+ MuxOR(getvlsz === 0.U && getvlm, maxvlbm) |
+ MuxOR(getvlsz === 1.U && getvlm, maxvlhm) |
+ MuxOR(getvlsz === 2.U && getvlm, maxvlwm)
+
+ val rs2nonzero = vinstInst(i)(24,20) =/= 0.U
+
+ when (rs2 < maxvl && rs2 < rs1 && rs2nonzero) {
+ len := rs2
+ } .elsewhen (rs1 < maxvl) {
+ len := rs1
+ } .otherwise {
+ len := maxvl
+ }
+
+ getvlValue(i) := len
+ getmaxvlValue(i) := maxvl
+ }
+
+ for (i <- 0 until 4) {
+ io.rd(i).valid := getvl(i) || getmaxvl(i) || vld_u(i) || vst_u(i) || vst_q(i)
+ io.rd(i).addr := rdAddr(i)
+
+ io.rd(i).data :=
+ MuxOR(getvl(i), getvlValue(i)) |
+ MuxOR(getmaxvl(i), getmaxvlValue(i)) |
+ MuxOR(vld_u(i) || vst_u(i) || vst_q(i), lsuAdder(i))
+ }
+
+ // ---------------------------------------------------------------------------
+ // Vector Extension Opcodes.
+ slice.io.in.valid := vvalid
+ slice.io.out.ready := io.out.ready
+ io.out.valid := slice.io.out.valid
+
+ // Instruction in execute should always succeed.
+ // Resolve back-pressure with stall to io.in in decode.
+ assert(!(slice.io.in.valid && !slice.io.in.ready))
+
+ for (i <- 0 until 4) {
+ slice.io.in.bits(i).vld := vld_o(i) || vld_u(i)
+ slice.io.in.bits(i).vst := vst_o(i) || vst_u(i) || vst_q(i)
+ slice.io.in.bits(i).lane.valid := vinstValid(i)
+ slice.io.in.bits(i).lane.bits.inst := vinstInst(i)
+ slice.io.in.bits(i).lane.bits.addr := io.rs(2 * i + 0).data
+ slice.io.in.bits(i).lane.bits.data := io.rs(2 * i + 1).data
+ }
+
+ for (i <- 0 until 4) {
+ io.out.lane(i) := slice.io.out.bits(i).lane
+ }
+
+ // Note: slice.io.in.ready is not used in the flow control.
+ // Require the vector core to signal a stall signal into decode,
+ // such that the double buffered slice never overruns.
+ assert(!(vvalid && !slice.io.in.ready))
+
+ // ---------------------------------------------------------------------------
+ // Status.
+ val nempty = RegInit(false.B)
+
+ // Simple implementation, will overlap downstream units redundantly.
+ nempty := io.in(0).valid || io.in(1).valid || io.in(2).valid ||
+ io.in(3).valid || vvalid || io.out.valid
+
+ io.nempty := nempty
+}
diff --git a/hdl/chisel/src/kelvin/vector/VLd.scala b/hdl/chisel/src/kelvin/vector/VLd.scala
new file mode 100644
index 0000000..85a342e
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VLd.scala
@@ -0,0 +1,159 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object VLd {
+ def apply(p: Parameters): VLd = {
+ return Module(new VLd(p))
+ }
+}
+
+class VLd(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ // Instructions.
+ val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+
+ // VRegfile.
+ val write = new VRegfileWriteIO(p)
+
+ // Bus.
+ val axi = new AxiMasterReadIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits)
+
+ // Status.
+ val nempty = Output(Bool())
+ })
+
+ // Loads do not zero out-of-size lanes, all ALU lanes will be populated.
+ // Memory may be initially zeroed so that one half of operation is zero.
+ // Writes are masked so there is no harm to non-zero entries.
+
+ // A usable depth of outstanding commands.
+ val cmdqDepth = 8
+
+ val maxvlb = (p.vectorBits / 8).U(p.vectorCountBits.W)
+ val maxvlbm = (p.vectorBits * 4 / 8).U(p.vectorCountBits.W)
+
+ val bytes = p.lsuDataBits / 8
+
+ val e = new VEncodeOp()
+
+ // ---------------------------------------------------------------------------
+ // Command Queue.
+ class VLdCmdq extends Bundle {
+ val op = UInt(new VEncodeOp().bits.W)
+ val f2 = UInt(3.W)
+ val sz = UInt(3.W)
+ val addr = UInt(32.W)
+ val offset = UInt(32.W)
+ val remain = UInt(p.vectorCountBits.W)
+ val vd = new VAddr()
+ val last = Bool()
+ }
+
+ def Fin(in: VDecodeBits): VLdCmdq = {
+ val out = Wire(new VLdCmdq)
+ val stride = in.f2(1)
+ val length = in.f2(0)
+ assert(PopCount(in.sz) <= 1.U)
+ assert(!(in.op === e.vld.U && (!in.vd.valid || in.vs.valid)))
+
+ val limit = Mux(in.m, maxvlbm, maxvlb)
+
+ val data = MuxOR(in.sz(0), in.sv.data) |
+ MuxOR(in.sz(1), Cat(in.sv.data, 0.U(1.W))) |
+ MuxOR(in.sz(2), Cat(in.sv.data, 0.U(2.W)))
+
+ val remain0 = maxvlbm
+ val remain1 = Mux(data > limit, limit, data)(p.vectorCountBits - 1, 0)
+ assert(remain0.getWidth == p.vectorCountBits)
+ assert(remain1.getWidth == p.vectorCountBits)
+
+ out.op := in.op
+ out.f2 := in.f2
+ out.sz := in.sz
+ out.addr := in.sv.addr
+ out.offset := Mux(stride, data(31,0), maxvlb)
+ out.remain := Mux(length, remain1, remain0)
+ out.vd := in.vd
+ out.last := !in.m
+
+ out
+ }
+
+ def Fout(in: VLdCmdq, m: Bool, step: UInt, valid: Bool): (VLdCmdq, Bool) = {
+ val msb = log2Ceil(bytes) - 1
+ val addrAlign = in.addr(msb, 0)
+ val offsAlign = in.offset(msb, 0)
+ assert(addrAlign === 0.U)
+ assert(offsAlign === 0.U)
+ assert(!valid || in.op === e.vld.U)
+
+ val out = Wire(new VLdCmdq)
+ val stride = in.f2(1)
+
+ val outlast = !m || step === 2.U // registered a cycle before 'last' usage
+
+ val last = !m || step === 3.U
+
+ out := in
+
+ out.vd.addr := in.vd.addr + 1.U
+
+ out.addr := in.addr + in.offset
+ out.remain := Mux(in.remain <= maxvlb, 0.U, in.remain - maxvlb)
+
+ out.last := outlast
+
+ (out, last)
+ }
+
+ def Factive(in: VLdCmdq, m: Bool, step: UInt): UInt = {
+ assert(step.getWidth == 5)
+ 0.U
+ }
+
+ val q = VCmdq(cmdqDepth, new VLdCmdq, Fin, Fout, Factive)
+
+ q.io.in <> io.in
+
+ // ---------------------------------------------------------------------------
+ // Axi.
+ io.axi.addr.valid := q.io.out.valid
+ io.axi.addr.bits.addr := Cat(0.U(1.W), q.io.out.bits.addr(30,0))
+ io.axi.addr.bits.id := q.io.out.bits.vd.addr
+ assert(!(q.io.out.valid && !q.io.out.bits.addr(31)))
+ assert(!(io.axi.addr.valid && io.axi.addr.bits.addr(31)))
+
+ q.io.out.ready := io.axi.addr.ready
+
+ // ---------------------------------------------------------------------------
+ // Write interface.
+ io.write.valid := io.axi.data.valid
+ io.write.data := io.axi.data.bits.data
+ io.write.addr := io.axi.data.bits.id
+
+ io.axi.data.ready := true.B
+
+ // ---------------------------------------------------------------------------
+ // Memory active status.
+ val nempty = RegInit(false.B)
+ val count = RegInit(0.U(7.W))
+ val inc = io.axi.addr.valid && io.axi.addr.ready
+ val dec = io.axi.data.valid && io.axi.data.ready
+
+ when (inc || dec) {
+ val nxtcount = count + inc - dec
+ count := nxtcount
+ nempty := nxtcount =/= 0.U
+ assert(count <= 64.U)
+ }
+
+ io.nempty := q.io.nempty || nempty
+}
+
+object EmitVLd extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new VLd(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VLdSt.scala b/hdl/chisel/src/kelvin/vector/VLdSt.scala
new file mode 100644
index 0000000..256ecd8
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VLdSt.scala
@@ -0,0 +1,302 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object VLdSt {
+ def apply(p: Parameters): VLdSt = {
+ return Module(new VLdSt(p))
+ }
+}
+
+class VLdSt(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ // Instructions.
+ val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+ val active = Output(UInt(64.W))
+
+ // VRegfile.
+ val vrfsb = Input(UInt(128.W))
+ val read = new VRegfileReadHsIO(p)
+ val write = new VRegfileWriteIO(p)
+
+ // Bus.
+ val dbus = new DBusIO(p)
+ val last = Output(Bool())
+ })
+
+ // A usable amount of outstanding transactions.
+ val cmdqDepth = 8
+
+ // The minimum depth to cover pipeline delays in this unit.
+ val dbusDepth = 3
+
+ val maxvlb = (p.vectorBits / 8).U(p.vectorCountBits.W)
+ val maxvlbm = (p.vectorBits * 4 / 8).U(p.vectorCountBits.W)
+
+ val bytes = p.lsuDataBits / 8
+
+ val e = new VEncodeOp()
+
+ // ---------------------------------------------------------------------------
+ // Swizzle datapath.
+ def Swizzle(positive: Boolean, size: Int, addr: UInt, data: UInt): UInt = {
+ val msb = log2Ceil(bytes) - 1
+ val datain = Wire(Vec(bytes, UInt(size.W)))
+ val dataout = Wire(Vec(bytes, UInt(size.W)))
+
+ for (i <- 0 until bytes) {
+ datain(i) := data(size * i + (size - 1), size * i)
+ }
+
+ val index = addr(msb, 0)
+ for (i <- 0 until bytes) {
+ val idx = if (positive) i.U + index else i.U - index
+ dataout(i) := VecAt(datain, idx)
+ assert(idx.getWidth == (msb + 1))
+ }
+
+ dataout.asUInt
+ }
+
+ // ---------------------------------------------------------------------------
+ // Command Queue.
+ class VLdStCmdq extends Bundle {
+ val op = UInt(new VEncodeOp().bits.W)
+ val f2 = UInt(3.W)
+ val sz = UInt(3.W)
+ val addr = UInt(32.W)
+ val offset = UInt(32.W)
+ val remain = UInt(p.vectorCountBits.W)
+ val vd = new VAddr()
+ val vs = new VAddrTag()
+ val quad = UInt(2.W) // vstq position
+ val last = Bool()
+
+ def IsLoad(): Bool = {
+ op === e.vld.U
+ }
+
+ def IsStore(): Bool = {
+ op === e.vst.U || op === e.vstq.U
+ }
+ }
+
+ def Fin(in: VDecodeBits): VLdStCmdq = {
+ val out = Wire(new VLdStCmdq)
+ val stride = in.f2(1)
+ val length = in.f2(0)
+ assert(PopCount(in.sz) <= 1.U)
+ assert(!(in.op === e.vst.U && ( in.vd.valid || !in.vs.valid)))
+ assert(!(in.op === e.vstq.U && ( in.vd.valid || !in.vs.valid)))
+ assert(!(in.op === e.vld.U && (!in.vd.valid || in.vs.valid)))
+
+ val limit = Mux(in.m, maxvlbm, maxvlb)
+
+ val data = MuxOR(in.sz(0), in.sv.data) |
+ MuxOR(in.sz(1), Cat(in.sv.data, 0.U(1.W))) |
+ MuxOR(in.sz(2), Cat(in.sv.data, 0.U(2.W)))
+
+ val remain0 = maxvlbm
+ val remain1 = Mux(data > limit, limit, data)(p.vectorCountBits - 1, 0)
+ assert(remain0.getWidth == p.vectorCountBits)
+ assert(remain1.getWidth == p.vectorCountBits)
+
+ out.op := in.op
+ out.f2 := in.f2
+ out.sz := in.sz
+ out.addr := in.sv.addr
+ out.offset := Mux(stride, data(31,0), Mux(in.op === e.vstq.U, maxvlb >> 2, maxvlb))
+ out.remain := Mux(length, remain1, remain0)
+ out.vd := in.vd
+ out.vs := in.vs
+ out.last := !in.m && in.op =/= e.vstq.U
+
+ out.quad := 0.U
+
+ out
+ }
+
+ def Fout(in: VLdStCmdq, m: Bool, step: UInt, valid: Bool): (VLdStCmdq, Bool) = {
+ assert(!valid || in.op === e.vld.U || in.op === e.vst.U || in.op === e.vstq.U)
+
+ val out = Wire(new VLdStCmdq)
+ val vstq = in.op === e.vstq.U
+ val stride = in.f2(1)
+
+ val fmaxvlb = Mux(in.op === e.vstq.U, maxvlb >> 2, maxvlb)
+
+ val outlast1 = !m || step === 2.U // registered a cycle before 'last' usage
+ val outlast2 = Mux(m, step === 14.U, step === 2.U)
+ val outlast = Mux(vstq, outlast2, outlast1)
+
+ val last1 = !m || step === 3.U
+ val last2 = Mux(m, step === 15.U, step === 3.U)
+ val last = Mux(vstq, last2, last1)
+
+ out := in
+
+ out.vd.addr := Mux(vstq && step(1,0) =/= 3.U, in.vd.addr, in.vd.addr + 1.U)
+ out.vs.addr := Mux(vstq && step(1,0) =/= 3.U, in.vs.addr, in.vs.addr + 1.U)
+
+ out.addr := in.addr + in.offset
+ out.remain := Mux(in.remain <= fmaxvlb, 0.U, in.remain - fmaxvlb)
+
+ out.last := outlast
+
+ out.quad := Mux(in.op === e.vstq.U, step + 1.U, 0.U)
+
+ (out, last)
+ }
+
+ def Factive(in: VLdStCmdq, m: Bool, step: UInt): UInt = {
+ assert(step.getWidth == 5)
+ val vstq = in.op === e.vstq.U
+ val stepq = Mux(vstq, step(4,2), step(2,0))
+ // Only reads are reported in active, vrfsb tracks writes.
+ val active = MuxOR(in.vs.valid, RegActive(m, stepq, in.vs.addr))
+ assert(active.getWidth == 64)
+ active
+ }
+
+ val q = VCmdq(cmdqDepth, new VLdStCmdq, Fin, Fout, Factive)
+
+ q.io.in <> io.in
+
+ val ctrlready = Wire(Bool())
+ q.io.out.ready := ScoreboardReady(q.io.out.bits.vs, io.vrfsb) && ctrlready
+
+ // ---------------------------------------------------------------------------
+ // Read register.
+ io.read.valid := q.io.out.valid && q.io.out.bits.vs.valid
+ io.read.stall := !q.io.out.ready // testbench signal
+ io.read.addr := q.io.out.bits.vs.addr
+ io.read.tag := OutTag(q.io.out.bits.vs)
+
+ // ---------------------------------------------------------------------------
+ // DBus.
+ class DBusCtrl extends Bundle {
+ val last = Bool()
+ val write = Bool()
+ val addr = UInt(p.lsuAddrBits.W)
+ val adrx = UInt(p.lsuAddrBits.W)
+ val size = UInt((log2Ceil(p.lsuDataBits / 8) + 1).W)
+ val widx = UInt(6.W)
+ }
+
+ class DBusWData extends Bundle {
+ val wdata = UInt(p.lsuDataBits.W)
+ val wmask = UInt((p.lsuDataBits / 8).W)
+ }
+
+ class RegWrite extends Bundle {
+ val widx = UInt(6.W)
+ val addr = UInt(log2Ceil(bytes).W) // bus address
+ val size = UInt((log2Ceil(p.lsuDataBits / 8) + 1).W)
+ }
+
+ val lineoffset = (p.lsuDataBits / 8)
+
+ // Combinatorial paths back to command queue are to be avoided.
+ val ctrl = Fifo(new DBusCtrl, dbusDepth)
+ val data = Fifo(new DBusWData, dbusDepth)
+ val rdataEn = RegInit(false.B)
+ val rdataSize = Reg(UInt(p.vectorCountBits.W))
+ val rdataAddr = Reg(UInt(log2Ceil(bytes).W))
+ val rdataAshf = Reg(UInt(log2Ceil(bytes).W))
+
+ ctrlready := ctrl.io.in.ready && (io.read.ready || !ctrl.io.in.bits.write)
+
+ val qoutEn = q.io.out.valid && q.io.out.ready
+ val rdataEnNxt = qoutEn && ctrl.io.in.bits.write
+
+ val qmaxvlb = Mux(q.io.out.bits.op === e.vstq.U, maxvlb >> 2.U, maxvlb)
+ val qsize = Mux(q.io.out.bits.remain > qmaxvlb, qmaxvlb, q.io.out.bits.remain)
+ val rdataWmask = Wire(Vec(p.lsuDataBits / 8, Bool()))
+
+ when (rdataEnNxt) {
+ val quad = q.io.out.bits.quad(1,0)
+ rdataSize := qsize
+ rdataAddr := q.io.out.bits.addr
+ rdataAshf := q.io.out.bits.addr - (quad * (maxvlb >> 2.U))
+ }
+
+ for (i <- 0 until p.lsuDataBits / 8) {
+ rdataWmask(i) := rdataSize > i.U
+ }
+
+ rdataEn := rdataEnNxt
+ ctrl.io.in.valid := qoutEn
+
+ ctrl.io.in.bits.addr := q.io.out.bits.addr
+ ctrl.io.in.bits.adrx := q.io.out.bits.addr + lineoffset.U
+ ctrl.io.in.bits.size := qsize
+ ctrl.io.in.bits.last := q.io.out.bits.last
+ ctrl.io.in.bits.write := q.io.out.bits.IsStore()
+ ctrl.io.in.bits.widx := q.io.out.bits.vd.addr
+ assert(!(ctrl.io.in.valid && !ctrl.io.in.ready))
+
+ data.io.in.valid := rdataEn
+ data.io.in.bits.wdata := Swizzle(false, 8, rdataAshf, io.read.data)
+ data.io.in.bits.wmask := Swizzle(false, 1, rdataAddr, rdataWmask.asUInt)
+ assert(!(data.io.in.valid && !data.io.in.ready))
+
+ ctrl.io.out.ready := io.dbus.ready && (data.io.out.valid || !ctrl.io.out.bits.write)
+ data.io.out.ready := io.dbus.ready && (ctrl.io.out.valid && ctrl.io.out.bits.write)
+ assert(!(data.io.out.valid && !ctrl.io.out.valid))
+
+ io.dbus.valid := ctrl.io.out.valid && (data.io.out.valid || !ctrl.io.out.bits.write)
+ io.dbus.write := ctrl.io.out.bits.write
+ io.dbus.addr := Cat(0.U(1.W), ctrl.io.out.bits.addr(30,0))
+ io.dbus.adrx := Cat(0.U(1.W), ctrl.io.out.bits.adrx(30,0))
+ io.dbus.size := ctrl.io.out.bits.size
+ io.dbus.wdata := data.io.out.bits.wdata
+ io.dbus.wmask := data.io.out.bits.wmask
+ assert(!(ctrl.io.out.valid && ctrl.io.out.bits.addr(31)))
+ assert(!(ctrl.io.out.valid && ctrl.io.out.bits.adrx(31)))
+ assert(!(io.dbus.valid && io.dbus.addr(31)))
+ assert(!(io.dbus.valid && io.dbus.adrx(31)))
+
+ io.last := ctrl.io.out.bits.last
+
+ // ---------------------------------------------------------------------------
+ // Write register.
+ val wrega = Slice(new RegWrite, true, true)
+ val wregd = Slice(UInt(p.vectorBits.W), false, true)
+ val wdataEn = RegInit(false.B)
+
+ wdataEn := io.dbus.valid && io.dbus.ready && !io.dbus.write
+
+ wrega.io.in.valid := ctrl.io.out.valid && io.dbus.ready && !ctrl.io.out.bits.write
+ wrega.io.in.bits.widx := ctrl.io.out.bits.widx
+ wrega.io.in.bits.addr := ctrl.io.out.bits.addr
+ wrega.io.in.bits.size := ctrl.io.out.bits.size
+ wrega.io.out.ready := wregd.io.out.valid
+ assert(!(wrega.io.in.valid && !wrega.io.in.ready))
+
+ wregd.io.in.valid := wdataEn
+ wregd.io.in.bits := io.dbus.rdata
+ wregd.io.out.ready := wrega.io.out.valid
+ assert(!(wregd.io.in.valid && !wregd.io.in.ready))
+
+ val maskb = Wire(Vec(p.vectorBits / 8, UInt(8.W)))
+ val mask = maskb.asUInt
+
+ for (i <- 0 until p.vectorBits / 8) {
+ maskb(i) := MuxOR(i.U < wrega.io.out.bits.size, 0xff.U)
+ }
+
+ io.write.valid := wrega.io.out.valid && wregd.io.out.valid
+ io.write.addr := wrega.io.out.bits.widx
+ io.write.data := Swizzle(true, 8, wrega.io.out.bits.addr, wregd.io.out.bits) & mask
+
+ // ---------------------------------------------------------------------------
+ // Active.
+ io.active := q.io.active
+}
+
+object EmitVLdSt extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new VLdSt(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VRegfile.scala b/hdl/chisel/src/kelvin/vector/VRegfile.scala
new file mode 100644
index 0000000..8f3b1a4
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VRegfile.scala
@@ -0,0 +1,430 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object VRegfile {
+ def apply(p: Parameters): VRegfile = {
+ return Module(new VRegfile(p))
+ }
+}
+
+class VRegfileReadIO(p: Parameters) extends Bundle {
+ val valid = Output(Bool())
+ val addr = Output(UInt(6.W))
+ val tag = Output(UInt(1.W))
+ val data = Input(UInt(p.vectorBits.W))
+}
+
+class VRegfileReadHsIO(p: Parameters) extends Bundle {
+ val valid = Output(Bool())
+ val ready = Input(Bool()) // handshake
+ val stall = Output(Bool()) // Testbench signal.
+ val addr = Output(UInt(6.W))
+ val tag = Output(UInt(1.W))
+ val data = Input(UInt(p.vectorBits.W))
+}
+
+class VRegfileScalarIO(p: Parameters) extends Bundle {
+ val valid = Output(Bool())
+ val data = Output(UInt(32.W))
+}
+
+class VRegfileTransposeIO(p: Parameters) extends Bundle {
+ val valid = Output(Bool())
+ val tcnt = 16.min(p.vectorBits / 32)
+ val addr = Output(UInt(6.W))
+ val index = Output(UInt(log2Ceil(tcnt).W))
+ val data = Input(UInt((tcnt * 32).W))
+}
+
+class VRegfileWrite(p: Parameters) extends Bundle {
+ val addr = UInt(6.W)
+ val data = UInt(p.vectorBits.W)
+}
+
+class VRegfileWriteIO(p: Parameters) extends Bundle {
+ val valid = Output(Bool())
+ val addr = Output(UInt(6.W))
+ val data = Output(UInt(p.vectorBits.W))
+}
+
+class VRegfileWriteHsIO(p: Parameters) extends Bundle {
+ val valid = Output(Bool())
+ val ready = Input(Bool()) // handshake used in arbitration logic
+ val addr = Output(UInt(6.W))
+ val data = Output(UInt(p.vectorBits.W))
+}
+
+// Write internal.
+class VRegfileWrintIO(p: Parameters) extends Bundle {
+ val valid = Output(Bool())
+ val addr = Output(UInt(6.W))
+ val data = Output(UInt(p.vectorBits.W))
+}
+
+// Write internal.
+class VRegfileWhintIO(p: Parameters) extends Bundle {
+ val valid = Output(Bool())
+ val addr = Output(UInt(6.W))
+}
+
+class VRegfileConvIO(p: Parameters) extends Bundle {
+ val valid = Output(Bool()) // registered signal suitable for mux control
+ val ready = Output(Bool()) // combinatoral from scheduling logic
+ val op = new Bundle {
+ val conv = Output(Bool()) // convolution to accum
+ val init = Output(Bool()) // set accum
+ val tran = Output(Bool()) // transpose to accum
+ val wclr = Output(Bool()) // write accum to vreg and clear accum
+ }
+ val addr1 = Output(UInt(6.W)) // narrow: transpose
+ val addr2 = Output(UInt(6.W)) // wide: internal
+ val mode = Output(UInt(2.W))
+ val index = Output(UInt(log2Ceil(p.vectorBits / 32).W))
+ val abias = Output(UInt(9.W))
+ val bbias = Output(UInt(9.W))
+ val asign = Output(Bool())
+ val bsign = Output(Bool())
+}
+
+class VRegfileScoreboardIO extends Bundle {
+ // 64 registers sequenced from even/odd tags.
+ val set = Valid(UInt(128.W))
+ val data = Input(UInt(128.W))
+}
+
+class VRegfile(p: Parameters) extends Module {
+ val readPorts = 7
+ val writePorts = 6
+ val whintPorts = 4
+
+ val io = IO(new Bundle {
+ val read = Vec(readPorts, Flipped(new VRegfileReadIO(p)))
+ val scalar = Vec(readPorts / 3, Flipped(new VRegfileScalarIO(p)))
+ val write = Vec(writePorts, Flipped(new VRegfileWrintIO(p)))
+ val whint = Vec(whintPorts, Flipped(new VRegfileWhintIO(p)))
+ val conv = Flipped(new VRegfileConvIO(p))
+ val transpose = Flipped(new VRegfileTransposeIO(p))
+ val vrfsb = Flipped(new VRegfileScoreboardIO)
+ })
+
+ val segcnt = p.vectorBits / 32
+ val segcntBits = log2Ceil(segcnt)
+
+ // ---------------------------------------------------------------------------
+ // Register file storage.
+ val vreg = for (i <- 0 until segcnt) yield {
+ Module(new VRegfileSegment(p))
+ }
+
+ // ---------------------------------------------------------------------------
+ // Convolution unit.
+ val vconv = VConvAlu(p)
+
+ // ---------------------------------------------------------------------------
+ // Assert state.
+ val writeCurr = Wire(UInt(64.W))
+ val writePrev = RegInit(0.U(64.W))
+ val writeSet = Wire(Vec(writePorts, UInt(64.W)))
+
+ for (i <- 0 until writePorts) {
+ writeSet(i) := MuxOR(io.write(i).valid, 1.U << io.write(i).addr)
+ }
+
+ writeCurr := VecOR(writeSet)
+ writePrev := writeCurr
+
+ // ---------------------------------------------------------------------------
+ // Write port interface and registration.
+ val writevalidBool = Wire(Vec(writePorts, Bool()))
+ val writevalid = writevalidBool.asUInt
+ val writebits = Wire(Vec(writePorts, new VRegfileWrite(p)))
+ val writevalidReg = RegInit(0.U(writePorts.W))
+ val writebitsReg = Reg(Vec(writePorts, new VRegfileWrite(p)))
+
+ for (i <- 0 until writePorts) {
+ writevalidBool(i) := io.write(i).valid
+ writebits(i).addr := io.write(i).addr
+ writebits(i).data := io.write(i).data
+ }
+
+ writevalidReg := writevalid
+
+ for (i <- 0 until writePorts) {
+ when (io.write(i).valid) {
+ writebitsReg(i).addr := io.write(i).addr
+ writebitsReg(i).data := io.write(i).data
+ }
+ }
+
+ // ---------------------------------------------------------------------------
+ // Write ports.
+ for (i <- 0 until writePorts) {
+ for (j <- 0 until segcnt) {
+ vreg(j).io.write(i).valid := writevalidReg(i)
+ vreg(j).io.write(i).addr := writebitsReg(i).addr
+ vreg(j).io.write(i).data := writebitsReg(i).data(32 * j + 31, 32 * j)
+ }
+ }
+
+ // ---------------------------------------------------------------------------
+ // Read ports.
+ val readData = Reg(Vec(readPorts, UInt(p.vectorBits.W)))
+
+ def ReadScalar(i: Int): (Bool, UInt) = {
+ val valid = Wire(Bool())
+ val scalar = Wire(UInt(32.W))
+
+ if (i == 1 || i == 4) {
+ valid := io.scalar(i / 3).valid
+ scalar := io.scalar(i / 3).data
+ } else {
+ valid := false.B
+ scalar := 0.U
+ }
+
+ val lanes = p.vectorBits / 32
+ val values = Wire(Vec(lanes, UInt(32.W)))
+ for (i <- 0 until lanes) {
+ values(i) := scalar
+ }
+
+ val result = values.asUInt
+ assert(result.getWidth == p.vectorBits)
+ (valid, result)
+ }
+
+ val rdata = Wire(Vec(readPorts, Vec(segcnt, UInt(32.W))))
+
+ for (i <- 0 until readPorts) {
+ for (j <- 0 until segcnt) {
+ vreg(j).io.read(i).addr := io.read(i).addr
+ rdata(i)(j) := vreg(j).io.read(i).data
+ }
+ }
+
+ for (i <- 0 until readPorts) {
+ // Forwarding of internal write-staging registers.
+ val f1validBits = Wire(Vec(writePorts, Bool()))
+ val f1valid = f1validBits.asUInt
+ assert(PopCount(f1valid) <= 1.U)
+
+ val f2validBits = Wire(Vec(writePorts, Bool()))
+ val f2valid = f2validBits.asUInt
+ assert(PopCount(f2valid) <= 1.U)
+
+ for (j <- 0 until writePorts) {
+ f1validBits(j) := writevalid(j) &&
+ writebits(j).addr === io.read(i).addr
+ }
+
+ for (j <- 0 until writePorts) {
+ f2validBits(j) := writevalidReg(j) &&
+ writebitsReg(j).addr === io.read(i).addr
+ }
+
+ val f1dataBits = Wire(Vec(writePorts, UInt(p.vectorBits.W)))
+ val f1data = VecOR(f1dataBits, writePorts)
+
+ for (j <- 0 until writePorts) {
+ f1dataBits(j) := MuxOR(f1valid(j), writebits(j).data)
+ }
+
+ val f2dataBits = Wire(Vec(writePorts, UInt(p.vectorBits.W)))
+ val f2data = VecOR(f2dataBits, writePorts)
+
+ for (j <- 0 until writePorts) {
+ f2dataBits(j) := MuxOR(f2valid(j), writebitsReg(j).data)
+ }
+
+ val (scalarValid, scalarData) = ReadScalar(i)
+
+ val sel = Cat(scalarValid,
+ !scalarValid && f1valid =/= 0.U,
+ !scalarValid && f1valid === 0.U && f2valid =/= 0.U,
+ !scalarValid && f1valid === 0.U && f2valid === 0.U)
+ assert(PopCount(sel) <= 1.U)
+
+ val data = MuxOR(sel(3), scalarData) |
+ MuxOR(sel(2), f1data) |
+ MuxOR(sel(1), f2data) |
+ MuxOR(sel(0), rdata(i).asUInt)
+
+ val rvalid =
+ if (i == 1 || i == 4) {
+ assert(!(io.read(i).valid && io.scalar(i / 3).valid))
+ io.read(i).valid || io.scalar(i / 3).valid
+ } else {
+ io.read(i).valid
+ }
+
+ when (rvalid) {
+ readData(i) := data
+ }
+ }
+
+ for (i <- 0 until readPorts) {
+ io.read(i).data := readData(i)
+ }
+
+ // ---------------------------------------------------------------------------
+ // Conv port.
+ val convConv = RegInit(false.B)
+ val convInit = RegInit(false.B)
+ val convTran = RegInit(false.B)
+ val convClear = RegInit(false.B)
+ val convIndex = Reg(UInt(log2Ceil(p.vectorBits / 32).W))
+ val convAbias = Reg(UInt(9.W))
+ val convBbias = Reg(UInt(9.W))
+ val convAsign = Reg(Bool())
+ val convBsign = Reg(Bool())
+ val internalData = Reg(UInt(p.vectorBits.W))
+
+ // io.conv.valid controls read multiplexors
+ // io.conv.ready frames data phase readiness
+ convConv := io.conv.valid && io.conv.ready && io.conv.op.conv
+ convInit := io.conv.valid && io.conv.ready && io.conv.op.init
+ convTran := io.conv.valid && io.conv.ready && io.conv.op.tran
+ convClear := io.conv.valid && io.conv.ready && io.conv.op.wclr
+ convIndex := io.conv.index
+
+ assert(!(io.conv.valid && io.conv.ready) ||
+ PopCount(Cat(io.conv.op.conv, io.conv.op.wclr, io.conv.op.init, io.conv.op.tran)) === 1.U)
+
+ val idata = Wire(Vec(segcnt, UInt(32.W)))
+ for (i <- 0 until segcnt) {
+ idata(i) := vreg(i).io.internal.data
+ }
+
+ for (i <- 0 until segcnt) {
+ vreg(i).io.internal.addr := io.conv.addr2
+ }
+
+ when (io.conv.valid) {
+ internalData := idata.asUInt
+ }
+
+ when (io.conv.valid) {
+ convAbias := io.conv.abias
+ convBbias := io.conv.bbias
+ convAsign := io.conv.asign
+ convBsign := io.conv.bsign
+ }
+
+ for (i <- 0 until segcnt) {
+ vreg(i).io.conv.valid := convClear
+ for (j <- 0 until segcnt) {
+ vreg(i).io.conv.data(j) := vconv.io.out(j)(32 * i + 31, 32 * i) // note index are reversed
+ }
+ }
+
+ // Note: do not assert if read touches any of the conv read/write registers.
+ // Other scheduling mechanisms are used to not advance the opcode.
+ val convRead0 = io.conv.valid && io.conv.ready && io.conv.op.conv
+ val convClear0 = io.conv.valid && io.conv.ready && io.conv.op.wclr
+
+ assert(!(convRead0 && io.conv.mode =/= 0.U))
+ // assert(!(convRead0 && io.conv.addr1(5,4) === 3.U))
+ // assert(!(convRead0 && io.conv.addr2(5,4) === 3.U))
+ assert(!(convRead0 && io.conv.addr1(3,0) =/= 0.U))
+ assert(!(convRead0 && io.conv.addr1(5,2) === io.conv.addr2(5,2) && (p.vectorBits == 128).B))
+ assert(!(convRead0 && io.conv.addr1(5,3) === io.conv.addr2(5,3) && (p.vectorBits == 256).B))
+ assert(!(convRead0 && io.conv.addr1(5,4) === io.conv.addr2(5,4) && (p.vectorBits == 512).B))
+
+ // Convolution reads must not be under pipelined writes.
+ assert(!(convRead0 && writeCurr(io.conv.addr2)))
+ assert(!(convRead0 && writePrev(io.conv.addr2)))
+
+ val convmaska = 0xffff.U << 48.U
+ assert(!(convClear0 && (writeCurr & convmaska) =/= 0.U))
+ assert(!(convClear0 && (writePrev & convmaska) =/= 0.U))
+ // // Note: writePrev check not needed since accumulator is a cycle after reads.
+ // // assert(!(convClear0 && (writePrev & convmaska) =/= 0.U))
+
+ for (i <- 0 until writePorts) {
+ assert(!((convClear0 || convClear) && io.write(i).valid && io.write(i).addr >= 48.U))
+ }
+
+ // ---------------------------------------------------------------------------
+ // Convolution.
+ vconv.io.op.conv := convConv
+ vconv.io.op.init := convInit
+ vconv.io.op.tran := convTran
+ vconv.io.op.clear := convClear
+ vconv.io.index := convIndex
+ vconv.io.adata := io.transpose.data
+ vconv.io.bdata := internalData
+ vconv.io.abias := convAbias
+ vconv.io.bbias := convBbias
+ vconv.io.asign := convAsign
+ vconv.io.bsign := convBsign
+
+ // ---------------------------------------------------------------------------
+ // Transpose port.
+ val transposeData = Reg(UInt(io.transpose.data.getWidth.W))
+ val transposeDataMux = Wire(Vec(segcnt, UInt(io.transpose.data.getWidth.W)))
+
+ for (i <- 0 until segcnt) {
+ vreg(i).io.transpose.addr := Mux(io.conv.valid, io.conv.addr1, io.transpose.addr)
+ transposeDataMux(i) := vreg(i).io.transpose.data
+ }
+
+ when (io.conv.valid || io.transpose.valid) {
+ val index = Mux(io.conv.valid, io.conv.index, io.transpose.index)
+ transposeData := VecAt(transposeDataMux, index)
+ }
+
+ io.transpose.data := transposeData
+
+ // Transpose reads must not be under pipelined writes.
+ for (i <- 0 until segcnt) {
+ assert(!(io.transpose.valid && writeCurr(io.transpose.addr + i.U)))
+ assert(!(io.transpose.valid && writePrev(io.transpose.addr + i.U)))
+ }
+
+ assert(!(io.transpose.valid && io.conv.valid))
+ assert(!(io.transpose.valid && convConv))
+
+ // ---------------------------------------------------------------------------
+ // Scoreboard.
+ def SbClr(valid: Bool = false.B, data: UInt = 0.U(128.W), i: Int = 0): (Bool, UInt) = {
+ if (i < writePorts) {
+ val wvalid = io.write(i).valid
+ val hvalid = if (i < whintPorts) io.whint(i).valid else false.B
+ val woh = MuxOR(io.write(i).valid, OneHot(io.write(i).addr, 64))
+ val hoh = if (i < whintPorts) MuxOR(io.whint(i).valid, OneHot(io.whint(i).addr, 64)) else 0.U
+ val whoh = woh | hoh
+ val whdata = Cat(whoh, whoh)
+ assert(whdata.getWidth == 128)
+ SbClr(valid || wvalid || hvalid, data | whdata, i + 1)
+ } else {
+ val cvalid = convClear // delayed one cycle beyond io.conv.wclr, no forwarding to read ports
+ val cdataH = Wire(UInt(16.W))
+ val cdata = MuxOR(cvalid, Cat(cdataH, 0.U(48.W), cdataH, 0.U(48.W)))
+ assert(cdata.getWidth == 128)
+ if (p.vectorBits == 128) cdataH := 0x000f.U
+ if (p.vectorBits == 256) cdataH := 0x00ff.U
+ if (p.vectorBits == 512) cdataH := 0xffff.U
+
+ (valid || cvalid, data | cdata)
+ }
+ }
+
+ val vrfsb = RegInit(0.U(128.W))
+ val vrfsbSetEn = io.vrfsb.set.valid
+ val vrfsbSet = MuxOR(io.vrfsb.set.valid, io.vrfsb.set.bits)
+ val (vrfsbClrEn, vrfsbClr) = SbClr()
+
+ when (vrfsbSetEn || vrfsbClrEn) {
+ vrfsb := (vrfsb & ~vrfsbClr) | vrfsbSet
+ }
+
+ io.vrfsb.data := vrfsb
+}
+
+object EmitVRegfile extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new VRegfile(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VRegfileSegment.scala b/hdl/chisel/src/kelvin/vector/VRegfileSegment.scala
new file mode 100644
index 0000000..5290d45
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VRegfileSegment.scala
@@ -0,0 +1,103 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+class VRegfileSegment(p: Parameters) extends Module {
+ val readPorts = 7
+ val writePorts = 6
+ val tcnt = 16.min(p.vectorBits / 32)
+
+ val io = IO(new Bundle {
+ val read = Vec(readPorts, new Bundle {
+ val addr = Input(UInt(6.W))
+ val data = Output(UInt(32.W))
+ })
+
+ val transpose = new Bundle {
+ val addr = Input(UInt(6.W))
+ val data = Output(UInt((tcnt * 32).W))
+ }
+
+ val internal = new Bundle {
+ val addr = Input(UInt(6.W))
+ val data = Output(UInt(32.W))
+ }
+
+ val write = Vec(writePorts, new Bundle {
+ val valid = Input(Bool())
+ val addr = Input(UInt(6.W))
+ val data = Input(UInt(32.W))
+ })
+
+ val conv = new Bundle {
+ val valid = Input(Bool())
+ val data = Input(Vec(tcnt, UInt(32.W)))
+ }
+ })
+
+ // Do not use a memory object, this breaks the synthesis.
+ // eg. val vreg = Mem(64, UInt(32.W))
+ val vreg = Reg(Vec(64, UInt(32.W)))
+
+ // ---------------------------------------------------------------------------
+ // Read.
+ for (i <- 0 until readPorts) {
+ val ridx = io.read(i).addr
+ io.read(i).data := VecAt(vreg, ridx)
+ }
+
+ // ---------------------------------------------------------------------------
+ // Transpose.
+ val tdata = Wire(Vec(tcnt, UInt(32.W)))
+ for (i <- 0 until tcnt) {
+ val tidx = Cat(io.transpose.addr(5,4), i.U(4.W)) // only supports [v0, v16, v32, v48].
+ assert(tidx.getWidth == 6)
+ tdata(i) := VecAt(vreg, tidx)
+ }
+ io.transpose.data := tdata.asUInt
+ assert(io.transpose.addr(3,0) === 0.U)
+
+ // ---------------------------------------------------------------------------
+ // Internal.
+ io.internal.data := VecAt(vreg, io.internal.addr)
+
+ // ---------------------------------------------------------------------------
+ // Write.
+ for (i <- 0 until 64) {
+ val wvalidBits = Wire(Vec(writePorts, Bool()))
+ val wdataBits = Wire(Vec(writePorts, UInt(32.W)))
+ assert(PopCount(wvalidBits.asUInt) <= 1.U)
+
+ for (j <- 0 until writePorts) {
+ wvalidBits(j) := io.write(j).valid && io.write(j).addr === i.U
+ wdataBits(j) := MuxOR(wvalidBits(j), io.write(j).data)
+ }
+
+ val wvalid = VecOR(wvalidBits, writePorts)
+ val wdata = VecOR(wdataBits, writePorts)
+
+ when (wvalid) {
+ vreg(i) := wdata
+ }
+ }
+
+ // ---------------------------------------------------------------------------
+ // Convolution parallel load interface.
+ // Data has been transposed in VRegfile.
+ // [48, 49, 50, ...] = data
+ when (io.conv.valid) {
+ for (i <- 0 until tcnt) {
+ vreg(i + 48) := io.conv.data(i)
+ }
+ }
+
+ for (i <- 0 until writePorts) {
+ assert(!(io.conv.valid && io.write(i).valid && io.write(i).addr >= 48.U))
+ }
+}
+
+object EmitVRegfileSegment extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new VRegfileSegment(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VSt.scala b/hdl/chisel/src/kelvin/vector/VSt.scala
new file mode 100644
index 0000000..d9892c7
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VSt.scala
@@ -0,0 +1,309 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object VSt {
+ def apply(p: Parameters): VSt = {
+ return Module(new VSt(p))
+ }
+}
+
+class VSt(p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ // Instructions.
+ val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+ val active = Output(UInt(64.W))
+
+ // VRegfile.
+ val vrfsb = Input(UInt(128.W))
+ val read = new VRegfileReadHsIO(p)
+
+ // Bus.
+ val axi = new AxiMasterWriteIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits)
+
+ // Status.
+ val nempty = Output(Bool())
+ })
+
+ // A usable depth of outstanding commands.
+ val cmdqDepth = 8
+
+ val maxvlb = (p.vectorBits / 8).U(p.vectorCountBits.W)
+ val maxvlbm = (p.vectorBits * 4 / 8).U(p.vectorCountBits.W)
+
+ val bytes = p.lsuDataBits / 8
+ val msb = log2Ceil(bytes)
+
+ val e = new VEncodeOp()
+
+ // ---------------------------------------------------------------------------
+ // Tie-offs
+ io.active := 0.U
+
+ io.in.ready := false.B
+
+ io.read.valid := false.B
+ io.read.stall := false.B
+ io.read.addr := 0.U
+ io.read.tag := 0.U
+
+ io.axi.addr.valid := false.B
+ io.axi.addr.bits.addr := 0.U
+ io.axi.addr.bits.id := 0.U
+
+ io.axi.data.valid := false.B
+ io.axi.data.bits.strb := 0.U
+ io.axi.data.bits.data := 0.U
+
+ io.axi.resp.ready := false.B
+
+ // ---------------------------------------------------------------------------
+ // Command Queue.
+ class VStCmdq extends Bundle {
+ val op = UInt(new VEncodeOp().bits.W)
+ val f2 = UInt(3.W)
+ val sz = UInt(3.W)
+ val addr = UInt(32.W)
+ val offset = UInt(32.W)
+ val remain = UInt(p.vectorCountBits.W)
+ val vs = new VAddrTag()
+ val quad = UInt(2.W) // vstq position
+ val last = Bool()
+ }
+
+ def Fin(in: VDecodeBits): VStCmdq = {
+ val out = Wire(new VStCmdq)
+ val stride = in.f2(1)
+ val length = in.f2(0)
+ assert(PopCount(in.sz) <= 1.U)
+ assert(!(in.op === e.vst.U && !in.vs.valid))
+ assert(!(in.op === e.vstq.U && !in.vs.valid))
+
+ val limit = Mux(in.m, maxvlbm, maxvlb)
+
+ val data = MuxOR(in.sz(0), in.sv.data) |
+ MuxOR(in.sz(1), Cat(in.sv.data, 0.U(1.W))) |
+ MuxOR(in.sz(2), Cat(in.sv.data, 0.U(2.W)))
+
+ val remain0 = maxvlbm
+ val remain1 = Mux(data > limit, limit, data)(p.vectorCountBits - 1, 0)
+ assert(remain0.getWidth == p.vectorCountBits)
+ assert(remain1.getWidth == p.vectorCountBits)
+
+ out.op := in.op
+ out.f2 := in.f2
+ out.sz := in.sz
+ out.addr := in.sv.addr
+ out.offset := Mux(stride, data(31,0), Mux(in.op === e.vstq.U, maxvlb >> 2, maxvlb))
+ out.remain := Mux(length, remain1, remain0)
+ out.vs := in.vs
+ out.last := !in.m && in.op =/= e.vstq.U
+
+ out.quad := 0.U
+
+ out
+ }
+
+ def Fout(in: VStCmdq, m: Bool, step: UInt, valid: Bool): (VStCmdq, Bool) = {
+ val addrAlign = Mux(in.op === e.vstq.U, in.addr(msb - 3, 0), in.addr(msb - 1, 0))
+ val offsAlign = Mux(in.op === e.vstq.U, in.offset(msb - 3, 0), in.offset(msb - 1, 0))
+ assert(addrAlign === 0.U)
+ assert(offsAlign === 0.U)
+ assert(!valid || in.op === e.vst.U || in.op === e.vstq.U)
+
+ val out = Wire(new VStCmdq)
+ val vstq = in.op === e.vstq.U
+ val stride = in.f2(1)
+
+ val fmaxvlb = Mux(in.op === e.vstq.U, maxvlb >> 2, maxvlb)
+
+ val outlast1 = !m || step === 2.U // registered a cycle before 'last' usage
+ val outlast2 = Mux(m, step === 14.U, step === 2.U)
+ val outlast = Mux(vstq, outlast2, outlast1)
+
+ val last1 = !m || step === 3.U
+ val last2 = Mux(m, step === 15.U, step === 3.U)
+ val last = Mux(vstq, last2, last1)
+
+ out := in
+
+ out.vs.addr := Mux(vstq && step(1,0) =/= 3.U, in.vs.addr, in.vs.addr + 1.U)
+
+ out.addr := in.addr + in.offset
+ out.remain := Mux(in.remain <= fmaxvlb, 0.U, in.remain - fmaxvlb)
+
+ out.last := outlast
+
+ out.quad := Mux(in.op === e.vstq.U, step + 1.U, 0.U)
+
+ (out, last)
+ }
+
+ def Factive(in: VStCmdq, m: Bool, step: UInt): UInt = {
+ assert(step.getWidth == 5)
+ val vstq = in.op === e.vstq.U
+ val stepq = Mux(vstq, step(4,2), step(2,0))
+ val active = MuxOR(in.vs.valid, RegActive(m, stepq, in.vs.addr))
+ assert(active.getWidth == 64)
+ active
+ }
+
+ class Ctrl extends Bundle {
+ val addr = UInt(p.lsuAddrBits.W)
+ val id = UInt(6.W)
+ val size = UInt((log2Ceil(p.lsuDataBits / 8) + 1).W)
+ val vstq = Bool()
+ val quad = UInt(2.W)
+ }
+
+ class Data extends Bundle {
+ val data = UInt(p.lsuDataBits.W)
+ val strb = UInt((p.lsuDataBits / 8).W)
+ }
+
+ val q = VCmdq(cmdqDepth, new VStCmdq, Fin, Fout, Factive)
+
+ val ctrl = Slice(new Ctrl, false, true)
+ val data = Slice(new Data, false, true, true)
+
+ val dataEn = RegInit(false.B)
+
+ // ---------------------------------------------------------------------------
+ // Swizzle.
+ def SwizzleData(): UInt = {
+ val dsb = p.vectorBits / 4
+
+ val addr = ctrl.io.out.bits.addr
+ val vstq = ctrl.io.out.bits.vstq
+ val quad = ctrl.io.out.bits.quad
+ val data = io.read.data
+
+ val d0 = data(1 * dsb - 1, 0 * dsb)
+ val d1 = data(2 * dsb - 1, 1 * dsb)
+ val d2 = data(3 * dsb - 1, 2 * dsb)
+ val d3 = data(4 * dsb - 1, 3 * dsb)
+
+ val dataout = MuxOR(!vstq, data) |
+ MuxOR(vstq && quad === 0.U, Cat(d0, d0, d0, d0)) |
+ MuxOR(vstq && quad === 1.U, Cat(d1, d1, d1, d1)) |
+ MuxOR(vstq && quad === 2.U, Cat(d2, d2, d2, d2)) |
+ MuxOR(vstq && quad === 3.U, Cat(d3, d3, d3, d3))
+ assert(dataout.getWidth == p.vectorBits)
+
+ dataout
+ }
+
+ def SwizzleStrb(): UInt = {
+ val n4 = bytes / 4
+ val n = bytes
+
+ val strbB = Wire(Vec(n, Bool()))
+ val strb = strbB.asUInt
+ val strbq = strb(n4 - 1, 0)
+ val addr = ctrl.io.out.bits.addr
+ val size = ctrl.io.out.bits.size
+ val vstq = ctrl.io.out.bits.vstq
+ val quad = addr(msb - 1, msb - 2)
+ val zeroq = Cat(0.U(n4.W))
+
+ for (i <- 0 until p.lsuDataBits / 8) {
+ strbB(i) := size > i.U
+ }
+
+ val strbout = MuxOR(!vstq, strb) |
+ MuxOR(vstq && quad === 0.U, Cat(zeroq, zeroq, zeroq, strbq)) |
+ MuxOR(vstq && quad === 1.U, Cat(zeroq, zeroq, strbq, zeroq)) |
+ MuxOR(vstq && quad === 2.U, Cat(zeroq, strbq, zeroq, zeroq)) |
+ MuxOR(vstq && quad === 3.U, Cat(strbq, zeroq, zeroq, zeroq))
+ assert(strbout.getWidth == p.vectorBits / 8)
+
+ strbout
+ }
+
+ // ---------------------------------------------------------------------------
+ // Instruction queue.
+ q.io.in <> io.in
+
+ val ctrlready = Wire(Bool())
+ q.io.out.ready := ScoreboardReady(q.io.out.bits.vs, io.vrfsb) && ctrlready
+
+ val qmaxvlb = Mux(q.io.out.bits.op === e.vstq.U, maxvlb >> 2.U, maxvlb)
+ val qsize = Mux(q.io.out.bits.remain > qmaxvlb, qmaxvlb, q.io.out.bits.remain)
+
+ val qoutEn = q.io.out.valid && q.io.out.ready
+
+ // ---------------------------------------------------------------------------
+ // Register read.
+ io.read.valid := q.io.out.valid && q.io.out.bits.vs.valid
+ io.read.stall := !q.io.out.ready
+ io.read.addr := q.io.out.bits.vs.addr
+ io.read.tag := OutTag(q.io.out.bits.vs)
+
+ dataEn := qoutEn
+
+ data.io.in.valid := dataEn
+ assert(!(data.io.in.valid && !data.io.in.ready))
+
+ data.io.out.ready := io.axi.addr.ready
+
+ data.io.in.bits.data := SwizzleData()
+ data.io.in.bits.strb := SwizzleStrb()
+
+ // ---------------------------------------------------------------------------
+ // Control.
+ ctrl.io.in.valid := qoutEn
+
+ ctrl.io.in.bits.addr := q.io.out.bits.addr
+ ctrl.io.in.bits.id := q.io.out.bits.vs.addr
+ ctrl.io.in.bits.size := qsize
+ ctrl.io.in.bits.vstq := q.io.out.bits.op === e.vstq.U
+ ctrl.io.in.bits.quad := q.io.out.bits.quad
+
+ ctrl.io.out.ready := io.axi.addr.ready
+
+ ctrlready := io.read.ready && ctrl.io.in.ready && data.io.in.ready
+
+ // ---------------------------------------------------------------------------
+ // Axi.
+ io.axi.addr.valid := ctrl.io.out.valid
+ io.axi.addr.bits.addr := Cat(0.U(1.W), ctrl.io.out.bits.addr(30, msb), 0.U(msb.W))
+ io.axi.addr.bits.id := ctrl.io.out.bits.id
+ assert(!(ctrl.io.out.valid && !ctrl.io.out.bits.addr(31)))
+ assert(!(io.axi.addr.valid && io.axi.addr.bits.addr(31)))
+
+ io.axi.data.valid := ctrl.io.out.valid
+ io.axi.data.bits.data := data.io.out.bits.data
+ io.axi.data.bits.strb := data.io.out.bits.strb
+
+ io.axi.resp.ready := true.B
+
+ assert(io.axi.addr.valid === io.axi.data.valid)
+ assert(io.axi.addr.ready === io.axi.data.ready)
+
+ // ---------------------------------------------------------------------------
+ // Active.
+ io.active := q.io.active
+
+ // ---------------------------------------------------------------------------
+ // Memory active status.
+ val nempty = RegInit(false.B)
+ val count = RegInit(0.U(9.W))
+ val inc = io.axi.addr.valid && io.axi.addr.ready
+ val dec = io.axi.resp.valid && io.axi.resp.ready
+
+ when (inc || dec) {
+ val nxtcount = count + inc - dec
+ count := nxtcount
+ nempty := nxtcount =/= 0.U
+ assert(count <= 256.U)
+ }
+
+ io.nempty := q.io.nempty || ctrl.io.out.valid || nempty
+}
+
+object EmitVSt extends App {
+ val p = new Parameters
+ (new chisel3.stage.ChiselStage).emitVerilog(new VSt(p), args)
+}
diff --git a/hdl/verilog/BUILD b/hdl/verilog/BUILD
new file mode 100644
index 0000000..81d5af0
--- /dev/null
+++ b/hdl/verilog/BUILD
@@ -0,0 +1,19 @@
+load("@rules_hdl//verilog:providers.bzl", "VerilogInfo", "verilog_library")
+
+verilog_library(
+ name = "clock_gate",
+ srcs = ["ClockGate.v"],
+ visibility = ["//visibility:public"],
+)
+
+verilog_library(
+ name = "sram_1rw_256x256",
+ srcs = ["Sram_1rw_256x256.v"],
+ visibility = ["//visibility:public"],
+)
+
+verilog_library(
+ name = "sram_1rw_256x288",
+ srcs = ["Sram_1rwm_256x288.v"],
+ visibility = ["//visibility:public"],
+)
\ No newline at end of file
diff --git a/hdl/verilog/ClockGate.v b/hdl/verilog/ClockGate.v
new file mode 100644
index 0000000..e170ae7
--- /dev/null
+++ b/hdl/verilog/ClockGate.v
@@ -0,0 +1,26 @@
+module ClockGate(
+ input clk_i,
+ input enable, // '1' passthrough, '0' disable.
+ output clk_o
+);
+
+`ifndef CLOCKGATE_ENABLE
+
+assign clk_o = clk_i;
+
+`else
+
+reg clk_en;
+
+// Capture 'enable' during low phase of the clock.
+always_latch begin
+ if (~clk_i) begin
+ clk_en <= enable;
+ end
+end
+
+assign clk_o = clk_i & clk_en;
+
+`endif
+
+endmodule // ClockGate
diff --git a/hdl/verilog/Sram_1rw_256x256.v b/hdl/verilog/Sram_1rw_256x256.v
new file mode 100644
index 0000000..82ebd75
--- /dev/null
+++ b/hdl/verilog/Sram_1rw_256x256.v
@@ -0,0 +1,23 @@
+module Sram_1rw_256x256(
+ input clock,
+ input valid,
+ input write,
+ input [7:0] addr,
+ input [255:0] wdata,
+ output [255:0] rdata
+);
+
+ reg [255:0] mem [0:255];
+ reg [7:0] raddr;
+
+ assign rdata = mem[raddr];
+
+ always @(posedge clock) begin
+ if (valid & write) begin
+ mem[addr] <= wdata;
+ end
+ if (valid & ~write) begin
+ raddr <= addr;
+ end
+ end
+endmodule
diff --git a/hdl/verilog/Sram_1rwm_256x288.v b/hdl/verilog/Sram_1rwm_256x288.v
new file mode 100644
index 0000000..fd76452
--- /dev/null
+++ b/hdl/verilog/Sram_1rwm_256x288.v
@@ -0,0 +1,92 @@
+module Sram_1rwm_256x288(
+ input clock,
+ input valid,
+ input write,
+ input [7:0] addr,
+ input [287:0] wdata,
+ input [31:0] wmask,
+ output [287:0] rdata
+);
+
+reg [287:0] mem [0:255];
+reg [7:0] raddr;
+
+assign rdata = mem[raddr];
+
+`ifdef FPGA
+
+always @(posedge clock) begin
+ for (int i = 0; i < 32; i++) begin
+ if (valid & write & wmask[i]) begin
+ mem[addr][i*9 +: 9] <= wdata[i*9 +: 9];
+ end
+ end
+ if (valid & ~write) begin
+ raddr <= addr;
+ end
+end
+
+endmodule // Sram_1rwm_256x288
+
+`else // !FPGA
+
+Sram_1rw_256x9 u_bl00(clock, valid & (~write | wmask[0]), write, addr, wdata[ 0 +: 9], rdata[ 0 +: 9]);
+Sram_1rw_256x9 u_bl01(clock, valid & (~write | wmask[1]), write, addr, wdata[ 9 +: 9], rdata[ 9 +: 9]);
+Sram_1rw_256x9 u_bl02(clock, valid & (~write | wmask[2]), write, addr, wdata[ 18 +: 9], rdata[ 18 +: 9]);
+Sram_1rw_256x9 u_bl03(clock, valid & (~write | wmask[3]), write, addr, wdata[ 27 +: 9], rdata[ 27 +: 9]);
+Sram_1rw_256x9 u_bl04(clock, valid & (~write | wmask[4]), write, addr, wdata[ 36 +: 9], rdata[ 36 +: 9]);
+Sram_1rw_256x9 u_bl05(clock, valid & (~write | wmask[5]), write, addr, wdata[ 45 +: 9], rdata[ 45 +: 9]);
+Sram_1rw_256x9 u_bl06(clock, valid & (~write | wmask[6]), write, addr, wdata[ 54 +: 9], rdata[ 54 +: 9]);
+Sram_1rw_256x9 u_bl07(clock, valid & (~write | wmask[7]), write, addr, wdata[ 63 +: 9], rdata[ 63 +: 9]);
+Sram_1rw_256x9 u_bl08(clock, valid & (~write | wmask[8]), write, addr, wdata[ 72 +: 9], rdata[ 72 +: 9]);
+Sram_1rw_256x9 u_bl09(clock, valid & (~write | wmask[9]), write, addr, wdata[ 81 +: 9], rdata[ 81 +: 9]);
+Sram_1rw_256x9 u_bl10(clock, valid & (~write | wmask[10]), write, addr, wdata[ 90 +: 9], rdata[ 90 +: 9]);
+Sram_1rw_256x9 u_bl11(clock, valid & (~write | wmask[11]), write, addr, wdata[ 99 +: 9], rdata[ 99 +: 9]);
+Sram_1rw_256x9 u_bl12(clock, valid & (~write | wmask[12]), write, addr, wdata[108 +: 9], rdata[108 +: 9]);
+Sram_1rw_256x9 u_bl13(clock, valid & (~write | wmask[13]), write, addr, wdata[117 +: 9], rdata[117 +: 9]);
+Sram_1rw_256x9 u_bl14(clock, valid & (~write | wmask[14]), write, addr, wdata[126 +: 9], rdata[126 +: 9]);
+Sram_1rw_256x9 u_bl15(clock, valid & (~write | wmask[15]), write, addr, wdata[135 +: 9], rdata[135 +: 9]);
+Sram_1rw_256x9 u_bl16(clock, valid & (~write | wmask[16]), write, addr, wdata[144 +: 9], rdata[144 +: 9]);
+Sram_1rw_256x9 u_bl17(clock, valid & (~write | wmask[17]), write, addr, wdata[153 +: 9], rdata[153 +: 9]);
+Sram_1rw_256x9 u_bl18(clock, valid & (~write | wmask[18]), write, addr, wdata[162 +: 9], rdata[162 +: 9]);
+Sram_1rw_256x9 u_bl19(clock, valid & (~write | wmask[19]), write, addr, wdata[171 +: 9], rdata[171 +: 9]);
+Sram_1rw_256x9 u_bl20(clock, valid & (~write | wmask[20]), write, addr, wdata[180 +: 9], rdata[180 +: 9]);
+Sram_1rw_256x9 u_bl21(clock, valid & (~write | wmask[21]), write, addr, wdata[189 +: 9], rdata[189 +: 9]);
+Sram_1rw_256x9 u_bl22(clock, valid & (~write | wmask[22]), write, addr, wdata[198 +: 9], rdata[198 +: 9]);
+Sram_1rw_256x9 u_bl23(clock, valid & (~write | wmask[23]), write, addr, wdata[207 +: 9], rdata[207 +: 9]);
+Sram_1rw_256x9 u_bl24(clock, valid & (~write | wmask[24]), write, addr, wdata[216 +: 9], rdata[216 +: 9]);
+Sram_1rw_256x9 u_bl25(clock, valid & (~write | wmask[25]), write, addr, wdata[225 +: 9], rdata[225 +: 9]);
+Sram_1rw_256x9 u_bl26(clock, valid & (~write | wmask[26]), write, addr, wdata[234 +: 9], rdata[234 +: 9]);
+Sram_1rw_256x9 u_bl27(clock, valid & (~write | wmask[27]), write, addr, wdata[243 +: 9], rdata[243 +: 9]);
+Sram_1rw_256x9 u_bl28(clock, valid & (~write | wmask[28]), write, addr, wdata[252 +: 9], rdata[252 +: 9]);
+Sram_1rw_256x9 u_bl29(clock, valid & (~write | wmask[29]), write, addr, wdata[261 +: 9], rdata[261 +: 9]);
+Sram_1rw_256x9 u_bl30(clock, valid & (~write | wmask[30]), write, addr, wdata[270 +: 9], rdata[270 +: 9]);
+Sram_1rw_256x9 u_bl31(clock, valid & (~write | wmask[31]), write, addr, wdata[279 +: 9], rdata[279 +: 9]);
+
+endmodule // Sram_1rwm_256x288
+
+module Sram_1rw_256x9(
+ input clock,
+ input valid,
+ input write,
+ input [7:0] addr,
+ input [8:0] wdata,
+ output [8:0] rdata
+);
+
+ reg [8:0] mem [0:255];
+ reg [7:0] raddr;
+
+ assign rdata = mem[raddr];
+
+ always @(posedge clock) begin
+ if (valid & write) begin
+ mem[addr] <= wdata;
+ end
+ if (valid & ~write) begin
+ raddr <= addr;
+ end
+ end
+endmodule // Sram_1rw_256x9
+
+`endif // FPGA
diff --git a/lib/BUILD b/lib/BUILD
new file mode 100644
index 0000000..79ff16e
--- /dev/null
+++ b/lib/BUILD
@@ -0,0 +1,46 @@
+load("@io_bazel_rules_scala//scala:scala.bzl", "scala_library")
+
+scala_library(
+ name = "chisel_lib",
+
+ deps = [
+ "@com_thoughtworks_paranamer//jar",
+ "@org_json4s_json4s_scalap//jar",
+ "@org_json4s_json4s_ast//jar",
+ "@org_json4s_json4s_core//jar",
+ "@org_json4s_json4s_native//jar",
+ "@org_apache_commons_commons_lang3//jar",
+ "@org_apache_commons_commons_text//jar",
+ "@edu_berkeley_cs_chisel3_plugin//jar",
+ "@com_github_scopt//jar",
+ "@net_jcazevedo_moultingyaml//jar",
+ "@edu_berkeley_cs_firrtl//jar",
+ "@edu_berkeley_cs_chisel3_core//jar",
+ "@edu_berkeley_cs_chisel3_macros//jar",
+ "@edu_berkeley_cs_chisel3//jar",
+ ],
+
+ exports = [
+ "@com_thoughtworks_paranamer//jar",
+ "@org_json4s_json4s_scalap//jar",
+ "@org_json4s_json4s_ast//jar",
+ "@org_json4s_json4s_core//jar",
+ "@org_json4s_json4s_native//jar",
+ "@org_apache_commons_commons_lang3//jar",
+ "@org_apache_commons_commons_text//jar",
+ "@edu_berkeley_cs_chisel3_plugin//jar",
+ "@com_github_scopt//jar",
+ "@net_jcazevedo_moultingyaml//jar",
+ "@edu_berkeley_cs_firrtl//jar",
+ "@edu_berkeley_cs_chisel3_core//jar",
+ "@edu_berkeley_cs_chisel3_macros//jar",
+ "@edu_berkeley_cs_chisel3//jar",
+ ],
+
+ visibility = ["//visibility:public"],
+
+ scalacopts = [
+ "-Xplugin:$(execpath @edu_berkeley_cs_chisel3_plugin//jar)",
+ "-P:chiselplugin:genBundleElements",
+ ],
+)
\ No newline at end of file
diff --git a/rules/BUILD b/rules/BUILD
new file mode 100644
index 0000000..ffd0fb0
--- /dev/null
+++ b/rules/BUILD
@@ -0,0 +1 @@
+package(default_visibility = ["//visibility:public"])
diff --git a/rules/chisel.bzl b/rules/chisel.bzl
new file mode 100644
index 0000000..5ce9315
--- /dev/null
+++ b/rules/chisel.bzl
@@ -0,0 +1,76 @@
+load("@io_bazel_rules_scala//scala:scala.bzl", "scala_binary", "scala_library")
+load("@rules_hdl//verilog:providers.bzl", "VerilogInfo", "verilog_library")
+load("@kelvin_hw//rules:verilator.bzl", "verilator_cc_library")
+
+def chisel_library(name,
+ srcs = [],
+ deps = [],
+ visibility = None):
+ scala_library(
+ name = name,
+ srcs = srcs,
+ deps = [
+ "@kelvin_hw//lib:chisel_lib",
+ "@edu_berkeley_cs_chisel3_plugin//jar",
+ ] + deps,
+ scalacopts = [
+ "-Xplugin:$(execpath @edu_berkeley_cs_chisel3_plugin//jar)",
+ "-P:chiselplugin:genBundleElements",
+ ],
+ visibility = visibility,
+ )
+
+def chisel_binary(name,
+ main_class,
+ srcs = [],
+ deps = [],
+ visibility = None):
+ scala_binary(
+ name = name,
+ srcs = srcs,
+ main_class = main_class,
+ deps = [
+ "@kelvin_hw//lib:chisel_lib",
+ "@edu_berkeley_cs_chisel3_plugin//jar",
+ ] + deps,
+ scalacopts = [
+ "-Xplugin:$(execpath @edu_berkeley_cs_chisel3_plugin//jar)",
+ "-P:chiselplugin:genBundleElements",
+ ],
+ visibility = visibility,
+ )
+
+def chisel_cc_library(name,
+ chisel_lib,
+ emit_class,
+ module_name,
+ verilog_deps=[]):
+ gen_binary_name = name + "_emit_verilog_binary"
+ chisel_binary(
+ name = gen_binary_name,
+ deps = [ chisel_lib ],
+ main_class = emit_class,
+ )
+
+ native.genrule(
+ name = name + "_emit_verilog",
+ srcs = [],
+ outs = [module_name + ".v"],
+ cmd = "./$(location " + gen_binary_name + ") --target-dir $(RULEDIR)",
+ tools = [":{}".format(gen_binary_name)],
+ )
+
+ verilog_library(
+ name = name + "_verilog",
+ srcs = [module_name + ".v"],
+ deps = verilog_deps,
+ )
+
+ verilator_cc_library(
+ name = name,
+ module = ":{}_verilog".format(name),
+ module_top = module_name,
+ visibility = ["//visibility:public"],
+ # TODO(derekjchow): Re-enable the default -Wall?
+ vopts = [],
+ )
diff --git a/rules/deps.bzl b/rules/deps.bzl
new file mode 100644
index 0000000..9e359b7
--- /dev/null
+++ b/rules/deps.bzl
@@ -0,0 +1,122 @@
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@io_bazel_rules_scala//scala:scala_maven_import_external.bzl",
+ "scala_maven_import_external")
+load("@io_bazel_rules_scala//scala:scala_cross_version.bzl",
+ "default_maven_server_urls")
+load("@rules_foreign_cc//foreign_cc:repositories.bzl",
+ "rules_foreign_cc_dependencies")
+load("@rules_hdl//dependency_support:dependency_support.bzl",
+ rules_hdl_dependency_support = "dependency_support")
+
+
+def kelvin_deps():
+ rules_foreign_cc_dependencies()
+ rules_hdl_dependency_support()
+
+ http_archive(
+ name = "accellera_systemc",
+ build_file = "systemc.BUILD",
+ sha256 = "bfb309485a8ad35a08ee78827d1647a451ec5455767b25136e74522a6f41e0ea",
+ strip_prefix = "systemc-2.3.4",
+ urls = [
+ "https://github.com/accellera-official/systemc/archive/refs/tags/2.3.4.tar.gz",
+ ],
+ )
+
+ # paranamer
+ scala_maven_import_external(
+ name = "com_thoughtworks_paranamer",
+ artifact = "com.thoughtworks.paranamer:paranamer:%s" % "2.8",
+ server_urls = default_maven_server_urls(),
+ licenses = ["notice"],
+ )
+
+ # json4s
+ scala_maven_import_external(
+ name = "org_json4s_json4s_ast",
+ artifact = "org.json4s:json4s-ast_2.13:%s" % "3.6.12",
+ server_urls = default_maven_server_urls(),
+ licenses = ["notice"],
+ )
+ scala_maven_import_external(
+ name = "org_json4s_json4s_scalap",
+ artifact = "org.json4s:json4s-scalap_2.13:%s" % "3.6.12",
+ server_urls = default_maven_server_urls(),
+ licenses = ["notice"],
+ )
+ scala_maven_import_external(
+ name = "org_json4s_json4s_core",
+ artifact = "org.json4s:json4s-core_2.13:%s" % "3.6.12",
+ server_urls = default_maven_server_urls(),
+ licenses = ["notice"],
+ )
+ scala_maven_import_external(
+ name = "org_json4s_json4s_native",
+ artifact = "org.json4s:json4s-native_2.13:%s" % "3.6.12",
+ server_urls = default_maven_server_urls(),
+ licenses = ["notice"],
+ )
+
+ # org.apache.commons
+ scala_maven_import_external(
+ name = "org_apache_commons_commons_lang3",
+ artifact = "org.apache.commons:commons-lang3:%s" % "3.11",
+ server_urls = default_maven_server_urls(),
+ licenses = ["notice"],
+ )
+ scala_maven_import_external(
+ name = "org_apache_commons_commons_text",
+ artifact = "org.apache.commons:commons-text:%s" % "1.9",
+ server_urls = default_maven_server_urls(),
+ licenses = ["notice"],
+ )
+
+ # scopt
+ scala_maven_import_external(
+ name = "com_github_scopt",
+ artifact = "com.github.scopt:scopt_2.13:%s" % "3.7.1",
+ server_urls = default_maven_server_urls(),
+ licenses = ["notice"],
+ )
+
+ # moultingyaml
+ scala_maven_import_external(
+ name = "net_jcazevedo_moultingyaml",
+ artifact = "net.jcazevedo:moultingyaml_2.13:%s" % "0.4.2",
+ server_urls = default_maven_server_urls(),
+ licenses = ["notice"],
+ )
+
+ # FIRRTL
+ scala_maven_import_external(
+ name = "edu_berkeley_cs_firrtl",
+ artifact = "edu.berkeley.cs:firrtl_2.13:%s" % "1.5.1",
+ server_urls = default_maven_server_urls(),
+ licenses = ["notice"],
+ )
+
+ # Chisel3
+ scala_maven_import_external(
+ name = "edu_berkeley_cs_chisel3",
+ artifact = "edu.berkeley.cs:chisel3_2.13:%s" % "3.5.1",
+ server_urls = default_maven_server_urls(),
+ licenses = ["notice"],
+ )
+ scala_maven_import_external(
+ name = "edu_berkeley_cs_chisel3_core",
+ artifact = "edu.berkeley.cs:chisel3-core_2.13:%s" % "3.5.1",
+ server_urls = default_maven_server_urls(),
+ licenses = ["notice"],
+ )
+ scala_maven_import_external(
+ name = "edu_berkeley_cs_chisel3_macros",
+ artifact = "edu.berkeley.cs:chisel3-macros_2.13:%s" % "3.5.1",
+ server_urls = default_maven_server_urls(),
+ licenses = ["notice"],
+ )
+ scala_maven_import_external(
+ name = "edu_berkeley_cs_chisel3_plugin",
+ artifact = "edu.berkeley.cs:chisel3-plugin_2.13.6:%s" % "3.5.1",
+ server_urls = default_maven_server_urls(),
+ licenses = ["notice"],
+ )
diff --git a/rules/repos.bzl b/rules/repos.bzl
new file mode 100644
index 0000000..dcf8658
--- /dev/null
+++ b/rules/repos.bzl
@@ -0,0 +1,44 @@
+# Kelvin repositories
+#
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+def kelvin_repos():
+ http_archive(
+ name = "bazel_skylib",
+ sha256 = "b8a1527901774180afc798aeb28c4634bdccf19c4d98e7bdd1ce79d1fe9aaad7",
+ urls = [
+ "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.4.1/bazel-skylib-1.4.1.tar.gz",
+ "https://github.com/bazelbuild/bazel-skylib/releases/download/1.4.1/bazel-skylib-1.4.1.tar.gz",
+ ],
+ )
+
+ http_archive(
+ name = "rules_hdl",
+ sha256 = "223bce01f8375b29073a1475591c0c7e0d86c0d0b2ed73cbdb85f9e9dfa0dda3",
+ strip_prefix = "bazel_rules_hdl-b58d34add60108ae20d273ee480193b25e96d000",
+ urls = [
+ "https://github.com/hdl/bazel_rules_hdl/archive/b58d34add60108ae20d273ee480193b25e96d000.tar.gz",
+ ],
+ patches = [
+ "0001-Update-version-of-Googletest-for-bazel-compatitibili.patch",
+ "0002-SystemC-support-for-verilator.patch",
+ ],
+ )
+
+ # See https://github.com/bazelbuild/rules_scala/releases for up to date version information.
+ rules_scala_version = "c711b4d1f0d1cc386c63ef748c9df14d2f3a187e"
+ http_archive(
+ name = "io_bazel_rules_scala",
+ sha256 = "556677f505634da64efc41912d280895e61f5da109d82bdee41cde4120a190a1",
+ strip_prefix = "rules_scala-%s" % rules_scala_version,
+ type = "zip",
+ url = "https://github.com/bazelbuild/rules_scala/archive/%s.zip" % rules_scala_version,
+ )
+
+ http_archive(
+ name = "rules_foreign_cc",
+ sha256 = "2a4d07cd64b0719b39a7c12218a3e507672b82a97b98c6a89d38565894cf7c51",
+ strip_prefix = "rules_foreign_cc-0.9.0",
+ url = "https://github.com/bazelbuild/rules_foreign_cc/archive/refs/tags/0.9.0.tar.gz",
+ )
diff --git a/rules/verilator.bzl b/rules/verilator.bzl
new file mode 100644
index 0000000..834cd68
--- /dev/null
+++ b/rules/verilator.bzl
@@ -0,0 +1,223 @@
+# Modified from bazel_rules_hdl to use SystemC
+"""Functions for verilator."""
+
+load("@rules_hdl//verilog:providers.bzl", "VerilogInfo")
+load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
+
+def cc_compile_and_link_static_library(ctx, srcs, hdrs, deps, runfiles, includes = [], defines = []):
+ """Compile and link C++ source into a static library
+
+ Args:
+ ctx: Context for rule
+ srcs: The cpp sources generated by verilator.
+ hdrs: The headers generated by verilator.
+ deps: Library dependencies to build with.
+ runfiles: Data dependencies that are read at runtime.
+ includes: The includes for the verilator module to build.
+ defines: Cpp defines to build with.
+
+ Returns:
+ CCInfo with the compiled library.
+ """
+ cc_toolchain = find_cpp_toolchain(ctx)
+ feature_configuration = cc_common.configure_features(
+ ctx = ctx,
+ cc_toolchain = cc_toolchain,
+ requested_features = ctx.features,
+ unsupported_features = ctx.disabled_features,
+ )
+
+ compilation_contexts = [dep[CcInfo].compilation_context for dep in deps]
+ compilation_context, compilation_outputs = cc_common.compile(
+ name = ctx.label.name,
+ actions = ctx.actions,
+ feature_configuration = feature_configuration,
+ cc_toolchain = cc_toolchain,
+ srcs = srcs,
+ includes = includes,
+ defines = defines,
+ public_hdrs = hdrs,
+ compilation_contexts = compilation_contexts,
+ )
+
+ linking_contexts = [dep[CcInfo].linking_context for dep in deps]
+ linking_context, linking_output = cc_common.create_linking_context_from_compilation_outputs(
+ actions = ctx.actions,
+ feature_configuration = feature_configuration,
+ cc_toolchain = cc_toolchain,
+ compilation_outputs = compilation_outputs,
+ linking_contexts = linking_contexts,
+ name = ctx.label.name,
+ disallow_dynamic_library = True,
+ )
+
+ output_files = []
+ if linking_output.library_to_link.static_library != None:
+ output_files.append(linking_output.library_to_link.static_library)
+ if linking_output.library_to_link.dynamic_library != None:
+ output_files.append(linking_output.library_to_link.dynamic_library)
+
+ return [
+ DefaultInfo(files = depset(output_files), runfiles = ctx.runfiles(files = runfiles)),
+ CcInfo(
+ compilation_context = compilation_context,
+ linking_context = linking_context,
+ ),
+ ]
+
+_CPP_SRC = ["cc", "cpp", "cxx", "c++"]
+_HPP_SRC = ["h", "hh", "hpp"]
+_RUNFILES = ["dat", "mem"]
+
+def _only_cpp(f):
+ """Filter for just C++ source/headers"""
+ if f.extension in _CPP_SRC + _HPP_SRC:
+ return f.path
+ return None
+
+def _only_hpp(f):
+ """Filter for just C++ headers"""
+ if f.extension in _HPP_SRC:
+ return f.path
+ return None
+
+_COPY_TREE_SH = """
+OUT=$1; shift && mkdir -p "$OUT" && cp $* "$OUT"
+"""
+
+def _copy_tree(ctx, idir, odir, map_each = None, progress_message = None):
+ """Copy files from a TreeArtifact to a new directory"""
+ args = ctx.actions.args()
+ args.add(odir.path)
+ args.add_all([idir], map_each = map_each)
+ ctx.actions.run_shell(
+ arguments = [args],
+ command = _COPY_TREE_SH,
+ inputs = [idir],
+ outputs = [odir],
+ progress_message = progress_message,
+ )
+
+ return odir
+
+def _verilator_cc_library(ctx):
+ transitive_srcs = depset([], transitive = [ctx.attr.module[VerilogInfo].dag])
+ all_srcs = [verilog_info_struct.srcs for verilog_info_struct in transitive_srcs.to_list()]
+ all_files = [src for sub_tuple in all_srcs for src in sub_tuple]
+
+ # Filter out .dat files.
+ runfiles = []
+ verilog_files = []
+ for file in all_files:
+ if file.extension in _RUNFILES:
+ runfiles.append(file)
+ else:
+ verilog_files.append(file)
+
+ verilator_output = ctx.actions.declare_directory(ctx.label.name + "-gen")
+ verilator_output_cpp = ctx.actions.declare_directory(ctx.label.name + ".cpp")
+ verilator_output_hpp = ctx.actions.declare_directory(ctx.label.name + ".h")
+
+ prefix = "V" + ctx.attr.module_top
+
+ args = ctx.actions.args()
+ args.add("--sc")
+ args.add("--pins-bv", "2")
+ args.add("--Mdir", verilator_output.path)
+ args.add("--top-module", ctx.attr.module_top)
+ args.add("--prefix", prefix)
+ if ctx.attr.trace:
+ args.add("--trace")
+ for verilog_file in verilog_files:
+ args.add(verilog_file.path)
+ args.add_all(ctx.attr.vopts, expand_directories = False)
+
+ ctx.actions.run(
+ arguments = [args],
+ executable = ctx.executable._verilator,
+ inputs = verilog_files,
+ outputs = [verilator_output],
+ progress_message = "[Verilator] Compiling {}".format(ctx.label),
+ )
+
+ _copy_tree(
+ ctx,
+ verilator_output,
+ verilator_output_cpp,
+ map_each = _only_cpp,
+ progress_message = "[Verilator] Extracting C++ source files",
+ )
+ _copy_tree(
+ ctx,
+ verilator_output,
+ verilator_output_hpp,
+ map_each = _only_hpp,
+ progress_message = "[Verilator] Extracting C++ header files",
+ )
+
+ # Do actual compile
+ defines = ["VM_TRACE"] if ctx.attr.trace else []
+ deps = [ctx.attr._verilator_lib, ctx.attr._zlib, ctx.attr._verilator_svdpi]
+
+ return cc_compile_and_link_static_library(
+ ctx,
+ srcs = [verilator_output_cpp],
+ hdrs = [verilator_output_hpp],
+ defines = defines,
+ runfiles = runfiles,
+ includes = [verilator_output_hpp.path],
+ deps = deps,
+ )
+
+verilator_cc_library = rule(
+ _verilator_cc_library,
+ attrs = {
+ "module": attr.label(
+ doc = "The top level module target to verilate.",
+ providers = [VerilogInfo],
+ mandatory = True,
+ ),
+ "module_top": attr.string(
+ doc = "The name of the verilog module to verilate.",
+ mandatory = True,
+ ),
+ "trace": attr.bool(
+ doc = "Enable tracing for Verilator",
+ default = True,
+ ),
+ "vopts": attr.string_list(
+ doc = "Additional command line options to pass to Verilator",
+ default = ["-Wall"],
+ ),
+ "_cc_toolchain": attr.label(
+ doc = "CC compiler.",
+ default = Label("@bazel_tools//tools/cpp:current_cc_toolchain"),
+ ),
+ "_verilator": attr.label(
+ doc = "Verilator binary.",
+ executable = True,
+ cfg = "exec",
+ default = Label("@verilator//:verilator_executable"),
+ ),
+ "_verilator_lib": attr.label(
+ doc = "Verilator library",
+ default = Label("@verilator//:libverilator"),
+ ),
+ "_verilator_svdpi": attr.label(
+ doc = "Verilator svdpi lib",
+ default = Label("@verilator//:svdpi"),
+ ),
+ "_zlib": attr.label(
+ doc = "zlib dependency",
+ default = Label("@net_zlib//:zlib"),
+ ),
+ },
+ provides = [
+ CcInfo,
+ DefaultInfo,
+ ],
+ toolchains = [
+ "@bazel_tools//tools/cpp:toolchain_type",
+ ],
+ fragments = ["cpp"],
+)
\ No newline at end of file
diff --git a/tests/verilator_sim/BUILD b/tests/verilator_sim/BUILD
new file mode 100644
index 0000000..409f100
--- /dev/null
+++ b/tests/verilator_sim/BUILD
@@ -0,0 +1,237 @@
+cc_library(
+ name = "sim_libs",
+ hdrs = [
+ "fifo.h",
+ "sysc_module.h",
+ "sysc_tb.h",
+ ],
+)
+
+cc_library(
+ name = "kelvin_if",
+ hdrs = [
+ "kelvin/core_if.h",
+ "kelvin/debug_if.h",
+ "kelvin/kelvin_cfg.h",
+ "kelvin/memory_if.h",
+ ],
+ defines = ["KELVIN_SIMD=256"],
+)
+
+cc_binary(
+ name = "core_sim",
+ srcs = [
+ "kelvin/core_tb.cc",
+ ],
+ deps = [
+ ":sim_libs",
+ ":kelvin_if",
+ "//hdl/chisel:core_cc_library",
+ "@accellera_systemc//:systemc",
+ ],
+)
+
+cc_binary(
+ name = "dbus2axi_tb",
+ srcs = [
+ "kelvin/dbus2axi_tb.cc",
+ ],
+ deps = [
+ ":sim_libs",
+ "//hdl/chisel:dbus2axi_cc_library",
+ "@accellera_systemc//:systemc",
+ ],
+)
+
+cc_binary(
+ name = "l1dcache_tb",
+ srcs = [
+ "kelvin/l1dcache_tb.cc",
+ ],
+ deps = [
+ ":kelvin_if",
+ ":sim_libs",
+ "//hdl/chisel:l1dcache_cc_library",
+ "@accellera_systemc//:systemc",
+ ],
+)
+
+cc_binary(
+ name = "l1icache_tb",
+ srcs = [
+ "kelvin/l1icache_tb.cc",
+ ],
+ deps = [
+ ":kelvin_if",
+ ":sim_libs",
+ "//hdl/chisel:l1icache_cc_library",
+ "@accellera_systemc//:systemc",
+ ],
+)
+
+# TODO(derekjchow): Add valu and valuint test benches
+
+cc_library(
+ name = "vdecode",
+ hdrs = [
+ "kelvin/vdecode.h",
+ ],
+ deps = [
+ ":vdecodeop",
+ ":vencodeop",
+ ],
+)
+
+cc_library(
+ name = "vdecodeop",
+ hdrs = [
+ "kelvin/vdecodeop.h",
+ ],
+)
+
+cc_library(
+ name = "vencodeop",
+ hdrs = [
+ "kelvin/vencodeop.h",
+ ],
+)
+
+cc_binary(
+ name = "vcmdq_tb",
+ srcs = [
+ "kelvin/vcmdq_tb.cc",
+ ],
+ deps = [
+ ":kelvin_if",
+ ":sim_libs",
+ ":vencodeop",
+ "//hdl/chisel:vcmdq_cc_library",
+ "@accellera_systemc//:systemc",
+ ],
+)
+
+cc_binary(
+ name = "vconvalu_tb",
+ srcs = [
+ "kelvin/vconvalu_tb.cc",
+ ],
+ deps = [
+ ":kelvin_if",
+ ":sim_libs",
+ "//hdl/chisel:vconvalu_cc_library",
+ "@accellera_systemc//:systemc",
+ ],
+)
+
+# TODO(derekjchow): Fix broken test
+cc_binary(
+ name = "vconvctrl_tb",
+ srcs = [
+ "kelvin/vconvctrl_tb.cc",
+ ],
+ deps = [
+ ":kelvin_if",
+ ":sim_libs",
+ ":vencodeop",
+ "//hdl/chisel:vconvctrl_cc_library",
+ "@accellera_systemc//:systemc",
+ ],
+)
+
+# TODO(derekjchow): Fix broken test
+cc_binary(
+ name = "vdecodeinstruction_tb",
+ srcs = [
+ "kelvin/vdecodeinstruction_tb.cc",
+ ],
+ deps = [
+ ":kelvin_if",
+ ":sim_libs",
+ ":vdecode",
+ "//hdl/chisel:vdecodeinstruction_cc_library",
+ "@accellera_systemc//:systemc",
+ ],
+)
+
+# TODO(derekjchow): Fix broken test
+cc_binary(
+ name = "vdecode_tb",
+ srcs = [
+ "kelvin/vdecode_tb.cc",
+ ],
+ deps = [
+ ":kelvin_if",
+ ":sim_libs",
+ ":vdecode",
+ "//hdl/chisel:vdecode_cc_library",
+ "@accellera_systemc//:systemc",
+ ],
+)
+
+cc_binary(
+ name = "vldst_tb",
+ srcs = [
+ "kelvin/vldst_tb.cc",
+ ],
+ deps = [
+ ":kelvin_if",
+ ":sim_libs",
+ ":vencodeop",
+ "//hdl/chisel:vldst_cc_library",
+ "@accellera_systemc//:systemc",
+ ],
+)
+
+cc_binary(
+ name = "vld_tb",
+ srcs = [
+ "kelvin/vld_tb.cc",
+ ],
+ deps = [
+ ":kelvin_if",
+ ":sim_libs",
+ ":vencodeop",
+ "//hdl/chisel:vld_cc_library",
+ "@accellera_systemc//:systemc",
+ ],
+)
+
+cc_binary(
+ name = "vregfilesegment_tb",
+ srcs = [
+ "kelvin/vregfilesegment_tb.cc",
+ ],
+ deps = [
+ ":kelvin_if",
+ ":sim_libs",
+ "//hdl/chisel:vregfilesegment_cc_library",
+ "@accellera_systemc//:systemc",
+ ],
+)
+
+cc_binary(
+ name = "vregfile_tb",
+ srcs = [
+ "kelvin/vregfile_tb.cc",
+ ],
+ deps = [
+ ":kelvin_if",
+ ":sim_libs",
+ "//hdl/chisel:vregfile_cc_library",
+ "@accellera_systemc//:systemc",
+ ],
+)
+
+cc_binary(
+ name = "vst_tb",
+ srcs = [
+ "kelvin/vst_tb.cc",
+ ],
+ deps = [
+ ":kelvin_if",
+ ":sim_libs",
+ ":vencodeop",
+ "//hdl/chisel:vst_cc_library",
+ "@accellera_systemc//:systemc",
+ ],
+)
diff --git a/tests/verilator_sim/fifo.h b/tests/verilator_sim/fifo.h
new file mode 100644
index 0000000..4e492ec
--- /dev/null
+++ b/tests/verilator_sim/fifo.h
@@ -0,0 +1,58 @@
+#ifndef TESTS_VERILATOR_SIM_FIFO_H_
+#define TESTS_VERILATOR_SIM_FIFO_H_
+
+// A SystemC CRT transaction queue.
+
+template <typename T>
+class fifo_t {
+ public:
+ bool empty() { return entries_.empty(); }
+
+ void write(T v) { entries_.emplace_back(v); }
+
+ bool read(T& v) {
+ if (entries_.empty()) return false;
+ v = entries_.at(0);
+ entries_.erase(entries_.begin());
+ return true;
+ }
+
+ bool next(T& v, int index = 0) {
+ if (index >= count()) return false;
+ v = entries_.at(index);
+ return true;
+ }
+
+ bool rand(T& v) {
+ if (entries_.empty()) return false;
+ int index = ::rand() % count();
+ v = entries_.at(index);
+ return true;
+ }
+
+ void clear() { entries_.clear(); }
+
+ bool remove(int index = 0) {
+ if (index >= count()) return false;
+ entries_.erase(entries_.begin() + index);
+ return true;
+ }
+
+ void shuffle() {
+ const int count = entries_.size();
+ if (count < 2) return;
+ for (int i = 0; i < count; ++i) {
+ const int index = ::rand() % count;
+ T v = entries_.at(index);
+ entries_.erase(entries_.begin() + index);
+ entries_.emplace_back(v);
+ }
+ }
+
+ int count() { return entries_.size(); }
+
+ private:
+ std::vector<T> entries_;
+};
+
+#endif // TESTS_VERILATOR_SIM_FIFO_H_
diff --git a/tests/verilator_sim/kelvin/core_if.h b/tests/verilator_sim/kelvin/core_if.h
new file mode 100644
index 0000000..9ec703c
--- /dev/null
+++ b/tests/verilator_sim/kelvin/core_if.h
@@ -0,0 +1,300 @@
+#ifndef TESTS_VERILATOR_SIM_KELVIN_CORE_IF_H_
+#define TESTS_VERILATOR_SIM_KELVIN_CORE_IF_H_
+
+#include "tests/verilator_sim/fifo.h"
+#include "tests/verilator_sim/kelvin/kelvin_cfg.h"
+#include "tests/verilator_sim/kelvin/memory_if.h"
+
+constexpr int kAxiWaitState = 3;
+
+static bool rand_bool() {
+ return rand() & 1;
+}
+
+static bool rand_bool_ibus() {
+#if 1
+ return rand_bool();
+#else
+ return true;
+#endif
+}
+
+static bool rand_bool_dbus() {
+#if 1
+ return rand_bool();
+#else
+ return true;
+#endif
+}
+
+static bool rand_bool_axi_w() {
+#if 1
+ return rand_bool();
+#else
+ return true;
+#endif
+}
+
+static bool rand_bool_axi_r() {
+#if 1
+ return rand_bool();
+#else
+ return true;
+#endif
+}
+
+// ScalarCore Memory Interface.
+struct Core_if : Memory_if {
+ sc_in<bool> io_ibus_valid;
+ sc_out<bool> io_ibus_ready;
+ sc_in<sc_bv<32> > io_ibus_addr;
+ sc_out<sc_bv<256> > io_ibus_rdata;
+
+ sc_in<bool> io_dbus_valid;
+ sc_out<bool> io_dbus_ready;
+ sc_in<bool> io_dbus_write;
+ sc_in<sc_bv<32> > io_dbus_addr;
+ sc_in<sc_bv<32> > io_dbus_adrx;
+ sc_in<sc_bv<kDbusBits> > io_dbus_size;
+ sc_in<sc_bv<kVector> > io_dbus_wdata;
+ sc_in<sc_bv<kVector / 8> > io_dbus_wmask;
+ sc_out<sc_bv<kVector> > io_dbus_rdata;
+
+ sc_out<bool> io_axi0_write_addr_ready;
+ sc_in<bool> io_axi0_write_addr_valid;
+ sc_in<sc_bv<32> > io_axi0_write_addr_bits_addr;
+ sc_in<sc_bv<kUncId> > io_axi0_write_addr_bits_id;
+ sc_out<bool> io_axi0_write_data_ready;
+ sc_in<bool> io_axi0_write_data_valid;
+ sc_in<sc_bv<kUncBits> > io_axi0_write_data_bits_data;
+ sc_in<sc_bv<kUncStrb> > io_axi0_write_data_bits_strb;
+ sc_in<bool> io_axi0_write_resp_ready;
+ sc_out<bool> io_axi0_write_resp_valid;
+ sc_out<sc_bv<kUncId> > io_axi0_write_resp_bits_id;
+ sc_out<sc_bv<2> > io_axi0_write_resp_bits_resp;
+ sc_out<bool> io_axi0_read_addr_ready;
+ sc_in<bool> io_axi0_read_addr_valid;
+ sc_in<sc_bv<32> > io_axi0_read_addr_bits_addr;
+ sc_in<sc_bv<kUncId> > io_axi0_read_addr_bits_id;
+ sc_in<bool> io_axi0_read_data_ready;
+ sc_out<bool> io_axi0_read_data_valid;
+ sc_out<sc_bv<2> > io_axi0_read_data_bits_resp;
+ sc_out<sc_bv<kUncId> > io_axi0_read_data_bits_id;
+ sc_out<sc_bv<kUncBits> > io_axi0_read_data_bits_data;
+ sc_out<bool> io_axi1_write_addr_ready;
+ sc_in<bool> io_axi1_write_addr_valid;
+ sc_in<sc_bv<32> > io_axi1_write_addr_bits_addr;
+ sc_in<sc_bv<kUncId> > io_axi1_write_addr_bits_id;
+ sc_out<bool> io_axi1_write_data_ready;
+ sc_in<bool> io_axi1_write_data_valid;
+ sc_in<sc_bv<kUncBits> > io_axi1_write_data_bits_data;
+ sc_in<sc_bv<kUncStrb> > io_axi1_write_data_bits_strb;
+ sc_in<bool> io_axi1_write_resp_ready;
+ sc_out<bool> io_axi1_write_resp_valid;
+ sc_out<sc_bv<kUncId> > io_axi1_write_resp_bits_id;
+ sc_out<sc_bv<2> > io_axi1_write_resp_bits_resp;
+ sc_out<bool> io_axi1_read_addr_ready;
+ sc_in<bool> io_axi1_read_addr_valid;
+ sc_in<sc_bv<32> > io_axi1_read_addr_bits_addr;
+ sc_in<sc_bv<kUncId> > io_axi1_read_addr_bits_id;
+ sc_in<bool> io_axi1_read_data_ready;
+ sc_out<bool> io_axi1_read_data_valid;
+ sc_out<sc_bv<2> > io_axi1_read_data_bits_resp;
+ sc_out<sc_bv<kUncId> > io_axi1_read_data_bits_id;
+ sc_out<sc_bv<kUncBits> > io_axi1_read_data_bits_data;
+
+ Core_if(sc_module_name n, const char* bin) : Memory_if(n, bin) {
+ for (int i = 0; i < kUncBits / 32; ++i) {
+ runused_.set_word(i, 0);
+ }
+ }
+
+ void eval() {
+ if (reset) {
+ io_ibus_ready = false;
+ io_axi0_read_addr_ready = false;
+ io_axi0_read_data_valid = false;
+ io_axi0_write_addr_ready = false;
+ io_axi0_write_data_ready = false;
+ io_axi0_write_resp_valid = false;
+ io_axi1_read_addr_ready = false;
+ io_axi1_read_data_valid = false;
+ io_axi1_write_addr_ready = false;
+ io_axi1_write_data_ready = false;
+ io_axi1_write_resp_valid = false;
+ } else if (clock->posedge()) {
+ cycle_++;
+
+ const bool axi0_write_ready = rand_bool_axi_w();
+ const bool axi1_write_ready = rand_bool_axi_w();
+
+ io_ibus_ready = rand_bool_ibus();
+ io_dbus_ready = rand_bool_dbus();
+ io_axi0_read_addr_ready = true;
+ io_axi0_write_addr_ready = axi0_write_ready;
+ io_axi0_write_data_ready = axi0_write_ready;
+ io_axi0_write_resp_valid = false;
+ io_axi1_read_addr_ready = true;
+ io_axi1_write_addr_ready = axi1_write_ready;
+ io_axi1_write_data_ready = axi1_write_ready;
+ io_axi1_write_resp_valid = false;
+
+ // Instruction bus read.
+ if (io_ibus_valid && io_ibus_ready) {
+ sc_bv<256> rdata;
+ uint32_t addr = io_ibus_addr.read().get_word(0);
+ uint32_t words[256 / 32];
+ Read(addr, 256 / 8, (uint8_t*) words);
+
+ for (int i = 0; i < 256 / 32; ++i) {
+ rdata.set_word(i, words[i]);
+ }
+
+ io_ibus_rdata = rdata;
+ }
+
+ // Data bus read.
+ if (io_dbus_valid && io_dbus_ready && !io_dbus_write) {
+ sc_bv<kVector> rdata;
+ uint32_t addr = io_dbus_addr.read().get_word(0);
+ uint32_t words[kVector / 32] = {0};
+ memset(words, 0xcc, sizeof(words));
+ int bytes = io_dbus_size.read().get_word(0);
+ Read(addr, bytes, (uint8_t*) words);
+ ReadSwizzle(addr, kVector / 8, (uint8_t*) words);
+ for (int i = 0; i < kVector / 32; ++i) {
+ rdata.set_word(i, words[i]);
+ }
+ io_dbus_rdata = rdata;
+ }
+
+ // Data bus write.
+ if (io_dbus_valid && io_dbus_ready && io_dbus_write) {
+ sc_bv<kVector> wdata = io_dbus_wdata;
+ uint32_t addr = io_dbus_addr.read().get_word(0);
+ uint32_t words[kVector / 32];
+ int bytes = io_dbus_size.read().get_word(0);
+ for (int i = 0; i < kVector / 32; ++i) {
+ words[i] = wdata.get_word(i);
+ }
+ WriteSwizzle(addr, kVector / 8, (uint8_t*) words);
+ Write(addr, bytes, (uint8_t*) words);
+ }
+
+ rtcm_t tcm_read;
+ sc_bv<kUncBits> rdata;
+
+ // axi0 read.
+ if (io_axi0_read_addr_valid && io_axi0_read_addr_ready) {
+ uint32_t addr = io_axi0_read_addr_bits_addr.read().get_word(0);
+ uint32_t words[kUncBits / 32];
+ Read(addr, kUncBits / 8, (uint8_t*) words);
+
+ tcm_read.cycle = cycle_;
+ tcm_read.id = io_axi0_read_addr_bits_id.read().get_word(0);
+ for (int i = 0; i < kUncBits / 32; ++i) {
+ tcm_read.data.set_word(i, words[i]);
+ }
+ rtcm_[0].write(tcm_read);
+ }
+
+ bool read0 = rand_bool_axi_r() && rtcm_[0].next(tcm_read);
+ if (read0 && (cycle_ - tcm_read.cycle) >= kAxiWaitState) {
+ assert(rtcm_[0].remove());
+ io_axi0_read_data_bits_id = tcm_read.id;
+ io_axi0_read_data_bits_data = tcm_read.data;
+ } else {
+ read0 = false;
+ io_axi0_read_data_bits_id = 0;
+ io_axi0_read_data_bits_data = runused_;
+ }
+ io_axi0_read_data_valid = read0;
+
+ // axi0 write.
+ if (io_axi0_write_addr_valid && io_axi0_write_addr_ready) {
+ assert(io_axi0_write_data_valid && io_axi0_write_data_valid);
+ uint8_t wdata[kUncBits / 8];
+ uint32_t addr = io_axi0_write_addr_bits_addr.read().get_word(0);
+ uint32_t* p_wdata = (uint32_t*) wdata;
+
+ for (int i = 0; i < kUncBits / 32; ++i) {
+ p_wdata[i] = io_axi0_write_data_bits_data.read().get_word(i);
+ }
+
+ for (int i = 0; i < kUncBits / 8; ++i) {
+ if (io_axi0_write_data_bits_strb.read().get_bit(i) != 0) {
+ Write(addr + i, 1, wdata + i);
+ }
+ }
+ }
+
+ if (io_axi0_write_addr_valid && io_axi0_write_addr_ready) {
+ io_axi0_write_resp_valid = true;
+ io_axi0_write_resp_bits_id = io_axi0_write_addr_bits_id;
+ }
+
+ // axi1 read.
+ if (io_axi1_read_addr_valid && io_axi1_read_addr_ready) {
+ uint32_t addr = io_axi1_read_addr_bits_addr.read().get_word(0);
+ uint32_t words[kUncBits / 32];
+ Read(addr, kUncBits / 8, (uint8_t*) words);
+
+ tcm_read.cycle = cycle_;
+ tcm_read.id = io_axi1_read_addr_bits_id.read().get_word(0);
+ for (int i = 0; i < kUncBits / 32; ++i) {
+ tcm_read.data.set_word(i, words[i]);
+ }
+ rtcm_[1].write(tcm_read);
+ }
+
+ bool read1 = rand_bool_axi_r() && rtcm_[1].next(tcm_read);
+ if (read1 && (cycle_ - tcm_read.cycle) >= kAxiWaitState) {
+ assert(rtcm_[1].remove());
+ io_axi1_read_data_bits_id = tcm_read.id;
+ io_axi1_read_data_bits_data = tcm_read.data;
+ } else {
+ read1 = false;
+ io_axi1_read_data_bits_id = 0;
+ io_axi1_read_data_bits_data = runused_;
+ }
+ io_axi1_read_data_valid = read1;
+
+ // axi1 write.
+ if (io_axi1_write_addr_valid && io_axi1_write_addr_ready) {
+ assert(io_axi1_write_data_valid && io_axi1_write_data_valid);
+ uint8_t wdata[kUncBits / 8];
+ uint32_t addr = io_axi1_write_addr_bits_addr.read().get_word(0);
+ uint32_t* p_wdata = (uint32_t*) wdata;
+
+ for (int i = 0; i < kUncBits / 32; ++i) {
+ p_wdata[i] = io_axi1_write_data_bits_data.read().get_word(i);
+ }
+
+ for (int i = 0; i < kUncBits / 8; ++i) {
+ if (io_axi1_write_data_bits_strb.read().get_bit(i) != 0) {
+ Write(addr + i, 1, wdata + i);
+ }
+ }
+ }
+
+ if (io_axi1_write_addr_valid && io_axi1_write_addr_ready) {
+ io_axi1_write_resp_valid = true;
+ io_axi1_write_resp_bits_id = io_axi1_write_addr_bits_id;
+ }
+ }
+ }
+
+private:
+ uint32_t cycle_ = 0;
+
+ struct rtcm_t {
+ uint32_t cycle;
+ uint32_t id : 7;
+ sc_bv<kUncBits> data;
+ };
+
+ fifo_t<rtcm_t> rtcm_[2];
+ sc_bv<kUncBits> runused_;
+};
+
+#endif // TESTS_VERILATOR_SIM_KELVIN_CORE_IF_H_
diff --git a/tests/verilator_sim/kelvin/core_tb.cc b/tests/verilator_sim/kelvin/core_tb.cc
new file mode 100644
index 0000000..e2a6e10
--- /dev/null
+++ b/tests/verilator_sim/kelvin/core_tb.cc
@@ -0,0 +1,270 @@
+#include "tests/verilator_sim/sysc_tb.h"
+
+#include "VCore.h"
+
+#include "tests/verilator_sim/kelvin/core_if.h"
+#include "tests/verilator_sim/kelvin/debug_if.h"
+#include "tests/verilator_sim/kelvin/kelvin_cfg.h"
+
+
+struct Core_tb : Sysc_tb {
+ sc_in<bool> io_halted;
+ sc_in<bool> io_fault;
+
+ using Sysc_tb::Sysc_tb; // constructor
+
+ void posedge() {
+ check(!io_fault, "io_fault");
+ if (io_halted) sc_stop();
+ }
+};
+
+static void Core_run(const char* name, const char* bin, const bool trace) {
+ VCore core(name);
+ Core_tb tb("Core_tb", 100000000, random);
+ Core_if mif("Core_if", bin);
+ Debug_if dbg("Debug_if", &mif);
+
+ sc_signal<bool> io_halted;
+ sc_signal<bool> io_fault;
+ sc_signal<bool> io_ibus_valid;
+ sc_signal<bool> io_ibus_ready;
+ sc_signal<bool> io_dbus_valid;
+ sc_signal<bool> io_dbus_ready;
+ sc_signal<bool> io_dbus_write;
+ sc_signal<bool> io_iflush_valid;
+ sc_signal<bool> io_iflush_ready;
+ sc_signal<bool> io_dflush_valid;
+ sc_signal<bool> io_dflush_ready;
+ sc_signal<bool> io_dflush_all;
+ sc_signal<bool> io_dflush_clean;
+ sc_signal<bool> io_slog_valid;
+ sc_signal<sc_bv<32> > io_csr_in_value_0;
+ sc_signal<sc_bv<32> > io_csr_in_value_1;
+ sc_signal<sc_bv<32> > io_csr_in_value_2;
+ sc_signal<sc_bv<32> > io_csr_in_value_3;
+ sc_signal<sc_bv<32> > io_csr_in_value_4;
+ sc_signal<sc_bv<32> > io_csr_in_value_5;
+ sc_signal<sc_bv<32> > io_csr_in_value_6;
+ sc_signal<sc_bv<32> > io_csr_in_value_7;
+ sc_signal<sc_bv<32> > io_csr_in_value_8;
+ sc_signal<sc_bv<32> > io_csr_in_value_9;
+ sc_signal<sc_bv<32> > io_csr_in_value_10;
+ sc_signal<sc_bv<32> > io_csr_in_value_11;
+ sc_signal<sc_bv<32> > io_csr_out_value_0;
+ sc_signal<sc_bv<32> > io_csr_out_value_1;
+ sc_signal<sc_bv<32> > io_csr_out_value_2;
+ sc_signal<sc_bv<32> > io_csr_out_value_3;
+ sc_signal<sc_bv<32> > io_csr_out_value_4;
+ sc_signal<sc_bv<32> > io_csr_out_value_5;
+ sc_signal<sc_bv<32> > io_csr_out_value_6;
+ sc_signal<sc_bv<32> > io_csr_out_value_7;
+ sc_signal<sc_bv<32> > io_ibus_addr;
+ sc_signal<sc_bv<256> > io_ibus_rdata;
+ sc_signal<sc_bv<32> > io_dbus_addr;
+ sc_signal<sc_bv<32> > io_dbus_adrx;
+ sc_signal<sc_bv<kDbusBits> > io_dbus_size;
+ sc_signal<sc_bv<kVector> > io_dbus_wdata;
+ sc_signal<sc_bv<kVector / 8> > io_dbus_wmask;
+ sc_signal<sc_bv<kVector> > io_dbus_rdata;
+ sc_signal<sc_bv<5> > io_slog_addr;
+ sc_signal<sc_bv<32> > io_slog_data;
+ sc_signal<sc_bv<4> > io_debug_en;
+ sc_signal<sc_bv<32> > io_debug_addr0;
+ sc_signal<sc_bv<32> > io_debug_addr1;
+ sc_signal<sc_bv<32> > io_debug_addr2;
+ sc_signal<sc_bv<32> > io_debug_addr3;
+ sc_signal<sc_bv<32> > io_debug_inst0;
+ sc_signal<sc_bv<32> > io_debug_inst1;
+ sc_signal<sc_bv<32> > io_debug_inst2;
+ sc_signal<sc_bv<32> > io_debug_inst3;
+ sc_signal<sc_bv<32> > io_debug_cycles;
+ sc_signal<bool> io_axi0_write_addr_ready;
+ sc_signal<bool> io_axi0_write_addr_valid;
+ sc_signal<sc_bv<32> > io_axi0_write_addr_bits_addr;
+ sc_signal<sc_bv<kUncId> > io_axi0_write_addr_bits_id;
+ sc_signal<bool> io_axi0_write_data_ready;
+ sc_signal<bool> io_axi0_write_data_valid;
+ sc_signal<sc_bv<kUncBits> > io_axi0_write_data_bits_data;
+ sc_signal<sc_bv<kUncStrb> > io_axi0_write_data_bits_strb;
+ sc_signal<bool> io_axi0_write_resp_ready;
+ sc_signal<bool> io_axi0_write_resp_valid;
+ sc_signal<sc_bv<kUncId> > io_axi0_write_resp_bits_id;
+ sc_signal<sc_bv<2> > io_axi0_write_resp_bits_resp;
+ sc_signal<bool> io_axi0_read_addr_ready;
+ sc_signal<bool> io_axi0_read_addr_valid;
+ sc_signal<sc_bv<32> > io_axi0_read_addr_bits_addr;
+ sc_signal<sc_bv<kUncId> > io_axi0_read_addr_bits_id;
+ sc_signal<bool> io_axi0_read_data_ready;
+ sc_signal<bool> io_axi0_read_data_valid;
+ sc_signal<sc_bv<2> > io_axi0_read_data_bits_resp;
+ sc_signal<sc_bv<kUncId> > io_axi0_read_data_bits_id;
+ sc_signal<sc_bv<kUncBits> > io_axi0_read_data_bits_data;
+ sc_signal<bool> io_axi1_write_addr_ready;
+ sc_signal<bool> io_axi1_write_addr_valid;
+ sc_signal<sc_bv<32> > io_axi1_write_addr_bits_addr;
+ sc_signal<sc_bv<kUncId> > io_axi1_write_addr_bits_id;
+ sc_signal<bool> io_axi1_write_data_ready;
+ sc_signal<bool> io_axi1_write_data_valid;
+ sc_signal<sc_bv<kUncBits> > io_axi1_write_data_bits_data;
+ sc_signal<sc_bv<kUncStrb> > io_axi1_write_data_bits_strb;
+ sc_signal<bool> io_axi1_write_resp_ready;
+ sc_signal<bool> io_axi1_write_resp_valid;
+ sc_signal<sc_bv<kUncId> > io_axi1_write_resp_bits_id;
+ sc_signal<sc_bv<2> > io_axi1_write_resp_bits_resp;
+ sc_signal<bool> io_axi1_read_addr_ready;
+ sc_signal<bool> io_axi1_read_addr_valid;
+ sc_signal<sc_bv<32> > io_axi1_read_addr_bits_addr;
+ sc_signal<sc_bv<kUncId> > io_axi1_read_addr_bits_id;
+ sc_signal<bool> io_axi1_read_data_ready;
+ sc_signal<bool> io_axi1_read_data_valid;
+ sc_signal<sc_bv<2> > io_axi1_read_data_bits_resp;
+ sc_signal<sc_bv<kUncId> > io_axi1_read_data_bits_id;
+ sc_signal<sc_bv<kUncBits> > io_axi1_read_data_bits_data;
+
+ io_iflush_ready = 1;
+ io_dflush_ready = 1;
+
+ tb.io_halted(io_halted);
+ tb.io_fault(io_fault);
+
+ core.clock(tb.clock);
+ core.reset(tb.reset);
+ core.io_halted(io_halted);
+ core.io_fault(io_fault);
+ core.io_ibus_valid(io_ibus_valid);
+ core.io_ibus_ready(io_ibus_ready);
+ core.io_dbus_valid(io_dbus_valid);
+ core.io_dbus_ready(io_dbus_ready);
+ core.io_dbus_write(io_dbus_write);
+ core.io_iflush_valid(io_iflush_valid);
+ core.io_iflush_ready(io_iflush_ready);
+ core.io_dflush_valid(io_dflush_valid);
+ core.io_dflush_ready(io_dflush_ready);
+ core.io_dflush_all(io_dflush_all);
+ core.io_dflush_clean(io_dflush_clean);
+ core.io_slog_valid(io_slog_valid);
+ core.io_csr_in_value_0(io_csr_in_value_0);
+ core.io_csr_in_value_1(io_csr_in_value_1);
+ core.io_csr_in_value_2(io_csr_in_value_2);
+ core.io_csr_in_value_3(io_csr_in_value_3);
+ core.io_csr_in_value_4(io_csr_in_value_4);
+ core.io_csr_in_value_5(io_csr_in_value_5);
+ core.io_csr_in_value_6(io_csr_in_value_6);
+ core.io_csr_in_value_7(io_csr_in_value_7);
+ core.io_csr_in_value_8(io_csr_in_value_8);
+ core.io_csr_in_value_9(io_csr_in_value_9);
+ core.io_csr_in_value_10(io_csr_in_value_10);
+ core.io_csr_in_value_11(io_csr_in_value_11);
+ core.io_csr_out_value_0(io_csr_out_value_0);
+ core.io_csr_out_value_1(io_csr_out_value_1);
+ core.io_csr_out_value_2(io_csr_out_value_2);
+ core.io_csr_out_value_3(io_csr_out_value_3);
+ core.io_csr_out_value_4(io_csr_out_value_4);
+ core.io_csr_out_value_5(io_csr_out_value_5);
+ core.io_csr_out_value_6(io_csr_out_value_6);
+ core.io_csr_out_value_7(io_csr_out_value_7);
+ core.io_ibus_addr(io_ibus_addr);
+ core.io_ibus_rdata(io_ibus_rdata);
+ core.io_dbus_addr(io_dbus_addr);
+ core.io_dbus_adrx(io_dbus_adrx);
+ core.io_dbus_size(io_dbus_size);
+ core.io_dbus_wdata(io_dbus_wdata);
+ core.io_dbus_wmask(io_dbus_wmask);
+ core.io_dbus_rdata(io_dbus_rdata);
+ core.io_slog_addr(io_slog_addr);
+ core.io_slog_data(io_slog_data);
+ core.io_debug_en(io_debug_en);
+ core.io_debug_addr0(io_debug_addr0);
+ core.io_debug_addr1(io_debug_addr1);
+ core.io_debug_addr2(io_debug_addr2);
+ core.io_debug_addr3(io_debug_addr3);
+ core.io_debug_inst0(io_debug_inst0);
+ core.io_debug_inst1(io_debug_inst1);
+ core.io_debug_inst2(io_debug_inst2);
+ core.io_debug_inst3(io_debug_inst3);
+ core.io_debug_cycles(io_debug_cycles);
+
+ mif.clock(tb.clock);
+ mif.reset(tb.reset);
+ mif.io_ibus_valid(io_ibus_valid);
+ mif.io_ibus_ready(io_ibus_ready);
+ mif.io_ibus_addr(io_ibus_addr);
+ mif.io_ibus_rdata(io_ibus_rdata);
+ mif.io_dbus_valid(io_dbus_valid);
+ mif.io_dbus_ready(io_dbus_ready);
+ mif.io_dbus_write(io_dbus_write);
+ mif.io_dbus_addr(io_dbus_addr);
+ mif.io_dbus_adrx(io_dbus_adrx);
+ mif.io_dbus_size(io_dbus_size);
+ mif.io_dbus_wdata(io_dbus_wdata);
+ mif.io_dbus_wmask(io_dbus_wmask);
+ mif.io_dbus_rdata(io_dbus_rdata);
+
+ dbg.clock(tb.clock);
+ dbg.reset(tb.reset);
+ dbg.io_slog_valid(io_slog_valid);
+ dbg.io_slog_addr(io_slog_addr);
+ dbg.io_slog_data(io_slog_data);
+
+#define BINDAXI(a) core.a(a); mif.a(a)
+ BINDAXI(io_axi0_write_addr_ready);
+ BINDAXI(io_axi0_write_addr_valid);
+ BINDAXI(io_axi0_write_addr_bits_addr);
+ BINDAXI(io_axi0_write_addr_bits_id);
+ BINDAXI(io_axi0_write_data_ready);
+ BINDAXI(io_axi0_write_data_valid);
+ BINDAXI(io_axi0_write_data_bits_data);
+ BINDAXI(io_axi0_write_data_bits_strb);
+ BINDAXI(io_axi0_write_resp_ready);
+ BINDAXI(io_axi0_write_resp_valid);
+ BINDAXI(io_axi0_write_resp_bits_id);
+ BINDAXI(io_axi0_write_resp_bits_resp);
+ BINDAXI(io_axi0_read_addr_ready);
+ BINDAXI(io_axi0_read_addr_valid);
+ BINDAXI(io_axi0_read_addr_bits_addr);
+ BINDAXI(io_axi0_read_addr_bits_id);
+ BINDAXI(io_axi0_read_data_ready);
+ BINDAXI(io_axi0_read_data_valid);
+ BINDAXI(io_axi0_read_data_bits_resp);
+ BINDAXI(io_axi0_read_data_bits_id);
+ BINDAXI(io_axi0_read_data_bits_data);
+ BINDAXI(io_axi1_write_addr_ready);
+ BINDAXI(io_axi1_write_addr_valid);
+ BINDAXI(io_axi1_write_addr_bits_addr);
+ BINDAXI(io_axi1_write_addr_bits_id);
+ BINDAXI(io_axi1_write_data_ready);
+ BINDAXI(io_axi1_write_data_valid);
+ BINDAXI(io_axi1_write_data_bits_data);
+ BINDAXI(io_axi1_write_data_bits_strb);
+ BINDAXI(io_axi1_write_resp_ready);
+ BINDAXI(io_axi1_write_resp_valid);
+ BINDAXI(io_axi1_write_resp_bits_id);
+ BINDAXI(io_axi1_write_resp_bits_resp);
+ BINDAXI(io_axi1_read_addr_ready);
+ BINDAXI(io_axi1_read_addr_valid);
+ BINDAXI(io_axi1_read_addr_bits_addr);
+ BINDAXI(io_axi1_read_addr_bits_id);
+ BINDAXI(io_axi1_read_data_ready);
+ BINDAXI(io_axi1_read_data_valid);
+ BINDAXI(io_axi1_read_data_bits_resp);
+ BINDAXI(io_axi1_read_data_bits_id);
+ BINDAXI(io_axi1_read_data_bits_data);
+
+ if (trace) {
+ tb.trace(core);
+ }
+
+ tb.start();
+}
+
+int sc_main(int argc, char *argv[]) {
+ if (argc <= 1) {
+ printf("Expected binary file argument\n");
+ return -1;
+ }
+
+ const char* path = argv[1];
+ Core_run(Sysc_tb::get_name(argv[0]), path, false);
+ return 0;
+}
diff --git a/tests/verilator_sim/kelvin/dbus2axi_tb.cc b/tests/verilator_sim/kelvin/dbus2axi_tb.cc
new file mode 100644
index 0000000..92855e7
--- /dev/null
+++ b/tests/verilator_sim/kelvin/dbus2axi_tb.cc
@@ -0,0 +1,343 @@
+#include "VDBus2Axi.h"
+#include "tests/verilator_sim/sysc_tb.h"
+
+struct DBus2Axi_tb : Sysc_tb {
+ sc_out<bool> io_dbus_valid;
+ sc_in<bool> io_dbus_ready;
+ sc_out<bool> io_dbus_write;
+ sc_out<bool> io_axi_write_addr_ready;
+ sc_in<bool> io_axi_write_addr_valid;
+ sc_out<bool> io_axi_write_data_ready;
+ sc_in<bool> io_axi_write_data_valid;
+ sc_in<bool> io_axi_write_resp_ready;
+ sc_out<bool> io_axi_write_resp_valid;
+ sc_out<bool> io_axi_read_addr_ready;
+ sc_in<bool> io_axi_read_addr_valid;
+ sc_in<bool> io_axi_read_data_ready;
+ sc_out<bool> io_axi_read_data_valid;
+ sc_out<sc_bv<32> > io_dbus_addr;
+ sc_out<sc_bv<32> > io_dbus_adrx;
+ sc_out<sc_bv<6> > io_dbus_size;
+ sc_out<sc_bv<256> > io_dbus_wdata;
+ sc_out<sc_bv<32> > io_dbus_wmask;
+ sc_in<sc_bv<256> > io_dbus_rdata;
+ sc_in<sc_bv<32> > io_axi_write_addr_bits_addr;
+ sc_in<sc_bv<6> > io_axi_write_addr_bits_id;
+ sc_in<sc_bv<256> > io_axi_write_data_bits_data;
+ sc_in<sc_bv<32> > io_axi_write_data_bits_strb;
+ sc_out<sc_bv<6> > io_axi_write_resp_bits_id;
+ sc_out<sc_bv<2> > io_axi_write_resp_bits_resp;
+ sc_in<sc_bv<32> > io_axi_read_addr_bits_addr;
+ sc_in<sc_bv<6> > io_axi_read_addr_bits_id;
+ sc_out<sc_bv<2> > io_axi_read_data_bits_resp;
+ sc_out<sc_bv<6> > io_axi_read_data_bits_id;
+ sc_out<sc_bv<256> > io_axi_read_data_bits_data;
+
+ using Sysc_tb::Sysc_tb;
+
+ void posedge() {
+ sc_bv<32> dbus_wmask;
+ sc_bv<256> dbus_wdata;
+ for (int i = 0; i < 8; ++i) dbus_wdata.set_word(i, rand_uint32());
+ dbus_wmask.set_word(0, rand_uint32());
+
+ if (!(io_dbus_valid && !io_dbus_ready && !io_dbus_write)) {
+ io_dbus_valid = rand_bool();
+ io_dbus_write = rand_bool();
+ io_dbus_addr = rand_uint32();
+ io_dbus_adrx = rand_uint32();
+ io_dbus_size = rand_uint32();
+ io_dbus_wdata = dbus_wdata;
+ io_dbus_wmask = dbus_wmask;
+ }
+
+ io_axi_read_addr_ready = rand_bool();
+
+ const bool write_ready = rand_bool();
+ io_axi_write_addr_ready = write_ready;
+ io_axi_write_data_ready = write_ready;
+
+ // *************************************************************************
+ // DBus Addr.
+ if (io_dbus_valid && !io_dbus_write && !dbus_read_ready_) {
+ dbus_read_ready_ = true;
+ axi_read_addr_t r;
+ r.addr = io_dbus_addr.read().get_word(0) & ~31;
+ r.id = 0x00; // from RTL
+ axi_read_addr_.write(r);
+ }
+
+ if (io_dbus_valid && io_dbus_ready && io_dbus_write) {
+ axi_write_addr_t w;
+ sc_bv<256> data;
+ sc_bv<32> strb;
+ w.addr = io_dbus_addr.read().get_word(0) & ~31;
+ w.id = 0x00; // from RTL
+ w.strb = io_dbus_wmask;
+ w.data = io_dbus_wdata;
+ axi_write_addr_.write(w);
+ }
+
+ // *************************************************************************
+ // DBus Read Data.
+ if (dbus_read_active_) {
+ dbus_read_active_ = false;
+ dbus_read_data_t ref, dut;
+ check(dbus_read_data_.read(ref), "dbus read data");
+ dut.data = io_dbus_rdata;
+ if (ref != dut) {
+ ref.print("ref::dbus_read_addr");
+ dut.print("dut::dbus_read_addr");
+ check(false);
+ }
+ }
+
+ if (io_dbus_valid && io_dbus_ready && !io_dbus_write) {
+ dbus_read_ready_ = false;
+ dbus_read_active_ = true;
+ }
+
+ // *************************************************************************
+ // AXI Read Addr.
+ if (io_axi_read_addr_valid && io_axi_read_addr_ready) {
+ axi_read_addr_t dut, ref;
+ check(axi_read_addr_.read(ref), "axi read addr");
+ dut.addr = io_axi_read_addr_bits_addr.read().get_word(0);
+ dut.id = io_axi_read_addr_bits_id.read().get_word(0);
+ if (ref != dut) {
+ ref.print("ref::axi_read_addr");
+ dut.print("dut::axi_read_addr");
+ check(false);
+ }
+
+ sc_bv<256> data;
+ for (int i = 0; i < 8; ++i) data.set_word(i, rand_uint32());
+ axi_read_data_t raxi;
+ raxi.id = dut.id;
+ raxi.data = data;
+ raxi.resp = rand_int();
+ axi_read_data_.write(raxi);
+
+ dbus_read_data_t dbus;
+ dbus.data = data;
+ dbus_read_data_.write(dbus);
+ }
+
+ // *************************************************************************
+ // AXI Read Data.
+ if (io_axi_read_data_valid && io_axi_read_data_ready) {
+ check(axi_read_data_.remove(), "axi read data");
+ }
+
+ axi_read_data_t rdata;
+ bool read_data_valid = axi_read_data_.next(rdata);
+ io_axi_read_data_valid = read_data_valid && rand_bool();
+ io_axi_read_data_bits_id = rdata.id;
+ io_axi_read_data_bits_data = rdata.data;
+ io_axi_read_data_bits_resp = rdata.resp;
+
+ // *************************************************************************
+ // AXI Write Addr.
+ if (io_axi_write_addr_valid && io_axi_write_addr_ready) {
+ assert(io_axi_write_data_valid && io_axi_write_data_ready);
+ axi_write_addr_t dut, ref;
+ check(axi_write_addr_.read(ref), "axi write addr");
+ dut.addr = io_axi_write_addr_bits_addr.read().get_word(0);
+ dut.id = io_axi_write_addr_bits_id.read().get_word(0);
+ dut.data = io_axi_write_data_bits_data;
+ dut.strb = io_axi_write_data_bits_strb;
+ if (ref != dut) {
+ ref.print("ref::axi_write_addr");
+ dut.print("dut::axi_write_addr");
+ check(false);
+ }
+
+ axi_write_resp_t resp;
+ resp.id = dut.id;
+ resp.resp = rand_int();
+ axi_write_resp_.write(resp);
+ }
+
+ // *************************************************************************
+ // AXI Write Resp.
+ if (io_axi_write_resp_valid && io_axi_write_resp_ready) {
+ check(axi_write_resp_.remove(), "axi write resp");
+ }
+
+ axi_write_resp_t wresp;
+ bool write_resp_valid = axi_write_resp_.next(wresp);
+ io_axi_write_resp_valid = write_resp_valid;
+ io_axi_write_resp_bits_id = wresp.id;
+ io_axi_write_resp_bits_resp = wresp.resp;
+ }
+
+
+ private:
+ struct axi_read_addr_t {
+ uint32_t addr;
+ uint32_t id : 7;
+
+ bool operator!=(const axi_read_addr_t& rhs) const {
+ if (addr != rhs.addr) return true;
+ if (id != rhs.id) return true;
+ return false;
+ }
+
+ void print(const char* name) {
+ printf("[%s]: id=%x addr=%08x\n", name, id, addr);
+ }
+ };
+
+ struct axi_read_data_t {
+ uint32_t id : 7;
+ uint32_t resp : 7;
+ sc_bv<256> data;
+
+ bool operator!=(const axi_read_data_t& rhs) const {
+ if (id != rhs.id) return true;
+ if (data != rhs.data) return true;
+ return false;
+ }
+
+ void print(const char* name) {
+ printf("[%s]: id=%x data=", name, id);
+ for (int i = 0; i < 256 / 32; ++i) {
+ printf("%08x ", data.get_word(i));
+ }
+ printf("\n");
+ }
+ };
+
+ struct axi_write_addr_t {
+ uint32_t addr;
+ uint32_t id : 7;
+ sc_bv<256> data;
+ sc_bv<32> strb;
+
+ bool operator!=(const axi_write_addr_t& rhs) const {
+ if (addr != rhs.addr) return true;
+ if (id != rhs.id) return true;
+ if (strb != rhs.strb) return true;
+ if (data != rhs.data) return true;
+ return false;
+ }
+
+ void print(const char* name) {
+ printf("[%s]: id=%x addr=%08x strb=%08x data=", name, id, addr, strb.get_word(0));
+ for (int i = 0; i < 256 / 32; ++i) {
+ printf("%08x ", data.get_word(0));
+ }
+ printf("\n");
+ }
+ };
+
+ struct axi_write_resp_t {
+ uint32_t id : 7;
+ uint32_t resp : 2;
+ };
+
+ struct dbus_read_data_t {
+ sc_bv<256> data;
+
+ bool operator!=(const dbus_read_data_t& rhs) const {
+ if (data != rhs.data) return true;
+ return false;
+ }
+
+ void print(const char* name) {
+ printf("[%s]: data=", name);
+ for (int i = 0; i < 256 / 32; ++i) {
+ printf("%08x ", data.get_word(i));
+ }
+ printf("\n");
+ }
+ };
+
+ bool dbus_read_ready_ = false;
+ bool dbus_read_active_ = false;
+ fifo_t<axi_read_addr_t> axi_read_addr_;
+ fifo_t<axi_read_data_t> axi_read_data_;
+ fifo_t<axi_write_addr_t> axi_write_addr_;
+ fifo_t<axi_write_resp_t> axi_write_resp_;
+ fifo_t<dbus_read_data_t> dbus_read_data_;
+};
+
+static void DBus2Axi_test(char* name, int loops, bool trace) {
+ sc_signal<bool> io_dbus_valid;
+ sc_signal<bool> io_dbus_ready;
+ sc_signal<bool> io_dbus_write;
+ sc_signal<bool> io_axi_write_addr_ready;
+ sc_signal<bool> io_axi_write_addr_valid;
+ sc_signal<bool> io_axi_write_data_ready;
+ sc_signal<bool> io_axi_write_data_valid;
+ sc_signal<bool> io_axi_write_resp_ready;
+ sc_signal<bool> io_axi_write_resp_valid;
+ sc_signal<bool> io_axi_read_addr_ready;
+ sc_signal<bool> io_axi_read_addr_valid;
+ sc_signal<bool> io_axi_read_data_ready;
+ sc_signal<bool> io_axi_read_data_valid;
+ sc_signal<sc_bv<32> > io_dbus_addr;
+ sc_signal<sc_bv<32> > io_dbus_adrx;
+ sc_signal<sc_bv<6> > io_dbus_size;
+ sc_signal<sc_bv<256> > io_dbus_wdata;
+ sc_signal<sc_bv<32> > io_dbus_wmask;
+ sc_signal<sc_bv<256> > io_dbus_rdata;
+ sc_signal<sc_bv<32> > io_axi_write_addr_bits_addr;
+ sc_signal<sc_bv<6> > io_axi_write_addr_bits_id;
+ sc_signal<sc_bv<256> > io_axi_write_data_bits_data;
+ sc_signal<sc_bv<32> > io_axi_write_data_bits_strb;
+ sc_signal<sc_bv<6> > io_axi_write_resp_bits_id;
+ sc_signal<sc_bv<2> > io_axi_write_resp_bits_resp;
+ sc_signal<sc_bv<32> > io_axi_read_addr_bits_addr;
+ sc_signal<sc_bv<6> > io_axi_read_addr_bits_id;
+ sc_signal<sc_bv<2> > io_axi_read_data_bits_resp;
+ sc_signal<sc_bv<6> > io_axi_read_data_bits_id;
+ sc_signal<sc_bv<256> > io_axi_read_data_bits_data;
+
+ DBus2Axi_tb tb("DBus2Axi_tb", loops, true /*random*/);
+ VDBus2Axi d2a(name);
+
+ d2a.clock(tb.clock);
+ d2a.reset(tb.reset);
+ BIND2(tb, d2a, io_dbus_valid);
+ BIND2(tb, d2a, io_dbus_ready);
+ BIND2(tb, d2a, io_dbus_write);
+ BIND2(tb, d2a, io_axi_write_addr_ready);
+ BIND2(tb, d2a, io_axi_write_addr_valid);
+ BIND2(tb, d2a, io_axi_write_data_ready);
+ BIND2(tb, d2a, io_axi_write_data_valid);
+ BIND2(tb, d2a, io_axi_write_resp_ready);
+ BIND2(tb, d2a, io_axi_write_resp_valid);
+ BIND2(tb, d2a, io_axi_read_addr_ready);
+ BIND2(tb, d2a, io_axi_read_addr_valid);
+ BIND2(tb, d2a, io_axi_read_data_ready);
+ BIND2(tb, d2a, io_axi_read_data_valid);
+ BIND2(tb, d2a, io_dbus_addr);
+ BIND2(tb, d2a, io_dbus_adrx);
+ BIND2(tb, d2a, io_dbus_size);
+ BIND2(tb, d2a, io_dbus_wdata);
+ BIND2(tb, d2a, io_dbus_wmask);
+ BIND2(tb, d2a, io_dbus_rdata);
+ BIND2(tb, d2a, io_axi_write_addr_bits_addr);
+ BIND2(tb, d2a, io_axi_write_addr_bits_id);
+ BIND2(tb, d2a, io_axi_write_data_bits_data);
+ BIND2(tb, d2a, io_axi_write_data_bits_strb);
+ BIND2(tb, d2a, io_axi_write_resp_bits_id);
+ BIND2(tb, d2a, io_axi_write_resp_bits_resp);
+ BIND2(tb, d2a, io_axi_read_addr_bits_addr);
+ BIND2(tb, d2a, io_axi_read_addr_bits_id);
+ BIND2(tb, d2a, io_axi_read_data_bits_resp);
+ BIND2(tb, d2a, io_axi_read_data_bits_id);
+ BIND2(tb, d2a, io_axi_read_data_bits_data);
+
+ if (trace) {
+ tb.trace(d2a);
+ }
+
+ tb.start();
+}
+
+int sc_main(int argc, char* argv[]) {
+ DBus2Axi_test(Sysc_tb::get_name(argv[0]), 1000000, false);
+ return 0;
+}
diff --git a/tests/verilator_sim/kelvin/debug_if.h b/tests/verilator_sim/kelvin/debug_if.h
new file mode 100644
index 0000000..e35a27a
--- /dev/null
+++ b/tests/verilator_sim/kelvin/debug_if.h
@@ -0,0 +1,155 @@
+#ifndef TESTS_VERILATOR_SIM_KELVIN_DEBUG_IF_H_
+#define TESTS_VERILATOR_SIM_KELVIN_DEBUG_IF_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "tests/verilator_sim/sysc_module.h"
+#include "tests/verilator_sim/kelvin/memory_if.h"
+
+// A core debug model.
+struct Debug_if : Sysc_module {
+ sc_in<bool> io_slog_valid;
+ sc_in<sc_bv<5> > io_slog_addr;
+ sc_in<sc_bv<32> > io_slog_data;
+
+ Debug_if(sc_module_name n, Memory_if* mm) : Sysc_module(n), mm_(mm) {
+ gettimeofday(&start_, NULL);
+ }
+
+ ~Debug_if() {
+ gettimeofday(&stop_, NULL);
+ const float time_s =
+ static_cast<float>(stop_.tv_sec - start_.tv_sec) +
+ static_cast<float>(stop_.tv_usec - start_.tv_usec) / 1000000.0f;
+
+ // Integer with commas.
+ auto s = std::to_string(cycle_);
+ int n = s.length() - 3;
+ while (n > 0) {
+ s.insert(n, ",");
+ n -= 3;
+ }
+
+ printf("Info: %s cycles @%.2fK/s\n", s.c_str(), cycle_ / time_s / 1000.0f);
+ }
+
+ void eval() {
+ if (reset) {
+ cycle_ = 0;
+ } else if (clock->posedge()) {
+ cycle_++;
+ if (io_slog_valid) {
+ Slog(io_slog_addr.read().get_word(0), io_slog_data.read().get_word(0));
+ }
+ }
+ }
+
+private:
+#ifndef TIME_DISABLE
+ const char* KNRM = "\x1B[0m";
+ const char* KRED = "\x1B[31m";
+ const char* KGRN = "\x1B[32m";
+ const char* KYEL = "\x1B[33m";
+ const char* KBLU = "\x1B[34m";
+ const char* KMAG = "\x1B[35m";
+ const char* KCYN = "\x1B[36m";
+ const char* KWHT = "\x1B[37m";
+ const char* KRST = "\033[0m";
+#endif // TIME_DISABLE
+
+ static const int ARGMAX = 16;
+ static const int BUFFERLIMIT = 100;
+ int argpos_;
+ uint64_t arg_[ARGMAX];
+ uint8_t str_[ARGMAX][BUFFERLIMIT];
+ uint8_t pos_[ARGMAX] = {0};
+
+ struct timeval stop_, start_;
+
+ Memory_if* mm_;
+
+ bool newline_ = false;
+ int cycle_ = 0;
+
+ void Slog(const uint8_t cmd, const uint32_t data) {
+ constexpr int FLOG = 0;
+ constexpr int SLOG = 1;
+ constexpr int CLOG = 2;
+ constexpr int KLOG = 3;
+
+ if (cmd == FLOG) {
+ char buf[BUFFERLIMIT];
+ char sbuf[ARGMAX * BUFFERLIMIT];
+
+ mm_->Read(data, BUFFERLIMIT, (uint8_t*) buf);
+ buf[sizeof(buf) - 1] = '\0';
+
+ sprintf(sbuf, buf, arg_[0], arg_[1], arg_[2], arg_[3],
+ arg_[4], arg_[5], arg_[6], arg_[7],
+ arg_[8], arg_[9], arg_[10], arg_[11],
+ arg_[12], arg_[13], arg_[14], arg_[15]); // ARGMAX
+
+ int len = strlen(sbuf);
+#ifndef TIME_DISABLE
+ printf("%s", KGRN);
+#endif // TIME_DISABLE
+ for (int i = 0; i < len; ++i) {
+ if (!newline_) {
+ newline_ = true;
+#ifndef TIME_DISABLE
+ printf("%s[%7d] %s", KCYN, cycle_, KGRN);
+#endif // TIME_DISABLE
+ }
+ const char ch = sbuf[i];
+ putc(ch, stdout);
+ if (ch == '\n') {
+ newline_ = false;
+ fflush(stdout);
+ }
+ }
+#ifndef TIME_DISABLE
+ printf("%s", KRST);
+#endif // TIME_DISABLE
+
+ memset(pos_, 0, sizeof(pos_));
+ argpos_ = 0;
+ return;
+ }
+
+ assert(argpos_ < ARGMAX);
+
+ if (cmd == SLOG) {
+ arg_[argpos_] = data;
+ argpos_++;
+ } else if (cmd == CLOG) {
+ arg_[argpos_] = (uint64_t) str_[argpos_];
+ const uint8_t *ptr = (const uint8_t*) &data;
+ uint8_t *buf = str_[argpos_];
+ for (int i = 0; i < 4; ++i) {
+ const int p = pos_[argpos_]++;
+ const char c = ptr[i];
+ assert(p + 1 < BUFFERLIMIT);
+ buf[p] = c;
+ buf[p + 1] = '\0';
+ if (!c) {
+ argpos_++;
+ break;
+ }
+ }
+ } else if (cmd == KLOG) {
+ arg_[argpos_] = (uint64_t) str_[argpos_];
+ uint8_t *buf = str_[argpos_];
+ char c = 0;
+ int pos = 0;
+ mm_->Read(data, BUFFERLIMIT, buf);
+ argpos_++;
+ } else {
+ printf("\n**error: RV32L SLOG unknown cmd=%d\n", cmd);
+ exit(-1);
+ }
+ }
+};
+
+#endif // TESTS_VERILATOR_SIM_KELVIN_DEBUG_IF_H_
diff --git a/tests/verilator_sim/kelvin/kelvin_cfg.h b/tests/verilator_sim/kelvin/kelvin_cfg.h
new file mode 100644
index 0000000..ef2a2b5
--- /dev/null
+++ b/tests/verilator_sim/kelvin/kelvin_cfg.h
@@ -0,0 +1,66 @@
+#ifndef TESTS_VERILATOR_SIM_KELVIN_KELVIN_CFG_H_
+#define TESTS_VERILATOR_SIM_KELVIN_KELVIN_CFG_H_
+
+#ifndef KELVIN_SIMD
+#error KELVIN_SIMD must be defined in Environment or Makefile.
+#elif KELVIN_SIMD == 128
+constexpr int kVector = 128;
+#elif KELVIN_SIMD == 256
+constexpr int kVector = 256;
+#elif KELVIN_SIMD == 512
+constexpr int kVector = 512;
+#else
+#error KELVIN_SIMD unsupported configuration.
+#endif
+
+constexpr int ctz(int a) {
+ if (a == 1) return 0;
+ if (a == 2) return 1;
+ if (a == 4) return 2;
+ if (a == 8) return 3;
+ if (a == 16) return 4;
+ if (a == 32) return 5;
+ if (a == 64) return 6;
+ if (a == 128) return 7;
+ if (a == 256) return 8;
+ return -1;
+}
+
+// ISS defines.
+constexpr uint32_t VLENB = kVector / 8;
+constexpr uint32_t VLENH = kVector / 16;
+constexpr uint32_t VLENW = kVector / 32;
+constexpr uint32_t SM = 4;
+
+constexpr int kDbusBits = ctz(kVector / 8) + 1;
+constexpr int kVlenBits = ctz(kVector / 8) + 1 + 2;
+
+// [External] System AXI.
+constexpr int kAxiBits = 256;
+constexpr int kAxiStrb = kAxiBits / 8;
+constexpr int kAxiId = 7;
+
+// [Internal] L1I AXI.
+constexpr int kL1IAxiBits = 256;
+constexpr int kL1IAxiStrb = kL1IAxiBits / 8;
+constexpr int kL1IAxiId = 4;
+
+// [Internal] L1D AXI.
+constexpr int kL1DAxiBits = 256;
+constexpr int kL1DAxiStrb = kL1DAxiBits / 8;
+constexpr int kL1DAxiId = 4;
+
+// [Internal] Uncached AXI[Vector,Scalar].
+constexpr int kUncBits = kVector;
+constexpr int kUncStrb = kVector / 8;
+constexpr int kUncId = 6;
+
+// Transaction is uncached (and bus width aligned).
+static uint8_t is_uncached(const uint32_t addr) {
+ // bit31==1 (0x80000000)
+ return (addr & (1u << 31)) != 0;
+}
+
+constexpr int kAlignedLsb = ctz(kVector / 8);
+
+#endif // TESTS_VERILATOR_SIM_KELVIN_KELVIN_CFG_H_
diff --git a/tests/verilator_sim/kelvin/l1dcache_tb.cc b/tests/verilator_sim/kelvin/l1dcache_tb.cc
new file mode 100644
index 0000000..299aedf
--- /dev/null
+++ b/tests/verilator_sim/kelvin/l1dcache_tb.cc
@@ -0,0 +1,510 @@
+#include "tests/verilator_sim/sysc_tb.h"
+
+#ifndef L1DCACHEBANK
+#include "VL1DCache.h"
+constexpr int kDBusBankAdj = 0;
+#else
+constexpr int kDBusBankAdj = 1;
+#endif
+
+#include "tests/verilator_sim/kelvin/kelvin_cfg.h"
+
+constexpr int kLineSize = kVector / 8;
+constexpr int kLineBase = ~(kLineSize - 1);
+constexpr int kLineOffset = kLineSize - 1;
+
+struct L1DCache_tb : Sysc_tb
+{
+ sc_out<bool> io_flush_valid;
+ sc_in<bool> io_flush_ready;
+ sc_out<bool> io_flush_all;
+ sc_out<bool> io_flush_clean;
+
+ sc_out<bool> io_dbus_valid;
+ sc_in<bool> io_dbus_ready;
+ sc_out<bool> io_dbus_write;
+ sc_out<sc_bv<kDbusBits> > io_dbus_size;
+ sc_out<sc_bv<32 - kDBusBankAdj> > io_dbus_addr;
+ sc_out<sc_bv<32 - kDBusBankAdj> > io_dbus_adrx;
+ sc_in<sc_bv<kVector> > io_dbus_rdata;
+ sc_out<sc_bv<kVector> > io_dbus_wdata;
+ sc_out<sc_bv<kVector / 8> > io_dbus_wmask;
+
+ sc_in<bool> io_axi_read_addr_valid;
+ sc_out<bool> io_axi_read_addr_ready;
+ sc_in<sc_bv<kL1DAxiId - kDBusBankAdj> > io_axi_read_addr_bits_id;
+ sc_in<sc_bv<32 - kDBusBankAdj> > io_axi_read_addr_bits_addr;
+
+ sc_out<bool> io_axi_read_data_valid;
+ sc_in<bool> io_axi_read_data_ready;
+ sc_out<sc_bv<2> > io_axi_read_data_bits_resp;
+ sc_out<sc_bv<kL1DAxiId - kDBusBankAdj> > io_axi_read_data_bits_id;
+ sc_out<sc_bv<kL1DAxiBits> > io_axi_read_data_bits_data;
+
+ sc_in<bool> io_axi_write_addr_valid;
+ sc_out<bool> io_axi_write_addr_ready;
+ sc_in<sc_bv<kL1DAxiId - kDBusBankAdj> > io_axi_write_addr_bits_id;
+ sc_in<sc_bv<32 - kDBusBankAdj> > io_axi_write_addr_bits_addr;
+
+ sc_in<bool> io_axi_write_data_valid;
+ sc_out<bool> io_axi_write_data_ready;
+ sc_in<sc_bv<kL1DAxiStrb> > io_axi_write_data_bits_strb;
+ sc_in<sc_bv<kL1DAxiBits> > io_axi_write_data_bits_data;
+
+ sc_out<bool> io_axi_write_resp_valid;
+ sc_in<bool> io_axi_write_resp_ready;
+ sc_out<sc_bv<2> > io_axi_write_resp_bits_resp;
+ sc_out<sc_bv<kL1DAxiId - kDBusBankAdj> > io_axi_write_resp_bits_id;
+
+ using Sysc_tb::Sysc_tb;
+
+ void posedge() {
+ // dbus
+#ifdef L1DCACHEBANK
+ // Checks a bank cache line.
+ if (dbus_resp_pipeline_) {
+ dbus_resp_pipeline_ = false;
+ uint32_t addr = dbus_resp_addr_;
+ int size = dbus_resp_size_;
+ for (int i = 0; i < vlenb_ && size; ++i) {
+ uint8_t ref = dbus_resp_data_[i];
+ uint8_t dut = io_dbus_rdata.read().get_word(i / 4) >> (8 * i);
+ if (ref != dut) {
+ printf("DDD(%d) %08x : %02x %02x\n", i, (addr & ~(vlenb_ - 1)) + i, ref, dut);
+ }
+ check(ref == dut, "dbus read data");
+ }
+ }
+#else
+ if (dbus_resp_pipeline_) {
+ dbus_resp_pipeline_ = false;
+ uint32_t addr = dbus_resp_addr_;
+ int size = dbus_resp_size_;
+ for (int j = addr; j < addr + size; ++j) {
+ int i = j & (vlenb_ - 1);
+ uint8_t ref = dbus_resp_data_[i];
+ uint8_t dut = io_dbus_rdata.read().get_word(i / 4) >> (8 * i);
+ check(ref == dut, "dbus read data");
+ }
+ }
+#endif
+
+ if (io_dbus_valid && io_dbus_ready && !io_dbus_write) {
+ dbus_active_ = false;
+ dbus_resp_pipeline_ = true;
+ dbus_resp_addr_ = io_dbus_addr.read().get_word(0);
+ dbus_resp_size_ = io_dbus_size.read().get_word(0);
+#ifdef L1DCACHEBANK
+ ReadBus(dbus_resp_addr_ & kLineBase, vlenb_, dbus_resp_data_);
+#else
+ ReadBus(dbus_resp_addr_, vlenb_, dbus_resp_data_);
+#endif
+ history_t cmd({dbus_resp_addr_});
+ history_.write(cmd);
+ if (history_.count() > 16) {
+ history_.remove();
+ }
+ }
+
+ if (io_dbus_valid && io_dbus_ready && io_dbus_write) {
+ dbus_active_ = false;
+
+ uint32_t addr = io_dbus_addr.read().get_word(0);
+ int size = io_dbus_size.read().get_word(0);
+ uint8_t wdata[vlenb_];
+ uint32_t* p_wdata = (uint32_t*) wdata;
+ for (int i = 0; i < vlenw_; ++i) {
+ p_wdata[i] = io_dbus_wdata.read().get_word(i);
+ }
+ const uint32_t linemask = vlenb_ - 1;
+ const uint32_t linebase = addr & ~linemask;
+ for (int i = 0; i < size; ++i, ++addr) {
+ const uint32_t lineoffset = addr & linemask;
+ if (io_dbus_wmask.read().get_bit(lineoffset)) {
+#ifdef L1DCACHEBANK
+ WriteBus(linebase + lineoffset, wdata[lineoffset]);
+#else
+ WriteBus(addr, wdata[lineoffset]);
+#endif
+ }
+ }
+ }
+
+ if (io_flush_valid && io_flush_ready) {
+ flush_valid_ = false;
+ flush_all_ = false;
+ flush_clean_ = false;
+ }
+
+ if (++flush_count_ > 5000 && !dbus_active_ && !flush_valid_) {
+ // Flush controls must not change during handshake.
+ flush_count_ = 0;
+ flush_valid_ = true;
+ flush_all_ = rand_bool();
+ flush_clean_ = rand_bool();
+ }
+
+ io_flush_valid = flush_valid_;
+ io_flush_all = flush_all_;
+ io_flush_clean = flush_clean_;
+
+ history_t dbus;
+ if (!io_dbus_valid || !dbus_active_) { // latch transaction
+ bool valid = rand_bool() && !flush_valid_;
+ bool write = rand_int(0, 3) == 0;
+ bool newaddr = rand_int(0, 3) == 0 || !history_.rand(dbus);
+ uint32_t addr = newaddr ? rand_uint32() : (dbus.addr + rand_int(-vlenb_, vlenb_));
+ addr = std::min(0xffffff00u, addr); // TODO: avoids a raxi() crash.
+ if (kDBusBankAdj) {
+ addr &= 0x7fffffff;
+ }
+ if (rand_int(0, 7) == 0) {
+ addr &= 0x3fff;
+ }
+#ifdef L1DCACHEBANK
+ int size = rand_int(1, vlenb_);
+#else
+ int size = rand_int(0, vlenb_);
+#endif
+ io_dbus_valid = valid;
+ io_dbus_write = write;
+ io_dbus_addr = addr;
+ io_dbus_adrx = addr + vlenb_;
+ io_dbus_size = size;
+ if (valid) {
+ dbus_active_ = true;
+ CheckAddr(addr, size);
+ }
+
+ sc_bv<kVector> wdata = 0;
+ sc_bv<kVector / 8> wmask = 0;
+
+ if (write) {
+ for (int i = 0; i < vlenw_; ++i) {
+ wdata.set_word(i, rand_uint32());
+ }
+ const uint32_t linemask = vlenb_ - 1;
+ const uint32_t lineoffset = addr & linemask;
+ const bool all = rand_bool();
+ for (int i = 0; i < size; ++i) {
+ if (all || rand_bool()) {
+ wmask.set_bit((i + lineoffset) & linemask, sc_dt::Log_1);
+ }
+ }
+ }
+
+ io_dbus_wdata.write(wdata);
+ io_dbus_wmask.write(wmask);
+ }
+
+ timeout_ = io_dbus_ready ? 0 : timeout_ + io_dbus_valid;
+ check(timeout_ < 10000, "dbus timeout");
+
+ // axi_read_addr
+ io_axi_read_addr_ready = rand_bool();
+
+ if (io_axi_read_addr_valid && io_axi_read_addr_ready) {
+ uint32_t id = io_axi_read_addr_bits_id.read().get_word(0);
+ uint32_t addr = io_axi_read_addr_bits_addr.read().get_word(0);
+ response_t resp({id, addr});
+ resp_.write(resp);
+ }
+
+ // axi_read_data
+ io_axi_read_data_valid = false;
+ io_axi_read_data_bits_id = 0;
+ io_axi_read_data_bits_data = 0;
+
+ if (io_axi_read_data_valid && io_axi_read_data_ready) {
+ check(resp_.remove(), "no response to erase");
+ }
+
+ response_t resp;
+ resp_.shuffle();
+ if (resp_.next(resp)) {
+ io_axi_read_data_valid = rand_bool();
+ io_axi_read_data_bits_id = resp.id;
+ uint32_t addr = resp.addr;
+ sc_bv<kL1DAxiBits> out;
+ for (int i = 0; i < axiw_; ++i) {
+ uint32_t data;
+ ReadAxi(addr, 4, (uint8_t*) &data);
+ out.set_word(i, data);
+ addr += 4;
+ }
+ io_axi_read_data_bits_data = out;
+ }
+
+ // axi_write_addr
+ bool writedataready = rand_bool();
+
+ io_axi_write_addr_ready = writedataready;
+
+ if (io_axi_write_addr_valid && io_axi_write_addr_ready) {
+ axiwaddr_t p;
+ p.id = io_axi_write_addr_bits_id.read().get_word(0);
+ p.addr = io_axi_write_addr_bits_addr.read().get_word(0);
+ waddr_.write(p);
+ }
+
+ // axi_write_data
+ io_axi_write_data_ready = writedataready;
+
+ if (io_axi_write_data_valid && io_axi_write_data_ready) {
+ axiwdata_t p;
+ uint32_t* ptr = (uint32_t*) p.data;
+ for (int i = 0; i < axiw_; ++i, ++ptr) {
+ ptr[0] = io_axi_write_data_bits_data.read().get_word(i);
+ }
+ for (int i = 0; i < axib_; ++i) {
+ p.mask[i] = io_axi_write_data_bits_strb.read().get_bit(i);
+ }
+ wdata_.write(p);
+ }
+
+ // axi_write_resp
+ if (io_axi_write_resp_valid && io_axi_write_resp_ready) {
+ wresp_.remove();
+ }
+
+ axiwaddr_t wr;
+ io_axi_write_resp_valid = rand_int(0, 4) == 0 && wresp_.next(wr);
+ io_axi_write_resp_bits_id = wr.id;
+
+ // Process axi data write, and populate response.
+ axiwaddr_t wa;
+ axiwdata_t wd;
+ if (waddr_.next(wa) && wdata_.next(wd)) {
+ waddr_.remove();
+ wdata_.remove();
+ wresp_.write(wa);
+
+ uint32_t addr = wa.addr;
+ for (int i = 0; i < axib_; ++i, ++addr) {
+ if (wd.mask[i]) {
+ WriteAxi(addr, wd.data[i]);
+ }
+ }
+ }
+ }
+
+private:
+ struct history_t {
+ uint32_t addr;
+ };
+
+ struct response_t {
+ uint32_t id;
+ uint32_t addr;
+ };
+
+ struct axiwaddr_t {
+ uint32_t id;
+ uint32_t addr;
+ };
+
+ struct axiwdata_t {
+ uint8_t data[kL1DAxiBits / 8];
+ bool mask[kL1DAxiBits / 8];
+ };
+
+ const int vlenb_ = kVector / 8;
+ const int vlenw_ = kVector / 32;
+ const int axib_ = kL1DAxiBits / 8;
+ const int axiw_ = kL1DAxiBits / 32;
+
+ int timeout_ = 0;
+ int flush_count_ = 0;
+ bool flush_valid_ = false;
+ bool flush_all_ = false;
+ bool flush_clean_ = false;
+
+ bool dbus_active_ = false;
+ bool dbus_resp_pipeline_ = false;
+ uint32_t dbus_resp_addr_;
+ uint32_t dbus_resp_size_;
+ uint8_t dbus_resp_data_[kVector / 8];
+ fifo_t<response_t> resp_;
+ fifo_t<history_t> history_;
+ fifo_t<axiwaddr_t> waddr_;
+ fifo_t<axiwdata_t> wdata_;
+ fifo_t<axiwaddr_t> wresp_;
+
+private:
+ std::map<uint32_t, uint8_t[kLineSize]> mem_bus_;
+ std::map<uint32_t, uint8_t[kLineSize]> mem_axi_;
+
+ void _CheckAddr(uint32_t addr, uint8_t size) {
+ const uint32_t paddr = addr & kLineBase;
+ if (mem_bus_.find(paddr) == mem_bus_.end()) {
+ uint8_t data[kLineSize];
+ uint32_t* p_data = (uint32_t*) data;
+ for (int i = 0; i < kLineSize / 4; ++i) {
+ p_data[i] = rand();
+ // p_data[i] = paddr + 4 * i; // debug
+ }
+ memcpy(mem_bus_[paddr], data, kLineSize);
+ memcpy(mem_axi_[paddr], data, kLineSize);
+ }
+ }
+
+ void CheckAddr(uint32_t addr, uint8_t size) {
+ _CheckAddr(addr, size);
+ // if ((addr & kLineBase) == ((addr + size) & kLineBase)) return;
+ _CheckAddr(addr + kLineSize, size);
+ }
+
+ template<int outsz>
+ void _Read(uint32_t addr, uint8_t size, uint8_t* data,
+ std::map<uint32_t, uint8_t[kLineSize]>& m) {
+ const uint32_t laddr = addr & kLineBase;
+ const uint32_t loffset = addr & kLineOffset;
+ const uint32_t doffset = addr & (outsz - 1);
+ uint32_t start = addr;
+ uint32_t end = std::min(addr + size, laddr + kLineSize);
+ int size0 = end - start;
+ int size1 = size - size0;
+
+ memset(data, 0xCC, outsz);
+#ifdef L1DCACHEBANK
+ assert(doffset == 0);
+ memcpy(data + doffset, m[laddr] + loffset, outsz);
+#else
+ memcpy(data + doffset, m[laddr] + loffset, size0);
+ if (!size1) return;
+ memcpy(data, m[laddr + kLineSize], size1);
+#endif
+ }
+
+ void _Write(uint32_t addr, uint8_t data,
+ std::map<uint32_t, uint8_t[kLineSize]>& m) {
+ const uint32_t laddr = addr & kLineBase;
+ const uint32_t loffset = addr & kLineOffset;
+
+ m[laddr][loffset] = data;
+ }
+
+ void ReadBus(uint32_t addr, uint8_t size, uint8_t* data) {
+ _Read<kVector / 8>(addr, size, data, mem_bus_);
+ }
+
+ void ReadAxi(uint32_t addr, uint8_t size, uint8_t* data) {
+ _Read<4>(addr, size, data, mem_axi_);
+ }
+
+ void WriteBus(uint32_t addr, uint8_t data) {
+ _Write(addr, data, mem_bus_);
+ }
+
+ void WriteAxi(uint32_t addr, uint8_t data) {
+ _Write(addr, data, mem_axi_);
+ }
+};
+
+static void L1DCache_test(char* name, int loops, bool trace) {
+ sc_signal<bool> clock;
+ sc_signal<bool> reset;
+
+ sc_signal<bool> io_flush_valid;
+ sc_signal<bool> io_flush_ready;
+ sc_signal<bool> io_flush_all;
+ sc_signal<bool> io_flush_clean;
+
+ sc_signal<bool> io_dbus_valid;
+ sc_signal<bool> io_dbus_ready;
+ sc_signal<bool> io_dbus_write;
+ sc_signal<sc_bv<kDbusBits> > io_dbus_size;
+ sc_signal<sc_bv<32 - kDBusBankAdj> > io_dbus_addr;
+ sc_signal<sc_bv<32 - kDBusBankAdj> > io_dbus_adrx;
+ sc_signal<sc_bv<kVector> > io_dbus_rdata;
+ sc_signal<sc_bv<kVector> > io_dbus_wdata;
+ sc_signal<sc_bv<kVector / 8> > io_dbus_wmask;
+
+ sc_signal<bool> io_axi_read_addr_valid;
+ sc_signal<bool> io_axi_read_addr_ready;
+ sc_signal<sc_bv<kL1DAxiId - kDBusBankAdj> > io_axi_read_addr_bits_id;
+ sc_signal<sc_bv<32 - kDBusBankAdj> > io_axi_read_addr_bits_addr;
+
+ sc_signal<bool> io_axi_read_data_valid;
+ sc_signal<bool> io_axi_read_data_ready;
+ sc_signal<sc_bv<2> > io_axi_read_data_bits_resp;
+ sc_signal<sc_bv<kL1DAxiId - kDBusBankAdj> > io_axi_read_data_bits_id;
+ sc_signal<sc_bv<kL1DAxiBits> > io_axi_read_data_bits_data;
+
+ sc_signal<bool> io_axi_write_addr_valid;
+ sc_signal<bool> io_axi_write_addr_ready;
+ sc_signal<sc_bv<kL1DAxiId - kDBusBankAdj> > io_axi_write_addr_bits_id;
+ sc_signal<sc_bv<32 - kDBusBankAdj> > io_axi_write_addr_bits_addr;
+
+ sc_signal<bool> io_axi_write_data_valid;
+ sc_signal<bool> io_axi_write_data_ready;
+ sc_signal<sc_bv<kL1DAxiStrb> > io_axi_write_data_bits_strb;
+ sc_signal<sc_bv<kL1DAxiBits> > io_axi_write_data_bits_data;
+
+ sc_signal<bool> io_axi_write_resp_valid;
+ sc_signal<bool> io_axi_write_resp_ready;
+ sc_signal<sc_bv<2> > io_axi_write_resp_bits_resp;
+ sc_signal<sc_bv<kL1DAxiId - kDBusBankAdj> > io_axi_write_resp_bits_id;
+
+ L1DCache_tb tb("L1DCache_tb", loops, true /*random*/);
+#ifdef L1DCACHEBANK
+ VL1DCacheBank l1dcache(name);
+#else
+ VL1DCache l1dcache(name);
+#endif
+
+ if (trace) {
+ tb.trace(l1dcache);
+ }
+
+ l1dcache.clock(tb.clock);
+ l1dcache.reset(tb.reset);
+
+ BIND2(tb, l1dcache, io_flush_valid);
+ BIND2(tb, l1dcache, io_flush_ready);
+ BIND2(tb, l1dcache, io_flush_all);
+ BIND2(tb, l1dcache, io_flush_clean);
+
+ BIND2(tb, l1dcache, io_dbus_valid);
+ BIND2(tb, l1dcache, io_dbus_ready);
+ BIND2(tb, l1dcache, io_dbus_write);
+ BIND2(tb, l1dcache, io_dbus_size);
+ BIND2(tb, l1dcache, io_dbus_addr);
+ BIND2(tb, l1dcache, io_dbus_adrx);
+ BIND2(tb, l1dcache, io_dbus_rdata);
+ BIND2(tb, l1dcache, io_dbus_wdata);
+ BIND2(tb, l1dcache, io_dbus_wmask);
+
+ BIND2(tb, l1dcache, io_axi_read_addr_valid);
+ BIND2(tb, l1dcache, io_axi_read_addr_ready);
+ BIND2(tb, l1dcache, io_axi_read_addr_bits_id);
+ BIND2(tb, l1dcache, io_axi_read_addr_bits_addr);
+
+ BIND2(tb, l1dcache, io_axi_read_data_valid);
+ BIND2(tb, l1dcache, io_axi_read_data_ready);
+ BIND2(tb, l1dcache, io_axi_read_data_bits_resp);
+ BIND2(tb, l1dcache, io_axi_read_data_bits_id);
+ BIND2(tb, l1dcache, io_axi_read_data_bits_data);
+
+ BIND2(tb, l1dcache, io_axi_write_addr_valid);
+ BIND2(tb, l1dcache, io_axi_write_addr_ready);
+ BIND2(tb, l1dcache, io_axi_write_addr_bits_id);
+ BIND2(tb, l1dcache, io_axi_write_addr_bits_addr);
+
+ BIND2(tb, l1dcache, io_axi_write_data_valid);
+ BIND2(tb, l1dcache, io_axi_write_data_ready);
+ BIND2(tb, l1dcache, io_axi_write_data_bits_strb);
+ BIND2(tb, l1dcache, io_axi_write_data_bits_data);
+
+ BIND2(tb, l1dcache, io_axi_write_resp_valid);
+ BIND2(tb, l1dcache, io_axi_write_resp_ready);
+ BIND2(tb, l1dcache, io_axi_write_resp_bits_resp);
+ BIND2(tb, l1dcache, io_axi_write_resp_bits_id);
+
+ tb.start();
+}
+
+int sc_main(int argc, char *argv[]) {
+ L1DCache_test(Sysc_tb::get_name(argv[0]), 1000000, false);
+ return 0;
+}
diff --git a/tests/verilator_sim/kelvin/l1dcachebank_tb.cc b/tests/verilator_sim/kelvin/l1dcachebank_tb.cc
new file mode 100644
index 0000000..4328333
--- /dev/null
+++ b/tests/verilator_sim/kelvin/l1dcachebank_tb.cc
@@ -0,0 +1,5 @@
+#include "VL1DCacheBank.h"
+
+#define L1DCACHEBANK
+
+#include "l1dcache_tb.cc"
diff --git a/tests/verilator_sim/kelvin/l1icache_tb.cc b/tests/verilator_sim/kelvin/l1icache_tb.cc
new file mode 100644
index 0000000..ced4d78
--- /dev/null
+++ b/tests/verilator_sim/kelvin/l1icache_tb.cc
@@ -0,0 +1,171 @@
+#include "tests/verilator_sim/sysc_tb.h"
+
+#include "VL1ICache.h"
+
+#include "tests/verilator_sim/kelvin/kelvin_cfg.h"
+
+struct L1ICache_tb : Sysc_tb
+{
+ sc_out<bool> io_flush_valid;
+ sc_in<bool> io_flush_ready;
+ sc_out<bool> io_ibus_valid;
+ sc_in<bool> io_ibus_ready;
+ sc_out<sc_bv<32> > io_ibus_addr;
+ sc_in<sc_bv<kL1IAxiBits> > io_ibus_rdata;
+ sc_in<bool> io_axi_read_addr_valid;
+ sc_out<bool> io_axi_read_addr_ready;
+ sc_in<sc_bv<kL1IAxiId> > io_axi_read_addr_bits_id;
+ sc_in<sc_bv<32> > io_axi_read_addr_bits_addr;
+ sc_out<bool> io_axi_read_data_valid;
+ sc_in<bool> io_axi_read_data_ready;
+ sc_out<sc_bv<2> > io_axi_read_data_bits_resp;
+ sc_out<sc_bv<kL1IAxiId> > io_axi_read_data_bits_id;
+ sc_out<sc_bv<kL1IAxiBits> > io_axi_read_data_bits_data;
+
+ using Sysc_tb::Sysc_tb;
+
+ void posedge() {
+ // flush
+ io_flush_valid = rand_int(0, 255) == 0;
+
+ // ibus
+ if (ibus_resp_pipeline_) {
+ ibus_resp_pipeline_ = false;
+ for (int i = 0; i < ibusw_; ++i) {
+ uint32_t ref = ibus_resp_data_ + i * 4;
+ uint32_t dut = io_ibus_rdata.read().get_word(i);
+ check(ref == dut, "ibus read data");
+ }
+ }
+
+ if (io_ibus_valid && io_ibus_ready) {
+ ibus_resp_pipeline_ = true;
+ ibus_resp_data_ = io_ibus_addr.read().get_word(0) & ~(ibusb_ - 1);
+
+ command_t cmd({io_ibus_addr.read().get_word(0)});
+ history_.write(cmd);
+ if (history_.count() > 16) {
+ history_.remove();
+ }
+ }
+
+ if (!io_ibus_valid || io_ibus_ready) { // latch transaction
+ command_t cmd;
+ bool newaddr = rand_int(0, 3) == 0 || !history_.rand(cmd);
+ uint32_t addr = newaddr ? rand_uint32() : cmd.addr;
+ if (rand_int(0, 7) == 0) {
+ addr &= 0x3fff;
+ }
+ io_ibus_valid = rand_bool();
+ io_ibus_addr = addr;
+ }
+
+ timeout_ = io_ibus_ready ? 0 : timeout_ + io_ibus_valid;
+ check(timeout_ < 100, "ibus timeout");
+
+ // kxi_read_addr
+ io_axi_read_addr_ready = rand_bool();
+
+ if (io_axi_read_addr_valid && io_axi_read_addr_ready) {
+ uint32_t id = io_axi_read_addr_bits_id.read().get_word(0);
+ uint32_t addr = io_axi_read_addr_bits_addr.read().get_word(0);
+ response_t resp({id, addr});
+ resp_.write(resp);
+ }
+
+ // kxi_read_data
+ io_axi_read_data_valid = false;
+ io_axi_read_data_bits_id = 0;
+ io_axi_read_data_bits_data = 0;
+
+ if (io_axi_read_data_valid && io_axi_read_data_ready) {
+ check(resp_.remove(), "no response to erase");
+ resp_.shuffle();
+ }
+
+ response_t resp;
+ if (resp_.next(resp)) {
+ io_axi_read_data_valid = rand_bool();
+ io_axi_read_data_bits_id = resp.id;
+ uint32_t data = resp.data;
+ sc_bv<kL1IAxiBits> out;
+ for (int i = 0; i < axiw_; ++i) {
+ out.set_word(i, data);
+ data += 4;
+ }
+ io_axi_read_data_bits_data = out;
+ }
+ }
+
+private:
+ struct command_t {
+ uint32_t addr;
+ };
+
+ struct response_t {
+ uint32_t id;
+ uint32_t data;
+ };
+
+ const int ibusb_ = kL1IAxiBits / 8;
+ const int ibusw_ = kL1IAxiBits / 32;
+ const int axib_ = kL1IAxiBits / 8;
+ const int axiw_ = kL1IAxiBits / 32;
+
+ int timeout_ = 0;
+
+ bool ibus_resp_pipeline_ = false;
+ uint32_t ibus_resp_data_;
+ fifo_t<command_t> history_;
+ fifo_t<response_t> resp_;
+};
+
+static void L1ICache_test(char* name, int loops, bool trace) {
+ sc_signal<bool> io_flush_valid;
+ sc_signal<bool> io_flush_ready;
+ sc_signal<bool> io_ibus_valid;
+ sc_signal<bool> io_ibus_ready;
+ sc_signal<sc_bv<32> > io_ibus_addr;
+ sc_signal<sc_bv<kL1IAxiBits> > io_ibus_rdata;
+ sc_signal<bool> io_axi_read_addr_valid;
+ sc_signal<bool> io_axi_read_addr_ready;
+ sc_signal<sc_bv<kL1IAxiId> > io_axi_read_addr_bits_id;
+ sc_signal<sc_bv<32> > io_axi_read_addr_bits_addr;
+ sc_signal<bool> io_axi_read_data_valid;
+ sc_signal<bool> io_axi_read_data_ready;
+ sc_signal<sc_bv<2> > io_axi_read_data_bits_resp;
+ sc_signal<sc_bv<kL1IAxiId> > io_axi_read_data_bits_id;
+ sc_signal<sc_bv<kL1IAxiBits> > io_axi_read_data_bits_data;
+
+ L1ICache_tb tb("L1ICache_tb", loops, true /*random*/);
+ VL1ICache l1icache(name);
+
+ if (trace) {
+ tb.trace(l1icache);
+ }
+
+ l1icache.clock(tb.clock);
+ l1icache.reset(tb.reset);
+ BIND2(tb, l1icache, io_flush_valid);
+ BIND2(tb, l1icache, io_flush_ready);
+ BIND2(tb, l1icache, io_ibus_valid);
+ BIND2(tb, l1icache, io_ibus_ready);
+ BIND2(tb, l1icache, io_ibus_addr);
+ BIND2(tb, l1icache, io_ibus_rdata);
+ BIND2(tb, l1icache, io_axi_read_addr_valid);
+ BIND2(tb, l1icache, io_axi_read_addr_ready);
+ BIND2(tb, l1icache, io_axi_read_addr_bits_id);
+ BIND2(tb, l1icache, io_axi_read_addr_bits_addr);
+ BIND2(tb, l1icache, io_axi_read_data_ready);
+ BIND2(tb, l1icache, io_axi_read_data_valid);
+ BIND2(tb, l1icache, io_axi_read_data_bits_data);
+ BIND2(tb, l1icache, io_axi_read_data_bits_id);
+ BIND2(tb, l1icache, io_axi_read_data_bits_resp);
+
+ tb.start();
+}
+
+int sc_main(int argc, char *argv[]) {
+ L1ICache_test(Sysc_tb::get_name(argv[0]), 1000000, false);
+ return 0;
+}
diff --git a/tests/verilator_sim/kelvin/memory_if.h b/tests/verilator_sim/kelvin/memory_if.h
new file mode 100644
index 0000000..ebe7fc1
--- /dev/null
+++ b/tests/verilator_sim/kelvin/memory_if.h
@@ -0,0 +1,177 @@
+#ifndef TESTS_VERILATOR_SIM_KELVIN_MEMORY_IF_H_
+#define TESTS_VERILATOR_SIM_KELVIN_MEMORY_IF_H_
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "tests/verilator_sim/sysc_module.h"
+
+// A memory model base class
+struct Memory_if : Sysc_module {
+ const int kPageSize = 4 * 1024;
+ const int kPageMask = ~(kPageSize - 1);
+
+ struct memory_page_t {
+ uint32_t addr;
+ uint8_t data[4096];
+ };
+
+ Memory_if(sc_module_name n, const char* bin, int limit = -1) :
+ Sysc_module(n) {
+ FILE *f = fopen(bin, "rb");
+
+ fseek(f, 0, SEEK_END);
+ int64_t fsize = ftell(f);
+ fseek(f, 0, SEEK_SET);
+ uint8_t *fdata = new uint8_t[fsize];
+
+ fread(fdata, fsize, 1, f);
+ fclose(f);
+
+ if (limit > 0 && fsize > limit) {
+ printf("***ERROR Memory_if limit exceeded [%d > %d]\n", fsize, limit);
+ exit(-1);
+ }
+
+ for (int addr = 0; addr < fsize; addr += kPageSize) {
+ const int64_t size = std::min(fsize - addr, int64_t(kPageSize));
+ AddPage(addr, size, fdata + addr);
+ }
+
+ delete [] fdata;
+ }
+
+ void Read(uint32_t addr, int bytes, uint8_t* data) {
+ while (bytes > 0) {
+ const uint32_t maddr = addr & kPageMask;
+ const uint32_t offset = addr - maddr;
+ const int limit = kPageSize - offset;
+ const int len = std::min(bytes, limit);
+
+ if (!HasPage(maddr)) {
+#ifdef PRINT_ADD_PAGE
+ printf("MemoryModel::Read add_page %08x\n", addr);
+#endif
+ AddPage(maddr, kPageSize);
+ }
+
+ auto& p = page_[maddr];
+ uint8_t* d = p.data;
+ memcpy(data, d + offset, len);
+#if 0
+ printf("READ %08x", addr);
+ for (int i = 0; i < len; i++) {
+ printf(" %02x", data[i]);
+ }
+ printf("\n");
+#endif
+ addr += len;
+ data += len;
+ bytes -= len;
+ assert (bytes >= 0);
+ }
+ }
+
+ void Write(uint32_t addr, int bytes, const uint8_t* data) {
+ while (bytes > 0) {
+ const uint32_t maddr = addr & kPageMask;
+ const uint32_t offset = addr - maddr;
+ const int limit = kPageSize - offset;
+ const int len = std::min(bytes, limit);
+
+ if (!HasPage(maddr)) {
+#ifdef PRINT_ADD_PAGE
+ printf("MemoryModel::Write add_page %08x\n", addr);
+#endif
+ AddPage(maddr, kPageSize);
+ }
+
+ auto& p = page_[maddr];
+ uint8_t* d = p.data;
+ memcpy(d + offset, data, len);
+#if 0
+ printf("WRITE %08x", addr);
+ for (int i = 0; i < len; i++) {
+ printf(" %02x", data[i]);
+ }
+ printf("\n");
+#endif
+ addr += len;
+ data += len;
+ bytes -= len;
+ assert (bytes >= 0);
+ }
+ }
+
+protected:
+ void ReadSwizzle(const uint32_t addr, const int bytes, uint8_t* data) {
+ const int mask = bytes - 1;
+ const int alignment = (bytes - (addr & mask)) & mask; // left shuffle
+ uint8_t tmp[512/8];
+
+ if (!alignment) return;
+
+ for (int i = 0; i < bytes; ++i) {
+ tmp[i] = data[i];
+ }
+
+ for (int i = 0; i < bytes; ++i) {
+ data[i] = tmp[(i + alignment) & mask];
+ }
+ }
+
+ void WriteSwizzle(const uint32_t addr, const int bytes, uint8_t* data) {
+ const int mask = bytes - 1;
+ const int alignment = addr & mask; // right shuffle
+ uint8_t tmp[512/8];
+
+ if (!alignment) return;
+
+ for (int i = 0; i < bytes; ++i) {
+ tmp[i] = data[i];
+ }
+
+ for (int i = 0; i < bytes; ++i) {
+ data[i] = tmp[(i + alignment) & mask];
+ }
+ }
+
+private:
+ std::map<uint32_t, memory_page_t> page_;
+
+ bool HasPage(const uint32_t addr) {
+ return page_.find(addr) != page_.end();
+ }
+
+ void AddPage(const uint32_t addr, const int bytes,
+ const uint8_t* data = nullptr) {
+ const uint32_t addrbase = addr & kPageMask;
+ if (addr != addrbase) {
+ printf("AddPage(%08x, %d)\n", addr, bytes);
+ assert(false && "AddPage: address not page aligned");
+ }
+
+ if (HasPage(addr)) {
+ printf("AddPage(%08x, %d)\n", addr, bytes);
+ assert(false && "AddPage: address already populated");
+ }
+
+ auto& p = page_[addr];
+ uint8_t* d = p.data;
+
+ if (bytes < kPageSize || data == nullptr) {
+#if 1
+ // remove need for .bss (hacky?)
+ memset(d, 0x00, kPageSize);
+#else
+ memset(d, 0xcc, kPageSize);
+#endif
+ }
+
+ if (data) {
+ memcpy(d, data, bytes);
+ }
+ }
+};
+
+#endif // TESTS_VERILATOR_SIM_KELVIN_MEMORY_IF_H_
diff --git a/tests/verilator_sim/kelvin/valu.h b/tests/verilator_sim/kelvin/valu.h
new file mode 100644
index 0000000..4e68a53
--- /dev/null
+++ b/tests/verilator_sim/kelvin/valu.h
@@ -0,0 +1,1108 @@
+#ifndef TESTS_VERILATOR_SIM_KELVIN_VALU_H_
+#define TESTS_VERILATOR_SIM_KELVIN_VALU_H_
+
+#include "tools/iss/alu.h" // Modified
+#include "tests/verilator_sim/kelvin/kelvin_cfg.h"
+#include "tests/verilator_sim/kelvin/vencodeop.h"
+
+constexpr int kLanes = kVector / 32;
+constexpr int kReadPorts = 7;
+constexpr int kWritePorts = 4;
+
+using namespace encode;
+
+struct valu_t {
+ uint8_t op : 7;
+ uint8_t f2 : 3;
+ uint8_t sz : 3;
+ struct {
+ uint32_t data[kLanes];
+ } in[kReadPorts];
+ struct {
+ uint32_t data[kLanes];
+ } out[kWritePorts];
+ struct {
+ uint32_t data;
+ } sv;
+ // Tracking the read/write/scalar controls.
+ struct {
+ bool valid;
+ uint8_t addr : 6;
+ uint8_t tag : 1;
+ } r[kReadPorts];
+ struct {
+ bool valid;
+ uint8_t addr : 6;
+ } w[kWritePorts];
+ struct {
+ bool valid;
+ } scalar;
+
+ bool operator!=(const valu_t& rhs) const {
+ if (w[0].valid != rhs.w[0].valid) return true;
+ if (w[1].valid != rhs.w[1].valid) return true;
+ if (w[0].valid && w[0].addr != rhs.w[0].addr) return true;
+ if (w[1].valid && w[1].addr != rhs.w[1].addr) return true;
+ for (int i = 0; i < kLanes; ++i) {
+ if (w[0].valid && out[0].data[i] != rhs.out[0].data[i]) return true;
+ if (w[1].valid && out[1].data[i] != rhs.out[1].data[i]) return true;
+ }
+ return false;
+ }
+
+ void print(const char* name, const bool inputs = false) {
+ printf("[%s] op=%d f2=%d sz=%d valid=[%d,%d] waddr=%d", name, op, f2, sz,
+ w[0].valid, w[1].valid, w[0].valid ? w[0].addr : 0);
+ if (w[1].valid) {
+ printf(" {%d}", w[1].addr);
+ }
+ printf(" wdata =");
+ for (int i = 0; i < kLanes; ++i) {
+ printf(" %08x", w[0].valid ? out[0].data[i] : 0);
+ }
+ if (w[1].valid) {
+ printf(" : {");
+ for (int i = 0; i < kLanes; ++i) {
+ printf(" %08x", out[1].data[i]);
+ }
+ printf(" }");
+ }
+ printf("\n");
+ if (inputs) {
+ printf("\n");
+ for (int i = 0; i < kReadPorts; ++i) {
+ printf(" read%d =", i);
+ for (int j = 0; j < kLanes; ++j) {
+ printf(" %08x", in[i].data[j]);
+ }
+ printf("\n");
+ }
+ }
+ }
+};
+
+#define VOP1U(func) \
+ if (sz == 1) { \
+ v = 1; \
+ x = func(uint8_t(a)) | func(uint8_t(a >> 8)) << 8 | \
+ func(uint8_t(a >> 16)) << 16 | func(uint8_t(a >> 24)) << 24; \
+ } \
+ if (sz == 2) { \
+ v = 1; \
+ x = func(uint16_t(a)) | func(uint16_t(a >> 16)) << 16; \
+ } \
+ if (sz == 4) { \
+ v = 1; \
+ x = func(uint32_t(a)); \
+ }
+
+#define VOP1PU(func) \
+ if (sz == 1) { \
+ v = 1; \
+ w = 1; \
+ x = func(uint8_t(a)) | func(uint8_t(a >> 8)) << 8 | \
+ func(uint8_t(a >> 16)) << 16 | func(uint8_t(a >> 24)) << 24; \
+ y = func(uint8_t(c)) | func(uint8_t(c >> 8)) << 8 | \
+ func(uint8_t(c >> 16)) << 16 | func(uint8_t(c >> 24)) << 24; \
+ } \
+ if (sz == 2) { \
+ v = 1; \
+ w = 1; \
+ x = func(uint16_t(a)) | func(uint16_t(a >> 16)) << 16; \
+ y = func(uint16_t(c)) | func(uint16_t(c >> 16)) << 16; \
+ } \
+ if (sz == 4) { \
+ v = 1; \
+ w = 1; \
+ x = func(uint32_t(a)); \
+ y = func(uint32_t(c)); \
+ }
+
+#define VOPXU(func) \
+ if (sz == 1) { \
+ v = 1; \
+ x = func(uint8_t(b)) | func(uint8_t(b >> 8)) << 8 | \
+ func(uint8_t(b >> 16)) << 16 | func(uint8_t(b >> 24)) << 24; \
+ } \
+ if (sz == 2) { \
+ v = 1; \
+ x = func(uint16_t(b)) | func(uint16_t(b >> 16)) << 16; \
+ } \
+ if (sz == 4) { \
+ v = 1; \
+ x = func(uint32_t(b)); \
+ }
+
+#define VOP2S(func) \
+ if (sz == 1) { \
+ v = 1; \
+ x = uint8_t(func(int8_t(a), int8_t(b))) | \
+ uint8_t(func(int8_t(a >> 8), int8_t(b >> 8))) << 8 | \
+ uint8_t(func(int8_t(a >> 16), int8_t(b >> 16))) << 16 | \
+ uint8_t(func(int8_t(a >> 24), int8_t(b >> 24))) << 24; \
+ } else if (sz == 2) { \
+ v = 1; \
+ x = uint16_t(func(int16_t(a), int16_t(b))) | \
+ uint16_t(func(int16_t(a >> 16), int16_t(b >> 16))) << 16; \
+ } else if (sz == 4) { \
+ v = 1; \
+ x = uint32_t(func(int32_t(a), int32_t(b))); \
+ }
+
+#define VOP2U(func) \
+ if (sz == 1) { \
+ v = 1; \
+ x = func(uint8_t(a), uint8_t(b)) | \
+ func(uint8_t(a >> 8), uint8_t(b >> 8)) << 8 | \
+ func(uint8_t(a >> 16), uint8_t(b >> 16)) << 16 | \
+ func(uint8_t(a >> 24), uint8_t(b >> 24)) << 24; \
+ } else if (sz == 2) { \
+ v = 1; \
+ x = func(uint16_t(a), uint16_t(b)) | \
+ func(uint16_t(a >> 16), uint16_t(b >> 16)) << 16; \
+ } else if (sz == 4) { \
+ v = 1; \
+ x = func(uint32_t(a), uint32_t(b)); \
+ }
+
+#define VOP2(func) \
+ if (f2_signed) { \
+ VOP2S(func) \
+ } else { \
+ VOP2U(func) \
+ }
+
+#define VOP2S_R(func, r) \
+ if (sz == 1) { \
+ v = 1; \
+ x = uint8_t(func(int8_t(a), int8_t(b), r)) | \
+ uint8_t(func(int8_t(a >> 8), int8_t(b >> 8), r)) << 8 | \
+ uint8_t(func(int8_t(a >> 16), int8_t(b >> 16), r)) << 16 | \
+ uint8_t(func(int8_t(a >> 24), int8_t(b >> 24), r)) << 24; \
+ } else if (sz == 2) { \
+ v = 1; \
+ x = uint16_t(func(int16_t(a), int16_t(b), r)) | \
+ uint16_t(func(int16_t(a >> 16), int16_t(b >> 16), r)) << 16; \
+ } else if (sz == 4) { \
+ v = 1; \
+ x = uint32_t(func(int32_t(a), int32_t(b), r)); \
+ }
+
+#define VOP2U_R(func, r) \
+ if (sz == 1) { \
+ v = 1; \
+ x = uint8_t(func(uint8_t(a), uint8_t(b), r)) | \
+ uint8_t(func(uint8_t(a >> 8), uint8_t(b >> 8), r)) << 8 | \
+ uint8_t(func(uint8_t(a >> 16), uint8_t(b >> 16), r)) << 16 | \
+ uint8_t(func(uint8_t(a >> 24), uint8_t(b >> 24), r)) << 24; \
+ } else if (sz == 2) { \
+ v = 1; \
+ x = uint16_t(func(uint16_t(a), uint16_t(b), r)) | \
+ uint16_t(func(uint16_t(a >> 16), uint16_t(b >> 16), r)) << 16; \
+ } else if (sz == 4) { \
+ v = 1; \
+ x = uint32_t(func(uint32_t(a), uint32_t(b), r)); \
+ }
+
+#define VOP2_R(func, r) \
+ if (f2_signed) { \
+ VOP2S_R(func, r) \
+ } else { \
+ VOP2U_R(func, r) \
+ }
+
+#define VOP2PS(func) \
+ if (sz == 1) { \
+ v = 1; \
+ w = 1; \
+ x = uint8_t(func(int8_t(a), int8_t(b))) | \
+ uint8_t(func(int8_t(a >> 8), int8_t(b >> 8))) << 8 | \
+ uint8_t(func(int8_t(a >> 16), int8_t(b >> 16))) << 16 | \
+ uint8_t(func(int8_t(a >> 24), int8_t(b >> 24))) << 24; \
+ y = uint8_t(func(int8_t(c), int8_t(b))) | \
+ uint8_t(func(int8_t(c >> 8), int8_t(b >> 8))) << 8 | \
+ uint8_t(func(int8_t(c >> 16), int8_t(b >> 16))) << 16 | \
+ uint8_t(func(int8_t(c >> 24), int8_t(b >> 24))) << 24; \
+ } else if (sz == 2) { \
+ v = 1; \
+ w = 1; \
+ x = uint16_t(func(int16_t(a), int16_t(b))) | \
+ uint16_t(func(int16_t(a >> 16), int16_t(b >> 16))) << 16; \
+ y = uint16_t(func(int16_t(c), int16_t(b))) | \
+ uint16_t(func(int16_t(c >> 16), int16_t(b >> 16))) << 16; \
+ } else if (sz == 4) { \
+ v = 1; \
+ w = 1; \
+ x = uint32_t(func(int32_t(a), int32_t(b))); \
+ y = uint32_t(func(int32_t(c), int32_t(b))); \
+ }
+
+#define VOP2PU(func) \
+ if (sz == 1) { \
+ v = 1; \
+ w = 1; \
+ x = func(uint8_t(a), uint8_t(b)) | \
+ func(uint8_t(a >> 8), uint8_t(b >> 8)) << 8 | \
+ func(uint8_t(a >> 16), uint8_t(b >> 16)) << 16 | \
+ func(uint8_t(a >> 24), uint8_t(b >> 24)) << 24; \
+ y = func(uint8_t(c), uint8_t(b)) | \
+ func(uint8_t(c >> 8), uint8_t(b >> 8)) << 8 | \
+ func(uint8_t(c >> 16), uint8_t(b >> 16)) << 16 | \
+ func(uint8_t(c >> 24), uint8_t(b >> 24)) << 24; \
+ } else if (sz == 2) { \
+ v = 1; \
+ w = 1; \
+ x = func(uint16_t(a), uint16_t(b)) | \
+ func(uint16_t(a >> 16), uint16_t(b >> 16)) << 16; \
+ y = func(uint16_t(c), uint16_t(b)) | \
+ func(uint16_t(c >> 16), uint16_t(b >> 16)) << 16; \
+ } else if (sz == 4) { \
+ v = 1; \
+ w = 1; \
+ x = func(uint32_t(a), uint32_t(b)); \
+ y = func(uint32_t(c), uint32_t(b)); \
+ }
+
+#define VOP2P(func) \
+ if (f2_signed) { \
+ VOP2PS(func) \
+ } else { \
+ VOP2PU(func) \
+ }
+
+#define VOP2PS_R(func, r) \
+ if (sz == 1) { \
+ v = 1; \
+ w = 1; \
+ x = uint8_t(func(int8_t(a), int8_t(b), r)) | \
+ uint8_t(func(int8_t(a >> 8), int8_t(b >> 8), r)) << 8 | \
+ uint8_t(func(int8_t(a >> 16), int8_t(b >> 16), r)) << 16 |