Kelvin core, with bazel support.

Change-Id: I11ceb466009c1b2e01929327cb946a0f2ab80116
diff --git a/.bazelrc b/.bazelrc
new file mode 100644
index 0000000..5b3d13f
--- /dev/null
+++ b/.bazelrc
@@ -0,0 +1 @@
+build --cxxopt=-std=c++17
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ac51a05
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+bazel-*
diff --git a/WORKSPACE b/WORKSPACE
new file mode 100644
index 0000000..fa5bbf9
--- /dev/null
+++ b/WORKSPACE
@@ -0,0 +1,20 @@
+workspace(name="kelvin_hw")
+
+load("//rules:repos.bzl", "kelvin_repos")
+kelvin_repos()
+
+# Scala setup
+load("@io_bazel_rules_scala//:scala_config.bzl", "scala_config")
+scala_config(scala_version = "2.13.6")
+load("@io_bazel_rules_scala//scala:scala.bzl", "rules_scala_setup", "rules_scala_toolchain_deps_repositories")
+rules_scala_setup()
+rules_scala_toolchain_deps_repositories(fetch_sources = True)
+load("@io_bazel_rules_scala//scala:toolchains.bzl", "scala_register_toolchains")
+scala_register_toolchains()
+load("@rules_proto//proto:repositories.bzl", "rules_proto_dependencies", "rules_proto_toolchains")
+rules_proto_dependencies()
+rules_proto_toolchains()
+
+load("//rules:deps.bzl", "kelvin_deps")
+kelvin_deps()
+
diff --git a/external/0001-Update-version-of-Googletest-for-bazel-compatitibili.patch b/external/0001-Update-version-of-Googletest-for-bazel-compatitibili.patch
new file mode 100644
index 0000000..6524ee1
--- /dev/null
+++ b/external/0001-Update-version-of-Googletest-for-bazel-compatitibili.patch
@@ -0,0 +1,27 @@
+From 17ec1b6631933d745a419835b1f88c4fffa5bc40 Mon Sep 17 00:00:00 2001
+From: Derek Chow <derekjchow@google.com>
+Date: Mon, 24 Jul 2023 13:44:30 -0700
+Subject: [PATCH] Update version of Googletest for bazel compatitibility.
+
+---
+ .../com_google_googletest/com_google_googletest.bzl         | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git dependency_support/com_google_googletest/com_google_googletest.bzl dependency_support/com_google_googletest/com_google_googletest.bzl
+index e57c403..6ba524c 100644
+--- dependency_support/com_google_googletest/com_google_googletest.bzl
++++ dependency_support/com_google_googletest/com_google_googletest.bzl
+@@ -21,7 +21,7 @@ def com_google_googletest():
+     maybe(
+         http_archive,
+         name = "com_google_googletest",
+-        urls = ["https://github.com/google/googletest/archive/0eea2e9fc63461761dea5f2f517bd6af2ca024fa.zip"],  # 2020-04-30
+-        strip_prefix = "googletest-0eea2e9fc63461761dea5f2f517bd6af2ca024fa",
+-        sha256 = "9463ff914d7c3db02de6bd40a3c412a74e979e3c76eaa89920a49ff8488d6d69",
++        urls = ["https://github.com/google/googletest/archive/refs/tags/v1.13.0.zip"],
++        strip_prefix = "googletest-1.13.0",
++        sha256 = "ffa17fbc5953900994e2deec164bb8949879ea09b411e07f215bfbb1f87f4632",
+     )
+-- 
+2.41.0.487.g6d72f3e995-goog
+
diff --git a/external/0002-SystemC-support-for-verilator.patch b/external/0002-SystemC-support-for-verilator.patch
new file mode 100644
index 0000000..7e414a7
--- /dev/null
+++ b/external/0002-SystemC-support-for-verilator.patch
@@ -0,0 +1,32 @@
+From 123df7a8075ee82f5e8988c77bc5e17c06078506 Mon Sep 17 00:00:00 2001
+From: Derek Chow <derekjchow@google.com>
+Date: Mon, 24 Jul 2023 17:09:47 -0700
+Subject: [PATCH 2/2] SystemC support for verilator.
+
+---
+ dependency_support/verilator/verilator.BUILD | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git dependency_support/verilator/verilator.BUILD dependency_support/verilator/verilator.BUILD
+index 371a1dd..3fc5aa1 100644
+--- dependency_support/verilator/verilator.BUILD
++++ dependency_support/verilator/verilator.BUILD
+@@ -200,6 +200,7 @@ cc_library(
+         "include/verilated_imp.h",
+         "include/verilated_syms.h",
+         "include/verilated_vcd_c.cpp",
++        "include/verilated_vcd_sc.cpp",
+     ],
+     hdrs = [
+         "include/verilated.h",
+@@ -215,6 +216,7 @@ cc_library(
+         # Needed for verilated_vcd_c.cpp and verilated_fst_c.cpp
+         "include/verilated_trace_imp.h",
+         "include/verilated_vcd_c.h",
++        "include/verilated_vcd_sc.h",
+         "include/verilatedos.h",
+         "include/verilated_types.h",
+         "include/verilated_funcs.h",
+-- 
+2.41.0.487.g6d72f3e995-goog
+
diff --git a/external/systemc.BUILD b/external/systemc.BUILD
new file mode 100644
index 0000000..2a48cf8
--- /dev/null
+++ b/external/systemc.BUILD
@@ -0,0 +1,22 @@
+load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake")
+
+filegroup(
+    name = "all_srcs",
+    srcs = glob(["**"]),
+)
+
+# TODO(derekjchow): Set isystem for systemc headers.
+cmake(
+    name = "systemc",
+    cache_entries = {
+        "CMAKE_CXX_STANDARD": "17",
+        "BUILD_SHARED_LIBS": "False",
+    },
+    generate_args = [
+        "-G Ninja",
+    ],
+    install = True,
+    lib_source = "@accellera_systemc//:all_srcs",
+    out_static_libs = ["libsystemc.a"],
+    visibility = ["//visibility:public"],
+)
diff --git a/hdl/chisel/.scalafmt.conf b/hdl/chisel/.scalafmt.conf
new file mode 100644
index 0000000..3ccfeff
--- /dev/null
+++ b/hdl/chisel/.scalafmt.conf
@@ -0,0 +1,4 @@
+version = "3.6.1"
+maxColumn = 80
+runner.dialect = scala3
+project.git = true
diff --git a/hdl/chisel/BUILD b/hdl/chisel/BUILD
new file mode 100644
index 0000000..8f5a41f
--- /dev/null
+++ b/hdl/chisel/BUILD
@@ -0,0 +1,123 @@
+load("@io_bazel_rules_scala//scala:scala.bzl", "scala_library", "scala_binary")
+load("@kelvin_hw//rules:chisel.bzl", "chisel_binary",
+                                     "chisel_library",
+                                     "chisel_cc_library")
+load("@rules_hdl//verilog:providers.bzl", "verilog_library")
+load("@rules_hdl//verilator:defs.bzl", "verilator_cc_library")
+
+chisel_library(
+    name = "common",
+    srcs = glob(["src/common/*.scala"]),
+)
+
+chisel_library(
+    name = "kelvin",
+    srcs = glob(["src/kelvin/**/*.scala"]),
+    deps = [
+        ":common",
+    ]
+)
+
+chisel_cc_library(
+    name = "core_cc_library",
+    chisel_lib = ":kelvin",
+    emit_class = "kelvin.EmitCore",
+    module_name = "Core",
+)
+
+chisel_cc_library(
+    name = "dbus2axi_cc_library",
+    chisel_lib = ":kelvin",
+    emit_class = "kelvin.EmitDBus2Axi",
+    module_name = "DBus2Axi",
+)
+
+chisel_cc_library(
+    name = "l1dcache_cc_library",
+    chisel_lib = ":kelvin",
+    emit_class = "kelvin.EmitL1DCache",
+    module_name = "L1DCache",
+    verilog_deps = [
+        "//hdl/verilog:sram_1rw_256x288",
+    ],
+)
+
+chisel_cc_library(
+    name = "l1icache_cc_library",
+    chisel_lib = ":kelvin",
+    emit_class = "kelvin.EmitL1ICache",
+    module_name = "L1ICache",
+    verilog_deps = [
+        "//hdl/verilog:sram_1rw_256x256",
+    ],
+)
+
+chisel_cc_library(
+    name = "vcmdq_cc_library",
+    chisel_lib = ":kelvin",
+    emit_class = "kelvin.EmitVCmdq",
+    module_name = "VCmdq",
+)
+
+chisel_cc_library(
+    name = "vconvalu_cc_library",
+    chisel_lib = ":kelvin",
+    emit_class = "kelvin.EmitVConvAlu",
+    module_name = "VConvAlu",
+)
+
+chisel_cc_library(
+    name = "vconvctrl_cc_library",
+    chisel_lib = ":kelvin",
+    emit_class = "kelvin.EmitVConvCtrl",
+    module_name = "VConvCtrl",
+)
+
+chisel_cc_library(
+    name = "vdecode_cc_library",
+    chisel_lib = ":kelvin",
+    emit_class = "kelvin.EmitVDecode",
+    module_name = "VDecode",
+)
+
+chisel_cc_library(
+    name = "vdecodeinstruction_cc_library",
+    chisel_lib = ":kelvin",
+    emit_class = "kelvin.EmitVDecodeInstruction",
+    module_name = "VDecodeInstruction",
+)
+
+chisel_cc_library(
+    name = "vldst_cc_library",
+    chisel_lib = ":kelvin",
+    emit_class = "kelvin.EmitVLdSt",
+    module_name = "VLdSt",
+)
+
+chisel_cc_library(
+    name = "vld_cc_library",
+    chisel_lib = ":kelvin",
+    emit_class = "kelvin.EmitVLd",
+    module_name = "VLd",
+)
+
+chisel_cc_library(
+    name = "vregfile_cc_library",
+    chisel_lib = ":kelvin",
+    emit_class = "kelvin.EmitVRegfile",
+    module_name = "VRegfile",
+)
+
+chisel_cc_library(
+    name = "vregfilesegment_cc_library",
+    chisel_lib = ":kelvin",
+    emit_class = "kelvin.EmitVRegfileSegment",
+    module_name = "VRegfileSegment",
+)
+
+chisel_cc_library(
+    name = "vst_cc_library",
+    chisel_lib = ":kelvin",
+    emit_class = "kelvin.EmitVSt",
+    module_name = "VSt",
+)
\ No newline at end of file
diff --git a/hdl/chisel/src/common/Fifo.scala b/hdl/chisel/src/common/Fifo.scala
new file mode 100644
index 0000000..f1efd98
--- /dev/null
+++ b/hdl/chisel/src/common/Fifo.scala
@@ -0,0 +1,102 @@
+package common
+
+import chisel3._
+import chisel3.util._
+
+object Fifo {
+  def apply[T <: Data](t: T, n: Int, passReady: Boolean = false) = {
+    Module(new Fifo(t, n, passReady))
+  }
+}
+
+class Fifo[T <: Data](t: T, n: Int, passReady: Boolean) extends Module {
+  val io = IO(new Bundle {
+    val in  = Flipped(Decoupled(t))
+    val out = Decoupled(t)
+    val count = Output(UInt(log2Ceil(n+1).W))
+  })
+
+  // An (n-1) queue with a registered output stage.
+  val m = n - 1  // n = Mem(n-1) + Out
+
+  val mem = Mem(m, t)
+  val rdata = Reg(t)
+
+  val rvalid = RegInit(false.B)
+  val wready = RegInit(false.B)
+  val raddr = RegInit(0.U(log2Ceil(m).W))
+  val waddr = RegInit(0.U(log2Ceil(m).W))
+  val count = RegInit(0.U(log2Ceil(n+1).W))
+
+  // ---------------------------------------------------------------------------
+  // Memory Addresses.
+  val winc = io.in.valid && io.in.ready
+  val rinc = (!rvalid || io.out.ready) && (winc || count > 1.U)
+
+  when (winc) {
+    waddr := Mux(waddr === (m - 1).U, 0.U, waddr + 1.U)
+  }
+
+  when (rinc) {
+    raddr := Mux(raddr === (m - 1).U, 0.U, raddr + 1.U)
+  }
+
+  val forward = rinc && winc && count <= 1.U
+
+  // ---------------------------------------------------------------------------
+  // FIFO Control.
+  val ien = io.in.valid && io.in.ready
+  val oen = io.out.valid && io.out.ready
+
+  when (ien && !oen) {
+    count := count + 1.U
+  } .elsewhen (!ien && oen) {
+    count := count - 1.U
+  }
+
+  when (ien) {
+    rvalid := true.B
+  } .elsewhen (io.out.ready && count === 1.U) {
+    rvalid := false.B
+  }
+
+  wready := count < (n - 1).U ||
+            count === (n - 1).U && !(ien && !oen) ||
+            (oen && !ien)
+
+  // ---------------------------------------------------------------------------
+  // Memory.
+  when (winc && !forward) {
+    mem(waddr) := io.in.bits
+  }
+
+  when (forward) {
+    rdata := io.in.bits
+  } .elsewhen (rinc) {
+    rdata := mem(raddr)
+  }
+
+  // ---------------------------------------------------------------------------
+  // Interface.
+  io.out.valid := rvalid
+  io.out.bits := rdata
+
+  if (passReady) {
+    io.in.ready := wready || io.out.ready                       // pass-through
+  } else {
+    io.in.ready := wready
+  }
+
+  io.count := count
+
+  assert(count <= n.U)
+  assert(!(!passReady.B && io.in.ready && count === n.U))
+}
+
+object EmitFifo extends App {
+  (new chisel3.stage.ChiselStage).emitVerilog(new Fifo(UInt(8.W), 11, false), args)
+}
+
+object EmitFifo_1 extends App {
+  (new chisel3.stage.ChiselStage).emitVerilog(new Fifo(UInt(8.W), 11, true), args)
+}
diff --git a/hdl/chisel/src/common/Fifo4.scala b/hdl/chisel/src/common/Fifo4.scala
new file mode 100644
index 0000000..a434364
--- /dev/null
+++ b/hdl/chisel/src/common/Fifo4.scala
@@ -0,0 +1,189 @@
+package common
+
+import chisel3._
+import chisel3.util._
+
+object Fifo4 {
+  def apply[T <: Data](t: T, n: Int) = {
+    Module(new Fifo4(t, n))
+  }
+}
+
+// 4way decode, used for Fifo4 style input controls.
+object Fifo4Valid {
+  def apply(in: UInt): (UInt, UInt, UInt, UInt) = {
+    assert(in.getWidth == 4)
+
+    val in0 = Cat(in(3,0) === 8.U,  // 8
+                  in(2,0) === 4.U,  // 4, 12
+                  in(1,0) === 2.U,  // 2, 6, 10, 14
+                  in(0))            // 1, 3, 5, 7, 9, 11, 13, 15
+
+    val in1 = Cat(in(3,0) === 12.U ||
+                  in(3,0) === 10.U ||
+                  in(3,0) === 9.U,  // 9, 10, 12
+                  in(2,0) === 6.U ||
+                  in(2,0) === 5.U,  // 5, 6, 13, 14
+                  in(1,0) === 3.U,  // 3, 7, 11, 15
+                  false.B)
+
+    val in2 = Cat(in(3,0) === 14.U ||
+                  in(3,0) === 13.U ||
+                  in(3,0) === 11.U,  // 11, 13, 14
+                  in(2,0) === 15.U ||
+                  in(2,0) === 7.U,   // 7, 15
+                  false.B, false.B)
+
+    val in3 = Cat(in(3,0) === 15.U,  // 15
+                  false.B, false.B, false.B)
+
+    (in0.asUInt, in1.asUInt, in2.asUInt, in3.asUInt)
+  }
+}
+
+class Fifo4[T <: Data](t: T, n: Int) extends Module {
+  val io = IO(new Bundle {
+    val in  = Flipped(Decoupled(Vec(4, Valid(t))))
+    val out = Decoupled(t)
+    val count = Output(UInt(log2Ceil(n+1).W))
+  })
+
+  val m = n - 1  // n = Mem(n-1) + Slice
+
+  def Increment(a: UInt, b: UInt): UInt = {
+    val c = a +& b
+    val d = Mux(c < m.U, c, c - m.U)(a.getWidth - 1, 0)
+    d
+  }
+
+  val mem = Mem(m, t)
+  val mslice = Slice(t, false, true)
+
+  val in0pos = RegInit(0.U(log2Ceil(m).W))
+  val in1pos = RegInit(1.U(log2Ceil(m).W))
+  val in2pos = RegInit(2.U(log2Ceil(m).W))
+  val in3pos = RegInit(3.U(log2Ceil(m).W))
+  val outpos = RegInit(0.U(log2Ceil(m).W))
+  val mcount = RegInit(0.U(log2Ceil(n+1).W))
+
+  io.count := mcount + io.out.valid
+
+  val ivalid = io.in.valid && io.in.ready
+  val ovalid = mslice.io.in.valid && mslice.io.in.ready
+
+  val iactive = Cat(io.in.bits(3).valid, io.in.bits(2).valid,
+                    io.in.bits(1).valid, io.in.bits(0).valid).asUInt
+
+  val icount = io.in.bits(0).valid +& io.in.bits(1).valid +
+               io.in.bits(2).valid +& io.in.bits(3).valid
+
+  // ---------------------------------------------------------------------------
+  // Fifo Control.
+  when (ivalid) {
+    in0pos := Increment(in0pos, icount)
+    in1pos := Increment(in1pos, icount)
+    in2pos := Increment(in2pos, icount)
+    in3pos := Increment(in3pos, icount)
+  }
+
+  when (ovalid) {
+    outpos := Increment(outpos, 1.U)
+  }
+
+  val inc = MuxOR(ivalid, icount)
+  val dec = mslice.io.in.valid && mslice.io.in.ready
+
+  when (ivalid || ovalid) {
+    mcount := mcount + inc - dec
+  }
+
+  // ---------------------------------------------------------------------------
+  // Fifo Input.
+  val (in0valid, in1valid, in2valid, in3valid) = Fifo4Valid(iactive)
+
+  for (i <- 0 until m) {
+    val valid = Cat(in0pos === i.U && in0valid(3) ||
+                    in1pos === i.U && in1valid(3) ||
+                    in2pos === i.U && in2valid(3) ||
+                    in3pos === i.U && in3valid(3),
+                    in0pos === i.U && in0valid(2) ||
+                    in1pos === i.U && in1valid(2) ||
+                    in2pos === i.U && in2valid(2),
+                    in0pos === i.U && in0valid(1) ||
+                    in1pos === i.U && in1valid(1),
+                    in0pos === i.U && in0valid(0))
+
+    // Couldn't get the following to work properly.
+    //
+    // val data = MuxOR(valid(0), io.in.bits(0).bits.asUInt) |
+    //            MuxOR(valid(1), io.in.bits(1).bits.asUInt) |
+    //            MuxOR(valid(2), io.in.bits(2).bits.asUInt) |
+    //            MuxOR(valid(3), io.in.bits(3).bits.asUInt)
+    //
+    // when (ivalid && valid =/= 0.U) {
+    //   mem(i) := data.asTypeOf(t)
+    // }
+    when (ivalid) {
+      when (valid(0)) {
+        mem(i) := io.in.bits(0).bits
+      } .elsewhen (valid(1)) {
+        mem(i) := io.in.bits(1).bits
+      } .elsewhen (valid(2)) {
+        mem(i) := io.in.bits(2).bits
+      } .elsewhen (valid(3)) {
+        mem(i) := io.in.bits(3).bits
+      }
+    }
+  }
+
+  mslice.io.in.valid := false.B
+  mslice.io.in.bits := io.in.bits(0).bits  // defaults
+
+  when (mcount > 0.U) {
+    when (io.out.ready) {
+      mslice.io.in.valid := true.B
+    }
+  } .otherwise {
+    when (ivalid && iactive =/= 0.U) {
+      mslice.io.in.valid := true.B
+    }
+  }
+
+  when (mcount > 0.U) {
+    mslice.io.in.bits := mem(outpos)
+  } .elsewhen (ivalid) {
+    // As above, couldn't get MuxOR to work.
+    when (iactive(0)) {
+      mslice.io.in.bits := io.in.bits(0).bits
+    } .elsewhen (iactive(1)) {
+      mslice.io.in.bits := io.in.bits(1).bits
+    } .elsewhen (iactive(2)) {
+      mslice.io.in.bits := io.in.bits(2).bits
+    } .elsewhen (iactive(3)) {
+      mslice.io.in.bits := io.in.bits(3).bits
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Valid Entries.
+  val active = RegInit(0.U(m.W))
+
+  val activeSet = MuxOR(ivalid,
+      ((icount >= 1.U) << in0pos) | ((icount >= 2.U) << in1pos) |
+      ((icount >= 3.U) << in2pos) | ((icount >= 4.U) << in3pos))
+
+  val activeClr = MuxOR(mslice.io.in.valid && mslice.io.in.ready, 1.U << outpos)
+
+  active := (active | activeSet) & ~activeClr
+
+  // ---------------------------------------------------------------------------
+  // Interface.
+  io.in.ready := mcount <= (m.U - icount)
+  io.out <> mslice.io.out
+
+  assert(mcount <= m.U)
+}
+
+object EmitFifo4 extends App {
+  (new chisel3.stage.ChiselStage).emitVerilog(new Fifo4(UInt(8.W), 11), args)
+}
diff --git a/hdl/chisel/src/common/Fifo4e.scala b/hdl/chisel/src/common/Fifo4e.scala
new file mode 100644
index 0000000..bca120d
--- /dev/null
+++ b/hdl/chisel/src/common/Fifo4e.scala
@@ -0,0 +1,143 @@
+package common
+
+import chisel3._
+import chisel3.util._
+
+// Fifo4 with entry output and no output registration stage.
+
+object Fifo4e {
+  def apply[T <: Data](t: T, n: Int) = {
+    Module(new Fifo4e(t, n))
+  }
+}
+
+class Fifo4e[T <: Data](t: T, n: Int) extends Module {
+  val io = IO(new Bundle {
+    val in  = Flipped(Decoupled(Vec(4, Valid(t))))
+    val out = Decoupled(t)
+    val count = Output(UInt(log2Ceil(n+1).W))
+    val entry = Output(Vec(n, Valid(t)))
+    val nempty = Output(Bool())
+  })
+
+  def Increment(a: UInt, b: UInt): UInt = {
+    val c = a +& b
+    val d = Mux(c < n.U, c, c - n.U)(a.getWidth - 1, 0)
+    d
+  }
+
+  val mem = Mem(n, t)
+
+  val in0pos = RegInit(0.U(log2Ceil(n).W))
+  val in1pos = RegInit(1.U(log2Ceil(n).W))
+  val in2pos = RegInit(2.U(log2Ceil(n).W))
+  val in3pos = RegInit(3.U(log2Ceil(n).W))
+  val outpos = RegInit(0.U(log2Ceil(n).W))
+  val mcount = RegInit(0.U(log2Ceil(n+1).W))
+  val nempty = RegInit(false.B)
+
+  io.count := mcount
+  io.nempty := nempty
+
+  val ivalid = io.in.valid && io.in.ready
+  val ovalid = io.out.valid && io.out.ready
+
+  val iactive = Cat(io.in.bits(3).valid, io.in.bits(2).valid,
+                    io.in.bits(1).valid, io.in.bits(0).valid).asUInt
+
+  val icount = io.in.bits(0).valid +& io.in.bits(1).valid +
+               io.in.bits(2).valid +& io.in.bits(3).valid
+
+  // ---------------------------------------------------------------------------
+  // Fifo Control.
+  when (ivalid) {
+    in0pos := Increment(in0pos, icount)
+    in1pos := Increment(in1pos, icount)
+    in2pos := Increment(in2pos, icount)
+    in3pos := Increment(in3pos, icount)
+  }
+
+  when (ovalid) {
+    outpos := Increment(outpos, 1.U)
+  }
+
+  val inc = MuxOR(ivalid, icount)
+  val dec = ovalid
+
+  when (ivalid || ovalid) {
+    val nxtcount = mcount + inc - dec
+    mcount := nxtcount
+    nempty := nxtcount =/= 0.U
+  }
+
+  // ---------------------------------------------------------------------------
+  // Fifo Input.
+  val (in0valid, in1valid, in2valid, in3valid) = Fifo4Valid(iactive)
+
+  for (i <- 0 until n) {
+    val valid = Cat(in0pos === i.U && in0valid(3) ||
+                    in1pos === i.U && in1valid(3) ||
+                    in2pos === i.U && in2valid(3) ||
+                    in3pos === i.U && in3valid(3),
+                    in0pos === i.U && in0valid(2) ||
+                    in1pos === i.U && in1valid(2) ||
+                    in2pos === i.U && in2valid(2),
+                    in0pos === i.U && in0valid(1) ||
+                    in1pos === i.U && in1valid(1),
+                    in0pos === i.U && in0valid(0))
+
+    // Couldn't get the following to work properly.
+    //
+    // val data = MuxOR(valid(0), io.in.bits(0).bits.asUInt) |
+    //            MuxOR(valid(1), io.in.bits(1).bits.asUInt) |
+    //            MuxOR(valid(2), io.in.bits(2).bits.asUInt) |
+    //            MuxOR(valid(3), io.in.bits(3).bits.asUInt)
+    //
+    // when (ivalid && valid =/= 0.U) {
+    //   mem(i) := data.asTypeOf(t)
+    // }
+    when (ivalid) {
+      when (valid(0)) {
+        mem(i) := io.in.bits(0).bits
+      } .elsewhen (valid(1)) {
+        mem(i) := io.in.bits(1).bits
+      } .elsewhen (valid(2)) {
+        mem(i) := io.in.bits(2).bits
+      } .elsewhen (valid(3)) {
+        mem(i) := io.in.bits(3).bits
+      }
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Valid Entries.
+  val active = RegInit(0.U(n.W))
+
+  val activeSet = MuxOR(ivalid,
+      ((icount >= 1.U) << in0pos) | ((icount >= 2.U) << in1pos) |
+      ((icount >= 3.U) << in2pos) | ((icount >= 4.U) << in3pos))
+
+  val activeClr = MuxOR(io.out.valid && io.out.ready, 1.U << outpos)
+
+  when (io.in.valid && io.in.ready || io.out.valid && io.out.ready) {
+    active := (active | activeSet) & ~activeClr
+  }
+
+  // ---------------------------------------------------------------------------
+  // Interface.
+  io.in.ready := mcount <= (n.U - icount)
+
+  io.out.valid := mcount =/= 0.U
+  io.out.bits := mem(outpos)
+
+  assert(mcount <= n.U)
+
+  for (i <- 0 until n) {
+    io.entry(i).valid := active(i)
+    io.entry(i).bits := mem(i)
+  }
+}
+
+object EmitFifo4e extends App {
+  (new chisel3.stage.ChiselStage).emitVerilog(new Fifo4e(UInt(8.W), 10), args)
+}
diff --git a/hdl/chisel/src/common/Fifo4x4.scala b/hdl/chisel/src/common/Fifo4x4.scala
new file mode 100644
index 0000000..5d02731
--- /dev/null
+++ b/hdl/chisel/src/common/Fifo4x4.scala
@@ -0,0 +1,183 @@
+package common
+
+import chisel3._
+import chisel3.util._
+
+object Fifo4x4 {
+  def apply[T <: Data](t: T, n: Int) = {
+    Module(new Fifo4x4(t, n))
+  }
+}
+
+// Input accepted with a common handshake and per lane select.
+// Outputs are transacted independently, and ordered {[0], [0,1], [0,1,2], [0,1,2,3]}.
+// Outputs are not registered, assumes passes directly into shallow combinatorial.
+class Fifo4x4[T <: Data](t: T, n: Int) extends Module {
+  val io = IO(new Bundle {
+    val in  = Flipped(Decoupled(Vec(4, Valid(t))))
+    val out = Vec(4, Decoupled(t))
+    val count = Output(UInt(log2Ceil(n+1).W))
+    val nempty = Output(Bool())
+  })
+
+  val m = n
+
+  val mb  = log2Ceil(m)
+  val n1b = log2Ceil(n + 1)
+
+  def Increment(a: UInt, b: UInt): UInt = {
+    val c = a +& b
+    val d = Mux(c < m.U, c, c - m.U)(a.getWidth - 1, 0)
+    d
+  }
+
+  val mem = Reg(Vec(n, t))
+
+  val inpos  = Reg(Vec(4, UInt(mb.W)))  // reset below
+  val outpos = Reg(Vec(4, UInt(mb.W)))  // reset below
+
+  val mcount = RegInit(0.U(n1b.W))
+  val nempty = RegInit(false.B)
+  val inready = RegInit(false.B)
+  val outvalid = RegInit(0.U(4.W))
+
+  val ivalid = io.in.valid && io.in.ready
+
+  val iactive = Cat(io.in.bits(3).valid, io.in.bits(2).valid,
+                    io.in.bits(1).valid, io.in.bits(0).valid).asUInt
+
+  val icount = (io.in.bits(0).valid +& io.in.bits(1).valid +&
+                io.in.bits(2).valid +& io.in.bits(3).valid)(2,0)
+
+  val oactiveBits = Cat(io.out(3).valid && io.out(3).ready,
+                        io.out(2).valid && io.out(2).ready,
+                        io.out(1).valid && io.out(1).ready,
+                        io.out(0).valid && io.out(0).ready)
+
+  val ovalid = oactiveBits =/= 0.U
+
+  val ocount = (oactiveBits(0) +& oactiveBits(1) +&
+                oactiveBits(2) +& oactiveBits(3))(2,0)
+
+  assert(!(oactiveBits(1) === 1.U && oactiveBits(0,0) =/= 1.U))
+  assert(!(oactiveBits(2) === 1.U && oactiveBits(1,0) =/= 3.U))
+  assert(!(oactiveBits(3) === 1.U && oactiveBits(2,0) =/= 7.U))
+
+  val ovalidBits = Cat(io.out(3).valid, io.out(2).valid,
+                       io.out(1).valid, io.out(0).valid)
+
+  assert(!(ovalidBits(1) === 1.U && ovalidBits(0,0) =/= 1.U))
+  assert(!(ovalidBits(2) === 1.U && ovalidBits(1,0) =/= 3.U))
+  assert(!(ovalidBits(3) === 1.U && ovalidBits(2,0) =/= 7.U))
+
+  val oreadyBits = Cat(io.out(3).ready, io.out(2).ready,
+                       io.out(1).ready, io.out(0).ready)
+
+  assert(!(oreadyBits(1) === 1.U && oreadyBits(0,0) =/= 1.U))
+  assert(!(oreadyBits(2) === 1.U && oreadyBits(1,0) =/= 3.U))
+  assert(!(oreadyBits(3) === 1.U && oreadyBits(2,0) =/= 7.U))
+
+  // ---------------------------------------------------------------------------
+  // Fifo Control.
+  when (reset.asBool) {
+    for (i <- 0 until 4) {
+      inpos(i) := i.U
+    }
+  } .elsewhen (ivalid) {
+    for (i <- 0 until 4) {
+      inpos(i) := Increment(inpos(i), icount)
+    }
+  }
+
+  when (reset.asBool) {
+    for (i <- 0 until 4) {
+      outpos(i) := i.U
+    }
+  } .elsewhen (ovalid) {
+    for (i <- 0 until 4) {
+      outpos(i) := Increment(outpos(i), ocount)
+    }
+  }
+
+  val inc = MuxOR(ivalid, icount)
+  val dec = MuxOR(ovalid, ocount)
+
+  when (ivalid || ovalid) {
+    val nxtmcount = mcount + inc - dec
+    inready := nxtmcount <= (m.U - 4.U)
+    mcount := nxtmcount
+    nempty := nxtmcount =/= 0.U
+    outvalid := Cat(nxtmcount >= 4.U,
+                    nxtmcount >= 3.U,
+                    nxtmcount >= 2.U,
+                    nxtmcount >= 1.U)
+  } .otherwise {
+    inready := mcount <= (m.U - 4.U)
+    outvalid := Cat(mcount >= 4.U,
+                    mcount >= 3.U,
+                    mcount >= 2.U,
+                    mcount >= 1.U)
+  }
+
+  // ---------------------------------------------------------------------------
+  // Fifo Input.
+  val (in0valid, in1valid, in2valid, in3valid) = Fifo4Valid(iactive)
+
+  for (i <- 0 until m) {
+    val valid = Cat(inpos(0) === i.U && in0valid(3) ||
+                    inpos(1) === i.U && in1valid(3) ||
+                    inpos(2) === i.U && in2valid(3) ||
+                    inpos(3) === i.U && in3valid(3),
+
+                    inpos(0) === i.U && in0valid(2) ||
+                    inpos(1) === i.U && in1valid(2) ||
+                    inpos(2) === i.U && in2valid(2),
+
+                    inpos(0) === i.U && in0valid(1) ||
+                    inpos(1) === i.U && in1valid(1),
+
+                    inpos(0) === i.U && in0valid(0))
+
+    if (true) {
+      val data = MuxOR(valid(0), io.in.bits(0).bits.asUInt) |
+                 MuxOR(valid(1), io.in.bits(1).bits.asUInt) |
+                 MuxOR(valid(2), io.in.bits(2).bits.asUInt) |
+                 MuxOR(valid(3), io.in.bits(3).bits.asUInt)
+
+      when (ivalid && valid =/= 0.U) {
+        mem(i) := data.asTypeOf(t)
+      }
+    } else {
+      when (ivalid) {
+        when (valid(0)) {
+          mem(i) := io.in.bits(0).bits
+        } .elsewhen (valid(1)) {
+          mem(i) := io.in.bits(1).bits
+        } .elsewhen (valid(2)) {
+          mem(i) := io.in.bits(2).bits
+        } .elsewhen (valid(3)) {
+          mem(i) := io.in.bits(3).bits
+        }
+      }
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Interface.
+  io.in.ready := inready
+
+  for (i <- 0 until 4) {
+    io.out(i).valid := outvalid(i)
+    io.out(i).bits := mem(outpos(i))  // TODO: VecAt()
+  }
+
+  io.count := mcount
+
+  io.nempty := nempty
+
+  assert(io.count <= m.U)
+}
+
+object EmitFifo4x4 extends App {
+  (new chisel3.stage.ChiselStage).emitVerilog(new Fifo4x4(UInt(32.W), 24), args)
+}
diff --git a/hdl/chisel/src/common/IDiv.scala b/hdl/chisel/src/common/IDiv.scala
new file mode 100644
index 0000000..070eb3e
--- /dev/null
+++ b/hdl/chisel/src/common/IDiv.scala
@@ -0,0 +1,175 @@
+package common
+
+import chisel3._
+import chisel3.util._
+import common._
+
+// An integer divide unit, to be fused with fdiv.
+
+object IDiv {
+  def apply(n: Int): IDiv = {
+    return Module(new IDiv(n))
+  }
+
+  val Stages = 4
+  val Rcnt = 32 / Stages
+}
+
+case class IDivOp() {
+  val DIV  = 0
+  val DIVU = 1
+  val REM  = 2
+  val REMU = 3
+  val Entries = 4
+}
+
+class IDiv(n: Int) extends Module {
+  val io = IO(new Bundle {
+    val req = Input(UInt(new IDivOp().Entries.W))
+    val ina = Flipped(Decoupled(Vec(n, UInt(32.W))))
+    val inb = Flipped(Decoupled(Vec(n, UInt(32.W))))
+    val out = Decoupled(Vec(n, UInt(32.W)))
+  })
+
+  val dvu = new IDivOp()
+
+  val active = RegInit(false.B)
+  val result = RegInit(false.B)
+  val count = Reg(UInt(6.W))
+
+  val state = Reg(Vec(n, new IDivState()))
+
+  val ivalid = io.ina.valid && io.ina.ready && io.inb.valid && io.inb.ready
+  val ovalid = io.out.valid && io.out.ready
+
+  when (ivalid) {
+    active := true.B
+  } .elsewhen (active && count === IDiv.Rcnt.U) {
+    active := false.B
+  }
+
+  when (ovalid) {
+    result := false.B
+  } .elsewhen (active && count === IDiv.Rcnt.U) {
+    result := true.B
+  }
+
+  when (ivalid) {
+    count := 0.U
+  } .elsewhen (active) {
+    count := count + 1.U
+  }
+
+  for (i <- 0 until n) {
+    val ina = io.ina.bits(i)
+    val inb = io.inb.bits(i)
+    val st = state(i)
+
+    when (ivalid) {
+      val divide = io.req(dvu.DIV) || io.req(dvu.DIVU)
+      val signed = io.req(dvu.DIV) || io.req(dvu.REM)
+      state(i) := IDivComb1(ina, inb, signed, divide)
+    } .elsewhen (active) {
+      state(i) := IDivComb2(state(i), count)
+    }
+  }
+
+  io.ina.ready := io.inb.valid && !active && (!result || io.out.ready)
+  io.inb.ready := io.ina.valid && !active && (!result || io.out.ready)
+
+  io.out.valid := result
+
+  for (i <- 0 until n) {
+    io.out.bits(i) := IDivComb3(state(i))
+  }
+}
+
+class IDivState extends Bundle {
+  val denom = UInt(32.W)  // output is placed first
+  val divide = UInt(32.W)
+  val remain = UInt(32.W)
+  val opdiv = Bool()
+  val opneg = Bool()
+}
+
+object IDivComb1 {
+  def apply(ina: UInt, inb: UInt, signed: Bool, divide: Bool): IDivState = {
+    val out = Wire(new IDivState())
+
+    val divByZero = inb === 0.U
+    val divsign = signed && (ina(31) =/= inb(31)) && !divByZero
+    val remsign = signed && ina(31)
+    val inp = Mux(signed && ina(31), ~ina + 1.U, ina)
+
+    out.opdiv := divide
+    out.opneg := Mux(divide, divsign, remsign)
+    out.denom := Mux(signed && inb(31), ~inb + 1.U, inb)
+    out.divide := inp
+    out.remain := 0.U
+
+    out
+  }
+}
+
+object IDivComb2 {
+  def apply(in: IDivState, count: UInt): IDivState = {
+    val out = Wire(new IDivState())
+    out := in
+
+    when (count < IDiv.Rcnt.U) {
+      val (div1, rem1) = Divide(in.divide, in.remain, in.denom)
+      if (IDiv.Stages == 1) {
+        out.divide := div1
+        out.remain := rem1
+      } else if (IDiv.Stages == 2) {
+        val (div2, rem2) = Divide(div1, rem1, in.denom)
+        out.divide := div2
+        out.remain := rem2
+      } else if (IDiv.Stages == 4) {
+        val (div2, rem2) = Divide(div1, rem1, in.denom)
+        val (div3, rem3) = Divide(div2, rem2, in.denom)
+        val (div4, rem4) = Divide(div3, rem3, in.denom)
+        out.divide := div4
+        out.remain := rem4
+      } else {
+        assert(false)
+      }
+    } .otherwise {
+      val div = Mux(in.opneg, ~in.divide + 1.U, in.divide)
+      val rem = Mux(in.opneg, ~in.remain + 1.U, in.remain)
+      out.denom := Mux(in.opdiv, div, rem)
+    }
+
+    out
+  }
+
+  def Divide(prvDivide: UInt, prvRemain: UInt, denom: UInt): (UInt, UInt) = {
+    val shfRemain = Cat(prvRemain(30,0), prvDivide(31))
+    val subtract = shfRemain -& denom
+    assert(subtract.getWidth == 33)
+    val divDivide = Wire(UInt(32.W))
+    val divRemain = Wire(UInt(32.W))
+
+    when (!subtract(32)) {
+      divDivide := Cat(prvDivide(30,0), 1.U(1.W))
+      divRemain := subtract(31,0)
+    } .otherwise {
+      divDivide := Cat(prvDivide(30,0), 0.U(1.W))
+      divRemain := shfRemain
+    }
+
+    (divDivide, divRemain)
+  }
+}
+
+object IDivComb3 {
+  def apply(in: IDivState): UInt = {
+    val result = in.denom
+    assert(result.getWidth == 32)
+    result
+  }
+}
+
+object EmitIDiv extends App {
+  (new chisel3.stage.ChiselStage).emitVerilog(new IDiv(1), args)
+}
diff --git a/hdl/chisel/src/common/Library.scala b/hdl/chisel/src/common/Library.scala
new file mode 100644
index 0000000..9fa4682
--- /dev/null
+++ b/hdl/chisel/src/common/Library.scala
@@ -0,0 +1,14 @@
+package common
+
+import chisel3._
+import chisel3.util._
+
+object MuxOR {
+  def apply(valid: Bool, data: UInt): UInt = {
+    Mux(valid, data, 0.U(data.getWidth))
+  }
+
+  def apply(valid: Bool, data: Bool): Bool = {
+    Mux(valid, data, false.B)
+  }
+}
diff --git a/hdl/chisel/src/common/Slice.scala b/hdl/chisel/src/common/Slice.scala
new file mode 100644
index 0000000..ac93641
--- /dev/null
+++ b/hdl/chisel/src/common/Slice.scala
@@ -0,0 +1,128 @@
+package common
+
+import chisel3._
+import chisel3.util._
+
+object Slice {
+  def apply[T <: Data](t: T, doubleBuffered: Boolean = true,
+      passReady: Boolean = false, passValid: Boolean = false) = {
+    Module(new Slice(t, doubleBuffered, passReady, passValid))
+  }
+}
+
+class Slice[T <: Data](t: T, doubleBuffered: Boolean,
+    passReady: Boolean, passValid: Boolean) extends Module {
+  val io = IO(new Bundle {
+    val in  = Flipped(Decoupled(t))
+    val out = Decoupled(t)
+    val count = Output(UInt(2.W))
+    val value = Output(Vec(if (doubleBuffered) 2 else 1, Valid(t)))
+  })
+
+  val size = if (doubleBuffered) 2 else 1
+
+  val ipos = RegInit(0.U(size.W))
+  val opos = RegInit(0.U(size.W))
+  val count = RegInit(0.U(size.W))
+  val mem = Reg(Vec(size, t))
+
+  val empty = ipos === opos
+  val bypass = if (passValid) io.in.valid && io.out.ready && empty else false.B
+  val ivalid = io.in.valid && io.in.ready && !bypass
+  val ovalid = io.out.valid && io.out.ready && !bypass
+
+  when (ivalid) {
+    ipos := ipos + 1.U
+  }
+
+  when (ovalid) {
+    opos := opos + 1.U
+  }
+
+  when (ivalid =/= ovalid) {
+    count := count + ivalid - ovalid
+  }
+
+  if (doubleBuffered) {
+    val full = ipos(0) === opos(0) && ipos(1) =/= opos(1)
+    if (passReady) {
+      io.in.ready := !full || io.out.ready                      // pass-through
+    } else {
+      io.in.ready := !full
+    }
+
+    when (ovalid && full) {
+      mem(0) := mem(1)
+    }
+
+    when (ivalid && !ovalid && empty ||
+          ivalid && ovalid && !full) {
+      mem(0) := io.in.bits
+    }
+
+    when (ivalid && !ovalid && !empty ||
+          ivalid && ovalid && full) {
+      mem(1) := io.in.bits
+    }
+
+    io.value(0).valid := !empty
+    io.value(1).valid := full
+    io.value(0).bits := mem(0)
+    io.value(1).bits := mem(1)
+  } else {
+    if (passReady) {
+      io.in.ready := empty || io.out.ready                      // pass-through
+    } else {
+      io.in.ready := empty
+    }
+
+    when (ivalid) {
+      mem(0) := io.in.bits
+    }
+
+    io.value(0).valid := !empty
+    io.value(0).bits := mem(0)
+  }
+
+  if (!passValid) {
+    io.out.valid := !empty
+    io.out.bits  := mem(0)
+  } else {
+    io.out.valid := !empty || io.in.valid                       // pass-through
+    io.out.bits  := Mux(!empty, mem(0), io.in.bits)             // pass-through
+  }
+
+  io.count := count
+}
+
+object EmitSlice extends App {
+  (new chisel3.stage.ChiselStage).emitVerilog(new Slice(UInt(32.W), false, false, false), args)
+}
+
+object EmitSlice_1 extends App {
+  (new chisel3.stage.ChiselStage).emitVerilog(new Slice(UInt(32.W), false, false, true), args)
+}
+
+object EmitSlice_2 extends App {
+  (new chisel3.stage.ChiselStage).emitVerilog(new Slice(UInt(32.W), false, true, false), args)
+}
+
+object EmitSlice_3 extends App {
+  (new chisel3.stage.ChiselStage).emitVerilog(new Slice(UInt(32.W), false, true, true), args)
+}
+
+object EmitSlice_4 extends App {
+  (new chisel3.stage.ChiselStage).emitVerilog(new Slice(UInt(32.W), true, false, false), args)
+}
+
+object EmitSlice_5 extends App {
+  (new chisel3.stage.ChiselStage).emitVerilog(new Slice(UInt(32.W), true, false, true), args)
+}
+
+object EmitSlice_6 extends App {
+  (new chisel3.stage.ChiselStage).emitVerilog(new Slice(UInt(32.W), true, true, false), args)
+}
+
+object EmitSlice_7 extends App {
+  (new chisel3.stage.ChiselStage).emitVerilog(new Slice(UInt(32.W), true, true, true), args)
+}
diff --git a/hdl/chisel/src/kelvin/Axi.scala b/hdl/chisel/src/kelvin/Axi.scala
new file mode 100644
index 0000000..bd5009a
--- /dev/null
+++ b/hdl/chisel/src/kelvin/Axi.scala
@@ -0,0 +1,169 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+case object AxiResponse {
+  val okay = 0
+  val rsvd = 1
+  val slverr = 2
+  val mmuerr = 3
+}
+
+// case object AxiBurst {
+//   val fixed = 0
+//   val incr = 1
+//   val wrap = 2
+// }
+
+// case object AxiSize {
+//   val bytes1 = 0
+//   val bytes2 = 1
+//   val bytes4 = 2
+//   val bytes8 = 3
+//   val bytes16 = 4
+//   val bytes32 = 5
+//   val bytes64 = 6
+//   val bytes128 = 7
+// }
+
+class AxiAddress(addrWidthBits: Int, idBits: Int) extends Bundle {
+  val addr  = UInt(addrWidthBits.W)
+  val id    = UInt(idBits.W)
+  // val burst = UInt(2.W)
+  // val size  = UInt(3.W)
+
+  def defaults() = {
+    addr  := 0.U
+    id    := 0.U
+    // burst := new AxiBurst().fixed
+    // size  := new AxiSize().bytes4
+  }
+}
+
+class AxiWriteData(dataWidthBits: Int) extends Bundle {
+  val data = UInt(dataWidthBits.W)
+  val strb = UInt((dataWidthBits/8).W)
+
+  def defaults() = {
+    data := 0.U
+    strb := ((1 << (dataWidthBits/8)) - 1).U
+  }
+}
+
+class AxiWriteResponse(idBits: Int) extends Bundle {
+  val id   = UInt(idBits.W)
+  val resp = UInt(2.W)
+
+  def defaults() = {
+    id   := 0.U
+    resp := 0.U
+  }
+
+  def defaultsFlipped() = {
+    id   := 0.U
+    resp := 0.U
+  }
+}
+
+class AxiReadData(dataWidthBits: Int, idBits: Int) extends Bundle {
+  val resp = UInt(2.W)
+  val id   = UInt(idBits.W)
+  val data = UInt(dataWidthBits.W)
+  // val last = Bool()
+
+  def defaultsFlipped() = {
+    resp := 0.U
+    id := 0.U
+    data := 0.U
+    // last := false.B
+  }
+}
+
+class AxiLiteAddress(addrWidthBits: Int) extends Bundle {
+  val addr = UInt(addrWidthBits.W)
+  val prot = UInt(3.W)
+}
+
+class AxiLiteWriteData(dataWidthBits: Int) extends Bundle {
+  val data = UInt(dataWidthBits.W)
+  val strb = UInt((dataWidthBits/8).W)
+}
+
+class AxiLiteReadData(dataWidthBits: Int) extends Bundle {
+  val data = UInt(dataWidthBits.W)
+  val resp = UInt(2.W)
+}
+
+class AxiMasterIO(addrWidthBits: Int, dataWidthBits: Int, idBits: Int)
+    extends Bundle {
+  val write = new AxiMasterWriteIO(addrWidthBits, dataWidthBits, idBits)
+  val read = new AxiMasterReadIO(addrWidthBits, dataWidthBits, idBits)
+
+  def defaults() = {
+    write.defaults()
+    read.defaults()
+  }
+
+  def defaultsFlipped() = {
+    write.defaultsFlipped()
+    read.defaultsFlipped()
+  }
+}
+
+class AxiMasterWriteIO(addrWidthBits: Int, dataWidthBits: Int, idBits: Int)
+    extends Bundle {
+  val addr = Decoupled(new AxiAddress(addrWidthBits, idBits))
+  val data = Decoupled(new AxiWriteData(dataWidthBits))
+  val resp = Flipped(Decoupled(new AxiWriteResponse(idBits)))
+
+  def defaults() = {
+    addr.bits.defaults()
+    data.bits.defaults()
+    addr.valid := false.B
+    data.valid := false.B
+    resp.ready := true.B
+  }
+
+  def defaultsFlipped() = {
+    addr.ready := false.B
+    data.ready := false.B
+    resp.valid := false.B
+    resp.bits.defaultsFlipped()
+  }
+}
+
+class AxiMasterReadIO(addrWidthBits: Int, dataWidthBits: Int, idBits: Int)
+    extends Bundle {
+  val addr = Decoupled(new AxiAddress(addrWidthBits, idBits))
+  val data = Flipped(Decoupled(new AxiReadData(dataWidthBits, idBits)))
+
+  def defaults() = {
+    addr.bits.defaults()
+    addr.valid := false.B
+    data.ready := false.B
+  }
+
+  def defaultsFlipped() = {
+    addr.ready := false.B
+    data.valid := false.B
+    data.bits.defaultsFlipped()
+  }
+}
+
+class AxiLiteMasterIO(val addrWidthBits: Int, val dataWidthBits: Int) extends Bundle {
+  val read  = new AxiLiteMasterReadIO(addrWidthBits, dataWidthBits)
+  val write = new AxiLiteMasterWriteIO(addrWidthBits, dataWidthBits)
+}
+
+class AxiLiteMasterWriteIO(val addrWidthBits: Int, val dataWidthBits: Int) extends Bundle {
+  val addr = Decoupled(new AxiLiteAddress(addrWidthBits))
+  val data = Decoupled(new AxiLiteWriteData(dataWidthBits))
+  val resp = Flipped(Decoupled(UInt(2.W)))
+}
+
+class AxiLiteMasterReadIO(addrWidthBits: Int, dataWidthBits: Int)
+    extends Bundle {
+  val addr = Decoupled(new AxiLiteAddress(addrWidthBits))
+  val data = Flipped(Decoupled(new AxiLiteReadData(dataWidthBits)))
+}
diff --git a/hdl/chisel/src/kelvin/ClockGate.scala b/hdl/chisel/src/kelvin/ClockGate.scala
new file mode 100644
index 0000000..87a576e
--- /dev/null
+++ b/hdl/chisel/src/kelvin/ClockGate.scala
@@ -0,0 +1,13 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+class ClockGate extends BlackBox {
+  val io = IO(new Bundle {
+    val clk_i  = Input(Clock())
+    val enable = Input(Bool())  // '1' passthrough, '0' disable.
+    val clk_o  = Output(Clock())
+  })
+}
diff --git a/hdl/chisel/src/kelvin/Core.scala b/hdl/chisel/src/kelvin/Core.scala
new file mode 100644
index 0000000..16c8ece
--- /dev/null
+++ b/hdl/chisel/src/kelvin/Core.scala
@@ -0,0 +1,76 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Core {
+  def apply(p: Parameters): Core = {
+    return Module(new Core(p))
+  }
+}
+
+class Core(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    val csr = new CsrInOutIO(p)
+    val halted = Output(Bool())
+    val fault = Output(Bool())
+
+    val ibus = new IBusIO(p)
+    val dbus = new DBusIO(p)
+    val axi0 = new AxiMasterIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits)
+    val axi1 = new AxiMasterIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits)
+
+    val iflush = new IFlushIO(p)
+    val dflush = new DFlushIO(p)
+    val slog = new SLogIO(p)
+
+    val debug = new DebugIO(p)
+  })
+
+  val score = SCore(p)
+  val vcore = VCore(p)
+  val dbusmux = DBusMux(p)
+
+  // ---------------------------------------------------------------------------
+  // Scalar Core outputs.
+  io.csr    <> score.io.csr
+  io.ibus   <> score.io.ibus
+  io.halted := score.io.halted
+  io.fault  := score.io.fault
+  io.iflush <> score.io.iflush
+  io.dflush <> score.io.dflush
+  io.slog   := score.io.slog
+  io.debug  := score.io.debug
+
+  // ---------------------------------------------------------------------------
+  // Vector core.
+  score.io.vcore <> vcore.io.score
+
+  // ---------------------------------------------------------------------------
+  // Local Data Bus Port
+  dbusmux.io.vldst := score.io.vldst
+  dbusmux.io.vlast := vcore.io.last
+
+  dbusmux.io.vcore <> vcore.io.dbus
+  dbusmux.io.score <> score.io.dbus
+
+  io.dbus <> dbusmux.io.dbus
+
+  // ---------------------------------------------------------------------------
+  // Scalar DBus to AXI.
+  val dbus2axi = DBus2Axi(p)
+  dbus2axi.io.dbus <> score.io.ubus
+
+  // ---------------------------------------------------------------------------
+  // AXI ports.
+  io.axi0.read  <> vcore.io.ld
+  io.axi0.write <> vcore.io.st
+
+  io.axi1 <> dbus2axi.io.axi
+}
+
+object EmitCore extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new Core(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/DBus2Axi.scala b/hdl/chisel/src/kelvin/DBus2Axi.scala
new file mode 100644
index 0000000..6fc89c0
--- /dev/null
+++ b/hdl/chisel/src/kelvin/DBus2Axi.scala
@@ -0,0 +1,65 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object DBus2Axi {
+  def apply(p: Parameters): DBus2Axi = {
+    return Module(new DBus2Axi(p))
+  }
+}
+
+class DBus2Axi(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    val dbus = Flipped(new DBusIO(p))
+    val axi = new AxiMasterIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits)
+  })
+
+  val linebit = log2Ceil(p.lsuDataBits / 8)
+
+  val sraddrActive = RegInit(false.B)
+  val sdata = Reg(UInt(p.axi2DataBits.W))
+
+  when (io.axi.read.data.valid && io.axi.read.data.ready) {
+    sraddrActive := false.B
+    assert(sraddrActive)
+    assert(!io.axi.read.addr.valid)
+  } .elsewhen (io.axi.read.addr.valid && io.axi.read.addr.ready) {
+    sraddrActive := true.B
+    assert(!sraddrActive)
+    assert(!io.axi.read.data.valid)
+  }
+
+  when (io.axi.read.data.valid && io.axi.read.data.ready) {
+    sdata := io.axi.read.data.bits.data
+  }
+
+  io.dbus.ready := Mux(io.dbus.write,
+                       io.axi.write.addr.valid && io.axi.write.addr.ready,
+                       io.axi.read.data.valid && sraddrActive)
+  io.dbus.rdata := sdata
+
+  val saddr = Cat(io.dbus.addr(31, linebit), 0.U(linebit.W))
+
+  io.axi.write.addr.valid := io.dbus.valid && io.dbus.write
+  io.axi.write.addr.bits.addr := saddr
+  io.axi.write.addr.bits.id := 0.U
+
+  io.axi.write.data.valid := io.dbus.valid && io.dbus.write
+  io.axi.write.data.bits.strb := io.dbus.wmask
+  io.axi.write.data.bits.data := io.dbus.wdata
+
+  io.axi.write.resp.ready := true.B
+
+  io.axi.read.addr.valid := io.dbus.valid && !io.dbus.write && !sraddrActive
+  io.axi.read.addr.bits.addr := saddr
+  io.axi.read.addr.bits.id := 0.U
+
+  io.axi.read.data.ready := true.B
+}
+
+object EmitDBus2Axi extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new DBus2Axi(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/DBusMux.scala b/hdl/chisel/src/kelvin/DBusMux.scala
new file mode 100644
index 0000000..30f523f
--- /dev/null
+++ b/hdl/chisel/src/kelvin/DBusMux.scala
@@ -0,0 +1,41 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object DBusMux {
+  def apply(p: Parameters): DBusMux = {
+    return Module(new DBusMux(p))
+  }
+}
+
+class DBusMux(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    val vldst = Input(Bool())  // score.lsu
+    val vlast = Input(Bool())  // vcore.vldst
+    val vcore = Flipped(new DBusIO(p))
+    val score = Flipped(new DBusIO(p))
+    val dbus  = new DBusIO(p)
+  })
+
+  io.dbus.valid := Mux(io.vldst, io.vcore.valid, io.score.valid)
+  io.dbus.write := Mux(io.vldst, io.vcore.write, io.score.write)
+  io.dbus.addr  := Mux(io.vldst, io.vcore.addr,  io.score.addr)
+  io.dbus.adrx  := Mux(io.vldst, io.vcore.adrx,  io.score.adrx)
+  io.dbus.size  := Mux(io.vldst, io.vcore.size,  io.score.size)
+  io.dbus.wdata := Mux(io.vldst, io.vcore.wdata, io.score.wdata)
+  io.dbus.wmask := Mux(io.vldst, io.vcore.wmask, io.score.wmask)
+
+  io.score.rdata := io.dbus.rdata
+  io.vcore.rdata := io.dbus.rdata
+
+  // Scalar core fifo syncs to vector core vldst, removed on last transaction.
+  io.score.ready := io.dbus.ready && (!io.vldst || io.vcore.valid && io.vlast)
+  io.vcore.ready := io.dbus.ready && io.vldst
+}
+
+object EmitDBusMux extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new DBusMux(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/L1DCache.scala b/hdl/chisel/src/kelvin/L1DCache.scala
new file mode 100644
index 0000000..ac006e7
--- /dev/null
+++ b/hdl/chisel/src/kelvin/L1DCache.scala
@@ -0,0 +1,676 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import chisel3.experimental.ChiselEnum
+import common._
+
+object L1DCache {
+  def apply(p: Parameters): L1DCache = {
+    return Module(new L1DCache(p))
+  }
+}
+
+object L1DCacheBank {
+  def apply(p: Parameters): L1DCacheBank = {
+    return Module(new L1DCacheBank(p))
+  }
+}
+
+class L1DCache(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    val dbus = Flipped(new DBusIO(p))
+    val axi = new AxiMasterIO(p.axi1AddrBits, p.axi1DataBits, p.axi1IdBits)
+    val flush = Flipped(new DFlushIO(p))
+  })
+
+  assert(p.axi1IdBits == 4)
+  assert(p.axi1DataBits == 256)
+
+  val bank0 = Module(new L1DCacheBank(p))
+  val bank1 = Module(new L1DCacheBank(p))
+
+  val linebit = log2Ceil(p.lsuDataBits / 8)
+  val linebytes = 1 << linebit
+
+  // Remove bank select bit from address.
+  def BankInAddress(addr: UInt): UInt = {
+    assert(addr.getWidth == 32)
+    val output = Cat(addr(31, linebit + 1), addr(linebit - 1, 0))
+    assert(output.getWidth == 31)
+    output
+  }
+
+  // Add bank select bit to address.
+  def BankOutAddress(addr: UInt, bank: Int): UInt = {
+    assert(addr.getWidth == 31)
+    val output = Cat(addr(30, linebit), bank.U(1.W), addr(linebit - 1, 0))
+    assert(output.getWidth == 32)
+    output
+  }
+
+  assert(io.dbus.size <= linebytes.U)
+
+  // ---------------------------------------------------------------------------
+  // Data bus multiplexor.
+  val lineend = (io.dbus.addr(linebit - 1, 0) + io.dbus.size) > linebytes.U
+  val dempty = io.dbus.size === 0.U
+  val dsel0 = io.dbus.addr(linebit) === 0.U && !dempty || lineend
+  val dsel1 = io.dbus.addr(linebit) === 1.U && !dempty || lineend
+  val preread = ~io.dbus.addr(11, linebit) =/= 0.U && !io.dbus.write && !dempty  // Within 4KB
+  val addrA = Mux(io.dbus.addr(linebit), BankInAddress(io.dbus.adrx), BankInAddress(io.dbus.addr))
+  val addrB = Mux(io.dbus.addr(linebit), BankInAddress(io.dbus.addr), BankInAddress(io.dbus.adrx))
+  val rsel = Reg(Vec(linebytes, Bool()))
+
+  assert(!(io.dbus.valid && io.dbus.adrx =/= (io.dbus.addr + linebytes.U)))
+
+  // Write masks
+  val wmaskSA = ((~0.U(linebytes.W)) << io.dbus.addr(linebit - 1, 0))(linebytes - 1, 0)
+  val wmaskSB = ((~0.U(linebytes.W)) >> (linebytes.U - io.dbus.addr(linebit - 1, 0)))(linebytes - 1, 0)
+  val wmaskA = io.dbus.wmask & wmaskSA
+  val wmaskB = io.dbus.wmask & wmaskSB
+  assert(wmaskSA.getWidth == io.dbus.wmask.getWidth)
+  assert(wmaskSB.getWidth == io.dbus.wmask.getWidth)
+  assert(wmaskA.getWidth == io.dbus.wmask.getWidth)
+  assert(wmaskB.getWidth == io.dbus.wmask.getWidth)
+  assert((wmaskSA | wmaskSB) === ~0.U(linebytes.W))
+  assert((wmaskSA & wmaskSB) === 0.U)
+
+  bank0.io.dbus.valid := io.dbus.valid && (dsel0 || preread)
+  bank0.io.dbus.write := io.dbus.write
+  bank0.io.dbus.wmask := Mux(io.dbus.addr(linebit), wmaskB, wmaskA)
+  bank0.io.dbus.size  := io.dbus.size
+  bank0.io.dbus.addr  := addrA
+  bank0.io.dbus.adrx  := addrB
+  bank0.io.dbus.wdata := io.dbus.wdata
+
+  bank1.io.dbus.valid := io.dbus.valid && (dsel1 || preread)
+  bank1.io.dbus.write := io.dbus.write
+  bank1.io.dbus.wmask := Mux(io.dbus.addr(linebit), wmaskA, wmaskB)
+  bank1.io.dbus.size  := io.dbus.size
+  bank1.io.dbus.addr  := addrB
+  bank1.io.dbus.adrx  := addrA
+  bank1.io.dbus.wdata := io.dbus.wdata
+
+  val dbusready = (bank0.io.dbus.ready || !dsel0) &&
+                  (bank1.io.dbus.ready || !dsel1)
+
+  // Read bank selection.
+  when (io.dbus.valid && dbusready && !io.dbus.write) {
+    val addr = io.dbus.addr(linebit, 0)
+    for (i <- 0 until linebytes) {
+      // reverse order to index usage
+      rsel(linebytes - 1 - i) := (addr + i.U)(linebit)
+    }
+  }
+
+  def RData(data: UInt = 0.U(1.W), i: Int = 0): UInt = {
+    if (i < p.lsuDataBits / 8) {
+      val d0 = bank0.io.dbus.rdata(8 * i + 7, 8 * i)
+      val d1 = bank1.io.dbus.rdata(8 * i + 7, 8 * i)
+      val d = Mux(rsel(i), d1, d0)
+      val r = if (i == 0) d else Cat(d, data)
+      assert(d.getWidth == 8)
+      assert(r.getWidth == (i + 1) * 8)
+      RData(r, i + 1)
+    } else {
+      data
+    }
+  }
+
+  io.dbus.rdata := RData()
+
+  io.dbus.ready := dbusready
+
+  // dbus transaction must latch until completion.
+  val addrLatchActive = RegInit(false.B)
+  val addrLatchData = Reg(UInt(32.W))
+
+  when (io.dbus.valid && !io.dbus.ready && !addrLatchActive) {
+    addrLatchActive := true.B
+    addrLatchData := io.dbus.addr
+  } .elsewhen (addrLatchActive && io.dbus.ready) {
+    addrLatchActive := false.B
+  }
+
+  // assert(!(addrLatchActive && !io.dbus.valid)) -- do not use, allow temporary deassertion
+  assert(!(addrLatchActive && addrLatchData =/= io.dbus.addr))
+
+  // ---------------------------------------------------------------------------
+  // AXI read bus multiplexor.
+  val rresp0 = io.axi.read.data.bits.id(p.axi1IdBits - 1) === 0.U
+  val rresp1 = io.axi.read.data.bits.id(p.axi1IdBits - 1) === 1.U
+
+  val raxi0 = bank0.io.axi.read.addr.valid
+  val raxi1 = !raxi0
+
+  io.axi.read.addr.valid     := bank0.io.axi.read.addr.valid || bank1.io.axi.read.addr.valid
+  io.axi.read.addr.bits.addr := Mux(raxi0, BankOutAddress(bank0.io.axi.read.addr.bits.addr, 0),
+                                           BankOutAddress(bank1.io.axi.read.addr.bits.addr, 1))
+  io.axi.read.addr.bits.id   := Mux(raxi0, Cat(0.U(1.W), bank0.io.axi.read.addr.bits.id), Cat(1.U(1.W), bank1.io.axi.read.addr.bits.id))
+
+  bank0.io.axi.read.addr.ready := io.axi.read.addr.ready && raxi0
+  bank1.io.axi.read.addr.ready := io.axi.read.addr.ready && raxi1
+
+  bank0.io.axi.read.data.valid := io.axi.read.data.valid && rresp0
+  bank0.io.axi.read.data.bits := io.axi.read.data.bits
+
+  bank1.io.axi.read.data.valid := io.axi.read.data.valid && rresp1
+  bank1.io.axi.read.data.bits := io.axi.read.data.bits
+
+  io.axi.read.data.ready := bank0.io.axi.read.data.ready && rresp0 ||
+                            bank1.io.axi.read.data.ready && rresp1
+
+  // ---------------------------------------------------------------------------
+  // AXI write bus multiplexor.
+  val waxi0 = Wire(Bool())
+  val waxi1 = Wire(Bool())
+  val wresp0 = io.axi.write.resp.bits.id(p.axi1IdBits - 1) === 0.U
+  val wresp1 = io.axi.write.resp.bits.id(p.axi1IdBits - 1) === 1.U
+
+  if (true) {
+    waxi0 := bank0.io.axi.write.addr.valid
+    waxi1 := !waxi0
+  } else {
+    // Flushes interleave banks for whole line writes.
+    // Change when selected bank not active and other is active.
+    // Change on last transaction in a line write.
+    val wsel = RegInit(false.B)
+
+    when (wsel) {
+      when (bank0.io.axi.write.addr.valid && !bank1.io.axi.write.addr.valid) {
+        wsel := false.B
+      } .elsewhen (bank1.io.axi.write.addr.valid && bank1.io.axi.write.addr.ready && bank1.io.axi.write.addr.bits.id === ~0.U((p.axi1IdBits - 1).W)) {
+        wsel := false.B
+      }
+    } .otherwise {
+      when (bank1.io.axi.write.addr.valid && !bank0.io.axi.write.addr.valid) {
+        wsel := true.B
+      } .elsewhen (bank0.io.axi.write.addr.valid && bank0.io.axi.write.addr.ready && bank0.io.axi.write.addr.bits.id === ~0.U((p.axi1IdBits - 1).W)) {
+        wsel := true.B
+      }
+    }
+
+    waxi0 := wsel === false.B
+    waxi1 := wsel === true.B
+  }
+
+  io.axi.write.addr.valid := bank0.io.axi.write.addr.valid && waxi0 ||
+                             bank1.io.axi.write.addr.valid && waxi1
+  io.axi.write.addr.bits.addr := Mux(waxi0, BankOutAddress(bank0.io.axi.write.addr.bits.addr, 0),
+                                            BankOutAddress(bank1.io.axi.write.addr.bits.addr, 1))
+  io.axi.write.addr.bits.id := Mux(waxi0, Cat(0.U(1.W), bank0.io.axi.write.addr.bits.id),
+                                          Cat(1.U(1.W), bank1.io.axi.write.addr.bits.id))
+
+  io.axi.write.data.valid := bank0.io.axi.write.data.valid && waxi0 ||
+                             bank1.io.axi.write.data.valid && waxi1
+  io.axi.write.data.bits := Mux(waxi0, bank0.io.axi.write.data.bits, bank1.io.axi.write.data.bits)
+
+  bank0.io.axi.write.addr.ready := io.axi.write.addr.ready && waxi0
+  bank1.io.axi.write.addr.ready := io.axi.write.addr.ready && waxi1
+  bank0.io.axi.write.data.ready := io.axi.write.data.ready && waxi0
+  bank1.io.axi.write.data.ready := io.axi.write.data.ready && waxi1
+
+  bank0.io.axi.write.resp.valid := io.axi.write.resp.valid && wresp0
+  bank0.io.axi.write.resp.bits  := io.axi.write.resp.bits
+
+  bank1.io.axi.write.resp.valid := io.axi.write.resp.valid && wresp1
+  bank1.io.axi.write.resp.bits  := io.axi.write.resp.bits
+
+  io.axi.write.resp.ready := bank0.io.axi.write.resp.ready && wresp0 ||
+                             bank1.io.axi.write.resp.ready && wresp1
+
+  assert(!(io.axi.write.addr.valid && !io.axi.write.data.valid))
+  assert(!(io.axi.write.addr.valid && (io.axi.write.addr.ready =/= io.axi.write.data.ready)))
+
+  // ---------------------------------------------------------------------------
+  // Flush controls.
+  // bank0.io.flush.valid := io.flush.valid && bank1.io.flush.ready
+  // bank1.io.flush.valid := io.flush.valid && bank0.io.flush.ready
+  bank0.io.flush.valid := io.flush.valid
+  bank0.io.flush.all   := io.flush.all
+  bank0.io.flush.clean := io.flush.clean
+
+  bank1.io.flush.valid := io.flush.valid
+  bank1.io.flush.all   := io.flush.all
+  bank1.io.flush.clean := io.flush.clean
+
+  io.flush.ready := bank0.io.flush.ready && bank1.io.flush.ready
+}
+
+class L1DCacheBank(p: Parameters) extends Module {
+  // A relatively simple cache block. Only one transaction may post at a time.
+  // 2^8 * 256  / 8 = 8KiB    4-way  Tag[31,12] + Index[11,6] + Data[5,0]
+  val slots = p.l1dslots
+  val slotBits = log2Ceil(slots)
+  val assoc = 4  // 2, 4, 8, 16, slots
+  val sets = slots / assoc
+  val setLsb = log2Ceil(p.lsuDataBits / 8)
+  val setMsb = log2Ceil(sets) + setLsb - 1
+  val tagLsb = setMsb + 1
+  val tagMsb = 30
+
+  val io = IO(new Bundle {
+    val dbus = Flipped(new DBusIO(p, true))
+    val axi = new AxiMasterIO(p.axi1AddrBits - 1, p.axi1DataBits, p.axi1IdBits - 1)
+    val flush = Flipped(new DFlushIO(p))
+  })
+
+  // AXI memory consistency, maintain per-byte strobes.
+  val bytes = p.lsuDataBits / 8
+
+  def Mem8to9(d: UInt, m: UInt): UInt = {
+    assert(d.getWidth == p.lsuDataBits)
+    assert(m.getWidth == p.lsuDataBits / 8)
+    val data = Wire(Vec(bytes, UInt(9.W)))
+    for (i <- 0 until bytes) {
+      data(i) := Cat(m(i), d(7 + i * 8, 0 + i * 8))
+    }
+    data.asUInt
+  }
+
+  def Mem9to8(d: UInt): UInt = {
+    assert(d.getWidth == p.lsuDataBits * 9 / 8)
+    val data = Wire(Vec(bytes, UInt(8.W)))
+    for (i <- 0 until bytes) {
+      data(i) := d(7 + i * 9, 0 + i * 9)
+    }
+    data.asUInt
+  }
+
+  def Mem9to1(d: UInt): UInt = {
+    assert(d.getWidth == p.lsuDataBits * 9 / 8)
+    val data = Wire(Vec(bytes, UInt(1.W)))
+    for (i <- 0 until bytes) {
+      data(i) := Cat(d(8 + i * 9))
+    }
+    data.asUInt
+  }
+
+  val checkBit = if (p.lsuDataBits == 128) 4
+                 else if (p.lsuDataBits == 256) 5 else 6
+  assert(assoc == 2 ||  assoc == 4 || assoc == 8 || assoc == 16 || assoc == slots)
+  assert(assoc != 2 ||  setLsb == checkBit && setMsb == (checkBit + 6) && tagLsb == (checkBit + 7))
+  assert(assoc != 4 ||  setLsb == checkBit && setMsb == (checkBit + 5) && tagLsb == (checkBit + 6))
+  assert(assoc != 8 ||  setLsb == checkBit && setMsb == (checkBit + 4) && tagLsb == (checkBit + 5))
+  assert(assoc != 16 || setLsb == checkBit && setMsb == (checkBit + 3) && tagLsb == (checkBit + 4))
+  assert(assoc != slots || tagLsb == checkBit)
+
+  class Sram_1rwm_256x288 extends BlackBox {
+    val io = IO(new Bundle {
+      val clock = Input(Clock())
+      val valid = Input(Bool())
+      val write = Input(Bool())
+      val addr  = Input(UInt(slotBits.W))
+      val wdata = Input(UInt((p.axi1DataBits * 9 / 8).W))
+      val wmask = Input(UInt((p.axi1DataBits * 1 / 8).W))
+      val rdata = Output(UInt((p.axi1DataBits * 9 / 8).W))
+    })
+  }
+
+  // Check io.dbus.wmask is in range of addr and size.
+  val busbytes = p.lsuDataBits / 8
+  val linemsb = log2Ceil(busbytes)
+  val chkmask0 = (~0.U(busbytes.W)) >> (busbytes.U - io.dbus.size)
+  val chkmask1 = Cat(chkmask0, chkmask0) << io.dbus.addr(linemsb - 1, 0)
+  val chkmask = chkmask1(2 * busbytes - 1, busbytes)
+  assert(!(io.dbus.valid && io.dbus.write) || (io.dbus.wmask & ~chkmask) === 0.U)
+
+  // ---------------------------------------------------------------------------
+  // CAM state.
+  val valid = RegInit(VecInit(Seq.fill(slots)(false.B)))
+  val dirty = RegInit(VecInit(Seq.fill(slots)(false.B)))
+  val camaddr = Reg(Vec(slots, UInt(32.W)))
+  // val mem = Mem1RWM(slots, p.lsuDataBits * 9 / 8, 9)
+  val mem = Module(new Sram_1rwm_256x288())
+
+  val history = Reg(Vec(slots / assoc, Vec(assoc, UInt(log2Ceil(assoc).W))))
+
+  val matchSet = Wire(Vec(slots, Bool()))
+  val matchAddr = Wire(Vec(assoc, Bool()))
+
+  val matchSlotB = Wire(Vec(slots, Bool()))
+  val matchSlot = matchSlotB.asUInt
+  val replaceSlotB = Wire(Vec(slots, Bool()))
+  val replaceSlot = replaceSlotB.asUInt
+
+  // OR mux lookup of associative entries.
+  def camaddrRead(i: Int, value: UInt = 0.U(32.W)): UInt = {
+    if (i < slots) {
+      camaddrRead(i + assoc, value | MuxOR(matchSet(i), camaddr(i)))
+    } else {
+      value
+    }
+  }
+
+  for (i <- 0 until assoc) {
+    val ca = camaddrRead(i)
+    matchAddr(i) := io.dbus.addr(tagMsb, tagLsb) === ca(tagMsb, tagLsb)
+  }
+
+  for (i <- 0 until slots) {
+    val set = i / assoc
+    val setMatch = if (assoc == slots) true.B else io.dbus.addr(setMsb, setLsb) === set.U
+    matchSet(i) := setMatch
+  }
+
+  for (i <- 0 until slots) {
+    val set = i / assoc
+    val index = i % assoc
+
+    matchSlotB(i) := valid(i) && matchSet(i) && matchAddr(index)
+
+    val historyMatch = history(set)(index) === 0.U
+    replaceSlotB(i) := matchSet(i) && historyMatch
+    assert((i - set * assoc) == index)
+  }
+
+  assert(PopCount(matchSlot) <= 1.U)
+  assert(PopCount(replaceSlot) <= 1.U)
+
+  val found = matchSlot =/= 0.U
+
+  val replaceNum = Wire(Vec(slots, UInt(slotBits.W)))
+  for (i <- 0 until slots) {
+    replaceNum(i) := MuxOR(replaceSlot(i), i.U)
+  }
+
+  val replaceId = VecOR(replaceNum, slots)
+  assert(replaceId.getWidth == slotBits)
+
+  val readNum = Wire(Vec(slots, UInt(slotBits.W)))
+  for (i <- 0 until slots) {
+    readNum(i) := MuxOR(matchSlotB(i), i.U)
+  }
+  val foundId = VecOR(readNum, slots)
+
+  for (i <- 0 until slots / assoc) {
+    // Get the matched value from the OneHot encoding of the set.
+    val matchSet = matchSlot((i + 1) * assoc - 1, i * assoc)
+    assert(PopCount(matchSet) <= 1.U)
+    val matchIndices = Wire(Vec(assoc, UInt(log2Ceil(assoc).W)))
+    for (j <- 0 until assoc) {
+      matchIndices(j) := MuxOR(matchSet(j), j.U)
+    }
+    val matchIndex = VecOR(matchIndices, assoc)
+    assert(matchIndex.getWidth == log2Ceil(assoc))
+    val matchValue = history(i)(matchIndex)
+
+    // History based on count values so that high set size has less DFF usage.
+    when (io.dbus.valid && io.dbus.ready && (if (assoc == slots) true.B else io.dbus.addr(setMsb, setLsb) === i.U)) {
+      for (j <- 0 until assoc) {
+        when (matchSet(j)) {
+          history(i)(j) := (assoc - 1).U
+        } .elsewhen (history(i)(j) > matchValue) {
+          history(i)(j) := history(i)(j) - 1.U
+          assert(history(i)(j) > 0.U)
+        }
+      }
+    }
+  }
+
+  // Reset history to unique values within sets.
+  // Must be placed below all other assignments.
+  // Note the definition is Reg() so will generate an asynchronous reset.
+  when (reset.asBool) {
+    for (i <- 0 until slots / assoc) {
+      for (j <- 0 until assoc) {
+        history(i)(j) := j.U
+      }
+    }
+  }
+
+  // These checks are extremely slow to compile.
+  if (false) {
+    for (i <- 0 until slots / assoc) {
+      for (j <- 0 until assoc) {
+        for (k <- 0 until assoc) {
+          if (j != k) {
+            assert(history(i)(j) =/= history(i)(k))
+          }
+        }
+      }
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Flush interface.
+  object FlushState extends ChiselEnum {
+    val sNone, sCapture, sProcess, sMemwaddr, sMemwdata, sAxiready, sAxiresp, sEnd = Value
+  }
+
+  val fstate = RegInit(FlushState.sNone)
+  val flush = RegInit(VecInit(Seq.fill(slots)(false.B)))
+
+  // ---------------------------------------------------------------------------
+  // AXI interface.
+  val ractive = RegInit(false.B)
+  val wactive = RegInit(false.B)
+  val active = ractive || wactive
+
+  assert(!(ractive && fstate =/= FlushState.sNone))
+
+  val axiraddrvalid = RegInit(false.B)
+  val axirdataready = RegInit(false.B)
+
+  val memwaddrEn = RegInit(false.B)
+  val memwdataEn = RegInit(false.B)
+  val axiwaddrvalid = RegInit(false.B)
+  val axiwdatavalid = RegInit(false.B)
+  val axiwdatabuf = Reg(UInt(p.axi1DataBits.W))
+  val axiwstrbbuf = Reg(UInt((p.axi1DataBits / 8).W))
+
+  val axiraddr = Reg(UInt(32.W))
+  val axiwaddr = Reg(UInt(32.W))
+
+  val replaceIdReg = Reg(UInt(slotBits.W))
+
+  val alignedAddr = Cat(io.dbus.addr(tagMsb, setLsb), 0.U(setLsb.W))
+
+  when (io.dbus.valid && !io.dbus.ready && !active) {
+    ractive := true.B
+    wactive := dirty(replaceId)
+    assert(!(dirty(replaceId) && !valid(replaceId)))
+    axiraddrvalid := true.B
+    axirdataready := true.B
+    valid(replaceId) := false.B
+    dirty(replaceId) := false.B
+    replaceIdReg := replaceId
+    camaddr(replaceId) := alignedAddr
+    axiraddr := alignedAddr
+    axiwaddr := camaddr(replaceId)
+  }
+
+  // Writeback pulsed controls to memory.
+  memwaddrEn := io.dbus.valid && !io.dbus.ready && !active && dirty(replaceId)
+  memwdataEn := memwaddrEn
+
+  when (io.dbus.valid && io.dbus.ready && io.dbus.write) {
+    dirty(foundId) := true.B
+  }
+
+  when (io.axi.read.addr.valid && io.axi.read.addr.ready) {
+    axiraddrvalid := false.B
+  }
+
+  when (io.axi.read.data.valid && io.axi.read.data.ready) {
+    valid(replaceIdReg) := true.B
+    axirdataready := false.B
+    ractive := false.B
+  }
+
+  when (memwdataEn) {
+    val rdata = mem.io.rdata
+    axiwdatabuf := Mem9to8(rdata)
+    axiwstrbbuf := Mem9to1(rdata)
+    axiwaddrvalid := true.B
+    axiwdatavalid := true.B
+  }
+
+  when (io.axi.write.addr.valid && io.axi.write.addr.ready) {
+    axiwaddrvalid := false.B
+  }
+
+  when (io.axi.write.data.valid && io.axi.write.data.ready) {
+    axiwdatavalid := false.B
+  }
+
+  when (io.axi.write.resp.valid && io.axi.write.resp.ready) {
+    wactive := false.B
+  }
+
+  io.axi.read.addr.valid := axiraddrvalid
+  io.axi.read.addr.bits.addr := axiraddr
+  io.axi.read.addr.bits.id := 0.U
+  io.axi.read.data.ready := axirdataready
+  assert(!(io.axi.read.data.valid && !io.axi.read.data.ready))
+
+  io.axi.write.addr.valid     := axiwaddrvalid
+  io.axi.write.addr.bits.id   := 0.U
+  io.axi.write.addr.bits.addr := axiwaddr
+
+  io.axi.write.resp.ready     := true.B
+
+  io.axi.write.data.valid     := axiwdatavalid
+  io.axi.write.data.bits.data := axiwdatabuf.asUInt
+  io.axi.write.data.bits.strb := axiwstrbbuf.asUInt
+
+  assert(!(io.axi.read.addr.valid && !ractive))
+  assert(!(io.axi.read.data.ready && !ractive))
+  assert(!(io.axi.write.addr.valid && !wactive && fstate === FlushState.sNone))
+
+  // ---------------------------------------------------------------------------
+  // Axi Write Response Count.
+  val wrespcnt = RegInit(0.U((slotBits + 1).W))
+  val wrespinc = io.axi.write.addr.valid && io.axi.write.addr.ready
+  val wrespdec = io.axi.write.resp.valid && io.axi.write.resp.ready
+
+  when (wrespinc && !wrespdec) {
+    wrespcnt := wrespcnt + 1.U
+  } .elsewhen (!wrespinc && wrespdec) {
+    wrespcnt := wrespcnt - 1.U
+  }
+
+  // ---------------------------------------------------------------------------
+  // Flush interface.
+  val flushId = Ctz(flush.asUInt)(slotBits - 1, 0)
+
+  for (i <- 0 until slots) {
+    assert(!(flush(i) && !dirty(i)))
+  }
+
+  switch(fstate) {
+    is (FlushState.sNone) {
+      when (io.flush.valid && !axiwaddrvalid && !axiwdatavalid && !axiraddrvalid && !axirdataready) {
+        fstate := FlushState.sCapture
+        replaceIdReg := foundId
+      }
+    }
+
+    is (FlushState.sCapture) {
+      fstate := FlushState.sProcess
+      flush(replaceIdReg) := dirty(replaceIdReg)  // matched (without .all)
+      when (io.flush.all) {
+        for (i <- 0 until slots) {
+          flush(i) := dirty(i)
+        }
+      }
+    }
+
+    is (FlushState.sProcess) {
+      when (flush.asUInt === 0.U) {
+        fstate := FlushState.sAxiresp
+      } .otherwise {
+        fstate := FlushState.sMemwaddr
+        memwaddrEn := true.B
+      }
+      replaceIdReg := flushId
+    }
+
+    is (FlushState.sMemwaddr) {
+      assert(memwaddrEn)
+      fstate := FlushState.sMemwdata
+      axiwaddr := camaddr(replaceIdReg)
+      flush(replaceIdReg) := false.B
+      dirty(replaceIdReg) := false.B
+      when (io.flush.clean) {
+        valid(replaceIdReg) := false.B
+      }
+    }
+
+    is (FlushState.sMemwdata) {
+      assert(memwdataEn)
+      fstate := FlushState.sAxiready
+    }
+
+    is (FlushState.sAxiready) {
+      when ((!axiwaddrvalid || io.axi.write.addr.valid && io.axi.write.addr.ready) &&
+            (!axiwdatavalid || io.axi.write.data.valid && io.axi.write.data.ready)) {
+        fstate := FlushState.sProcess
+      }
+    }
+
+    is (FlushState.sAxiresp) {
+      when (wrespcnt === 0.U) {
+        fstate := FlushState.sEnd
+      }
+    }
+
+    is (FlushState.sEnd) {
+      // Must complete the handshake as there are multiple banks.
+      when (io.flush.ready && !io.flush.valid) {
+        fstate := FlushState.sNone
+      }
+      when (io.flush.clean) {
+        when (io.flush.all) {
+          for (i <- 0 until slots) {
+            valid(i) := false.B
+            assert(!dirty(i))
+            assert(!flush(i))
+          }
+        }
+      }
+    }
+  }
+
+  io.flush.ready := fstate === FlushState.sEnd
+
+  assert(!(io.flush.valid && io.dbus.valid))
+
+  // ---------------------------------------------------------------------------
+  // Core Data Bus.
+  io.dbus.ready := found && !ractive
+  io.dbus.rdata := Mem9to8(mem.io.rdata)
+  assert(!(io.dbus.valid && io.dbus.size === 0.U))
+
+  // ---------------------------------------------------------------------------
+  // Memory controls.
+  val axiwrite  = memwaddrEn
+  val axiread = io.axi.read.data.valid && io.axi.read.data.ready
+  val buswrite = io.dbus.valid && io.dbus.ready && io.dbus.write
+  val busread  = io.dbus.valid && !io.dbus.write && !ractive
+
+  val wdbits = p.axi1DataBits
+  val wmbits = p.axi1DataBits / 8
+  val id = io.axi.read.data.bits.id
+  val rsel = axirdataready
+  mem.io.clock := clock
+  mem.io.valid := busread || buswrite || axiread || axiwrite
+  mem.io.write := rsel && !axiwrite || io.dbus.valid && io.dbus.write && !ractive
+  mem.io.addr  := Mux(rsel || axiwrite, replaceIdReg, foundId)
+  mem.io.wmask := Mux(rsel, ~0.U(wmbits.W), io.dbus.wmask)
+  mem.io.wdata := Mux(rsel, Mem8to9(io.axi.read.data.bits.data, 0.U(wmbits.W)),
+                            Mem8to9(io.dbus.wdata, ~0.U(wmbits.W)))
+
+  assert(PopCount(busread +& buswrite +& axiread) <= 1.U)
+}
+
+object EmitL1DCache extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new L1DCache(p), args)
+}
+
+object EmitL1DCacheBank extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new L1DCacheBank(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/L1ICache.scala b/hdl/chisel/src/kelvin/L1ICache.scala
new file mode 100644
index 0000000..cc20969
--- /dev/null
+++ b/hdl/chisel/src/kelvin/L1ICache.scala
@@ -0,0 +1,256 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object L1ICache {
+  def apply(p: Parameters): L1ICache = {
+    return Module(new L1ICache(p))
+  }
+}
+
+class L1ICache(p: Parameters) extends Module {
+  // A relatively simple cache block. Only one transaction may post at a time.
+  // 2^8 * 256  / 8 = 8KiB    4-way  Tag[31,12] + Index[11,6] + Data[5,0]
+  assert(p.axi0IdBits == 4)
+  assert(p.axi0DataBits == 256)
+
+  val slots = p.l1islots
+  val slotBits = log2Ceil(slots)
+  val assoc = 4  // 2, 4, 8, 16, slots
+  val sets = slots / assoc
+  val setLsb = log2Ceil(p.fetchDataBits / 8)
+  val setMsb = log2Ceil(sets) + setLsb - 1
+  val tagLsb = setMsb + 1
+  val tagMsb = 31
+
+  val io = IO(new Bundle {
+    val ibus = Flipped(new IBusIO(p))
+    val flush = Flipped(new IFlushIO(p))
+    val axi = new Bundle {
+      val read = new AxiMasterReadIO(p.axi0AddrBits, p.axi0DataBits, p.axi0IdBits)
+    }
+  })
+
+  assert(assoc == 2 ||  assoc == 4 || assoc == 8 || assoc == 16 || assoc == slots)
+  assert(assoc != 2 ||  setLsb == 5 && setMsb == 11 && tagLsb == 12)
+  assert(assoc != 4 ||  setLsb == 5 && setMsb == 10 && tagLsb == 11)
+  assert(assoc != 8 ||  setLsb == 5 && setMsb == 9  && tagLsb == 10)
+  assert(assoc != 16 || setLsb == 5 && setMsb == 8  && tagLsb == 9)
+  assert(assoc != slots || tagLsb == 5)
+
+  class Sram_1rw_256x256 extends BlackBox {
+    val io = IO(new Bundle {
+      val clock = Input(Clock())
+      val valid = Input(Bool())
+      val write = Input(Bool())
+      val addr  = Input(UInt(slotBits.W))
+      val wdata = Input(UInt(p.axi0DataBits.W))
+      val rdata = Output(UInt(p.axi0DataBits.W))
+    })
+  }
+
+  // ---------------------------------------------------------------------------
+  // CAM state.
+  val valid = RegInit(VecInit(Seq.fill(slots)(false.B)))
+  val camaddr = Reg(Vec(slots, UInt(32.W)))
+  // val mem = Mem1RW(slots, UInt(p.axi0DataBits.W))
+  val mem = Module(new Sram_1rw_256x256())
+
+  val history = Reg(Vec(slots / assoc, Vec(assoc, UInt(log2Ceil(assoc).W))))
+
+  val matchSet = Wire(Vec(slots, Bool()))
+  val matchAddr = Wire(Vec(assoc, Bool()))
+
+  val matchSlotB = Wire(Vec(slots, Bool()))
+  val matchSlot = matchSlotB.asUInt
+  val replaceSlotB = Wire(Vec(slots, Bool()))
+  val replaceSlot = replaceSlotB.asUInt
+
+  // OR mux lookup of associative entries.
+  def camaddrRead(i: Int, value: UInt = 0.U(32.W)): UInt = {
+    if (i < slots) {
+      camaddrRead(i + assoc, value | MuxOR(matchSet(i), camaddr(i)))
+    } else {
+      value
+    }
+  }
+
+  for (i <- 0 until assoc) {
+    val ca = camaddrRead(i)
+    matchAddr(i) := io.ibus.addr(tagMsb, tagLsb) === ca(tagMsb, tagLsb)
+  }
+
+  for (i <- 0 until slots) {
+    val set = i / assoc
+    val setMatch = if (assoc == slots) true.B else io.ibus.addr(setMsb, setLsb) === set.U
+    matchSet(i) := setMatch
+  }
+
+  for (i <- 0 until slots) {
+    val set = i / assoc
+    val index = i % assoc
+
+    matchSlotB(i) := valid(i) && matchSet(i) && matchAddr(index)
+
+    val historyMatch = history(set)(index) === 0.U
+    replaceSlotB(i) := matchSet(i) && historyMatch
+    assert((i - set * assoc) == index)
+  }
+
+  assert(PopCount(matchSlot) <= 1.U)
+  assert(PopCount(replaceSlot) <= 1.U)
+
+  val found = io.ibus.valid && matchSlot =/= 0.U
+
+  val replaceNum = Wire(Vec(slots, UInt(slotBits.W)))
+  for (i <- 0 until slots) {
+    replaceNum(i) := MuxOR(replaceSlot(i), i.U)
+  }
+
+  val replaceId = VecOR(replaceNum, slots)
+  assert(replaceId.getWidth == slotBits)
+
+  val readNum = Wire(Vec(slots, UInt(slotBits.W)))
+  for (i <- 0 until slots) {
+    readNum(i) := MuxOR(matchSlotB(i), i.U)
+  }
+  val readId = VecOR(readNum, slots)
+
+  for (i <- 0 until slots / assoc) {
+    // Get the matched value from the OneHot encoding of the set.
+    val matchSet = matchSlot((i + 1) * assoc - 1, i * assoc)
+    assert(PopCount(matchSet) <= 1.U)
+    val matchIndices = Wire(Vec(assoc, UInt(log2Ceil(assoc).W)))
+    for (j <- 0 until assoc) {
+      matchIndices(j) := MuxOR(matchSet(j), j.U)
+    }
+    val matchIndex = VecOR(matchIndices, assoc)
+    assert(matchIndex.getWidth == log2Ceil(assoc))
+    val matchValue = history(i)(matchIndex)
+
+    // History based on count values so that high set size has less DFF usage.
+    when (io.ibus.valid && io.ibus.ready && (if (assoc == slots) true.B else io.ibus.addr(setMsb, setLsb) === i.U)) {
+      for (j <- 0 until assoc) {
+        when (matchSet(j)) {
+          history(i)(j) := (assoc - 1).U
+        } .elsewhen (history(i)(j) > matchValue) {
+          history(i)(j) := history(i)(j) - 1.U
+          assert(history(i)(j) > 0.U)
+        }
+      }
+    }
+  }
+
+  // Reset history to unique values within sets.
+  // Must be placed below all other assignments.
+  // Note the definition is Reg() so will generate an asynchronous reset.
+  when (reset.asBool) {
+    for (i <- 0 until slots / assoc) {
+      for (j <- 0 until assoc) {
+        history(i)(j) := j.U
+      }
+    }
+  }
+
+  // These checks are extremely slow to compile.
+  if (false) {
+    for (i <- 0 until slots / assoc) {
+      for (j <- 0 until assoc) {
+        for (k <- 0 until assoc) {
+          if (j != k) {
+            assert(history(i)(j) =/= history(i)(k))
+          }
+        }
+      }
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Core Instruction Bus.
+  io.ibus.ready := found
+
+  io.ibus.rdata := mem.io.rdata
+
+  // ---------------------------------------------------------------------------
+  // axi interface.
+  val axivalid = RegInit(false.B)  // io.axi.read.addr.valid
+  val axiready = RegInit(false.B)  // io.axi.read.data.ready
+  val axiaddr = Reg(UInt(32.W))
+
+  val replaceIdReg = Reg(UInt(slotBits.W))
+
+  when (io.ibus.valid && !io.ibus.ready && !axivalid && !axiready) {
+    replaceIdReg := replaceId
+  }
+
+  when (io.axi.read.addr.valid && io.axi.read.addr.ready) {
+    axivalid := false.B
+  } .elsewhen (io.ibus.valid && !io.ibus.ready && !axivalid && !axiready) {
+    axivalid := true.B
+  }
+
+  when (io.axi.read.data.valid && io.axi.read.data.ready) {
+    axiready := false.B
+  } .elsewhen (io.axi.read.addr.valid && io.axi.read.addr.ready && !axiready) {
+    axiready := true.B
+  }
+
+  when (io.flush.valid) {
+    for (i <- 0 until slots) {
+      valid(i) := false.B
+    }
+  } .elsewhen (io.ibus.valid && !io.ibus.ready && !axivalid && !axiready) {
+    valid(replaceId) := false.B
+  } .elsewhen (io.axi.read.data.valid && io.axi.read.data.ready) {
+    valid(replaceIdReg) := true.B
+  }
+
+  when (io.ibus.valid && !io.ibus.ready && !axivalid && !axiready) {
+    val alignedAddr = Cat(io.ibus.addr(31, setLsb), 0.U(setLsb.W))
+    axiaddr := alignedAddr
+    camaddr(replaceId) := alignedAddr
+  } .elsewhen (io.axi.read.addr.valid && io.axi.read.addr.ready) {
+    axiaddr := axiaddr + (p.axi0DataBits / 8).U
+  }
+
+  io.axi.read.defaults()
+  io.axi.read.addr.valid := axivalid
+  io.axi.read.addr.bits.addr := axiaddr
+  io.axi.read.addr.bits.id := 0.U
+  io.axi.read.data.ready := axiready
+
+  io.flush.ready := true.B
+
+  // IBus transaction must latch until completion.
+  val addrLatchActive = RegInit(false.B)
+  val addrLatchData = Reg(UInt(32.W))
+
+  when (io.flush.valid) {
+    addrLatchActive := false.B
+  } .elsewhen (io.ibus.valid && !io.ibus.ready && !addrLatchActive) {
+    addrLatchActive := true.B
+    addrLatchData := io.ibus.addr
+  } .elsewhen (addrLatchActive && io.ibus.ready) {
+    addrLatchActive := false.B
+  }
+
+  assert(!(addrLatchActive && !io.ibus.valid))
+  assert(!(addrLatchActive && addrLatchData =/= io.ibus.addr))
+
+  // ---------------------------------------------------------------------------
+  // Memory controls.
+  val memwrite = io.axi.read.data.valid && io.axi.read.data.ready
+  val memread = io.ibus.valid && !axivalid && !axiready
+  mem.io.clock := clock
+  mem.io.valid := memread || memwrite
+  mem.io.write := axiready
+  mem.io.addr  := Mux(axiready, replaceIdReg, readId)
+  mem.io.wdata := io.axi.read.data.bits.data
+}
+
+object EmitL1ICache extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new L1ICache(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/Library.scala b/hdl/chisel/src/kelvin/Library.scala
new file mode 100644
index 0000000..f0e04d5
--- /dev/null
+++ b/hdl/chisel/src/kelvin/Library.scala
@@ -0,0 +1,331 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+object Mux0 {
+  def apply(valid: Bool, data: UInt): UInt = {
+    Mux(valid, data, 0.U(data.getWidth))
+  }
+
+  def apply(valid: Bool, data: Bool): Bool = {
+    Mux(valid, data, false.B)
+  }
+}
+
+object MuxOR {
+  def apply(valid: Bool, data: UInt): UInt = {
+    Mux(valid, data, 0.U(data.getWidth))
+  }
+
+  def apply(valid: Bool, data: Bool): Bool = {
+    Mux(valid, data, false.B)
+  }
+}
+
+object Min {
+  def apply(a: UInt, b: UInt): UInt = {
+    assert(a.getWidth == b.getWidth)
+    Mux(a < b, a, b)
+  }
+}
+
+object Max {
+  def apply(a: UInt, b: UInt): UInt = {
+    assert(a.getWidth == b.getWidth)
+    Mux(a > b, a, b)
+  }
+}
+
+object Repeat {
+  def apply(b: Bool, n: Int): UInt = {
+    val r = VecInit(Seq.fill(n)(b))
+    r.asUInt
+  }
+}
+
+object SignExt {
+  def apply(v: UInt, n: Int): UInt = {
+    val s = v.getWidth
+    val r = Cat(Repeat(v(s - 1), n - s), v)
+    assert(r.getWidth == n)
+    r.asUInt
+  }
+}
+
+// ORs vector lanes.
+//  Vec(4, UInt(7.W)) -> UInt(7.W)
+//  for (i <- 0 until count) out |= in(i)
+object VecOR {
+  def apply(vec: Vec[UInt], count: Int, index: Int, bits: UInt): UInt = {
+    if (index < count) {
+      apply(vec, count, index+1, bits | vec(index))
+    } else {
+      bits
+    }
+  }
+
+  def apply(vec: Vec[Bool], count: Int, index: Int, bits: Bool): Bool = {
+    if (index < count) {
+      apply(vec, count, index+1, bits || vec(index))
+    } else {
+      bits
+    }
+  }
+
+  def apply(vec: Vec[UInt], count: Int): UInt = {
+    apply(vec, count, 0, 0.U)
+  }
+
+  def apply(vec: Vec[Bool], count: Int): Bool = {
+    apply(vec, count, 0, false.B)
+  }
+
+  def apply(vec: Vec[UInt]): UInt = {
+    val count = vec.length
+    apply(vec, count, 0, 0.U)
+  }
+
+  def apply(vec: Vec[Bool]): Bool = {
+    val count = vec.length
+    apply(vec, count, 0, false.B)
+  }
+}
+
+object IndexMask {
+  def apply(data: Vec[UInt], index: UInt): Vec[UInt] = {
+    val count = data.length
+    val width = data(0).getWidth.W
+    val value = Wire(Vec(count, UInt(width)))
+    for (i <- 0 until count) {
+      value(i) := Mux(i.U === index, data(i), 0.U)
+    }
+    value
+  }
+}
+
+object OrReduce {
+  def apply(data: Vec[UInt]): UInt = {
+    if (data.length > 1) {
+      val count = data.length / 2
+      val odd   = data.length & 1
+      val width = data(0).getWidth.W
+      val value = Wire(Vec(count + odd, UInt(width)))
+      for (i <- 0 until count) {
+        value(i) := data(2 * i + 0) | data(2 * i + 1)
+      }
+      if (odd != 0) {
+        value(count) := data(2 * count)
+      }
+      OrReduce(value)
+    } else {
+      data(0)
+    }
+  }
+}
+
+object VecAt {
+  def apply(data: Vec[Bool], index: UInt): Bool = {
+    assert(data.length == (1 << index.getWidth))
+    val count = data.length
+    val dataUInt = Wire(Vec(count, UInt(1.W)))
+    for (i <- 0 until count) {
+      dataUInt(i) := data(i)
+    }
+    OrReduce(IndexMask(dataUInt, index)) =/= 0.U
+  }
+
+  def apply(data: Vec[UInt], index: UInt): UInt = {
+    assert(data.length == (1 << index.getWidth))
+    OrReduce(IndexMask(data, index))
+  }
+}
+
+object BoolAt {
+  def apply(udata: UInt, index: UInt): Bool = {
+    assert(udata.getWidth == (1 << index.getWidth))
+    val width = udata.getWidth
+    val data = Wire(Vec(width, UInt(1.W)))
+    for (i <- 0 until width) {
+      data(i) := udata(i)
+    }
+    OrReduce(IndexMask(data, index)) =/= 0.U
+  }
+}
+
+object WiredAND {
+  def apply(bits: UInt): Bool = {
+    WiredAND(VecInit(bits.asBools))
+  }
+
+  def apply(bits: Vec[Bool]): Bool = {
+    val count = bits.length
+    if (count > 1) {
+      val limit = (count + 1) / 2
+      val value = Wire(Vec(limit, Bool()))
+      for (i <- 0 until limit) {
+        if (i * 2 + 1 >= count) {
+          value(i) := bits(2 * i + 0)
+        } else {
+          value(i) := bits(2 * i + 0) & bits(2 * i + 1)
+        }
+      }
+      WiredAND(value)
+    } else {
+      bits(0)
+    }
+  }
+}
+
+object WiredOR {
+  def apply(bits: UInt): Bool = {
+    WiredOR(VecInit(bits.asBools))
+  }
+
+  def apply(bits: Vec[Bool]): Bool = {
+    val count = bits.length
+    if (count > 1) {
+      val limit = (count + 1) / 2
+      val value = Wire(Vec(limit, Bool()))
+      for (i <- 0 until limit) {
+        if (i * 2 + 1 >= count) {
+          value(i) := bits(2 * i + 0)
+        } else {
+          value(i) := bits(2 * i + 0) | bits(2 * i + 1)
+        }
+      }
+      WiredOR(value)
+    } else {
+      bits(0)
+    }
+  }
+}
+
+object OneHot {
+  def apply(bits: UInt, count: Int): UInt = {
+    // // UIntToOH(bits, count)
+    // val bools = Wire(Vec(count, Bool()))
+    // for (i <- 0 until count) {
+    //   bools(i) := bits === i.U
+    // }
+    // val r = bools.asUInt
+    // assert(r.getWidth == count)
+    // r
+
+    UIntToOH(bits, count)
+  }
+}
+
+// Page mask for two address ranges, factoring unaligned address overflow.
+object PageMaskShift {
+  def apply(address: UInt, length: UInt): UInt = {
+    assert(address.getWidth == 32)
+
+    // Find the power2 page size that contains the range offset+length.
+    // The address width is one less than length, as we want to use the
+    // page base of zero and length to match the page size.
+    val psel = Cat((address(9,0) +& length) <= 1024.U,
+                   (address(8,0) +& length) <= 512.U,
+                   (address(7,0) +& length) <= 256.U,
+                   (address(6,0) +& length) <= 128.U,
+                   (address(5,0) +& length) <= 64.U,
+                   (address(4,0) +& length) <= 32.U,
+                   (address(3,0) +& length) <= 16.U,
+                   (address(2,0) +& length) <= 8.U,
+                   (address(1,0) +& length) <= 4.U)
+
+    val pshift =
+        Mux(psel(0), 2.U, Mux(psel(1), 3.U, Mux(psel(2), 4.U, Mux(psel(3), 5.U,
+        Mux(psel(4), 6.U, Mux(psel(5), 7.U, Mux(psel(6), 8.U, Mux(psel(7), 9.U,
+        Mux(psel(8), 10.U, 0.U)))))))))
+
+    // Determine the longest run of lsb 1's. We OR 1's of the address lsb so
+    // that base+length overflow ripple extends as far as needed.
+    // Include an additional lsb 1 to round us to the next page size, as we will
+    // not perform in page test beyond the segmentBits size.
+    val addrmask = Cat(address(31,10), ~0.U(10.W), 1.U(1.W))
+    val cto = PriorityEncoder(~addrmask)
+    assert(cto.getWidth == 6)
+
+    // Mask shift value.
+    val shift = Mux(psel =/= 0.U, pshift, cto)
+    assert(shift.getWidth == 6)
+
+    shift
+  }
+}
+
+object Cto {
+  def apply(bits: UInt): UInt = {
+    PriorityEncoder(Cat(1.U(1.W), ~bits))
+  }
+}
+
+object Ctz {
+  def apply(bits: UInt): UInt = {
+    PriorityEncoder(Cat(1.U(1.W), bits))
+  }
+}
+
+// Unused
+object Clb {
+  def apply(bits: UInt): UInt = {
+    val clo = Clo(bits)
+    val clz = Clz(bits)
+    Mux(bits(bits.getWidth - 1), clo, clz)
+  }
+}
+
+// Unused
+object Clo {
+  def apply(bits: UInt): UInt = {
+    PriorityEncoder(Cat(1.U(1.W), Reverse(~bits)))
+  }
+}
+
+object Clz {
+  def apply(bits: UInt): UInt = {
+    PriorityEncoder(Cat(1.U(1.W), Reverse(bits)))
+  }
+}
+
+object WCtz {
+  def apply(bits: UInt, offset: Int = 0): UInt = {
+    assert((bits.getWidth % 32) == 0)
+    val z = Ctz(bits(31, 0))
+    val v = z | offset.U
+    assert(z.getWidth == 6)
+    if (bits.getWidth > 32) {
+      Mux(!z(5), v, WCtz(bits(bits.getWidth - 1, 32), offset + 32))
+    } else {
+      Mux(!z(5), v, (offset + 32).U)
+    }
+  }
+}
+
+object DecodeBits {
+  def apply(inst: UInt, bitPattern: String, v: Bool = true.B, index: Int = 31):
+      Bool = {
+    // System.out.println(">>> String \"" + bitPattern + "\" = " + bitPattern.length + " : " + index)
+    if (bitPattern.length > 0) {
+      if (bitPattern(0) == '0') {
+        val bit = ~inst(index)
+        DecodeBits(inst, bitPattern.drop(1), v && bit, index - 1)
+      } else if (bitPattern(0) == '1') {
+        val bit = inst(index)
+        DecodeBits(inst, bitPattern.drop(1), v && bit, index - 1)
+      } else if (bitPattern(0) == 'x') {
+        val bit = inst(index)
+        DecodeBits(inst, bitPattern.drop(1), v, index - 1)
+      } else if (bitPattern(0) == '_') {
+        DecodeBits(inst, bitPattern.drop(1), v, index)
+      } else {
+        assert(false)
+        v
+      }
+    } else {
+      assert(index == -1)
+      v
+    }
+  }
+}
diff --git a/hdl/chisel/src/kelvin/Parameters.scala b/hdl/chisel/src/kelvin/Parameters.scala
new file mode 100644
index 0000000..2024883
--- /dev/null
+++ b/hdl/chisel/src/kelvin/Parameters.scala
@@ -0,0 +1,72 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+case class Parameters() {
+  case object Core {
+    val tiny = 0
+    val little = 1
+    val big = 2
+  }
+
+  val core = sys.env.get("KELVIN_SIMD").getOrElse("256").toInt match {
+    case 128 => Core.tiny
+    case 256 => Core.little
+    case 512 => Core.big
+  }
+
+  // Machine.
+  val programCounterBits = 32
+  val instructionBits = 32
+  val instructionLanes = 4
+
+  // Vector Length (register-file and compute).
+  val vectorBits = core match {
+    case Core.tiny   => 128
+    case Core.little => 256
+    case Core.big    => 512
+  }
+
+  val vectorCountBits = log2Ceil(vectorBits / 8) + 1 + 2  // +2 stripmine
+  assert(vectorBits == 512 || vectorBits == 256
+      || vectorBits == 128)  // 128 = faster builds, but not production(?).
+
+  // Vector queue.
+  val vectorFifoDepth = 16
+
+  // L0ICache Fetch unit.
+  // val fetchCacheBytes = 2048
+  val fetchCacheBytes = 1024
+  // val fetchCacheBytes = 128
+
+  // Scalar Core Fetch bus.
+  val fetchAddrBits = 32   // do not change
+  val fetchDataBits = 256  // do not change
+
+  // Scalar Core Load Store Unit bus.
+  val lsuAddrBits = 32  // do not change
+  val lsuDataBits = vectorBits
+
+  // [External] Core AXI interface.
+  val axiSysIdBits = 7
+  val axiSysAddrBits = 32
+  val axiSysDataBits = vectorBits
+
+  // [Internal] L1ICache interface.
+  val l1islots = 256
+  val axi0IdBits = 4  // (1x banks, 4 bits unused)
+  val axi0AddrBits = 32
+  val axi0DataBits = fetchDataBits
+
+  // [Internal] L1DCache interface.
+  val l1dslots = 256  // (x2 banks)
+  val axi1IdBits = 4  // (x2 banks, 3 bits unused)
+  val axi1AddrBits = 32
+  val axi1DataBits = vectorBits
+
+  // [Internal] TCM[Vector,Scalar] interface.
+  val axi2IdBits = 6
+  val axi2AddrBits = 32
+  val axi2DataBits = vectorBits
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Alu.scala b/hdl/chisel/src/kelvin/scalar/Alu.scala
new file mode 100644
index 0000000..2ac5855
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Alu.scala
@@ -0,0 +1,117 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Alu {
+  def apply(p: Parameters): Alu = {
+    return Module(new Alu(p))
+  }
+}
+
+case class AluOp() {
+  val ADD  = 0
+  val SUB  = 1
+  val SLT  = 2
+  val SLTU = 3
+  val XOR  = 4
+  val OR   = 5
+  val AND  = 6
+  val SLL  = 7
+  val SRL  = 8
+  val SRA  = 9
+  val LUI  = 10
+  val CLZ  = 11
+  val CTZ  = 12
+  val PCNT = 13
+  val MIN  = 14
+  val MINU = 15
+  val MAX  = 16
+  val MAXU = 17
+  val Entries = 18
+}
+
+class AluIO(p: Parameters) extends Bundle {
+  val valid = Input(Bool())
+  val addr = Input(UInt(5.W))
+  val op = Input(UInt(new AluOp().Entries.W))
+}
+
+class Alu(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    // Decode cycle.
+    val req = new AluIO(p)
+
+    // Execute cycle.
+    val rs1 = Flipped(new RegfileReadDataIO)
+    val rs2 = Flipped(new RegfileReadDataIO)
+    val rd  = Flipped(new RegfileWriteDataIO)
+  })
+
+  val alu = new AluOp()
+
+  val valid = RegInit(false.B)
+  val addr = Reg(UInt(5.W))
+  val op = RegInit(0.U(alu.Entries.W))
+
+  // Pulse the cycle after the decoded request.
+  valid := io.req.valid
+
+  // Avoid output toggles by not updating state between uses.
+  // The Regfile has the same behavior, leaving read ports unchanged.
+  when (io.req.valid) {
+    addr := io.req.addr
+    op := io.req.op
+  }
+
+  // val rs1 = MuxOR(valid, io.rs1.data)
+  // val rs2 = MuxOR(valid, io.rs2.data)
+  val rs1 = io.rs1.data
+  val rs2 = io.rs2.data
+  val shamt = rs2(4,0)
+
+  // TODO: should we be masking like this for energy?
+  // TODO: a single addsub for add/sub/slt/sltu
+  // val add  = MuxOR(op(alu.ADD), rs1) +  MuxOR(op(alu.ADD), rs2)
+  // val sub  = MuxOR(op(alu.SUB), rs1) -  MuxOR(op(alu.SUB), rs2)
+  // val sll  = MuxOR(op(alu.SLL), rs1) << MuxOR(op(alu.SLL), shamt)
+  // val srl  = MuxOR(op(alu.SRL), rs1) >> MuxOR(op(alu.SRL), shamt)
+  // val sra  = (MuxOR(op(alu.SRA), rs1.asSInt, 0.S) >> MuxOR(op(alu.SRA), shamt)).asUInt
+  // val slt  = MuxOR(op(alu.SLT), rs1.asSInt, 0.S) < MuxOR(op(alu.SLT), rs2.asSInt, 0.S)
+  // val sltu = MuxOR(op(alu.SLTU), rs1) < MuxOR(op(alu.SLTU), rs2)
+  // val and  = MuxOR(op(alu.AND), rs1) &  MuxOR(op(alu.AND), rs2)
+  // val or   = MuxOR(op(alu.OR), rs1) |  MuxOR(op(alu.OR), rs2)
+  // val xor  = MuxOR(op(alu.XOR), rs1) ^  MuxOR(op(alu.XOR), rs2)
+  // val lui  = MuxOR(op(alu.LUI), rs2)
+  // val clz  = MuxOR(op(alu.CLZ), CLZ(rs1))
+  // val ctz  = MuxOR(op(alu.CTZ), CTZ(rs1))
+  // val pcnt = MuxOR(op(alu.PCNT), PopCount(rs1))
+
+  // io.rd.data := add | sub | sll | srl | sra | slt | sltu | and | or | xor | lui
+
+  io.rd.valid := valid
+  io.rd.addr  := addr
+  io.rd.data  := MuxOR(op(alu.ADD),  rs1 + rs2) |
+                 MuxOR(op(alu.SUB),  rs1 - rs2) |
+                 MuxOR(op(alu.SLT),  rs1.asSInt < rs2.asSInt) |
+                 MuxOR(op(alu.SLTU), rs1 < rs2) |
+                 MuxOR(op(alu.XOR),  rs1 ^ rs2) |
+                 MuxOR(op(alu.OR),   rs1 | rs2) |
+                 MuxOR(op(alu.AND),  rs1 & rs2) |
+                 MuxOR(op(alu.SLL),  rs1 << shamt) |
+                 MuxOR(op(alu.SRL),  rs1 >> shamt) |
+                 MuxOR(op(alu.SRA),  (rs1.asSInt >> shamt).asUInt) |
+                 MuxOR(op(alu.LUI),  rs2) |
+                 MuxOR(op(alu.CLZ),  Clz(rs1)) |
+                 MuxOR(op(alu.CTZ),  Ctz(rs1)) |
+                 MuxOR(op(alu.PCNT), PopCount(rs1)) |
+                 MuxOR(op(alu.MIN),  Mux(rs1.asSInt < rs2.asSInt, rs1, rs2)) |
+                 MuxOR(op(alu.MAX),  Mux(rs1.asSInt > rs2.asSInt, rs1, rs2)) |
+                 MuxOR(op(alu.MINU), Mux(rs1 < rs2, rs1, rs2)) |
+                 MuxOR(op(alu.MAXU), Mux(rs1 > rs2, rs1, rs2))
+
+  // Assertions.
+  assert(!(valid && !io.rs1.valid && !op(alu.LUI)))
+  assert(!(valid && !io.rs2.valid))
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Bru.scala b/hdl/chisel/src/kelvin/scalar/Bru.scala
new file mode 100644
index 0000000..02d7121
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Bru.scala
@@ -0,0 +1,222 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Bru {
+  def apply(p: Parameters): Bru = {
+    return Module(new Bru(p))
+  }
+}
+
+case class BruOp() {
+  val JAL  = 0
+  val JALR = 1
+  val BEQ  = 2
+  val BNE  = 3
+  val BLT  = 4
+  val BGE  = 5
+  val BLTU = 6
+  val BGEU = 7
+  val EBREAK = 8
+  val ECALL = 9
+  val EEXIT = 10
+  val EYIELD = 11
+  val ECTXSW = 12
+  val MPAUSE = 13
+  val MRET = 14
+  val FENCEI = 15
+  val UNDEF = 16
+  val Entries = 17
+}
+
+class BruIO(p: Parameters) extends Bundle {
+  val valid = Input(Bool())
+  val fwd = Input(Bool())
+  val op = Input(UInt(new BruOp().Entries.W))
+  val pc = Input(UInt(p.programCounterBits.W))
+  val target = Input(UInt(p.programCounterBits.W))
+  val link = Input(UInt(5.W))
+}
+
+class BranchTakenIO(p: Parameters) extends Bundle {
+  val valid = Output(Bool())
+  val value = Output(UInt(p.programCounterBits.W))
+}
+
+class Bru(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    // Decode cycle.
+    val req = new BruIO(p)
+
+    // Execute cycle.
+    val csr = new CsrBruIO(p)
+    val rs1 = Input(new RegfileReadDataIO)
+    val rs2 = Input(new RegfileReadDataIO)
+    val rd  = Flipped(new RegfileWriteDataIO)
+    val taken = new BranchTakenIO(p)
+    val target = Flipped(new RegfileBranchTargetIO)
+    val interlock = Output(Bool())
+    val iflush = Output(Bool())
+  })
+
+  val branch = new BruOp()
+
+  val interlock = RegInit(false.B)
+
+  val readRs = RegInit(false.B)
+  val fwd = RegInit(false.B)
+  val op = RegInit(0.U(branch.Entries.W))
+  val target = Reg(UInt(p.programCounterBits.W))
+  val linkValid = RegInit(false.B)
+  val linkAddr = Reg(UInt(5.W))
+  val linkData = Reg(UInt(p.programCounterBits.W))
+  val pcEx = Reg(UInt(32.W))
+
+  linkValid := io.req.valid && io.req.link =/= 0.U &&
+               (io.req.op(branch.JAL) || io.req.op(branch.JALR))
+
+  op := Mux(io.req.valid, io.req.op, 0.U)
+  fwd := io.req.valid && io.req.fwd
+
+  readRs := Mux(io.req.valid,
+            io.req.op(branch.BEQ)  || io.req.op(branch.BNE) ||
+            io.req.op(branch.BLT)  || io.req.op(branch.BGE) ||
+            io.req.op(branch.BLTU) || io.req.op(branch.BGEU), false.B)
+
+  val mode = io.csr.out.mode  // (0) machine, (1) user
+
+  val pcDe  = io.req.pc
+  val pc4De = io.req.pc + 4.U
+
+  when (io.req.valid) {
+    val mret = io.req.op(branch.MRET) && !mode
+    val call = io.req.op(branch.MRET) && mode ||
+               io.req.op(branch.EBREAK) ||
+               io.req.op(branch.ECALL) ||
+               io.req.op(branch.EEXIT) ||
+               io.req.op(branch.EYIELD) ||
+               io.req.op(branch.ECTXSW) ||
+               io.req.op(branch.MPAUSE)
+    target := Mux(mret, io.csr.out.mepc,
+              Mux(call, io.csr.out.mtvec,
+              Mux(io.req.fwd || io.req.op(branch.FENCEI), pc4De,
+              Mux(io.req.op(branch.JALR), io.target.data,
+                  io.req.target))))
+    linkAddr := io.req.link
+    linkData := pc4De
+    pcEx := pcDe
+  }
+
+  interlock := io.req.valid && (io.req.op(branch.EBREAK) ||
+                 io.req.op(branch.ECALL) || io.req.op(branch.EEXIT) ||
+                 io.req.op(branch.EYIELD) || io.req.op(branch.ECTXSW) ||
+                 io.req.op(branch.MPAUSE) || io.req.op(branch.MRET))
+
+  io.interlock := interlock
+
+  // This mux sits on the critical path.
+  // val rs1 = Mux(readRs, io.rs1.data, 0.U)
+  // val rs2 = Mux(readRs, io.rs2.data, 0.U)
+  val rs1 = io.rs1.data
+  val rs2 = io.rs2.data
+
+  val eq  = rs1 === rs2
+  val neq = !eq
+  val lt  = rs1.asSInt < rs2.asSInt
+  val ge  = !lt
+  val ltu = rs1 < rs2
+  val geu = !ltu
+
+  io.taken.valid := op(branch.EBREAK) && mode ||
+                    op(branch.ECALL)  && mode ||
+                    op(branch.EEXIT)  && mode ||
+                    op(branch.EYIELD) && mode ||
+                    op(branch.ECTXSW) && mode ||
+                    op(branch.MRET)   && !mode ||
+                    op(branch.MRET)   && mode ||  // fault
+                    op(branch.MPAUSE) && mode ||  // fault
+                    op(branch.FENCEI) ||
+                    (op(branch.JAL) ||
+                     op(branch.JALR) ||
+                     op(branch.BEQ)  && eq  ||
+                     op(branch.BNE)  && neq ||
+                     op(branch.BLT)  && lt  ||
+                     op(branch.BGE)  && ge  ||
+                     op(branch.BLTU) && ltu ||
+                     op(branch.BGEU) && geu) =/= fwd
+
+  io.taken.value := target
+
+  io.rd.valid := linkValid
+  io.rd.addr := linkAddr
+  io.rd.data := linkData
+
+  // Undefined Fault.
+  val undefFault = op(branch.UNDEF)
+
+  // Usage Fault.
+  val usageFault = op(branch.EBREAK) && !mode ||
+                   op(branch.ECALL)  && !mode ||
+                   op(branch.EEXIT)  && !mode ||
+                   op(branch.EYIELD) && !mode ||
+                   op(branch.ECTXSW) && !mode ||
+                   op(branch.MPAUSE) && mode ||
+                   op(branch.MRET)   && mode
+
+  io.csr.in.mode.valid := op(branch.EBREAK) && mode ||
+                          op(branch.ECALL)  && mode ||
+                          op(branch.EEXIT)  && mode ||
+                          op(branch.EYIELD) && mode ||
+                          op(branch.ECTXSW) && mode ||
+                          op(branch.MPAUSE) && mode ||  // fault
+                          op(branch.MRET)   && mode ||  // fault
+                          op(branch.MRET)   && !mode
+  io.csr.in.mode.bits := MuxOR(op(branch.MRET) && !mode, true.B)
+
+  io.csr.in.mepc.valid := op(branch.EBREAK) && mode ||
+                          op(branch.ECALL)  && mode ||
+                          op(branch.EEXIT)  && mode ||
+                          op(branch.EYIELD) && mode ||
+                          op(branch.ECTXSW) && mode ||
+                          op(branch.MPAUSE) && mode ||  // fault
+                          op(branch.MRET)   && mode     // fault
+  io.csr.in.mepc.bits := Mux(op(branch.EYIELD), linkData, pcEx)
+
+  io.csr.in.mcause.valid := undefFault || usageFault ||
+                            op(branch.EBREAK) && mode ||
+                            op(branch.ECALL)  && mode ||
+                            op(branch.EEXIT)  && mode ||
+                            op(branch.EYIELD) && mode ||
+                            op(branch.ECTXSW) && mode
+
+  val faultMsb = 1.U << 31
+  io.csr.in.mcause.bits := Mux(undefFault, 2.U  | faultMsb,
+                           Mux(usageFault, 16.U | faultMsb,
+                             MuxOR(op(branch.EBREAK), 1.U) |
+                             MuxOR(op(branch.ECALL),  2.U) |
+                             MuxOR(op(branch.EEXIT),  3.U) |
+                             MuxOR(op(branch.EYIELD), 4.U) |
+                             MuxOR(op(branch.ECTXSW), 5.U)))
+
+  io.csr.in.mtval.valid := undefFault || usageFault
+  io.csr.in.mtval.bits := pcEx
+
+  io.iflush := op(branch.FENCEI)
+
+  // Pipeline will be halted.
+  io.csr.in.halt := op(branch.MPAUSE) && !mode || io.csr.in.fault
+  io.csr.in.fault := undefFault && !mode || usageFault && !mode
+
+  // Assertions.
+  val valid = RegInit(false.B)
+  val ignore = op(branch.JAL) || op(branch.JALR) || op(branch.EBREAK) ||
+               op(branch.ECALL) || op(branch.EEXIT) || op(branch.EYIELD) ||
+               op(branch.ECTXSW) || op(branch.MPAUSE) || op(branch.MRET) ||
+               op(branch.FENCEI) || op(branch.UNDEF)
+
+  valid := io.req.valid
+  assert(!(valid && !io.rs1.valid) || ignore)
+  assert(!(valid && !io.rs2.valid) || ignore)
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Csr.scala b/hdl/chisel/src/kelvin/scalar/Csr.scala
new file mode 100644
index 0000000..7c78680
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Csr.scala
@@ -0,0 +1,273 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Csr {
+  def apply(p: Parameters): Csr = {
+    return Module(new Csr(p))
+  }
+}
+
+case class CsrOp() {
+  val CSRRW = 0
+  val CSRRS = 1
+  val CSRRC = 2
+  val Entries = 3
+}
+
+class CsrInIO(p: Parameters) extends Bundle {
+  val value = Input(Vec(12, UInt(32.W)))
+}
+
+class CsrOutIO(p: Parameters) extends Bundle {
+  val value = Output(Vec(8, UInt(32.W)))
+}
+
+class CsrInOutIO(p: Parameters) extends Bundle {
+  val in  = new CsrInIO(p)
+  val out = new CsrOutIO(p)
+}
+
+class CsrBruIO(p: Parameters) extends Bundle {
+  val in = new Bundle {
+    val mode   = Valid(Bool())
+    val mcause = Valid(UInt(32.W))
+    val mepc   = Valid(UInt(32.W))
+    val mtval  = Valid(UInt(32.W))
+    val halt   = Output(Bool())
+    val fault  = Output(Bool())
+  }
+  val out = new Bundle {
+    val mode  = Input(Bool())
+    val mepc  = Input(UInt(32.W))
+    val mtvec = Input(UInt(32.W))
+  }
+  def defaults() = {
+    out.mode := false.B
+    out.mepc := 0.U
+    out.mtvec := 0.U
+  }
+}
+
+class CsrIO(p: Parameters) extends Bundle {
+  val valid = Input(Bool())
+  val addr = Input(UInt(5.W))
+  val index = Input(UInt(12.W))
+  val op = Input(UInt(new CsrOp().Entries.W))
+}
+
+class Csr(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    // Reset and shutdown.
+    val csr = new CsrInOutIO(p)
+
+    // Decode cycle.
+    val req = new CsrIO(p)
+
+    // Execute cycle.
+    val rs1 = Flipped(new RegfileReadDataIO)
+    val rd  = Flipped(new RegfileWriteDataIO)
+    val bru = Flipped(new CsrBruIO(p))
+
+    // Vector core.
+    val vcore = Input(new Bundle { val undef = Bool() })
+
+    // Pipeline Control.
+    val halted = Output(Bool())
+    val fault  = Output(Bool())
+  })
+
+  val csr = new CsrOp()
+
+  val valid = RegInit(false.B)
+  val addr = Reg(UInt(5.W))
+  val index = Reg(UInt(12.W))
+  val op = RegInit(0.U(csr.Entries.W))
+
+  // Pipeline Control.
+  val halted = RegInit(false.B)
+  val fault = RegInit(false.B)
+
+  // Machine(0)/User(1) Mode.
+  val mode = RegInit(false.B)
+
+  // CSRs parallel loaded when(reset).
+  val mpc       = Reg(UInt(32.W))
+  val msp       = Reg(UInt(32.W))
+  val mcause    = Reg(UInt(32.W))
+  val mtval     = Reg(UInt(32.W))
+  val mcontext0 = Reg(UInt(32.W))
+  val mcontext1 = Reg(UInt(32.W))
+  val mcontext2 = Reg(UInt(32.W))
+  val mcontext3 = Reg(UInt(32.W))
+  val mcontext4 = Reg(UInt(32.W))
+  val mcontext5 = Reg(UInt(32.W))
+  val mcontext6 = Reg(UInt(32.W))
+  val mcontext7 = Reg(UInt(32.W))
+
+  // CSRs with initialization.
+  val fflags    = RegInit(0.U(5.W))
+  val frm       = RegInit(0.U(3.W))
+  val mie       = RegInit(0.U(1.W))
+  val mtvec     = RegInit(0.U(32.W))
+  val mscratch  = RegInit(0.U(32.W))
+  val mepc      = RegInit(0.U(32.W))
+
+  val fcsr = Cat(frm, fflags)
+
+  // Decode the Index.
+  val fflagsEn    = index === 0x001.U
+  val frmEn       = index === 0x002.U
+  val fcsrEn      = index === 0x003.U
+  val mieEn       = index === 0x304.U
+  val mtvecEn     = index === 0x305.U
+  val mscratchEn  = index === 0x340.U
+  val mepcEn      = index === 0x341.U
+  val mcauseEn    = index === 0x342.U
+  val mtvalEn     = index === 0x343.U
+  val mcontext0En = index === 0x7C0.U
+  val mcontext1En = index === 0x7C1.U
+  val mcontext2En = index === 0x7C2.U
+  val mcontext3En = index === 0x7C3.U
+  val mcontext4En = index === 0x7C4.U
+  val mcontext5En = index === 0x7C5.U
+  val mcontext6En = index === 0x7C6.U
+  val mcontext7En = index === 0x7C7.U
+  val mpcEn       = index === 0x7E0.U
+  val mspEn       = index === 0x7E1.U
+
+  // Control registers.
+  when (io.req.valid) {
+    valid := io.req.valid
+    addr := io.req.addr
+    index := io.req.index
+    op := io.req.op
+  } .elsewhen (valid) {
+    valid := false.B
+    addr := 0.U
+    index := 0.U
+    op := 0.U
+  }
+
+  // Pipeline Control.
+  when (io.bru.in.halt || io.vcore.undef) {
+    halted := true.B
+  }
+
+  when (io.bru.in.fault || io.vcore.undef) {
+    fault := true.B
+  }
+
+  io.halted := halted
+  io.fault  := fault
+
+  assert(!(io.fault && !io.halted))
+
+  // Register state.
+  val rs1 = io.rs1.data
+
+  val rdata = MuxOR(fflagsEn,     fflags) |
+              MuxOR(frmEn,        frm) |
+              MuxOR(fcsrEn,       fcsr) |
+              MuxOR(mieEn,        mie) |
+              MuxOR(mtvecEn,      mtvec) |
+              MuxOR(mscratchEn,   mscratch) |
+              MuxOR(mepcEn,       mepc) |
+              MuxOR(mcauseEn,     mcause) |
+              MuxOR(mtvalEn,      mtval) |
+              MuxOR(mcontext0En,  mcontext0) |
+              MuxOR(mcontext1En,  mcontext1) |
+              MuxOR(mcontext2En,  mcontext2) |
+              MuxOR(mcontext3En,  mcontext3) |
+              MuxOR(mcontext4En,  mcontext4) |
+              MuxOR(mcontext5En,  mcontext5) |
+              MuxOR(mcontext6En,  mcontext6) |
+              MuxOR(mcontext7En,  mcontext7) |
+              MuxOR(mpcEn,        mpc) |
+              MuxOR(mspEn,        msp)
+
+  val wdata = MuxOR(op(csr.CSRRW), rs1) |
+              MuxOR(op(csr.CSRRS), rdata | rs1) |
+              MuxOR(op(csr.CSRRC), rdata & ~rs1)
+
+  when (valid) {
+    when (fflagsEn)     { fflags    := wdata }
+    when (frmEn)        { frm       := wdata }
+    when (fcsrEn)       { fflags    := wdata(4,0)
+                          frm       := wdata(7,5) }
+    when (mieEn)        { mie       := wdata }
+    when (mtvecEn)      { mtvec     := wdata }
+    when (mscratchEn)   { mscratch  := wdata }
+    when (mepcEn)       { mepc      := wdata }
+    when (mcauseEn)     { mcause    := wdata }
+    when (mtvalEn)      { mtval     := wdata }
+    when (mpcEn)        { mpc       := wdata }
+    when (mspEn)        { msp       := wdata }
+    when (mcontext0En)  { mcontext0 := wdata }
+    when (mcontext1En)  { mcontext1 := wdata }
+    when (mcontext2En)  { mcontext2 := wdata }
+    when (mcontext3En)  { mcontext3 := wdata }
+    when (mcontext4En)  { mcontext4 := wdata }
+    when (mcontext5En)  { mcontext5 := wdata }
+    when (mcontext6En)  { mcontext6 := wdata }
+    when (mcontext7En)  { mcontext7 := wdata }
+  }
+
+  when (io.bru.in.mode.valid) {
+    mode := io.bru.in.mode.bits
+  }
+
+  val firstFault = !mcause(31)
+
+  when (io.bru.in.mcause.valid && firstFault) {
+    mcause := io.bru.in.mcause.bits
+  }
+
+  when (io.bru.in.mtval.valid && firstFault) {
+    mtval := io.bru.in.mtval.bits
+  }
+
+  when (io.bru.in.mepc.valid) {
+    mepc := io.bru.in.mepc.bits
+  }
+
+  // This pattern of separate when() blocks requires resets after the data.
+  when (reset.asBool) {
+    mpc       := io.csr.in.value(0)
+    msp       := io.csr.in.value(1)
+    mcause    := io.csr.in.value(2)
+    mtval     := io.csr.in.value(3)
+    mcontext0 := io.csr.in.value(4)
+    mcontext1 := io.csr.in.value(5)
+    mcontext2 := io.csr.in.value(6)
+    mcontext3 := io.csr.in.value(7)
+    mcontext4 := io.csr.in.value(8)
+    mcontext5 := io.csr.in.value(9)
+    mcontext6 := io.csr.in.value(10)
+    mcontext7 := io.csr.in.value(11)
+  }
+
+  // Forwarding.
+  io.bru.out.mode  := mode
+  io.bru.out.mepc  := Mux(mepcEn, wdata, mepc)
+  io.bru.out.mtvec := Mux(mtvecEn, wdata, mtvec)
+
+  io.csr.out.value(0) := mpc
+  io.csr.out.value(1) := msp
+  io.csr.out.value(2) := mcause
+  io.csr.out.value(3) := mtval
+  io.csr.out.value(4) := mcontext0
+  io.csr.out.value(5) := mcontext1
+  io.csr.out.value(6) := mcontext2
+  io.csr.out.value(7) := mcontext3
+
+  // Write port.
+  io.rd.valid := valid
+  io.rd.addr  := addr
+  io.rd.data  := rdata
+
+  // Assertions.
+  assert(!(valid && !io.rs1.valid))
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Debug.scala b/hdl/chisel/src/kelvin/scalar/Debug.scala
new file mode 100644
index 0000000..d7fedf3
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Debug.scala
@@ -0,0 +1,19 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+// Debug signals for HDL development.
+class DebugIO(p: Parameters) extends Bundle {
+  val en = Output(UInt(4.W))
+  val addr0 = Output(UInt(32.W))
+  val addr1 = Output(UInt(32.W))
+  val addr2 = Output(UInt(32.W))
+  val addr3 = Output(UInt(32.W))
+  val inst0 = Output(UInt(32.W))
+  val inst1 = Output(UInt(32.W))
+  val inst2 = Output(UInt(32.W))
+  val inst3 = Output(UInt(32.W))
+  val cycles = Output(UInt(32.W))
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Decode.scala b/hdl/chisel/src/kelvin/scalar/Decode.scala
new file mode 100644
index 0000000..c7a7373
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Decode.scala
@@ -0,0 +1,660 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Decode {
+  def apply(p: Parameters, pipeline: Int): Decode = {
+    return Module(new Decode(p, pipeline))
+  }
+}
+
+class DecodeSerializeIO extends Bundle {
+  val lsu = Output(Bool())
+  val mul = Output(Bool())
+  val jump = Output(Bool())
+  val brcond = Output(Bool())
+  val vinst = Output(Bool())     // all vector instructions
+
+  def defaults() = {
+    lsu := false.B
+    mul := false.B
+    jump := false.B
+    brcond := false.B
+    vinst := false.B
+  }
+}
+
+class Decode(p: Parameters, pipeline: Int) extends Module {
+  val io = IO(new Bundle {
+    // Core controls.
+    val halted = Input(Bool())
+
+    // Decode input interface.
+    val inst = Flipped(new FetchInstruction(p))
+    val scoreboard = new Bundle {
+      val regd = Input(UInt(32.W))
+      val comb = Input(UInt(32.W))
+      val spec = Output(UInt(32.W))
+    }
+    val mactive = Input(Bool())  // memory active
+
+    // Register file decode cycle interface.
+    val rs1Read = Flipped(new RegfileReadAddrIO)
+    val rs1Set  = Flipped(new RegfileReadSetIO)
+    val rs2Read = Flipped(new RegfileReadAddrIO)
+    val rs2Set  = Flipped(new RegfileReadSetIO)
+    val rdMark  = Flipped(new RegfileWriteAddrIO)
+    val busRead = Flipped(new RegfileBusAddrIO)
+
+    // ALU interface.
+    val alu = Flipped(new AluIO(p))
+
+    // Branch interface.
+    val bru = Flipped(new BruIO(p))
+
+    // CSR interface.
+    val csr = Flipped(new CsrIO(p))
+
+    // LSU interface.
+    val lsu = Flipped(new LsuIO(p))
+
+    // Multiplier interface.
+    val mlu = Flipped(new MluIO(p))
+
+    // Divide interface.
+    val dvu = Flipped(new DvuIO(p))
+
+    // Vector interface.
+    val vinst = Flipped(new VInstIO)
+
+    // Branch status.
+    val branchTaken = Input(Bool())
+
+    // Interlock Controls
+    val interlock = Input(Bool())
+    val serializeIn  = Flipped(new DecodeSerializeIO)
+    val serializeOut = new DecodeSerializeIO
+
+    // Scalar logging.
+    val slog = Output(Bool())
+  })
+
+  val decodeEn = io.inst.valid && io.inst.ready && !io.branchTaken
+
+  // The decode logic.
+  val d = Module(new DecodedInstruction(p, pipeline))
+  d.io.addr := io.inst.addr
+  d.io.inst := io.inst.inst
+
+  val vldst = d.io.vld || d.io.vst
+  val vldst_wb = vldst && io.inst.inst(28)
+
+  val rdAddr  = Mux(vldst, io.inst.inst(19,15), io.inst.inst(11,7))
+  val rs1Addr = io.inst.inst(19,15)
+  val rs2Addr = io.inst.inst(24,20)
+  val rs3Addr = io.inst.inst(31,27)
+
+  val isAluImm = d.io.addi || d.io.slti || d.io.sltiu || d.io.xori ||
+                 d.io.ori || d.io.andi || d.io.slli || d.io.srli || d.io.srai
+
+  val isAluReg = d.io.add || d.io.sub || d.io.slt || d.io.sltu || d.io.xor ||
+                 d.io.or || d.io.and || d.io.sll || d.io.srl || d.io.sra
+
+  val isAlu1Bit = d.io.clz || d.io.ctz || d.io.pcnt
+  val isAlu2Bit = d.io.min || d.io.minu || d.io.max || d.io.maxu
+
+  val isCondBr = d.io.beq || d.io.bne || d.io.blt || d.io.bge ||
+                 d.io.bltu || d.io.bgeu
+
+  val isCsr = d.io.csrrw || d.io.csrrs || d.io.csrrc
+  val isCsrImm = isCsr &&  io.inst.inst(14)
+  val isCsrReg = isCsr && !io.inst.inst(14)
+
+  val isLoad = d.io.lb || d.io.lh || d.io.lw || d.io.lbu || d.io.lhu
+  val isStore = d.io.sb || d.io.sh || d.io.sw
+  val isLsu = isLoad || isStore || d.io.vld || d.io.vst || d.io.flushat || d.io.flushall
+
+  val isMul = d.io.mul || d.io.mulh || d.io.mulhsu || d.io.mulhu || d.io.mulhr || d.io.mulhsur || d.io.mulhur || d.io.dmulh || d.io.dmulhr
+
+  val isDvu = d.io.div || d.io.divu || d.io.rem || d.io.remu
+
+  val isVIop = io.vinst.op(new VInstOp().VIOP)
+
+  val isVIopVs1 = isVIop
+  val isVIopVs2 = isVIop && io.inst.inst(1,0) === 0.U  // exclude: .vv
+  val isVIopVs3 = isVIop && io.inst.inst(2,0) === 1.U  // exclude: .vvv
+
+  // Use the forwarded scoreboard to interlock on multicycle operations.
+  val aluRdEn  = !io.scoreboard.comb(rdAddr)  || isVIopVs1 || isStore || isCondBr
+  val aluRs1En = !io.scoreboard.comb(rs1Addr) || isVIopVs1 || isLsu || d.io.auipc
+  val aluRs2En = !io.scoreboard.comb(rs2Addr) || isVIopVs2 || isLsu || d.io.auipc || isAluImm || isAlu1Bit
+  // val aluRs3En = !io.scoreboard.comb(rs3Addr) || isVIopVs3
+  // val aluEn = aluRdEn && aluRs1En && aluRs2En && aluRs3En  // TODO: is aluRs3En needed?
+  val aluEn = aluRdEn && aluRs1En && aluRs2En
+
+  // Interlock jalr but special case return.
+  val bruEn = !d.io.jalr || !io.scoreboard.regd(rs1Addr) ||
+              io.inst.inst(31,20) === 0.U
+
+  // Require interlock on address generation as there is no write forwarding.
+  val lsuEn = !isLsu ||
+              !io.serializeIn.lsu && io.lsu.ready &&
+              (!isLsu || !io.serializeIn.brcond) &&  // TODO: can this line be removed?
+              !(Mux(io.busRead.bypass, io.scoreboard.comb(rs1Addr),
+                    io.scoreboard.regd(rs1Addr)) ||
+                    io.scoreboard.comb(rs2Addr) && (isStore || vldst))
+
+  // Interlock mul, only one lane accepted.
+  val mulEn = !isMul || !io.serializeIn.mul
+
+
+  // Vector extension interlock.
+  val vinstEn = !(io.serializeIn.vinst || isVIop && io.serializeIn.brcond) &&
+                !(io.vinst.op =/= 0.U && !io.vinst.ready)
+
+  // Fence interlock.
+  // Input mactive used passthrough, prefer to avoid registers in Decode.
+  val fenceEn = !(d.io.fence && io.mactive)
+
+  // ALU opcode.
+  val alu = new AluOp()
+  val aluOp = Wire(Vec(alu.Entries, Bool()))
+  val aluValid = WiredOR(io.alu.op)  // used without decodeEn
+  io.alu.valid := decodeEn && aluValid
+  io.alu.addr := rdAddr
+  io.alu.op := aluOp.asUInt
+
+  aluOp(alu.ADD)  := d.io.auipc || d.io.addi || d.io.add
+  aluOp(alu.SUB)  := d.io.sub
+  aluOp(alu.SLT)  := d.io.slti || d.io.slt
+  aluOp(alu.SLTU) := d.io.sltiu || d.io.sltu
+  aluOp(alu.XOR)  := d.io.xori || d.io.xor
+  aluOp(alu.OR)   := d.io.ori || d.io.or
+  aluOp(alu.AND)  := d.io.andi || d.io.and
+  aluOp(alu.SLL)  := d.io.slli || d.io.sll
+  aluOp(alu.SRL)  := d.io.srli || d.io.srl
+  aluOp(alu.SRA)  := d.io.srai || d.io.sra
+  aluOp(alu.LUI)  := d.io.lui
+  aluOp(alu.CLZ)  := d.io.clz
+  aluOp(alu.CTZ)  := d.io.ctz
+  aluOp(alu.PCNT) := d.io.pcnt
+  aluOp(alu.MIN)  := d.io.min
+  aluOp(alu.MINU) := d.io.minu
+  aluOp(alu.MAX)  := d.io.max
+  aluOp(alu.MAXU) := d.io.maxu
+
+  // Branch conditional opcode.
+  val bru = new BruOp()
+  val bruOp = Wire(Vec(bru.Entries, Bool()))
+  val bruValid = WiredOR(io.bru.op)  // used without decodeEn
+  io.bru.valid := decodeEn && bruValid
+  io.bru.fwd := io.inst.brchFwd
+  io.bru.op := bruOp.asUInt
+  io.bru.pc := io.inst.addr
+  io.bru.target := io.inst.addr + Mux(io.inst.inst(2), d.io.immjal, d.io.immbr)
+  io.bru.link := rdAddr
+
+  bruOp(bru.JAL)  := d.io.jal
+  bruOp(bru.JALR) := d.io.jalr
+  bruOp(bru.BEQ)  := d.io.beq
+  bruOp(bru.BNE)  := d.io.bne
+  bruOp(bru.BLT)  := d.io.blt
+  bruOp(bru.BGE)  := d.io.bge
+  bruOp(bru.BLTU) := d.io.bltu
+  bruOp(bru.BGEU) := d.io.bgeu
+  bruOp(bru.EBREAK) := d.io.ebreak
+  bruOp(bru.ECALL)  := d.io.ecall
+  bruOp(bru.EEXIT)  := d.io.eexit
+  bruOp(bru.EYIELD) := d.io.eyield
+  bruOp(bru.ECTXSW) := d.io.ectxsw
+  bruOp(bru.MPAUSE) := d.io.mpause
+  bruOp(bru.MRET)   := d.io.mret
+  bruOp(bru.FENCEI) := d.io.fencei
+  bruOp(bru.UNDEF)  := d.io.undef
+
+  // CSR opcode.
+  val csr = new CsrOp()
+  val csrOp = Wire(Vec(csr.Entries, Bool()))
+  val csrValid = WiredOR(io.csr.op)  // used without decodeEn
+  io.csr.valid := decodeEn && csrValid
+  io.csr.addr := rdAddr
+  io.csr.index := io.inst.inst(31,20)
+  io.csr.op := csrOp.asUInt
+
+  csrOp(csr.CSRRW) := d.io.csrrw
+  csrOp(csr.CSRRS) := d.io.csrrs
+  csrOp(csr.CSRRC) := d.io.csrrc
+
+  // LSU opcode.
+  val lsu = new LsuOp()
+  val lsuOp = Wire(Vec(lsu.Entries, Bool()))
+  val lsuValid = WiredOR(io.lsu.op)  // used without decodeEn
+  io.lsu.valid := decodeEn && lsuValid
+  io.lsu.store := io.inst.inst(5)
+  io.lsu.addr := rdAddr
+  io.lsu.op := lsuOp.asUInt
+
+  lsuOp(lsu.LB)  := d.io.lb
+  lsuOp(lsu.LH)  := d.io.lh
+  lsuOp(lsu.LW)  := d.io.lw
+  lsuOp(lsu.LBU) := d.io.lbu
+  lsuOp(lsu.LHU) := d.io.lhu
+  lsuOp(lsu.SB)  := d.io.sb
+  lsuOp(lsu.SH)  := d.io.sh
+  lsuOp(lsu.SW)  := d.io.sw
+  lsuOp(lsu.FENCEI)   := d.io.fencei
+  lsuOp(lsu.FLUSHAT)  := d.io.flushat
+  lsuOp(lsu.FLUSHALL) := d.io.flushall
+
+  lsuOp(lsu.VLDST) := d.io.vld || d.io.vst
+
+  // MLU opcode.
+  val mlu = new MluOp()
+  val mluOp = Wire(Vec(mlu.Entries, Bool()))
+  val mluValid = WiredOR(io.mlu.op)  // used without decodeEn
+  io.mlu.valid := decodeEn && mluValid
+  io.mlu.addr := rdAddr
+  io.mlu.op := mluOp.asUInt
+
+  mluOp(mlu.MUL)     := d.io.mul
+  mluOp(mlu.MULH)    := d.io.mulh
+  mluOp(mlu.MULHSU)  := d.io.mulhsu
+  mluOp(mlu.MULHU)   := d.io.mulhu
+  mluOp(mlu.MULHR)   := d.io.mulhr
+  mluOp(mlu.MULHSUR) := d.io.mulhsur
+  mluOp(mlu.MULHUR)  := d.io.mulhur
+  mluOp(mlu.DMULH)   := d.io.dmulh
+  mluOp(mlu.DMULHR)  := d.io.dmulhr
+
+  // DIV opcode.
+  val dvu = new DvuOp()
+  val dvuOp = Wire(Vec(dvu.Entries, Bool()))
+  val dvuValid = WiredOR(io.dvu.op)  // used without decodeEn
+  io.dvu.valid := decodeEn && dvuValid
+  io.dvu.addr := rdAddr
+  io.dvu.op := dvuOp.asUInt
+
+  dvuOp(dvu.DIV)  := d.io.div
+  dvuOp(dvu.DIVU) := d.io.divu
+  dvuOp(dvu.REM)  := d.io.rem
+  dvuOp(dvu.REMU) := d.io.remu
+
+  val dvuEn = WiredOR(io.dvu.op) === 0.U || io.dvu.ready
+
+  // Vector instructions.
+  val vinst = new VInstOp()
+  val vinstOp = Wire(Vec(vinst.Entries, Bool()))
+  val vinstValid = WiredOR(vinstOp)  // used without decodeEn
+
+  io.vinst.valid := decodeEn && vinstValid
+  io.vinst.addr := rdAddr
+  io.vinst.inst := io.inst.inst
+  io.vinst.op := vinstOp.asUInt
+
+  vinstOp(vinst.VLD) := d.io.vld
+  vinstOp(vinst.VST) := d.io.vst
+  vinstOp(vinst.VIOP) := d.io.viop
+  vinstOp(vinst.GETVL) := d.io.getvl
+  vinstOp(vinst.GETMAXVL) := d.io.getmaxvl
+
+  // Scalar logging.
+  io.slog := decodeEn && d.io.slog
+
+  // Register file read ports.
+  io.rs1Read.valid := decodeEn && (isCondBr || isAluReg || isAluImm || isAlu1Bit || isAlu2Bit ||
+                      isCsrImm || isCsrReg || isMul || isDvu || d.io.slog ||
+                      d.io.getvl || d.io.vld || d.io.vst)
+  io.rs2Read.valid := decodeEn && (isCondBr || isAluReg || isAlu2Bit || isStore ||
+                      isCsrReg || isMul || isDvu || d.io.slog || d.io.getvl ||
+                      d.io.vld || d.io.vst || d.io.viop)
+
+  // rs1 is on critical path to busPortAddr.
+  io.rs1Read.addr := Mux(io.inst.inst(0), rs1Addr, rs3Addr)
+
+  // rs2 is used for the vector operation scalar value.
+  io.rs2Read.addr := rs2Addr
+
+  // Register file set ports.
+  io.rs1Set.valid := decodeEn && (d.io.auipc || isCsrImm)
+  io.rs2Set.valid := io.rs1Set.valid || decodeEn && (isAluImm || isAlu1Bit || d.io.lui)
+
+  io.rs1Set.value := Mux(isCsr, d.io.immcsr, io.inst.addr)  // Program Counter (PC)
+
+  io.rs2Set.value := MuxCase(d.io.imm12,
+                     IndexedSeq((d.io.auipc || d.io.lui) -> d.io.imm20))
+
+  // Register file write address ports. We speculate without knowing the decode
+  // enable status to improve timing, and under a branch is ignored anyway.
+  val rdMark_valid =
+      aluValid || csrValid || mluValid || dvuValid && io.dvu.ready ||
+      lsuValid && isLoad ||
+      d.io.getvl || d.io.getmaxvl || vldst_wb ||
+      bruValid && (bruOp(bru.JAL) || bruOp(bru.JALR)) && rdAddr =/= 0.U
+
+  // val scoreboard_spec = Mux(rdMark_valid || d.io.vst, OneHot(rdAddr, 32), 0.U)  // TODO: why was d.io.vst included?
+  val scoreboard_spec = Mux(rdMark_valid, OneHot(rdAddr, 32), 0.U)
+  io.scoreboard.spec := Cat(scoreboard_spec(31,1), 0.U(1.W))
+
+  io.rdMark.valid := decodeEn && rdMark_valid
+  io.rdMark.addr  := rdAddr
+
+  // Register file bus address port.
+  // Pointer chasing bypass if immediate is zero.
+  // Load/Store immediate selection keys off bit5, and RET off bit6.
+  io.busRead.valid := lsuValid
+  io.busRead.bypass := io.inst.inst(31,25) === 0.U &&
+    Mux(!io.inst.inst(5) || io.inst.inst(6), io.inst.inst(24,20) === 0.U,
+                                             io.inst.inst(11,7) === 0.U)
+
+  // SB,SH,SW   0100011
+  // FSW        0100111 //TODO(hoangm)
+  val storeSelect = io.inst.inst(6,3) === 4.U && io.inst.inst(1,0) === 3.U
+  io.busRead.immen := !d.io.flushat
+  io.busRead.immed := Cat(d.io.imm12(31,5),
+                          Mux(storeSelect, d.io.immst(4,0), d.io.imm12(4,0)))
+
+  // Decode ready signalling to fetch.
+  // This must not factor branchTaken, which will be done directly in the
+  // fetch unit. Note above decodeEn resolves for branch for execute usage.
+  io.inst.ready := aluEn && bruEn && lsuEn && mulEn && dvuEn && vinstEn && fenceEn &&
+                   !io.serializeIn.jump && !io.halted && !io.interlock &&
+                   (pipeline.U === 0.U || !d.io.undef)
+
+  // Serialize Interface.
+  // io.serializeOut.lsu  := io.serializeIn.lsu || lsuValid || vldst  // vldst interlock for address generation cycle in vinst
+  // io.serializeOut.lsu  := io.serializeIn.lsu || vldst  // vldst interlock for address generation cycle in vinst
+  io.serializeOut.lsu  := io.serializeIn.lsu
+  io.serializeOut.mul  := io.serializeIn.mul || mluValid
+  io.serializeOut.jump := io.serializeIn.jump || d.io.jal || d.io.jalr ||
+                          d.io.ebreak || d.io.ecall || d.io.eexit ||
+                          d.io.eyield || d.io.ectxsw || d.io.mpause || d.io.mret
+  io.serializeOut.brcond := io.serializeIn.brcond |
+      d.io.beq || d.io.bne || d.io.blt || d.io.bge || d.io.bltu || d.io.bgeu
+  io.serializeOut.vinst := io.serializeIn.vinst
+}
+
+class DecodedInstruction(p: Parameters, pipeline: Int) extends Module {
+  val io = IO(new Bundle {
+    val addr = Input(UInt(32.W))
+    val inst = Input(UInt(32.W))
+
+    // Immediates
+    val imm12  = Output(UInt(32.W))
+    val imm20  = Output(UInt(32.W))
+    val immjal = Output(UInt(32.W))
+    val immbr  = Output(UInt(32.W))
+    val immcsr = Output(UInt(32.W))
+    val immst  = Output(UInt(32.W))
+
+    // RV32I
+    val lui   = Output(Bool())
+    val auipc = Output(Bool())
+    val jal   = Output(Bool())
+    val jalr  = Output(Bool())
+    val beq   = Output(Bool())
+    val bne   = Output(Bool())
+    val blt   = Output(Bool())
+    val bge   = Output(Bool())
+    val bltu  = Output(Bool())
+    val bgeu  = Output(Bool())
+    val csrrw = Output(Bool())
+    val csrrs = Output(Bool())
+    val csrrc = Output(Bool())
+    val lb    = Output(Bool())
+    val lh    = Output(Bool())
+    val lw    = Output(Bool())
+    val lbu   = Output(Bool())
+    val lhu   = Output(Bool())
+    val sb    = Output(Bool())
+    val sh    = Output(Bool())
+    val sw    = Output(Bool())
+    val fence = Output(Bool())
+    val addi  = Output(Bool())
+    val slti  = Output(Bool())
+    val sltiu = Output(Bool())
+    val xori  = Output(Bool())
+    val ori   = Output(Bool())
+    val andi  = Output(Bool())
+    val slli  = Output(Bool())
+    val srli  = Output(Bool())
+    val srai  = Output(Bool())
+    val add   = Output(Bool())
+    val sub   = Output(Bool())
+    val slt   = Output(Bool())
+    val sltu  = Output(Bool())
+    val xor   = Output(Bool())
+    val or    = Output(Bool())
+    val and   = Output(Bool())
+    val sll   = Output(Bool())
+    val srl   = Output(Bool())
+    val sra   = Output(Bool())
+
+    // RV32M
+    val mul     = Output(Bool())
+    val mulh    = Output(Bool())
+    val mulhsu  = Output(Bool())
+    val mulhu   = Output(Bool())
+    val mulhr   = Output(Bool())
+    val mulhsur = Output(Bool())
+    val mulhur  = Output(Bool())
+    val dmulh   = Output(Bool())
+    val dmulhr  = Output(Bool())
+    val div     = Output(Bool())
+    val divu    = Output(Bool())
+    val rem     = Output(Bool())
+    val remu    = Output(Bool())
+
+    // RV32B
+    val clz  = Output(Bool())
+    val ctz  = Output(Bool())
+    val pcnt = Output(Bool())
+    val min  = Output(Bool())
+    val minu = Output(Bool())
+    val max  = Output(Bool())
+    val maxu = Output(Bool())
+
+    // Vector instructions.
+    val getvl = Output(Bool())
+    val getmaxvl = Output(Bool())
+    val vld = Output(Bool())
+    val vst = Output(Bool())
+    val viop = Output(Bool())
+
+    // Core controls.
+    val ebreak = Output(Bool())
+    val ecall  = Output(Bool())
+    val eexit  = Output(Bool())
+    val eyield = Output(Bool())
+    val ectxsw = Output(Bool())
+    val mpause = Output(Bool())
+    val mret   = Output(Bool())
+    val undef  = Output(Bool())
+
+    // Fences.
+    val fencei = Output(Bool())
+    val flushat = Output(Bool())
+    val flushall = Output(Bool())
+
+    // Scalar logging.
+    val slog = Output(Bool())
+  })
+
+  val op = io.inst
+
+  // Immediates
+  io.imm12  := Cat(Fill(20, op(31)), op(31,20))
+  io.imm20  := Cat(op(31,12), 0.U(12.W))
+  io.immjal := Cat(Fill(12, op(31)), op(19,12), op(20), op(30,21), 0.U(1.W))
+  io.immbr  := Cat(Fill(20, op(31)), op(7), op(30,25), op(11,8), 0.U(1.W))
+  io.immcsr := op(19,15)
+  io.immst  := Cat(Fill(20, op(31)), op(31,25), op(11,7))
+
+  // RV32I
+  io.lui   := DecodeBits(op, "xxxxxxxxxxxxxxxxxxxx_xxxxx_0110111")
+  io.auipc := DecodeBits(op, "xxxxxxxxxxxxxxxxxxxx_xxxxx_0010111")
+  io.jal   := DecodeBits(op, "xxxxxxxxxxxxxxxxxxxx_xxxxx_1101111")
+  io.jalr  := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_000_xxxxx_1100111")
+  io.beq   := DecodeBits(op, "xxxxxxx_xxxxx_xxxxx_000_xxxxx_1100011")
+  io.bne   := DecodeBits(op, "xxxxxxx_xxxxx_xxxxx_001_xxxxx_1100011")
+  io.blt   := DecodeBits(op, "xxxxxxx_xxxxx_xxxxx_100_xxxxx_1100011")
+  io.bge   := DecodeBits(op, "xxxxxxx_xxxxx_xxxxx_101_xxxxx_1100011")
+  io.bltu  := DecodeBits(op, "xxxxxxx_xxxxx_xxxxx_110_xxxxx_1100011")
+  io.bgeu  := DecodeBits(op, "xxxxxxx_xxxxx_xxxxx_111_xxxxx_1100011")
+  io.csrrw := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_x01_xxxxx_1110011")
+  io.csrrs := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_x10_xxxxx_1110011")
+  io.csrrc := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_x11_xxxxx_1110011")
+  io.lb    := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_000_xxxxx_0000011")
+  io.lh    := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_001_xxxxx_0000011")
+  io.lw    := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_010_xxxxx_0000011")
+  io.lbu   := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_100_xxxxx_0000011")
+  io.lhu   := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_101_xxxxx_0000011")
+  io.sb    := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_000_xxxxx_0100011")
+  io.sh    := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_001_xxxxx_0100011")
+  io.sw    := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_010_xxxxx_0100011")
+  io.fence := DecodeBits(op, "0000_xxxx_xxxx_00000_000_00000_0001111")
+  io.addi  := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_000_xxxxx_0010011")
+  io.slti  := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_010_xxxxx_0010011")
+  io.sltiu := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_011_xxxxx_0010011")
+  io.xori  := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_100_xxxxx_0010011")
+  io.ori   := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_110_xxxxx_0010011")
+  io.andi  := DecodeBits(op, "xxxxxxxxxxxx_xxxxx_111_xxxxx_0010011")
+  io.slli  := DecodeBits(op, "0000000_xxxxx_xxxxx_001_xxxxx_0010011")
+  io.srli  := DecodeBits(op, "0000000_xxxxx_xxxxx_101_xxxxx_0010011")
+  io.srai  := DecodeBits(op, "0100000_xxxxx_xxxxx_101_xxxxx_0010011")
+  io.add   := DecodeBits(op, "0000000_xxxxx_xxxxx_000_xxxxx_0110011")
+  io.sub   := DecodeBits(op, "0100000_xxxxx_xxxxx_000_xxxxx_0110011")
+  io.slt   := DecodeBits(op, "0000000_xxxxx_xxxxx_010_xxxxx_0110011")
+  io.sltu  := DecodeBits(op, "0000000_xxxxx_xxxxx_011_xxxxx_0110011")
+  io.xor   := DecodeBits(op, "0000000_xxxxx_xxxxx_100_xxxxx_0110011")
+  io.or    := DecodeBits(op, "0000000_xxxxx_xxxxx_110_xxxxx_0110011")
+  io.and   := DecodeBits(op, "0000000_xxxxx_xxxxx_111_xxxxx_0110011")
+  io.sll   := DecodeBits(op, "0000000_xxxxx_xxxxx_001_xxxxx_0110011")
+  io.srl   := DecodeBits(op, "0000000_xxxxx_xxxxx_101_xxxxx_0110011")
+  io.sra   := DecodeBits(op, "0100000_xxxxx_xxxxx_101_xxxxx_0110011")
+
+  // RV32M
+  io.mul     := DecodeBits(op, "0000_001_xxxxx_xxxxx_000_xxxxx_0110011")
+  io.mulh    := DecodeBits(op, "0000_001_xxxxx_xxxxx_001_xxxxx_0110011")
+  io.mulhsu  := DecodeBits(op, "0000_001_xxxxx_xxxxx_010_xxxxx_0110011")
+  io.mulhu   := DecodeBits(op, "0000_001_xxxxx_xxxxx_011_xxxxx_0110011")
+  io.mulhr   := DecodeBits(op, "0010_001_xxxxx_xxxxx_001_xxxxx_0110011")
+  io.mulhsur := DecodeBits(op, "0010_001_xxxxx_xxxxx_010_xxxxx_0110011")
+  io.mulhur  := DecodeBits(op, "0010_001_xxxxx_xxxxx_011_xxxxx_0110011")
+  io.dmulh   := DecodeBits(op, "0000_010_xxxxx_xxxxx_001_xxxxx_0110011")
+  io.dmulhr  := DecodeBits(op, "0010_010_xxxxx_xxxxx_001_xxxxx_0110011")
+  io.div     := DecodeBits(op, "0000_001_xxxxx_xxxxx_100_xxxxx_0110011")
+  io.divu    := DecodeBits(op, "0000_001_xxxxx_xxxxx_101_xxxxx_0110011")
+  io.rem     := DecodeBits(op, "0000_001_xxxxx_xxxxx_110_xxxxx_0110011")
+  io.remu    := DecodeBits(op, "0000_001_xxxxx_xxxxx_111_xxxxx_0110011")
+
+  // RV32B
+  io.clz  := DecodeBits(op, "0110000_00000_xxxxx_001_xxxxx_0010011")
+  io.ctz  := DecodeBits(op, "0110000_00001_xxxxx_001_xxxxx_0010011")
+  io.pcnt := DecodeBits(op, "0110000_00010_xxxxx_001_xxxxx_0010011")
+  io.min  := DecodeBits(op, "0000101_xxxxx_xxxxx_100_xxxxx_0110011")
+  io.minu := DecodeBits(op, "0000101_xxxxx_xxxxx_101_xxxxx_0110011")
+  io.max  := DecodeBits(op, "0000101_xxxxx_xxxxx_110_xxxxx_0110011")
+  io.maxu := DecodeBits(op, "0000101_xxxxx_xxxxx_111_xxxxx_0110011")
+
+  // Decode scalar log.
+  val slog = DecodeBits(op, "01111_00_00000_xxxxx_0xx_00000_11101_11")
+
+  // Vector length.
+  io.getvl    := DecodeBits(op, "0001x_xx_xxxxx_xxxxx_000_xxxxx_11101_11") && op(26,25) =/= 3.U && (op(24,20) =/= 0.U || op(19,15) =/= 0.U)
+  io.getmaxvl := DecodeBits(op, "0001x_xx_00000_00000_000_xxxxx_11101_11") && op(26,25) =/= 3.U
+
+  // Vector load/store.
+  io.vld := DecodeBits(op, "000xxx_0xxxxx_xxxxx0_xx_xxxxxx_x_111_11")     // vld
+
+  io.vst := DecodeBits(op, "001xxx_0xxxxx_xxxxx0_xx_xxxxxx_x_111_11") ||  // vst
+            DecodeBits(op, "011xxx_0xxxxx_xxxxx0_xx_xxxxxx_x_111_11")     // vstq
+
+  // Convolution transfer accumulators to vregs. Also decodes acset/actr ops.
+  val vconv = DecodeBits(op, "010100_000000_000000_xx_xxxxxx_x_111_11")
+
+  // Duplicate
+  val vdup = DecodeBits(op, "01000x_0xxxxx_000000_xx_xxxxxx_x_111_11") && op(13,12) <= 2.U
+  val vdupi = vdup && op(26) === 0.U
+
+  // Vector instructions.
+  io.viop := op(0) === 0.U ||     // .vv .vx
+             op(1,0) === 1.U ||  // .vvv .vxv
+             vconv || vdupi
+
+  // [extensions] Core controls.
+  io.ebreak := DecodeBits(op, "000000000001_00000_000_00000_11100_11")
+  io.ecall  := DecodeBits(op, "000000000000_00000_000_00000_11100_11")
+  io.eexit  := DecodeBits(op, "000000100000_00000_000_00000_11100_11")
+  io.eyield := DecodeBits(op, "000001000000_00000_000_00000_11100_11")
+  io.ectxsw := DecodeBits(op, "000001100000_00000_000_00000_11100_11")
+  io.mpause := DecodeBits(op, "000010000000_00000_000_00000_11100_11")
+  io.mret   := DecodeBits(op, "001100000010_00000_000_00000_11100_11")
+
+  // Fences.
+  io.fencei   := DecodeBits(op, "0000_0000_0000_00000_001_00000_0001111")
+  io.flushat  := DecodeBits(op, "0010x_xx_00000_xxxxx_000_00000_11101_11") && op(19,15) =/= 0.U
+  io.flushall := DecodeBits(op, "0010x_xx_00000_00000_000_00000_11101_11")
+
+  // [extensions] Scalar logging.
+  io.slog := slog
+
+  // Stub out decoder state not used beyond pipeline0.
+  if (pipeline > 0) {
+    io.csrrw := false.B
+    io.csrrs := false.B
+    io.csrrc := false.B
+
+    io.div := false.B
+    io.divu := false.B
+    io.rem := false.B
+    io.remu := false.B
+
+    io.ebreak := false.B
+    io.ecall  := false.B
+    io.eexit  := false.B
+    io.eyield := false.B
+    io.ectxsw := false.B
+    io.mpause := false.B
+    io.mret   := false.B
+
+    io.fence    := false.B
+    io.fencei   := false.B
+    io.flushat  := false.B
+    io.flushall := false.B
+
+    io.slog := false.B
+  }
+
+  // Generate the undefined opcode.
+  val decoded = Cat(io.lui, io.auipc,
+                    io.jal, io.jalr,
+                    io.beq, io.bne, io.blt, io.bge, io.bltu, io.bgeu,
+                    io.csrrw, io.csrrs, io.csrrc,
+                    io.lb, io.lh, io.lw, io.lbu, io.lhu,
+                    io.sb, io.sh, io.sw, io.fence,
+                    io.addi, io.slti, io.sltiu, io.xori, io.ori, io.andi,
+                    io.add, io.sub, io.slt, io.sltu, io.xor, io.or, io.and,
+                    io.slli, io.srli, io.srai, io.sll, io.srl, io.sra,
+                    io.mul, io.mulh, io.mulhsu, io.mulhu, io.mulhr, io.mulhsur, io.mulhur, io.dmulh, io.dmulhr,
+                    io.div, io.divu, io.rem, io.remu,
+                    io.clz, io.ctz, io.pcnt, io.min, io.minu, io.max, io.maxu,
+                    io.viop, io.vld, io.vst,
+                    io.getvl, io.getmaxvl,
+                    io.ebreak, io.ecall, io.eexit, io.eyield, io.ectxsw,
+                    io.mpause, io.mret, io.fencei, io.flushat, io.flushall, io.slog)
+
+  io.undef := !WiredOR(decoded)
+
+  // Delay the assert until the next cycle, so that logs appear on console.
+  val onehot_failed = RegInit(false.B)
+  assert(!onehot_failed)
+
+  val onehot_decode = PopCount(decoded)
+  when ((onehot_decode + io.undef) =/= 1.U) {
+    onehot_failed := true.B
+    printf("[FAIL] decode  inst=%x  addr=%x  decoded=0b%b  pipeline=%d\n",
+      io.inst, io.addr, decoded, pipeline.U)
+  }
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Dvu.scala b/hdl/chisel/src/kelvin/scalar/Dvu.scala
new file mode 100644
index 0000000..8f117d1
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Dvu.scala
@@ -0,0 +1,145 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Dvu {
+  def apply(p: Parameters): Dvu = {
+    return Module(new Dvu(p))
+  }
+}
+
+case class DvuOp() {
+  val DIV  = 0
+  val DIVU = 1
+  val REM  = 2
+  val REMU = 3
+  val Entries = 4
+}
+
+class DvuIO(p: Parameters) extends Bundle {
+  val valid = Input(Bool())
+  val ready = Output(Bool())
+  val addr = Input(UInt(5.W))
+  val op = Input(UInt(new DvuOp().Entries.W))
+}
+
+class Dvu(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    // Decode cycle.
+    val req = new DvuIO(p)
+
+    // Execute cycle.
+    val rs1 = Flipped(new RegfileReadDataIO)
+    val rs2 = Flipped(new RegfileReadDataIO)
+    val rd  = new Bundle {  // RegfileWriteDataIO
+      val valid = Output(Bool())
+      val ready = Input(Bool())
+      val addr  = Output(UInt(5.W))
+      val data  = Output(UInt(32.W))
+    }
+  })
+
+  // This implemention differs to common::idiv by supporting early termination,
+  // and only performs one bit per cycle.
+  val dvu = new DvuOp()
+
+  def Divide(prvDivide: UInt, prvRemain: UInt, denom: UInt): (UInt, UInt) = {
+    val shfRemain = Cat(prvRemain(30,0), prvDivide(31))
+    val subtract = shfRemain -& denom
+    assert(subtract.getWidth == 33)
+    val divDivide = Wire(UInt(32.W))
+    val divRemain = Wire(UInt(32.W))
+
+    when (!subtract(32)) {
+      divDivide := Cat(prvDivide(30,0), 1.U(1.W))
+      divRemain := subtract(31,0)
+    } .otherwise {
+      divDivide := Cat(prvDivide(30,0), 0.U(1.W))
+      divRemain := shfRemain
+    }
+
+    (divDivide, divRemain)
+  }
+
+  val active = RegInit(false.B)
+  val compute = RegInit(false.B)
+
+  val addr1    = Reg(UInt(5.W))
+  val signed1  = Reg(Bool())
+  val divide1  = Reg(Bool())
+  val addr2    = Reg(UInt(5.W))
+  val signed2d = Reg(Bool())
+  val signed2r = Reg(Bool())
+  val divide2  = Reg(Bool())
+
+  val count = Reg(UInt(6.W))
+
+  val divide = Reg(UInt(32.W))
+  val remain = Reg(UInt(32.W))
+  val denom  = Reg(UInt(32.W))
+
+  val divByZero = io.rs2.data === 0.U
+
+  io.req.ready := !active && !compute && !count(5)
+
+  // This is not a Clz, one value too small.
+  def Clz1(bits: UInt): UInt = {
+    val msb = bits.getWidth - 1
+    Mux(bits(msb), 0.U, PriorityEncoder(Reverse(bits(msb - 1, 0))))
+  }
+
+  // Disable active second to last cycle.
+  when (io.req.valid && io.req.ready) {
+    active := true.B
+  } .elsewhen (count === 30.U) {
+    active := false.B
+  }
+
+  // Compute is delayed by one cycle.
+  compute := active
+
+  when (io.req.valid && io.req.ready) {
+    addr1   := io.req.addr
+    signed1 := io.req.op(dvu.DIV) || io.req.op(dvu.REM)
+    divide1 := io.req.op(dvu.DIV) || io.req.op(dvu.DIVU)
+  }
+
+  when (active && !compute) {
+    addr2    := addr1
+    signed2d := signed1 && (io.rs1.data(31) =/= io.rs2.data(31)) && !divByZero
+    signed2r := signed1 && io.rs1.data(31)
+    divide2  := divide1
+
+    val inp = Mux(signed1 && io.rs1.data(31), ~io.rs1.data + 1.U, io.rs1.data)
+
+    // The divBy0 uses full latency to simplify logic.
+    // Count the leading zeroes, which is one less than the priority encoding.
+    val clz = Mux(io.rs2.data === 0.U, 0.U, Clz1(inp))
+
+    denom  := Mux(signed1 && io.rs2.data(31), ~io.rs2.data + 1.U, io.rs2.data)
+    divide := inp << clz
+    remain := 0.U
+    count  := clz
+  } .elsewhen (compute && count < 32.U) {
+    val (div, rem) = Divide(divide, remain, denom)
+    divide := div
+    remain := rem
+    count := count + 1.U
+  } .elsewhen (io.rd.valid && io.rd.ready) {
+    count := 0.U
+  }
+
+  val div = Mux(signed2d, ~divide + 1.U, divide)
+  val rem = Mux(signed2r, ~remain + 1.U, remain)
+
+  io.rd.valid := count(5)
+  io.rd.addr := addr2
+  io.rd.data := Mux(divide2, div, rem)
+}
+
+object EmitDvu extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new Dvu(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Fetch.scala b/hdl/chisel/src/kelvin/scalar/Fetch.scala
new file mode 100644
index 0000000..0351317
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Fetch.scala
@@ -0,0 +1,507 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Fetch {
+  def apply(p: Parameters): Fetch = {
+    return Module(new Fetch(p))
+  }
+}
+
+class IBusIO(p: Parameters) extends Bundle {
+  // Control Phase.
+  val valid = Output(Bool())
+  val ready = Input(Bool())
+  val addr = Output(UInt(p.fetchAddrBits.W))
+  // Read Phase.
+  val rdata = Input(UInt(p.fetchDataBits.W))
+}
+
+class FetchInstruction(p: Parameters) extends Bundle {
+  val valid = Output(Bool())
+  val ready = Input(Bool())
+  val addr = Output(UInt(p.programCounterBits.W))
+  val inst = Output(UInt(p.instructionBits.W))
+  val brchFwd = Output(Bool())
+}
+
+class FetchIO(p: Parameters) extends Bundle {
+  val lanes = Vec(p.instructionLanes, new FetchInstruction(p))
+}
+
+class Fetch(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    val csr = new CsrInIO(p)
+    val ibus = new IBusIO(p)
+    val inst = new FetchIO(p)
+    val branch = Flipped(Vec(4, new BranchTakenIO(p)))
+    val linkPort = Flipped(new RegfileLinkPortIO)
+    val iflush = Flipped(new IFlushIO(p))
+  })
+
+  // This is the only compiled and tested configuration (at this time).
+  assert(p.fetchAddrBits == 32)
+  assert(p.fetchDataBits == 256)
+
+  val aslice = Slice(UInt(p.fetchAddrBits.W), true)
+  val readAddr = Reg(UInt(p.fetchAddrBits.W))
+  val readDataEn = RegInit(false.B)
+
+  val readAddrEn = io.ibus.valid && io.ibus.ready
+  val readData = io.ibus.rdata
+  readDataEn := readAddrEn && !io.iflush.valid
+
+  io.iflush.ready := !aslice.io.out.valid
+
+  // L0 cache
+  // ____________________________________
+  // |        Tag           |Index|xxxxx|
+  // ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
+  val lanes    = p.fetchDataBits / p.instructionBits  // input lanes
+  val indices  = p.fetchCacheBytes * 8 / p.fetchDataBits
+  val indexLsb = log2Ceil(p.fetchDataBits / 8)
+  val indexMsb = log2Ceil(indices) + indexLsb - 1
+  val tagLsb   = indexMsb + 1
+  val tagMsb   = p.fetchAddrBits - 1
+  val indexCountBits = log2Ceil(indices - 1)
+
+  if (p.fetchCacheBytes == 1024) {
+    assert(indexLsb == 5)
+    assert(indexMsb == 9)
+    assert(tagLsb == 10)
+    assert(tagMsb == 31)
+    assert(indices == 32)
+    assert(indexCountBits == 5)
+    assert(lanes == 8)
+  }
+
+  val l0valid = RegInit(0.U(indices.W))
+  val l0req   = RegInit(0.U(indices.W))
+  val l0tag   = Reg(Vec(indices, UInt((tagMsb - tagLsb + 1).W)))
+  val l0data  = Reg(Vec(indices, UInt(p.fetchDataBits.W)))
+
+  // Instruction outputs.
+  val instValid = RegInit(VecInit(Seq.fill(4)(false.B)))
+  val instAddr  = Reg(Vec(4, UInt(p.instructionBits.W)))
+  val instBits  = Reg(Vec(4, UInt(p.instructionBits.W)))
+
+  val instAligned0 = Cat(instAddr(0)(31, indexLsb), 0.U(indexLsb.W))
+  val instAligned1 = instAligned0 + Cat(1.U, 0.U(indexLsb.W))
+
+  val instIndex0 = instAligned0(indexMsb, indexLsb)
+  val instIndex1 = instAligned1(indexMsb, indexLsb)
+
+  val instTag0 = instAligned0(tagMsb, tagLsb)
+  val instTag1 = instAligned1(tagMsb, tagLsb)
+
+  val l0valid0 = l0valid(instIndex0)
+  val l0valid1 = l0valid(instIndex1)
+
+  val l0tag0 = VecAt(l0tag, instIndex0)
+  val l0tag1 = VecAt(l0tag, instIndex1)
+
+  val match0 = l0valid0 && instTag0 === l0tag0
+  val match1 = l0valid1 && instTag1 === l0tag1
+
+  // Read interface.
+  // Do not request entries that are already inflight.
+  // Perform a branch tag lookup to see if target is in cache.
+  def Predecode(addr: UInt, op: UInt): (Bool, UInt) = {
+    val jal = DecodeBits(op, "xxxxxxxxxxxxxxxxxxxx_xxxxx_1101111")
+    val immed = Cat(Fill(12, op(31)), op(19,12), op(20), op(30,21), 0.U(1.W))
+    val target = addr + immed
+    (jal, target)
+  }
+
+  val (preBranchTaken0, preBranchTarget0) =
+      Predecode(instAddr(0), instBits(0))
+  val (preBranchTaken1, preBranchTarget1) =
+      Predecode(instAddr(1), instBits(1))
+  val (preBranchTaken2, preBranchTarget2) =
+      Predecode(instAddr(2), instBits(2))
+  val (preBranchTaken3, preBranchTarget3) =
+      Predecode(instAddr(3), instBits(3))
+
+  val preBranchTaken = io.inst.lanes(0).valid && preBranchTaken0 ||
+                       io.inst.lanes(1).valid && preBranchTaken1 ||
+                       io.inst.lanes(2).valid && preBranchTaken2 ||
+                       io.inst.lanes(3).valid && preBranchTaken3
+
+  val preBranchTarget = Mux(preBranchTaken0, preBranchTarget0,
+                        Mux(preBranchTaken1, preBranchTarget1,
+                        Mux(preBranchTaken2, preBranchTarget2,
+                            preBranchTarget3)))
+
+  val preBranchTag = preBranchTarget(tagMsb, tagLsb)
+  val preBranchIndex = preBranchTarget(indexMsb, indexLsb)
+
+  val branchTag0 = io.branch(0).value(tagMsb, tagLsb)
+  val branchTag1 = io.branch(1).value(tagMsb, tagLsb)
+  val branchTag2 = io.branch(2).value(tagMsb, tagLsb)
+  val branchTag3 = io.branch(3).value(tagMsb, tagLsb)
+  val branchIndex0 = io.branch(0).value(indexMsb, indexLsb)
+  val branchIndex1 = io.branch(1).value(indexMsb, indexLsb)
+  val branchIndex2 = io.branch(2).value(indexMsb, indexLsb)
+  val branchIndex3 = io.branch(3).value(indexMsb, indexLsb)
+
+  val l0validB0 = l0valid(branchIndex0)
+  val l0validB1 = l0valid(branchIndex1)
+  val l0validB2 = l0valid(branchIndex2)
+  val l0validB3 = l0valid(branchIndex3)
+  val l0validP  = l0valid(preBranchIndex)
+
+  val l0tagB0 = VecAt(l0tag, branchIndex0)
+  val l0tagB1 = VecAt(l0tag, branchIndex1)
+  val l0tagB2 = VecAt(l0tag, branchIndex2)
+  val l0tagB3 = VecAt(l0tag, branchIndex3)
+  val l0tagP  = VecAt(l0tag, preBranchIndex)
+
+  val reqB0 = io.branch(0).valid && !l0req(branchIndex0) &&
+      (branchTag0 =/= l0tagB0 || !l0validB0)
+  val reqB1 = io.branch(1).valid && !l0req(branchIndex1) &&
+      (branchTag1 =/= l0tagB1 || !l0validB1) &&
+      !io.branch(0).valid
+  val reqB2 = io.branch(2).valid && !l0req(branchIndex2) &&
+      (branchTag2 =/= l0tagB2 || !l0validB2) &&
+      !io.branch(0).valid && !io.branch(1).valid
+  val reqB3 = io.branch(3).valid && !l0req(branchIndex3) &&
+      (branchTag3 =/= l0tagB3 || !l0validB3) &&
+      !io.branch(0).valid && !io.branch(1).valid && !io.branch(2).valid
+  val reqP = preBranchTaken && !l0req(preBranchIndex) && (preBranchTag =/= l0tagP || !l0validP)
+  val req0 = !match0 && !l0req(instIndex0)
+  val req1 = !match1 && !l0req(instIndex1)
+
+  aslice.io.in.valid := (reqB0 || reqB1 || reqB2 || reqB3 || reqP || req0 || req1) && !io.iflush.valid
+  aslice.io.in.bits  := Mux(reqB0, Cat(io.branch(0).value(31,indexLsb), 0.U(indexLsb.W)),
+                        Mux(reqB1, Cat(io.branch(1).value(31,indexLsb), 0.U(indexLsb.W)),
+                        Mux(reqB2, Cat(io.branch(2).value(31,indexLsb), 0.U(indexLsb.W)),
+                        Mux(reqB3, Cat(io.branch(3).value(31,indexLsb), 0.U(indexLsb.W)),
+                        Mux(reqP,  Cat(preBranchTarget(31,indexLsb), 0.U(indexLsb.W)),
+                        Mux(req0, instAligned0, instAligned1))))))
+
+  when (readAddrEn) {
+    readAddr := io.ibus.addr
+  }
+
+  io.ibus.valid := aslice.io.out.valid
+  aslice.io.out.ready := io.ibus.ready || io.iflush.valid
+  io.ibus.addr := aslice.io.out.bits
+
+  // initialize tags to 1s as 0xfffxxxxx are invalid instruction addresses
+  val l0validClr = WireInit(0.U(indices.W))
+  val l0validSet = WireInit(0.U(indices.W))
+  val l0reqClr = WireInit(0.U(indices.W))
+  val l0reqSet = WireInit(0.U(indices.W))
+
+  val readIdx = readAddr(indexMsb, indexLsb)
+
+  for (i <- 0 until indices) {
+    when (readDataEn && readIdx === i.U) {
+      l0tag(i.U)  := readAddr(tagMsb, tagLsb)
+      l0data(i.U) := readData
+    }
+  }
+
+  when (readDataEn) {
+    val bits = OneHot(readIdx, indices)
+    l0validSet := bits
+    l0reqClr   := bits
+  }
+
+  when (io.iflush.valid) {
+    val clr = ~(0.U(l0validClr.getWidth.W))
+    l0validClr := clr
+    l0reqClr   := clr
+  }
+
+  when (aslice.io.in.valid && aslice.io.in.ready) {
+    l0reqSet := OneHot(aslice.io.in.bits(indexMsb, indexLsb), indices)
+  }
+
+  when (l0validClr =/= 0.U || l0validSet =/= 0.U) {
+    l0valid := (l0valid | l0validSet) & ~l0validClr
+  }
+
+  when (l0reqClr =/= 0.U || l0reqSet =/= 0.U) {
+    l0req := (l0req | l0reqSet) & ~l0reqClr
+  }
+
+  // Instruction Outputs
+  // Do not use the next instruction address directly in the lookup, as that
+  // creates excessive timing pressure. We know that the match is either on
+  // the old line or the next line, so can late mux on lookups of prior.
+  // Widen the arithmetic paths and select from results.
+  val fetchEn = Wire(Vec(4, Bool()))
+
+  for (i <- 0 until 4) {
+    fetchEn(i) := io.inst.lanes(i).valid && io.inst.lanes(i).ready
+  }
+
+  val fsel = Cat(fetchEn(3),
+                 fetchEn(2) && !fetchEn(3),
+                 fetchEn(1) && !fetchEn(2) && !fetchEn(3),
+                 fetchEn(0) && !fetchEn(1) && !fetchEn(2) && !fetchEn(3),
+                 !fetchEn(0) && !fetchEn(1) && !fetchEn(2) && !fetchEn(3))
+
+  val nxtInstAddr0 = instAddr(0)          // 0
+  val nxtInstAddr1 = instAddr(1)          // 4
+  val nxtInstAddr2 = instAddr(2)          // 8
+  val nxtInstAddr3 = instAddr(3)          // 12
+  val nxtInstAddr4 = instAddr(0) + 16.U   // 16
+  val nxtInstAddr5 = instAddr(1) + 16.U   // 20
+  val nxtInstAddr6 = instAddr(2) + 16.U   // 24
+  val nxtInstAddr7 = instAddr(3) + 16.U   // 28
+
+  val nxtInstAddr = Wire(Vec(4, UInt(p.instructionBits.W)))
+
+  nxtInstAddr(0) := Mux(fsel(4), nxtInstAddr4, 0.U) |
+                    Mux(fsel(3), nxtInstAddr3, 0.U) |
+                    Mux(fsel(2), nxtInstAddr2, 0.U) |
+                    Mux(fsel(1), nxtInstAddr1, 0.U) |
+                    Mux(fsel(0), nxtInstAddr0, 0.U)
+
+  nxtInstAddr(1) := Mux(fsel(4), nxtInstAddr5, 0.U) |
+                    Mux(fsel(3), nxtInstAddr4, 0.U) |
+                    Mux(fsel(2), nxtInstAddr3, 0.U) |
+                    Mux(fsel(1), nxtInstAddr2, 0.U) |
+                    Mux(fsel(0), nxtInstAddr1, 0.U)
+
+  nxtInstAddr(2) := Mux(fsel(4), nxtInstAddr6, 0.U) |
+                    Mux(fsel(3), nxtInstAddr5, 0.U) |
+                    Mux(fsel(2), nxtInstAddr4, 0.U) |
+                    Mux(fsel(1), nxtInstAddr3, 0.U) |
+                    Mux(fsel(0), nxtInstAddr2, 0.U)
+
+  nxtInstAddr(3) := Mux(fsel(4), nxtInstAddr7, 0.U) |
+                    Mux(fsel(3), nxtInstAddr6, 0.U) |
+                    Mux(fsel(2), nxtInstAddr5, 0.U) |
+                    Mux(fsel(1), nxtInstAddr4, 0.U) |
+                    Mux(fsel(0), nxtInstAddr3, 0.U)
+
+  val nxtInstIndex0 = nxtInstAddr(0)(indexMsb, indexLsb)
+  val nxtInstIndex1 = nxtInstAddr(3)(indexMsb, indexLsb)
+
+  val readFwd0 =
+      readDataEn && readAddr(31,indexLsb) === instAligned0(31,indexLsb)
+  val readFwd1 =
+      readDataEn && readAddr(31,indexLsb) === instAligned1(31,indexLsb)
+
+  val nxtMatch0Fwd = match0 || readFwd0
+  val nxtMatch1Fwd = match1 || readFwd1
+
+  val nxtMatch0 =
+      Mux(instIndex0(0) === nxtInstIndex0(0), nxtMatch0Fwd, nxtMatch1Fwd)
+  val nxtMatch1 =
+      Mux(instIndex0(0) === nxtInstIndex1(0), nxtMatch0Fwd, nxtMatch1Fwd)
+
+  val nxtInstValid = Wire(Vec(4, Bool()))
+
+  val nxtInstBits0 = Mux(readFwd0, readData, VecAt(l0data, instIndex0))
+  val nxtInstBits1 = Mux(readFwd1, readData, VecAt(l0data, instIndex1))
+  val nxtInstBits = Wire(Vec(16, UInt(p.instructionBits.W)))
+
+  for (i <- 0 until 8) {
+    val offset = 32 * i
+    nxtInstBits(i + 0) := nxtInstBits0(31 + offset, offset)
+    nxtInstBits(i + 8) := nxtInstBits1(31 + offset, offset)
+  }
+
+  def BranchMatchDe(valid: Bool, value: UInt):
+      (Bool, UInt, Vec[UInt], Vec[UInt]) = {
+
+    val addr = VecInit(value,
+                       value + 4.U,
+                       value + 8.U,
+                       value + 12.U)
+
+    val match0 = l0valid(addr(0)(indexMsb,indexLsb)) &&
+        addr(0)(tagMsb,tagLsb) === VecAt(l0tag, addr(0)(indexMsb,indexLsb))
+    val match1 = l0valid(addr(3)(indexMsb,indexLsb)) &&
+        addr(3)(tagMsb,tagLsb) === VecAt(l0tag, addr(3)(indexMsb,indexLsb))
+
+    val vvalid = VecInit(Mux(addr(0)(4,2) <= 7.U, match0, match1),
+                         Mux(addr(0)(4,2) <= 6.U, match0, match1),
+                         Mux(addr(0)(4,2) <= 5.U, match0, match1),
+                         Mux(addr(0)(4,2) <= 4.U, match0, match1))
+
+    val muxbits0 = VecAt(l0data, addr(0)(indexMsb,indexLsb))
+    val muxbits1 = VecAt(l0data, addr(3)(indexMsb,indexLsb))
+    val muxbits = Wire(Vec(16, UInt(p.instructionBits.W)))
+
+    for (i <- 0 until 8) {
+      val offset = 32 * i
+      muxbits(i + 0) := muxbits0(31 + offset, offset)
+      muxbits(i + 8) := muxbits1(31 + offset, offset)
+    }
+
+    val bits = Wire(Vec(4, UInt(p.instructionBits.W)))
+    for (i <- 0 until 4) {
+      val idx = Cat(addr(0)(5) =/= addr(i)(5), addr(i)(4,2))
+      bits(i) := VecAt(muxbits, idx)
+    }
+
+    (valid, vvalid.asUInt, addr, bits)
+  }
+
+  def BranchMatchEx(branch: Vec[BranchTakenIO]):
+      (Bool, UInt, Vec[UInt], Vec[UInt]) = {
+    val valid = branch(0).valid || branch(1).valid ||
+                branch(2).valid || branch(3).valid
+
+    val addr = VecInit(Mux(branch(0).valid, branch(0).value,
+                       Mux(branch(1).valid, branch(1).value,
+                       Mux(branch(2).valid, branch(2).value,
+                                            branch(3).value))),
+                       Mux(branch(0).valid, branch(0).value + 4.U,
+                       Mux(branch(1).valid, branch(1).value + 4.U,
+                       Mux(branch(2).valid, branch(2).value + 4.U,
+                                            branch(3).value + 4.U))),
+                       Mux(branch(0).valid, branch(0).value + 8.U,
+                       Mux(branch(1).valid, branch(1).value + 8.U,
+                       Mux(branch(2).valid, branch(2).value + 8.U,
+                                            branch(3).value + 8.U))),
+                       Mux(branch(0).valid, branch(0).value + 12.U,
+                       Mux(branch(1).valid, branch(1).value + 12.U,
+                       Mux(branch(2).valid, branch(2).value + 12.U,
+                                            branch(3).value + 12.U))))
+
+    val match0 = l0valid(addr(0)(indexMsb,indexLsb)) &&
+        addr(0)(tagMsb,tagLsb) === VecAt(l0tag, addr(0)(indexMsb,indexLsb))
+    val match1 = l0valid(addr(3)(indexMsb,indexLsb)) &&
+        addr(3)(tagMsb,tagLsb) === VecAt(l0tag, addr(3)(indexMsb,indexLsb))
+
+    val vvalid = VecInit(Mux(addr(0)(4,2) <= 7.U, match0, match1),
+                         Mux(addr(0)(4,2) <= 6.U, match0, match1),
+                         Mux(addr(0)(4,2) <= 5.U, match0, match1),
+                         Mux(addr(0)(4,2) <= 4.U, match0, match1))
+
+    val muxbits0 = VecAt(l0data, addr(0)(indexMsb,indexLsb))
+    val muxbits1 = VecAt(l0data, addr(3)(indexMsb,indexLsb))
+    val muxbits = Wire(Vec(16, UInt(p.instructionBits.W)))
+
+    for (i <- 0 until 8) {
+      val offset = 32 * i
+      muxbits(i + 0) := muxbits0(31 + offset, offset)
+      muxbits(i + 8) := muxbits1(31 + offset, offset)
+    }
+
+    val bits = Wire(Vec(4, UInt(p.instructionBits.W)))
+    for (i <- 0 until 4) {
+      val idx = Cat(addr(0)(5) =/= addr(i)(5), addr(i)(4,2))
+      bits(i) := VecAt(muxbits, idx)
+    }
+
+    (valid, vvalid.asUInt, addr, bits)
+  }
+
+  def PredecodeDe(addr: UInt, op: UInt): (Bool, UInt) = {
+    val jal = DecodeBits(op, "xxxxxxxxxxxxxxxxxxxx_xxxxx_1101111")
+    val ret = DecodeBits(op, "000000000000_00001_000_00000_1100111") &&
+                io.linkPort.valid
+    val bxx = DecodeBits(op, "xxxxxxx_xxxxx_xxxxx_xxx_xxxxx_1100011") &&
+                op(31) && op(14,13) =/= 1.U
+    val immjal = Cat(Fill(12, op(31)), op(19,12), op(20), op(30,21), 0.U(1.W))
+    val immbxx = Cat(Fill(20, op(31)), op(7), op(30,25), op(11,8), 0.U(1.W))
+    val immed = Mux(op(2), immjal, immbxx)
+    val target = Mux(ret, io.linkPort.value, addr + immed)
+    (jal || ret || bxx, target)
+  }
+
+  val (brchTakenDe0, brchTargetDe0) = PredecodeDe(instAddr(0), instBits(0))
+  val (brchTakenDe1, brchTargetDe1) = PredecodeDe(instAddr(1), instBits(1))
+  val (brchTakenDe2, brchTargetDe2) = PredecodeDe(instAddr(2), instBits(2))
+  val (brchTakenDe3, brchTargetDe3) = PredecodeDe(instAddr(3), instBits(3))
+
+  val brchTakenDeOr =
+      io.inst.lanes(0).valid && io.inst.lanes(0).ready && brchTakenDe0 ||
+      io.inst.lanes(1).valid && io.inst.lanes(1).ready && brchTakenDe1 ||
+      io.inst.lanes(2).valid && io.inst.lanes(2).ready && brchTakenDe2 ||
+      io.inst.lanes(3).valid && io.inst.lanes(3).ready && brchTakenDe3
+
+  val brchTargetDe = Mux(brchTakenDe0, brchTargetDe0,
+                     Mux(brchTakenDe1, brchTargetDe1,
+                     Mux(brchTakenDe2, brchTargetDe2,
+                         brchTargetDe3)))
+
+  val (brchTakenDe, brchValidDe, brchAddrDe, brchBitsDe) =
+      BranchMatchDe(brchTakenDeOr, brchTargetDe)
+
+  val (brchTakenEx, brchValidEx, brchAddrEx, brchBitsEx) =
+      BranchMatchEx(io.branch)
+
+  val brchValidDeMask =
+      Cat(!brchTakenDe0 && !brchTakenDe1 && !brchTakenDe2,
+          !brchTakenDe0 && !brchTakenDe1,
+          !brchTakenDe0,
+          true.B)
+
+  val brchFwd = Cat(
+      brchTakenDe3 && !brchTakenDe0 && !brchTakenDe1 && !brchTakenDe2,
+      brchTakenDe2 && !brchTakenDe0 && !brchTakenDe1,
+      brchTakenDe1 && !brchTakenDe0,
+      brchTakenDe0)
+
+  for (i <- 0 until 4) {
+    // 1, 11, 111, ...
+    nxtInstValid(i) := Mux(nxtInstAddr(0)(4,2) <= (7 - i).U, nxtMatch0, nxtMatch1)
+
+    val nxtInstValidUInt = nxtInstValid.asUInt
+    instValid(i) := Mux(brchTakenEx, brchValidEx(i,0) === ~0.U((i+1).W),
+                    Mux(brchTakenDe, brchValidDe(i,0) === ~0.U((i+1).W),
+                    nxtInstValidUInt(i,0) === ~0.U((i+1).W))) && !io.iflush.valid
+
+    instAddr(i) := Mux(brchTakenEx, brchAddrEx(i),
+                   Mux(brchTakenDe, brchAddrDe(i), nxtInstAddr(i)))
+
+    // The (2,0) bits are the offset within the base line plus the next line.
+    // The (3) bit of the index must factor the base difference of addresses
+    // instAddr and nxtInstAddr which are line aligned.
+    val idx = Cat(instAddr(0)(5) =/= nxtInstAddr(i)(5), nxtInstAddr(i)(4,2))
+    instBits(i) := Mux(brchTakenEx, brchBitsEx(i),
+                   Mux(brchTakenDe, brchBitsDe(i),
+                   VecAt(nxtInstBits, idx)))
+  }
+
+  // This pattern of separate when() blocks requires resets after the data.
+  when (reset.asBool) {
+    val addr = Cat(io.csr.value(0)(31,2), 0.U(2.W))
+    instAddr(0) := addr
+    instAddr(1) := addr + 4.U
+    instAddr(2) := addr + 8.U
+    instAddr(3) := addr + 12.U
+  }
+
+  // Outputs
+  for (i <- 0 until 4) {
+    io.inst.lanes(i).valid := instValid(i) & brchValidDeMask(i)
+    io.inst.lanes(i).addr  := instAddr(i)
+    io.inst.lanes(i).inst  := instBits(i)
+    io.inst.lanes(i).brchFwd := brchFwd(i)
+  }
+
+  // Assertions.
+  assert(instAddr(0) + 4.U === instAddr(1))
+  assert(instAddr(0) + 8.U === instAddr(2))
+  assert(instAddr(0) + 12.U === instAddr(3))
+
+  assert(fsel.getWidth == 5)
+  assert(PopCount(fsel) <= 1.U)
+
+  val instValidUInt = instValid.asUInt
+  assert(!(!instValidUInt(0) && (instValidUInt(3,1) =/= 0.U)))
+  assert(!(!instValidUInt(1) && (instValidUInt(3,2) =/= 0.U)))
+  assert(!(!instValidUInt(2) && (instValidUInt(3,3) =/= 0.U)))
+
+  val instLanesReady = Cat(io.inst.lanes(3).ready, io.inst.lanes(2).ready,
+                           io.inst.lanes(1).ready, io.inst.lanes(0).ready)
+  assert(!(!instLanesReady(0) && (instLanesReady(3,1) =/= 0.U)))
+  assert(!(!instLanesReady(1) && (instLanesReady(3,2) =/= 0.U)))
+  assert(!(!instLanesReady(2) && (instLanesReady(3,3) =/= 0.U)))
+}
+
+object EmitFetch extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new Fetch(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Flush.scala b/hdl/chisel/src/kelvin/scalar/Flush.scala
new file mode 100644
index 0000000..dddfcdf
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Flush.scala
@@ -0,0 +1,21 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+class IFlushIO(p: Parameters) extends Bundle {
+  val valid = Output(Bool())
+  val ready = Input(Bool())
+}
+
+class DFlushIO(p: Parameters) extends Bundle {
+  val valid = Output(Bool())
+  val ready = Input(Bool())
+  val all   = Output(Bool())  // all=0, see io.dbus.addr for line address.
+  val clean = Output(Bool())  // clean and flush
+}
+
+class DFlushFenceiIO(p: Parameters) extends DFlushIO(p) {
+  val fencei = Output(Bool())
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala
new file mode 100644
index 0000000..7b6f82e
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -0,0 +1,288 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Lsu {
+  def apply(p: Parameters): Lsu = {
+    return Module(new Lsu(p))
+  }
+}
+
+class DBusIO(p: Parameters, bank: Boolean = false) extends Bundle {
+  // Control Phase.
+  val valid = Output(Bool())
+  val ready = Input(Bool())
+  val write = Output(Bool())
+  val addr = Output(UInt((p.lsuAddrBits - (if (bank) 1 else 0)).W))
+  val adrx = Output(UInt((p.lsuAddrBits - (if (bank) 1 else 0)).W))
+  val size = Output(UInt((log2Ceil(p.lsuDataBits / 8) + 1).W))
+  val wdata = Output(UInt(p.lsuDataBits.W))
+  val wmask = Output(UInt((p.lsuDataBits / 8).W))
+  // Read Phase.
+  val rdata = Input(UInt(p.lsuDataBits.W))
+}
+
+case class LsuOp() {
+  val LB  = 0
+  val LH  = 1
+  val LW  = 2
+  val LBU = 3
+  val LHU = 4
+  val SB  = 5
+  val SH  = 6
+  val SW  = 7
+  val FENCEI = 8
+  val FLUSHAT = 9
+  val FLUSHALL = 10
+  val VLDST = 11
+  val Entries = 12
+}
+
+class LsuIO(p: Parameters) extends Bundle {
+  val valid = Input(Bool())
+  val ready = Output(Bool())
+  val store = Input(Bool())
+  val addr = Input(UInt(5.W))
+  val op = Input(UInt(new LsuOp().Entries.W))
+}
+
+class LsuCtrl(p: Parameters) extends Bundle {
+  val addr = UInt(32.W)
+  val adrx = UInt(32.W)
+  val data = UInt(32.W)
+  val index = UInt(5.W)
+  val size = UInt((log2Ceil(p.lsuDataBits / 8) + 1).W)
+  val write = Bool()
+  val sext = Bool()
+  val iload = Bool()
+  val fencei = Bool()
+  val flushat = Bool()
+  val flushall = Bool()
+  val sldst = Bool()  // scalar load/store cached
+  val vldst = Bool()  // vector load/store
+  val suncd = Bool()  // scalar load/store uncached
+}
+
+class LsuReadData(p: Parameters) extends Bundle {
+  val addr = UInt(32.W)
+  val index = UInt(5.W)
+  val size = UInt((log2Ceil(p.lsuDataBits / 8) + 1).W)
+  val sext = Bool()
+  val iload = Bool()
+  val sldst = Bool()
+  val suncd = Bool()
+}
+
+class Lsu(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    // Decode cycle.
+    val req = Vec(4, new LsuIO(p))
+    val busPort = Flipped(new RegfileBusPortIO)
+
+    // Execute cycle(s).
+    val rd = Flipped(new RegfileWriteDataIO)
+
+    // Cached interface.
+    val dbus = new DBusIO(p)
+    val flush = new DFlushFenceiIO(p)
+
+    // Uncached interface.
+    val ubus = new DBusIO(p)
+
+    // Vector switch.
+    val vldst = Output(Bool())
+  })
+
+  val lsu = new LsuOp()
+
+  // AXI Queues.
+  val n = 8
+  val ctrl = Fifo4(new LsuCtrl(p), n)
+  val data = Slice(new LsuReadData(p), true, true)
+
+  // Match and mask.
+  val ctrlready = Cat(ctrl.io.count <= (n - 4).U,
+                      ctrl.io.count <= (n - 3).U,
+                      ctrl.io.count <= (n - 2).U,
+                      ctrl.io.count <= (n - 1).U)
+
+  io.req(0).ready := ctrlready(0) && data.io.in.ready
+  io.req(1).ready := ctrlready(1) && data.io.in.ready
+  io.req(2).ready := ctrlready(2) && data.io.in.ready
+  io.req(3).ready := ctrlready(3) && data.io.in.ready
+
+  // Address phase must use simple logic to resolve mask for unaligned address.
+  val linebit = log2Ceil(p.lsuDataBits / 8)
+  val lineoffset = (p.lsuDataBits / 8)
+
+  // ---------------------------------------------------------------------------
+  // Control Port Inputs.
+  ctrl.io.in.valid := io.req(0).valid || io.req(1).valid ||
+                      io.req(2).valid || io.req(3).valid
+
+  for (i <- 0 until 4) {
+    val uncached = io.busPort.addr(i)(31)
+
+    val opstore = io.req(i).op(lsu.SW) || io.req(i).op(lsu.SH) || io.req(i).op(lsu.SB)
+    val opiload = io.req(i).op(lsu.LW) || io.req(i).op(lsu.LH) || io.req(i).op(lsu.LB) || io.req(i).op(lsu.LHU) || io.req(i).op(lsu.LBU)
+    val opload  = opiload
+    val opfencei   = io.req(i).op(lsu.FENCEI)
+    val opflushat  = io.req(i).op(lsu.FLUSHAT)
+    val opflushall = io.req(i).op(lsu.FLUSHALL)
+    val opsldst = opstore || opload
+    val opvldst = io.req(i).op(lsu.VLDST)
+    val opsext = io.req(i).op(lsu.LB) || io.req(i).op(lsu.LH)
+    val opsize = Cat(io.req(i).op(lsu.LW) || io.req(i).op(lsu.SW),
+                     io.req(i).op(lsu.LH) || io.req(i).op(lsu.LHU) || io.req(i).op(lsu.SH),
+                     io.req(i).op(lsu.LB) || io.req(i).op(lsu.LBU) || io.req(i).op(lsu.SB))
+
+    ctrl.io.in.bits(i).valid := io.req(i).valid && ctrlready(i) && !(opvldst && uncached)
+
+    ctrl.io.in.bits(i).bits.addr := io.busPort.addr(i)
+    ctrl.io.in.bits(i).bits.adrx := io.busPort.addr(i) + lineoffset.U
+    ctrl.io.in.bits(i).bits.data := io.busPort.data(i)
+    ctrl.io.in.bits(i).bits.index := io.req(i).addr
+    ctrl.io.in.bits(i).bits.sext := opsext
+    ctrl.io.in.bits(i).bits.size := opsize
+    ctrl.io.in.bits(i).bits.iload := opiload
+    ctrl.io.in.bits(i).bits.fencei   := opfencei
+    ctrl.io.in.bits(i).bits.flushat  := opflushat
+    ctrl.io.in.bits(i).bits.flushall := opflushall
+    ctrl.io.in.bits(i).bits.sldst := opsldst && !uncached
+    ctrl.io.in.bits(i).bits.vldst := opvldst
+    ctrl.io.in.bits(i).bits.suncd := opsldst && uncached
+    ctrl.io.in.bits(i).bits.write := !opload
+  }
+
+  // ---------------------------------------------------------------------------
+  // Control Port Outputs.
+  val wsel = ctrl.io.out.bits.addr(1,0)
+  val wda = ctrl.io.out.bits.data
+  val wdataS =
+    MuxOR(wsel === 0.U, wda(31,0)) |
+    MuxOR(wsel === 1.U, Cat(wda(23,16), wda(15,8), wda(7,0), wda(31,24))) |
+    MuxOR(wsel === 2.U, Cat(wda(15,8), wda(7,0), wda(31,24), wda(23,16))) |
+    MuxOR(wsel === 3.U, Cat(wda(7,0), wda(31,24), wda(23,16), wda(15,8)))
+  val wmaskB = p.lsuDataBits / 8
+  val wmaskT = (~0.U(wmaskB.W)) >> (wmaskB.U - ctrl.io.out.bits.size)
+  val wmaskS = (wmaskT << ctrl.io.out.bits.addr(linebit-1,0)) |
+               (wmaskT >> (lineoffset.U - ctrl.io.out.bits.addr(linebit-1,0)))
+  val wdata = Wire(UInt(p.lsuDataBits.W))
+  val wmask = wmaskS(lineoffset - 1, 0)
+
+  if (p.lsuDataBits == 128) {
+    wdata := Cat(wdataS, wdataS, wdataS, wdataS)
+  } else if (p.lsuDataBits == 256) {
+    wdata := Cat(wdataS, wdataS, wdataS, wdataS,
+                 wdataS, wdataS, wdataS, wdataS)
+  } else if (p.lsuDataBits == 512) {
+    wdata := Cat(wdataS, wdataS, wdataS, wdataS,
+                 wdataS, wdataS, wdataS, wdataS,
+                 wdataS, wdataS, wdataS, wdataS,
+                 wdataS, wdataS, wdataS, wdataS)
+  } else {
+    assert(false)
+  }
+
+  io.dbus.valid := ctrl.io.out.valid && ctrl.io.out.bits.sldst
+  io.dbus.write := ctrl.io.out.bits.write
+  io.dbus.addr  := Cat(0.U(1.W), ctrl.io.out.bits.addr(30,0))
+  io.dbus.adrx  := Cat(0.U(1.W), ctrl.io.out.bits.adrx(30,0))
+  io.dbus.size  := ctrl.io.out.bits.size
+  io.dbus.wdata := wdata
+  io.dbus.wmask := wmask
+  assert(!(io.dbus.valid && ctrl.io.out.bits.addr(31)))
+  assert(!(io.dbus.valid && io.dbus.addr(31)))
+  assert(!(io.dbus.valid && io.dbus.adrx(31)))
+
+  io.ubus.valid := ctrl.io.out.valid && ctrl.io.out.bits.suncd
+  io.ubus.write := ctrl.io.out.bits.write
+  io.ubus.addr  := Cat(0.U(1.W), ctrl.io.out.bits.addr(30,0))
+  io.ubus.adrx  := Cat(0.U(1.W), ctrl.io.out.bits.adrx(30,0))
+  io.ubus.size  := ctrl.io.out.bits.size
+  io.ubus.wdata := wdata
+  io.ubus.wmask := wmask
+  assert(!(io.ubus.valid && !ctrl.io.out.bits.addr(31)))
+  assert(!(io.ubus.valid && io.dbus.addr(31)))
+  assert(!(io.ubus.valid && io.dbus.adrx(31)))
+
+  io.flush.valid  := ctrl.io.out.valid && (ctrl.io.out.bits.fencei || ctrl.io.out.bits.flushat || ctrl.io.out.bits.flushall)
+  io.flush.all    := ctrl.io.out.bits.fencei || ctrl.io.out.bits.flushall
+  io.flush.clean  := true.B
+  io.flush.fencei := ctrl.io.out.bits.fencei
+
+  ctrl.io.out.ready := io.flush.valid && io.flush.ready ||
+                       io.dbus.valid && io.dbus.ready ||
+                       io.ubus.valid && io.ubus.ready ||
+                       ctrl.io.out.bits.vldst && io.dbus.ready
+
+  io.vldst := ctrl.io.out.valid && ctrl.io.out.bits.vldst
+
+  // ---------------------------------------------------------------------------
+  // Load response.
+  data.io.in.valid := io.dbus.valid && io.dbus.ready && !io.dbus.write ||
+                      io.ubus.valid && io.ubus.ready && !io.ubus.write
+
+  data.io.in.bits.addr  := ctrl.io.out.bits.addr
+  data.io.in.bits.index := ctrl.io.out.bits.index
+  data.io.in.bits.sext  := ctrl.io.out.bits.sext
+  data.io.in.bits.size  := ctrl.io.out.bits.size
+  data.io.in.bits.iload := ctrl.io.out.bits.iload
+  data.io.in.bits.sldst := ctrl.io.out.bits.sldst
+  data.io.in.bits.suncd := ctrl.io.out.bits.suncd
+
+  data.io.out.ready := true.B
+
+  assert(!(ctrl.io.in.valid && !data.io.in.ready))
+
+  // ---------------------------------------------------------------------------
+  // Register file ports.
+  val rvalid = data.io.out.valid
+  val rsext = data.io.out.bits.sext
+  val rsize = data.io.out.bits.size
+  val rsel  = data.io.out.bits.addr(linebit - 1, 0)
+
+  // Rotate and sign extend.
+  def RotSignExt(datain: UInt, dataout: UInt = 0.U(p.lsuDataBits.W), i: Int = 0): UInt = {
+    assert(datain.getWidth  == p.lsuDataBits)
+    assert(dataout.getWidth == p.lsuDataBits)
+
+    if (i < p.lsuDataBits / 8) {
+      val mod = p.lsuDataBits
+
+      val rdata = Cat(datain((8 * (i + 3) + 7) % mod, (8 * (i + 3)) % mod),
+                      datain((8 * (i + 2) + 7) % mod, (8 * (i + 2)) % mod),
+                      datain((8 * (i + 1) + 7) % mod, (8 * (i + 1)) % mod),
+                      datain((8 * (i + 0) + 7) % mod, (8 * (i + 0)) % mod))
+
+      val sizeMask = Mux(rsize === 4.U, 0xffffffff.S(32.W).asUInt,
+                     Mux(rsize === 2.U, 0x0000ffff.U(32.W), 0x000000ff.U(32.W)))
+
+      val signExtend = Mux(rsext,
+                         Mux(rsize === 2.U,
+                           Mux(rdata(15), 0xffff0000.S(32.W).asUInt, 0.U(32.W)),
+                           Mux(rdata(7),  0xffffff00.S(32.W).asUInt, 0.U(32.W))),
+                         0.U)
+      assert(sizeMask.getWidth == 32)
+      assert(signExtend.getWidth == 32)
+
+      val sdata = MuxOR(rsel === i.U, rdata & sizeMask | signExtend)
+      RotSignExt(datain, dataout | sdata, i + 1)
+    } else {
+      dataout
+    }
+  }
+
+  val rdata = RotSignExt(MuxOR(data.io.out.bits.sldst, io.dbus.rdata) |
+                         MuxOR(data.io.out.bits.suncd, io.ubus.rdata))
+
+  // pass-through
+  io.rd.valid := rvalid && data.io.out.bits.iload
+  io.rd.addr  := data.io.out.bits.index
+  io.rd.data  := rdata
+
+  assert(!ctrl.io.out.valid || PopCount(Cat(ctrl.io.out.bits.sldst, ctrl.io.out.bits.vldst, ctrl.io.out.bits.suncd)) <= 1.U)
+  assert(!data.io.out.valid || PopCount(Cat(data.io.out.bits.sldst, data.io.out.bits.suncd)) <= 1.U)
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Mlu.scala b/hdl/chisel/src/kelvin/scalar/Mlu.scala
new file mode 100644
index 0000000..173909d
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Mlu.scala
@@ -0,0 +1,140 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Mlu {
+  def apply(p: Parameters): Mlu = {
+    return Module(new Mlu(p))
+  }
+}
+
+case class MluOp() {
+  val MUL = 0
+  val MULH = 1
+  val MULHSU = 2
+  val MULHU = 3
+  val MULHR = 4
+  val MULHSUR = 5
+  val MULHUR = 6
+  val DMULH = 7
+  val DMULHR = 8
+  val Entries = 9
+}
+
+class MluIO(p: Parameters) extends Bundle {
+  val valid = Input(Bool())
+  val addr = Input(UInt(5.W))
+  val op = Input(UInt(new MluOp().Entries.W))
+}
+
+class Mlu(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    // Decode cycle.
+    val req = Vec(4, new MluIO(p))
+
+    // Execute cycle.
+    val rs1 = Vec(4, Flipped(new RegfileReadDataIO))
+    val rs2 = Vec(4, Flipped(new RegfileReadDataIO))
+    val rd  = Flipped(new RegfileWriteDataIO)
+  })
+
+  val mlu = new MluOp()
+
+  val op = RegInit(0.U(mlu.Entries.W))
+  val valid1 = RegInit(false.B)
+  val valid2 = RegInit(false.B)
+  val addr1 = Reg(UInt(5.W))
+  val addr2 = Reg(UInt(5.W))
+  val sel = Reg(UInt(4.W))
+
+  valid1 := io.req(0).valid || io.req(1).valid ||
+            io.req(2).valid || io.req(3).valid
+  valid2 := valid1
+
+  when (io.req(0).valid) {
+    op := io.req(0).op
+    addr1 := io.req(0).addr
+    sel := 1.U
+  } .elsewhen (io.req(1).valid) {
+    op := io.req(1).op
+    addr1 := io.req(1).addr
+    sel := 2.U
+  } .elsewhen (io.req(2).valid) {
+    op := io.req(2).op
+    addr1 := io.req(2).addr
+    sel := 4.U
+  } .elsewhen (io.req(3).valid) {
+    op := io.req(3).op
+    addr1 := io.req(3).addr
+    sel := 8.U
+  } .otherwise {
+    op := 0.U
+    sel := 0.U
+  }
+
+  val rs1 = MuxOR(valid1 & sel(0), io.rs1(0).data) |
+            MuxOR(valid1 & sel(1), io.rs1(1).data) |
+            MuxOR(valid1 & sel(2), io.rs1(2).data) |
+            MuxOR(valid1 & sel(3), io.rs1(3).data)
+
+  val rs2 = MuxOR(valid1 & sel(0), io.rs2(0).data) |
+            MuxOR(valid1 & sel(1), io.rs2(1).data) |
+            MuxOR(valid1 & sel(2), io.rs2(2).data) |
+            MuxOR(valid1 & sel(3), io.rs2(3).data)
+
+  // Multiplier has a registered output.
+  val mul2 = Reg(UInt(32.W))
+  val round2 = Reg(UInt(1.W))
+
+  when (valid1) {
+    val rs2signed = op(mlu.MULH) || op(mlu.MULHR) || op(mlu.DMULH) || op(mlu.DMULHR)
+    val rs1signed = op(mlu.MULHSU) || op(mlu.MULHSUR) || rs2signed
+    val rs1s = Cat(rs1signed && rs1(31), rs1).asSInt
+    val rs2s = Cat(rs2signed && rs2(31), rs2).asSInt
+    val prod = rs1s.asSInt * rs2s.asSInt
+    assert(prod.getWidth == 66)
+
+    addr2 := addr1
+    round2 := prod(30) && op(mlu.DMULHR) ||
+              prod(31) && (op(mlu.MULHR) || op(mlu.MULHSUR) || op(mlu.MULHUR))
+
+    when (op(mlu.MUL)) {
+      mul2 := prod(31,0)
+    } .elsewhen (op(mlu.MULH) || op(mlu.MULHSU) || op(mlu.MULHU) || op(mlu.MULHR) || op(mlu.MULHSUR) || op(mlu.MULHUR)) {
+      mul2 := prod(63,32)
+    } .elsewhen (op(mlu.DMULH) || op(mlu.DMULHR)) {
+      val maxneg = 2.U(2.W)
+      val halfneg = 1.U(2.W)
+      val sat = rs1(29,0) === 0.U && rs2(29,0) === 0.U &&
+                (rs1(31,30) === maxneg && rs2(31,30) === maxneg ||
+                 rs1(31,30) === maxneg && rs2(31,30) === halfneg ||
+                 rs2(31,30) === maxneg && rs1(31,30) === halfneg)
+      when (sat) {
+        when (prod(65)) {
+          mul2 := 0x7fffffff.U(32.W)
+        } .otherwise {
+          mul2 := Cat(1.U(1.W), 0.U(31.W))
+        }
+      } .otherwise {
+        mul2 := prod(62,31)
+      }
+    }
+  }
+
+  io.rd.valid := valid2
+  io.rd.addr  := addr2
+  io.rd.data  := mul2 + round2
+
+  // Assertions.
+  for (i <- 0 until 4) {
+    assert(!(valid1 && sel(i) && !io.rs1(i).valid))
+    assert(!(valid1 && sel(i) && !io.rs2(i).valid))
+  }
+}
+
+object EmitMlu extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new Mlu(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/scalar/Regfile.scala b/hdl/chisel/src/kelvin/scalar/Regfile.scala
new file mode 100644
index 0000000..800aaac
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/Regfile.scala
@@ -0,0 +1,255 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object Regfile {
+  def apply(p: Parameters): Regfile = {
+    return Module(new Regfile(p))
+  }
+}
+
+class RegfileReadAddrIO extends Bundle {
+  val valid = Input(Bool())
+  val addr  = Input(UInt(5.W))
+}
+
+class RegfileReadSetIO extends Bundle {
+  val valid = Input(Bool())
+  val value = Input(UInt(32.W))
+}
+
+class RegfileReadDataIO extends Bundle {
+  val valid = Output(Bool())
+  val data  = Output(UInt(32.W))
+}
+
+class RegfileWriteAddrIO extends Bundle {
+  val valid = Input(Bool())
+  val addr  = Input(UInt(5.W))
+}
+
+class RegfileWriteDataIO extends Bundle {
+  val valid = Input(Bool())
+  val addr  = Input(UInt(5.W))
+  val data  = Input(UInt(32.W))
+}
+
+class RegfileBusAddrIO extends Bundle {
+  val valid = Input(Bool())
+  val bypass = Input(Bool())
+  val immen = Input(Bool())
+  val immed = Input(UInt(32.W))
+}
+
+class RegfileBusPortIO extends Bundle {
+  val addr = Output(Vec(4, UInt(32.W)))
+  val data = Output(Vec(4, UInt(32.W)))
+}
+
+class RegfileLinkPortIO extends Bundle {
+  val valid = Output(Bool())
+  val value = Output(UInt(32.W))
+}
+
+class RegfileBranchTargetIO extends Bundle {
+  val data = Output(UInt(32.W))
+}
+
+class Regfile(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    // Decode cycle.
+    val readAddr = Vec(8, new RegfileReadAddrIO)
+    val readSet  = Vec(8, new RegfileReadSetIO)
+    val writeAddr = Vec(4, new RegfileWriteAddrIO)
+    val busAddr = Vec(4, new RegfileBusAddrIO)
+    val target = Vec(4, new RegfileBranchTargetIO)
+    val linkPort = new RegfileLinkPortIO
+    val busPort = new RegfileBusPortIO
+
+    // Execute cycle.
+    val readData = Vec(8, new RegfileReadDataIO)
+    val writeData = Vec(6, new RegfileWriteDataIO)
+    val writeMask = Vec(5, new Bundle {val valid = Input(Bool())})
+    val scoreboard = new Bundle {
+      val regd = Output(UInt(32.W))
+      val comb = Output(UInt(32.W))
+    }
+  })
+
+  // 8R6W
+  // 8 read ports
+  // 6 write ports
+
+  // The scalar registers, integer (and float todo).
+  val regfile = Reg(Vec(32, UInt(32.W)))
+
+  // ***************************************************************************
+  // The scoreboard.
+  // ***************************************************************************
+  val scoreboard = RegInit(0.U(32.W))
+
+  // The write Addr:Data contract is against speculated opcodes. If an opcode
+  // is in the shadow of a taken branch it will still Set:Clr the scoreboard,
+  // but the actual write will be Masked.
+  val scoreboard_set =
+    MuxOR(io.writeAddr(0).valid, OneHot(io.writeAddr(0).addr, 32)) |
+    MuxOR(io.writeAddr(1).valid, OneHot(io.writeAddr(1).addr, 32)) |
+    MuxOR(io.writeAddr(2).valid, OneHot(io.writeAddr(2).addr, 32)) |
+    MuxOR(io.writeAddr(3).valid, OneHot(io.writeAddr(3).addr, 32))
+
+  val scoreboard_clr0 =
+    MuxOR(io.writeData(0).valid, OneHot(io.writeData(0).addr, 32)) |
+    MuxOR(io.writeData(1).valid, OneHot(io.writeData(1).addr, 32)) |
+    MuxOR(io.writeData(2).valid, OneHot(io.writeData(2).addr, 32)) |
+    MuxOR(io.writeData(3).valid, OneHot(io.writeData(3).addr, 32)) |
+    MuxOR(io.writeData(4).valid, OneHot(io.writeData(4).addr, 32)) |
+    MuxOR(io.writeData(5).valid, OneHot(io.writeData(5).addr, 32))
+
+  val scoreboard_clr = Cat(scoreboard_clr0(31,1), 0.U(1.W))
+
+  when (scoreboard_set =/= 0.U || scoreboard_clr =/= 0.U) {
+    val nxtScoreboard = (scoreboard & ~scoreboard_clr) | scoreboard_set
+    scoreboard := Cat(nxtScoreboard(31,1), 0.U(1.W))
+  }
+
+  io.scoreboard.regd := scoreboard
+  io.scoreboard.comb := scoreboard & ~scoreboard_clr
+
+  // ***************************************************************************
+  // The read port response.
+  // ***************************************************************************
+  val readDataReady = RegInit(VecInit(Seq.fill(8){false.B}))
+  val readDataBits  = Reg(Vec(8, UInt(32.W)))
+  val nxtReadDataBits = Wire(Vec(8, UInt(32.W)))
+
+  for (i <- 0 until 8) {
+    io.readData(i).valid := readDataReady(i)
+    io.readData(i).data  := readDataBits(i)
+  }
+
+  // ***************************************************************************
+  // One hot write ports.
+  // ***************************************************************************
+  val writeValid = Wire(Vec(32, Bool()))
+  val writeData  = Wire(Vec(32, UInt(32.W)))
+
+  writeValid(0) := true.B  // do not require special casing of indices
+  writeData(0)  := 0.U     // regfile(0) is optimized away
+
+  for (i <- 1 until 32) {
+    val valid = Cat(io.writeData(5).valid && io.writeData(5).addr === i.U,
+                    io.writeData(4).valid && io.writeData(4).addr === i.U &&
+                      !io.writeMask(4).valid,
+                    io.writeData(3).valid && io.writeData(3).addr === i.U &&
+                      !io.writeMask(3).valid,
+                    io.writeData(2).valid && io.writeData(2).addr === i.U &&
+                      !io.writeMask(2).valid,
+                    io.writeData(1).valid && io.writeData(1).addr === i.U &&
+                      !io.writeMask(1).valid,
+                    io.writeData(0).valid && io.writeData(0).addr === i.U &&
+                      !io.writeMask(0).valid)
+
+    val data  = MuxOR(valid(0), io.writeData(0).data) |
+                MuxOR(valid(1), io.writeData(1).data) |
+                MuxOR(valid(2), io.writeData(2).data) |
+                MuxOR(valid(3), io.writeData(3).data) |
+                MuxOR(valid(4), io.writeData(4).data) |
+                MuxOR(valid(5), io.writeData(5).data)
+
+    writeValid(i) := valid =/= 0.U
+    writeData(i)  := data
+
+    assert(PopCount(valid) <= 1.U)
+  }
+
+  for (i <- 0 until 32) {
+    when (writeValid(i)) {
+      regfile(i) := writeData(i)
+    }
+  }
+
+  // ***************************************************************************
+  // Read ports with write forwarding.
+  // ***************************************************************************
+  val rdata = Wire(Vec(8, UInt(32.W)))
+  val wdata = Wire(Vec(8, UInt(32.W)))
+  val rwdata = Wire(Vec(8, UInt(32.W)))
+  for (i <- 0 until 8) {
+    val idx = io.readAddr(i).addr
+    val write = VecAt(writeValid, idx)
+    rdata(i) := VecAt(regfile, idx)
+    wdata(i) := VecAt(writeData, idx)
+    rwdata(i) := Mux(write, wdata(i), rdata(i))
+  }
+
+  for (i <- 0 until 8) {
+    val setValid = io.readSet(i).valid
+    val setValue = io.readSet(i).value
+
+    val nxtReadDataReady = io.readAddr(i).valid || setValid
+
+    readDataReady(i) := nxtReadDataReady
+
+    nxtReadDataBits(i) := Mux(setValid, setValue, rwdata(i))
+
+    when (nxtReadDataReady) {
+      readDataBits(i) := nxtReadDataBits(i)
+    }
+  }
+
+  // Bus port priority encoded address.
+  val busAddr = Wire(Vec(4, UInt(32.W)))
+  val busValid = Cat(io.busAddr(3).valid, io.busAddr(2).valid,
+                     io.busAddr(1).valid, io.busAddr(0).valid)
+
+  for (i <- 0 until 4) {
+    busAddr(i) := Mux(io.busAddr(i).bypass, rwdata(2 * i),
+                  Mux(io.busAddr(i).immen, rdata(2 * i) + io.busAddr(i).immed,
+                      rdata(2 * i)))
+  }
+
+  for (i <- 0 until 4) {
+    io.busPort.addr(i) := busAddr(i)
+    io.busPort.data(i) := nxtReadDataBits(2 * i + 1)
+  }
+
+  // Branch target address combinatorial.
+  for (i <- 0 until 4) {
+    io.target(i).data := busAddr(i)
+  }
+
+  // ***************************************************************************
+  // Link port.
+  // ***************************************************************************
+  io.linkPort.valid := !scoreboard(1)
+  io.linkPort.value := regfile(1)
+
+  // ***************************************************************************
+  // Assertions.
+  // ***************************************************************************
+  for (i <- 0 until 4) {
+    assert(busAddr(i).getWidth == p.lsuAddrBits)
+  }
+
+  for (i <- 0 until 6) {
+    for (j <- (i+1) until 6) {
+      // Delay the failure a cycle for debugging purposes.
+      val write_fail = RegInit(false.B)
+      write_fail := io.writeData(i).valid && io.writeData(j).valid &&
+                    io.writeData(i).addr === io.writeData(j).addr &&
+                    io.writeData(i).addr =/= 0.U
+      assert(!write_fail)
+    }
+  }
+
+  val scoreboard_error = RegInit(false.B)
+  scoreboard_error := (scoreboard & scoreboard_clr) =/= scoreboard_clr
+  assert(!scoreboard_error)
+}
+
+object EmitRegfile extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new Regfile(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/scalar/SCore.scala b/hdl/chisel/src/kelvin/scalar/SCore.scala
new file mode 100644
index 0000000..786bbe6
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/SCore.scala
@@ -0,0 +1,350 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object SCore {
+  def apply(p: Parameters): SCore = {
+    return Module(new SCore(p))
+  }
+}
+
+class SCore(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    val csr = new CsrInOutIO(p)
+    val halted = Output(Bool())
+    val fault = Output(Bool())
+
+    val ibus = new IBusIO(p)
+    val dbus = new DBusIO(p)
+    val ubus = new DBusIO(p)
+    val vldst = Output(Bool())
+
+    val vcore = Flipped(new VCoreIO(p))
+
+    val iflush = new IFlushIO(p)
+    val dflush = new DFlushIO(p)
+    val slog = new SLogIO(p)
+
+    val debug = new DebugIO(p)
+  })
+
+  // The functional units that make up the core.
+  val regfile = Regfile(p)
+  val fetch = Fetch(p)
+  val decode = Seq(Decode(p, 0), Decode(p, 1), Decode(p, 2), Decode(p, 3))
+  val alu = Seq.fill(4)(Alu(p))
+  val bru = Seq.fill(4)(Bru(p))
+  val csr = Csr(p)
+  val lsu = Lsu(p)
+  val mlu = Mlu(p)
+  val dvu = Dvu(p)
+
+  // Wire up the core.
+  val branchTaken = bru(0).io.taken.valid || bru(1).io.taken.valid ||
+                    bru(2).io.taken.valid || bru(3).io.taken.valid
+
+  // ---------------------------------------------------------------------------
+  // IFlush
+  val iflush = RegInit(false.B)
+
+  when (bru(0).io.iflush) {
+    iflush := true.B
+  } .elsewhen (fetch.io.iflush.ready && io.iflush.ready &&
+               lsu.io.flush.ready && lsu.io.flush.fencei) {
+    iflush := false.B
+  }
+
+  io.dflush.valid := lsu.io.flush.valid
+  io.dflush.all   := lsu.io.flush.all
+  io.dflush.clean := lsu.io.flush.clean
+  lsu.io.flush.ready := io.dflush.ready
+
+  assert(!bru(1).io.iflush)
+  assert(!bru(2).io.iflush)
+  assert(!bru(3).io.iflush)
+
+  // ---------------------------------------------------------------------------
+  // Fetch
+  fetch.io.csr := io.csr.in
+
+  for (i <- 0 until 4) {
+    fetch.io.branch(i) := bru(i).io.taken
+  }
+
+  fetch.io.linkPort := regfile.io.linkPort
+
+  fetch.io.iflush.valid := iflush
+
+  // ---------------------------------------------------------------------------
+  // Decode
+  val mask = VecInit(true.B,
+                     decode(0).io.inst.ready,
+                     decode(0).io.inst.ready && decode(1).io.inst.ready,
+                     decode(0).io.inst.ready && decode(1).io.inst.ready &&
+                       decode(2).io.inst.ready)
+
+  for (i <- 0 until 4) {
+    decode(i).io.inst.valid := fetch.io.inst.lanes(i).valid && mask(i)
+    fetch.io.inst.lanes(i).ready := decode(i).io.inst.ready && mask(i)
+    decode(i).io.inst.addr := fetch.io.inst.lanes(i).addr
+    decode(i).io.inst.inst := fetch.io.inst.lanes(i).inst
+    decode(i).io.inst.brchFwd := fetch.io.inst.lanes(i).brchFwd
+
+    decode(i).io.branchTaken := branchTaken
+    decode(i).io.halted := csr.io.halted
+  }
+
+  // Interlock based on regfile write port dependencies.
+  decode(0).io.interlock := bru(0).io.interlock
+  decode(1).io.interlock := decode(0).io.interlock
+  decode(2).io.interlock := decode(1).io.interlock
+  decode(3).io.interlock := decode(2).io.interlock
+
+  // Serialize opcodes with only one pipeline.
+  decode(0).io.serializeIn.defaults()
+  decode(1).io.serializeIn := decode(0).io.serializeOut
+  decode(2).io.serializeIn := decode(1).io.serializeOut
+  decode(3).io.serializeIn := decode(2).io.serializeOut
+
+  // In decode update multi-issue scoreboard state.
+  val scoreboard_spec1 = decode(0).io.scoreboard.spec
+  val scoreboard_spec2 = decode(1).io.scoreboard.spec | scoreboard_spec1
+  val scoreboard_spec3 = decode(2).io.scoreboard.spec | scoreboard_spec2
+  assert(scoreboard_spec1.getWidth == 32)
+  assert(scoreboard_spec2.getWidth == 32)
+  assert(scoreboard_spec3.getWidth == 32)
+
+  decode(0).io.scoreboard.comb := regfile.io.scoreboard.comb
+  decode(0).io.scoreboard.regd := regfile.io.scoreboard.regd
+  decode(1).io.scoreboard.comb := regfile.io.scoreboard.comb | scoreboard_spec1
+  decode(1).io.scoreboard.regd := regfile.io.scoreboard.regd | scoreboard_spec1
+  decode(2).io.scoreboard.comb := regfile.io.scoreboard.comb | scoreboard_spec2
+  decode(2).io.scoreboard.regd := regfile.io.scoreboard.regd | scoreboard_spec2
+  decode(3).io.scoreboard.comb := regfile.io.scoreboard.comb | scoreboard_spec3
+  decode(3).io.scoreboard.regd := regfile.io.scoreboard.regd | scoreboard_spec3
+
+
+  decode(0).io.mactive := io.vcore.mactive
+  decode(1).io.mactive := false.B
+  decode(2).io.mactive := false.B
+  decode(3).io.mactive := false.B
+
+  // ---------------------------------------------------------------------------
+  // ALU
+  for (i <- 0 until 4) {
+    alu(i).io.req := decode(i).io.alu
+    alu(i).io.rs1 := regfile.io.readData(2 * i + 0)
+    alu(i).io.rs2 := regfile.io.readData(2 * i + 1)
+  }
+
+  // ---------------------------------------------------------------------------
+  // Branch Unit
+  for (i <- 0 until 4) {
+    bru(i).io.req := decode(i).io.bru
+    bru(i).io.rs1 := regfile.io.readData(2 * i + 0)
+    bru(i).io.rs2 := regfile.io.readData(2 * i + 1)
+    bru(i).io.target := regfile.io.target(i)
+  }
+
+  bru(0).io.csr <> csr.io.bru
+  bru(1).io.csr.defaults()
+  bru(2).io.csr.defaults()
+  bru(3).io.csr.defaults()
+
+  io.iflush.valid := iflush
+
+  // ---------------------------------------------------------------------------
+  // Control Status Unit
+  csr.io.csr <> io.csr
+
+  csr.io.req <> decode(0).io.csr
+  csr.io.rs1 := regfile.io.readData(0)
+
+  csr.io.vcore.undef := io.vcore.undef
+
+  // ---------------------------------------------------------------------------
+  // Status
+  io.halted := csr.io.halted
+  io.fault  := csr.io.fault
+
+  // ---------------------------------------------------------------------------
+  // Load/Store Unit
+  lsu.io.busPort := regfile.io.busPort
+
+  for (i <- 0 until 4) {
+    lsu.io.req(i).valid := decode(i).io.lsu.valid
+    lsu.io.req(i).store := decode(i).io.lsu.store
+    lsu.io.req(i).addr  := decode(i).io.lsu.addr
+    lsu.io.req(i).op    := decode(i).io.lsu.op
+    decode(i).io.lsu.ready := lsu.io.req(i).ready
+  }
+
+  // ---------------------------------------------------------------------------
+  // Multiplier Unit
+  mlu.io.req(0) := decode(0).io.mlu
+  mlu.io.req(1) := decode(1).io.mlu
+  mlu.io.req(2) := decode(2).io.mlu
+  mlu.io.req(3) := decode(3).io.mlu
+  mlu.io.rs1(0) := regfile.io.readData(0)
+  mlu.io.rs1(1) := regfile.io.readData(2)
+  mlu.io.rs1(2) := regfile.io.readData(4)
+  mlu.io.rs1(3) := regfile.io.readData(6)
+  mlu.io.rs2(0) := regfile.io.readData(1)
+  mlu.io.rs2(1) := regfile.io.readData(3)
+  mlu.io.rs2(2) := regfile.io.readData(5)
+  mlu.io.rs2(3) := regfile.io.readData(7)
+
+  // On taken branches, multicycle MLU execute must be masked
+  val mluInvalidate = RegInit(false.B)
+  mluInvalidate := branchTaken
+
+  // ---------------------------------------------------------------------------
+  // Divide Unit
+  dvu.io.req <> decode(0).io.dvu
+  dvu.io.rs1 := regfile.io.readData(0)
+  dvu.io.rs2 := regfile.io.readData(1)
+  dvu.io.rd.ready := !mlu.io.rd.valid
+
+  // TODO: make port conditional on pipeline index.
+  for (i <- 1 until 4) {
+    decode(i).io.dvu.ready := false.B
+  }
+
+  // ---------------------------------------------------------------------------
+  // Register File
+  for (i <- 0 until 4) {
+    regfile.io.readAddr(2 * i + 0) := decode(i).io.rs1Read
+    regfile.io.readAddr(2 * i + 1) := decode(i).io.rs2Read
+    regfile.io.readSet(2 * i + 0) := decode(i).io.rs1Set
+    regfile.io.readSet(2 * i + 1) := decode(i).io.rs2Set
+    regfile.io.writeAddr(i) := decode(i).io.rdMark
+    regfile.io.busAddr(i) := decode(i).io.busRead
+
+    val csr0Valid = if (i == 0) csr.io.rd.valid else false.B
+    val csr0Addr  = if (i == 0) csr.io.rd.addr else 0.U
+    val csr0Data  = if (i == 0) csr.io.rd.data else 0.U
+
+
+    regfile.io.writeData(i).valid := csr0Valid ||
+                                     alu(i).io.rd.valid || bru(i).io.rd.valid ||
+                                     io.vcore.rd(i).valid
+
+    regfile.io.writeData(i).addr :=
+        MuxOR(csr0Valid, csr0Addr) |
+        MuxOR(alu(i).io.rd.valid, alu(i).io.rd.addr) |
+        MuxOR(bru(i).io.rd.valid, bru(i).io.rd.addr) |
+        MuxOR(io.vcore.rd(i).valid, io.vcore.rd(i).addr)
+
+    regfile.io.writeData(i).data :=
+        MuxOR(csr0Valid, csr0Data) |
+        MuxOR(alu(i).io.rd.valid, alu(i).io.rd.data) |
+        MuxOR(bru(i).io.rd.valid, bru(i).io.rd.data) |
+        MuxOR(io.vcore.rd(i).valid, io.vcore.rd(i).data)
+
+    assert((csr0Valid +&
+            alu(i).io.rd.valid +& bru(i).io.rd.valid +&
+            io.vcore.rd(i).valid) <= 1.U)
+  }
+
+  regfile.io.writeData(4).valid := mlu.io.rd.valid || dvu.io.rd.valid
+  regfile.io.writeData(4).addr := Mux(mlu.io.rd.valid, mlu.io.rd.addr, dvu.io.rd.addr)
+  regfile.io.writeData(4).data := Mux(mlu.io.rd.valid, mlu.io.rd.data, dvu.io.rd.data)
+  assert(!(mlu.io.rd.valid && (dvu.io.rd.valid && dvu.io.rd.ready)))  // TODO: stall dvu on mlu write
+
+  regfile.io.writeData(5).valid := lsu.io.rd.valid
+  regfile.io.writeData(5).addr  := lsu.io.rd.addr
+  regfile.io.writeData(5).data  := lsu.io.rd.data
+
+  regfile.io.writeMask(0).valid := false.B
+  regfile.io.writeMask(1).valid := regfile.io.writeMask(0).valid ||
+                                     bru(0).io.taken.valid
+  regfile.io.writeMask(2).valid := regfile.io.writeMask(1).valid ||
+                                     bru(1).io.taken.valid
+  regfile.io.writeMask(3).valid := regfile.io.writeMask(2).valid ||
+                                     bru(2).io.taken.valid
+  regfile.io.writeMask(4).valid := mluInvalidate
+
+  // ---------------------------------------------------------------------------
+  // Vector Extension
+  for (i <- 0 until 4) {
+    io.vcore.vinst(i) <> decode(i).io.vinst
+  }
+
+  for (i <- 0 until 8) {
+    io.vcore.rs(i) := regfile.io.readData(i)
+  }
+
+  // ---------------------------------------------------------------------------
+  // Fetch Bus
+  io.ibus <> fetch.io.ibus
+
+  // ---------------------------------------------------------------------------
+  // Local Data Bus Port
+  io.dbus <> lsu.io.dbus
+  io.ubus <> lsu.io.ubus
+
+  io.vldst := lsu.io.vldst
+
+  // ---------------------------------------------------------------------------
+  // Scalar logging interface
+  val slogValid = RegInit(false.B)
+  val slogAddr = Reg(UInt(2.W))
+  val slogEn = decode(0).io.slog
+
+  slogValid := slogEn
+  when (slogEn) {
+    slogAddr := decode(0).io.inst.inst(14,12)
+  }
+
+  io.slog.valid := slogValid
+  io.slog.addr  := MuxOR(slogValid, slogAddr)
+  io.slog.data  := MuxOR(slogValid, regfile.io.readData(0).data)
+
+  // ---------------------------------------------------------------------------
+  // DEBUG
+  val cycles = RegInit(0.U(32.W))
+  cycles := cycles + 1.U
+  io.debug.cycles := cycles
+
+  val debugEn = RegInit(0.U(4.W))
+  val debugAddr = Reg(Vec(4, UInt(32.W)))
+  val debugInst = Reg(Vec(4, UInt(32.W)))
+
+  val debugBrch =
+    Cat(bru(0).io.taken.valid || bru(1).io.taken.valid || bru(2).io.taken.valid,
+        bru(0).io.taken.valid || bru(1).io.taken.valid,
+        bru(0).io.taken.valid,
+        false.B)
+
+  debugEn := Cat(fetch.io.inst.lanes(3).valid && fetch.io.inst.lanes(3).ready && !branchTaken,
+                 fetch.io.inst.lanes(2).valid && fetch.io.inst.lanes(2).ready && !branchTaken,
+                 fetch.io.inst.lanes(1).valid && fetch.io.inst.lanes(1).ready && !branchTaken,
+                 fetch.io.inst.lanes(0).valid && fetch.io.inst.lanes(0).ready && !branchTaken)
+
+  debugAddr(0) := fetch.io.inst.lanes(0).addr
+  debugAddr(1) := fetch.io.inst.lanes(1).addr
+  debugAddr(2) := fetch.io.inst.lanes(2).addr
+  debugAddr(3) := fetch.io.inst.lanes(3).addr
+  debugInst(0) := fetch.io.inst.lanes(0).inst
+  debugInst(1) := fetch.io.inst.lanes(1).inst
+  debugInst(2) := fetch.io.inst.lanes(2).inst
+  debugInst(3) := fetch.io.inst.lanes(3).inst
+
+  io.debug.en := debugEn & ~debugBrch
+
+  io.debug.addr0 := debugAddr(0)
+  io.debug.addr1 := debugAddr(1)
+  io.debug.addr2 := debugAddr(2)
+  io.debug.addr3 := debugAddr(3)
+  io.debug.inst0 := debugInst(0)
+  io.debug.inst1 := debugInst(1)
+  io.debug.inst2 := debugInst(2)
+  io.debug.inst3 := debugInst(3)
+}
+
+object EmitSCore extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new SCore(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/scalar/SLog.scala b/hdl/chisel/src/kelvin/scalar/SLog.scala
new file mode 100644
index 0000000..b52963a
--- /dev/null
+++ b/hdl/chisel/src/kelvin/scalar/SLog.scala
@@ -0,0 +1,12 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+// Scalar instrumentation logging (printf).
+class SLogIO(p: Parameters) extends Bundle {
+  val valid = Output(Bool())
+  val addr = Output(UInt(5.W))
+  val data = Output(UInt(32.W))
+}
diff --git a/hdl/chisel/src/kelvin/vector/VAlu.scala b/hdl/chisel/src/kelvin/vector/VAlu.scala
new file mode 100644
index 0000000..b074a13
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VAlu.scala
@@ -0,0 +1,395 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object VAlu {
+  def apply(p: Parameters): VAlu = {
+    return Module(new VAlu(p))
+  }
+}
+
+class VAlu(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    // Instructions.
+    val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+    val active = Output(UInt(64.W))
+
+    // VRegfile.
+    val vrfsb = Input(UInt(128.W))
+    val read  = Vec(7, new VRegfileReadIO(p))
+    val write = Vec(4, new VRegfileWriteIO(p))
+    val whint = Vec(4, new VRegfileWhintIO(p))
+    val scalar = Vec(2, new VRegfileScalarIO(p))
+
+    // Testbench signals.
+    val read_0_ready = Output(Bool())
+    val read_1_ready = Output(Bool())
+    val read_2_ready = Output(Bool())
+    val read_3_ready = Output(Bool())
+    val read_4_ready = Output(Bool())
+    val read_5_ready = Output(Bool())
+    val read_6_ready = Output(Bool())
+  })
+
+  val cmdqDepth = 8
+
+  val e = new VEncodeOp()
+
+  // ---------------------------------------------------------------------------
+  // Tie-offs.
+  for (i <- 0 until 7) {
+    io.read(i).valid := false.B
+    io.read(i).addr := 0.U
+    io.read(i).tag  := 0.U
+  }
+
+  for (i <- 0 until 4) {
+    io.write(i).valid := false.B
+    io.write(i).addr := 0.U
+    io.write(i).data := 0.U
+  }
+
+  for (i <- 0 until 4) {
+    io.whint(i).valid := false.B
+    io.whint(i).addr := 0.U
+  }
+
+  // ---------------------------------------------------------------------------
+  // Opcode checks.
+  for (i <- 0 until 4) {
+    when (io.in.valid && io.in.ready) {
+      when (io.in.bits(i).valid) {
+        val op = io.in.bits(i).bits.op
+        val supported =
+            // Arithmetic
+            op === e.vabsd.U ||
+            op === e.vacc.U ||
+            op === e.vadd.U ||
+            op === e.vadds.U ||
+            op === e.vaddw.U ||
+            op === e.vadd3.U ||
+            op === e.vdup.U ||
+            op === e.vhadd.U ||
+            op === e.vhsub.U ||
+            op === e.vmax.U ||
+            op === e.vmin.U ||
+            op === e.vpadd.U ||
+            op === e.vpsub.U ||
+            op === e.vrsub.U ||
+            op === e.vsub.U ||
+            op === e.vsubs.U ||
+            op === e.vsubw.U ||
+            // Compare.
+            op === e.veq.U ||
+            op === e.vne.U ||
+            op === e.vlt.U ||
+            op === e.vle.U ||
+            op === e.vgt.U ||
+            op === e.vge.U ||
+            // Logical.
+            op === e.vand.U ||
+            op === e.vclb.U ||
+            op === e.vclz.U ||
+            op === e.vcpop.U ||
+            op === e.vmv.U ||
+            op === e.vmv2.U ||
+            op === e.vmvp.U ||
+            op === e.adwinit.U ||
+            op === e.vnot.U ||
+            op === e.vor.U ||
+            op === e.vrev.U ||
+            op === e.vror.U ||
+            op === e.vxor.U ||
+            // Shift.
+            op === e.vshl.U ||
+            op === e.vshr.U ||
+            op === e.vshf.U ||
+            op === e.vsrans.U ||
+            op === e.vsraqs.U ||
+            // Multiply.
+            op === e.vdmulh.U ||
+            op === e.vdmulh2.U ||
+            op === e.vmadd.U ||
+            op === e.vmul.U ||
+            op === e.vmul2.U ||
+            op === e.vmulh.U ||
+            op === e.vmulh2.U ||
+            op === e.vmuls.U ||
+            op === e.vmuls2.U ||
+            op === e.vmulw.U ||
+            // Shuffle.
+            op === e.vslidevn.U ||
+            op === e.vslidevp.U ||
+            op === e.vslidehn2.U ||
+            op === e.vslidehp2.U ||
+            op === e.vsel.U ||
+            op === e.vevn.U ||
+            op === e.vodd.U ||
+            op === e.vevnodd.U ||
+            op === e.vzip.U ||
+            // ML
+            op === e.vdwconv.U ||
+            op === e.adwconv.U
+
+        when (!supported) {
+          printf("**Op=%d unsupported\n", op)
+        }
+        assert(supported)
+
+        assert(!(io.in.bits(i).bits.vt.valid && io.in.bits(i).bits.sv.valid))
+
+        when (op === e.vdwconv.U || op === e.adwconv.U) {
+          val sparse = io.in.bits(i).bits.sv.data(3,2)
+          assert(io.in.bits(i).bits.m === false.B)
+          assert(io.in.bits(i).bits.sz === 4.U)
+          assert(io.in.bits(i).bits.sv.valid === false.B)
+          assert(sparse < 3.U)
+        }
+      }
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Command Queue.
+  class VAluCmdq extends Bundle {
+    val op = UInt(new VEncodeOp().bits.W)
+    val f2 = UInt(3.W)
+    val sz = UInt(3.W)
+    val vd = new VAddr()
+    val ve = new VAddr()
+    val vs = new VAddrTag()
+    val vt = new VAddrTag()
+    val vu = new VAddrTag()
+    val sv = new SData()
+    val cmdsync = Bool()
+  }
+
+  def Fin(in: VDecodeBits, alu: Int): VAluCmdq = {
+    val out = Wire(new VAluCmdq)
+    out.op := in.op
+    out.f2 := in.f2
+    out.sz := in.sz
+    out.cmdsync := in.cmdsync
+    when ((alu == 0).B || !in.cmdsync) {
+      out.vd := in.vd
+      out.ve := in.ve
+      out.vs := in.vs
+      out.vt := in.vt
+      out.vu := in.vu
+    } .otherwise {
+      out.vd := in.vf
+      out.ve := in.vg
+      out.vs := in.vx
+      out.vt := in.vy
+      out.vu := in.vz
+    }
+    out.sv := in.sv
+    out
+  }
+
+  def Fin0(in: VDecodeBits): VAluCmdq = {
+    Fin(in, 0)
+  }
+
+  def Fin1(in: VDecodeBits): VAluCmdq = {
+    Fin(in, 1)
+  }
+
+  def Fout(in: VAluCmdq, m: Bool, step: UInt, valid: Bool): (VAluCmdq, Bool) = {
+    val vevnodd = in.op === e.vevn.U || in.op === e.vodd.U || in.op === e.vevnodd.U
+    val vzip = in.op === e.vzip.U
+    val out = Wire(new VAluCmdq)
+    val last = !m || step === 3.U
+    out := in
+    out.vd.addr := in.vd.addr + 1.U
+    out.ve.addr := in.ve.addr + 1.U
+    out.vs.addr := in.vs.addr + 1.U
+    out.vt.addr := in.vt.addr + 1.U
+    out.vu.addr := in.vu.addr + 1.U
+    when (m && vevnodd) {
+      out.vu.addr := in.vu.addr
+      when (step === 1.U) {  // halfway
+        out.vs.addr := in.vu.addr + 0.U
+        out.vt.addr := in.vu.addr + 1.U
+      } .otherwise {
+        out.vs.addr := in.vs.addr + 2.U
+        out.vt.addr := in.vt.addr + 2.U
+      }
+    }
+    when (vzip) {
+      assert(in.ve.addr === (in.vd.addr + 1.U))
+      out.vd.addr := in.vd.addr + 2.U
+      out.ve.addr := in.ve.addr + 2.U
+    }
+    (out, last)
+  }
+
+  def Factive(in: VAluCmdq, m: Bool, step: UInt): UInt = {
+    assert(step.getWidth == 5)
+    assert(step <= 4.U)
+    // Only reads are reported in active, vrfsb tracks writes.
+    val active = MuxOR(in.vs.valid, RegActive(m, step(2,0), in.vs.addr)) |
+                 MuxOR(in.vt.valid, RegActive(m, step(2,0), in.vt.addr)) |
+                 MuxOR(in.vu.valid, RegActive(m, step(2,0), in.vu.addr))
+    assert(active.getWidth == 64)
+    active
+  }
+
+  val q0 = VCmdq(cmdqDepth, new VAluCmdq, Fin0, Fout, Factive)
+  val q1 = VCmdq(cmdqDepth, new VAluCmdq, Fin1, Fout, Factive)
+
+  q0.io.in.valid := io.in.valid && q1.io.in.ready
+  q1.io.in.valid := io.in.valid && q0.io.in.ready
+  io.in.ready := q0.io.in.ready && q1.io.in.ready
+
+  q0.io.in.bits := io.in.bits
+  q1.io.in.bits := io.in.bits
+
+  val q0ready = ScoreboardReady(q0.io.out.bits.vs, io.vrfsb) &&
+                ScoreboardReady(q0.io.out.bits.vt, io.vrfsb) &&
+                ScoreboardReady(q0.io.out.bits.vu, io.vrfsb)
+
+  val q1ready = ScoreboardReady(q1.io.out.bits.vs, io.vrfsb) &&
+                ScoreboardReady(q1.io.out.bits.vt, io.vrfsb) &&
+                ScoreboardReady(q1.io.out.bits.vu, io.vrfsb)
+
+  q0.io.out.ready := q0ready && (!q0.io.out.bits.cmdsync || q1.io.out.valid && q1ready && q1.io.out.bits.cmdsync)
+  q1.io.out.ready := q1ready && (!q1.io.out.bits.cmdsync || q0.io.out.valid && q0ready && q0.io.out.bits.cmdsync)
+
+  // ---------------------------------------------------------------------------
+  // ALU Selection interleaving.
+  val alureg = RegInit(false.B)
+  val alusel = Wire(Vec(5, Bool()))
+
+  // Toggle if previous was valid and was not a synchronized dual command.
+  alusel(0) := alureg
+  alusel(1) := Mux(io.in.bits(0).valid && !io.in.bits(0).bits.cmdsync, !alusel(0), alusel(0))
+  alusel(2) := Mux(io.in.bits(1).valid && !io.in.bits(1).bits.cmdsync, !alusel(1), alusel(1))
+  alusel(3) := Mux(io.in.bits(2).valid && !io.in.bits(2).bits.cmdsync, !alusel(2), alusel(2))
+  alusel(4) := Mux(io.in.bits(3).valid && !io.in.bits(3).bits.cmdsync, !alusel(3), alusel(3))
+
+  when (io.in.valid && io.in.ready) {
+    alureg := alusel(4)
+  }
+
+  for (i <- 0 until 4) {
+    q0.io.in.bits(i).valid := io.in.bits(i).valid && (alusel(i) === 0.U || io.in.bits(i).bits.cmdsync)
+    q1.io.in.bits(i).valid := io.in.bits(i).valid && (alusel(i) === 1.U || io.in.bits(i).bits.cmdsync)
+  }
+
+  // ---------------------------------------------------------------------------
+  // Read ports.
+  io.read(0).valid := q0.io.out.bits.vs.valid
+  io.read(1).valid := q0.io.out.bits.vt.valid
+  io.read(2).valid := q0.io.out.bits.vu.valid
+  io.read(3).valid := q1.io.out.bits.vs.valid
+  io.read(4).valid := q1.io.out.bits.vt.valid
+  io.read(5).valid := q1.io.out.bits.vu.valid
+
+  io.read(0).addr := q0.io.out.bits.vs.addr
+  io.read(1).addr := q0.io.out.bits.vt.addr
+  io.read(2).addr := q0.io.out.bits.vu.addr
+  io.read(3).addr := q1.io.out.bits.vs.addr
+  io.read(4).addr := q1.io.out.bits.vt.addr
+  io.read(5).addr := q1.io.out.bits.vu.addr
+
+  io.read(0).tag := OutTag(q0.io.out.bits.vs)
+  io.read(1).tag := OutTag(q0.io.out.bits.vt)
+  io.read(2).tag := OutTag(q0.io.out.bits.vu)
+  io.read(3).tag := OutTag(q1.io.out.bits.vs)
+  io.read(4).tag := OutTag(q1.io.out.bits.vt)
+  io.read(5).tag := OutTag(q1.io.out.bits.vu)
+
+  io.scalar(0).valid := q0.io.out.bits.sv.valid
+  io.scalar(1).valid := q1.io.out.bits.sv.valid
+
+  io.scalar(0).data := q0.io.out.bits.sv.data
+  io.scalar(1).data := q1.io.out.bits.sv.data
+
+  io.read_0_ready := io.read(0).valid && q0.io.out.ready
+  io.read_1_ready := io.read(1).valid && q0.io.out.ready
+  io.read_2_ready := io.read(2).valid && q0.io.out.ready
+  io.read_3_ready := io.read(3).valid && q1.io.out.ready
+  io.read_4_ready := io.read(4).valid && q1.io.out.ready
+  io.read_5_ready := io.read(5).valid && q1.io.out.ready
+  io.read_6_ready := false.B
+
+  // ---------------------------------------------------------------------------
+  // Alu0.
+  val alu0 = Module(new VAluInt(p, 0))
+
+  alu0.io.in.valid := q0.io.out.valid && q0.io.out.ready
+  alu0.io.in.op := q0.io.out.bits.op
+  alu0.io.in.f2 := q0.io.out.bits.f2
+  alu0.io.in.sz := q0.io.out.bits.sz
+  alu0.io.in.vd.addr := q0.io.out.bits.vd.addr
+  alu0.io.in.ve.addr := q0.io.out.bits.ve.addr
+  alu0.io.in.sv.data := q0.io.out.bits.sv.data
+
+  alu0.io.read(0).data := io.read(0).data
+  alu0.io.read(1).data := io.read(1).data
+  alu0.io.read(2).data := io.read(2).data
+  alu0.io.read(3).data := io.read(3).data
+  alu0.io.read(4).data := io.read(4).data
+  alu0.io.read(5).data := io.read(5).data
+  alu0.io.read(6).data := io.read(6).data
+
+  io.write(0).valid := alu0.io.write(0).valid
+  io.write(0).addr := alu0.io.write(0).addr
+  io.write(0).data := alu0.io.write(0).data
+
+  io.write(1).valid := alu0.io.write(1).valid
+  io.write(1).addr := alu0.io.write(1).addr
+  io.write(1).data := alu0.io.write(1).data
+
+  io.whint(0).valid := alu0.io.whint(0).valid
+  io.whint(0).addr := alu0.io.whint(0).addr
+
+  io.whint(1).valid := alu0.io.whint(1).valid
+  io.whint(1).addr := alu0.io.whint(1).addr
+
+  // ---------------------------------------------------------------------------
+  // Alu1.
+  val alu1 = Module(new VAluInt(p, 1))
+
+  alu1.io.in.valid := q1.io.out.valid && q1.io.out.ready
+  alu1.io.in.op := q1.io.out.bits.op
+  alu1.io.in.f2 := q1.io.out.bits.f2
+  alu1.io.in.sz := q1.io.out.bits.sz
+  alu1.io.in.vd.addr := q1.io.out.bits.vd.addr
+  alu1.io.in.ve.addr := q1.io.out.bits.ve.addr
+  alu1.io.in.sv.data := q1.io.out.bits.sv.data
+
+  alu1.io.read(0).data := io.read(3).data
+  alu1.io.read(1).data := io.read(4).data
+  alu1.io.read(2).data := io.read(5).data
+  alu1.io.read(3).data := io.read(0).data
+  alu1.io.read(4).data := io.read(1).data
+  alu1.io.read(5).data := io.read(2).data
+  alu1.io.read(6).data := io.read(6).data
+
+  io.write(2).valid := alu1.io.write(0).valid
+  io.write(2).addr := alu1.io.write(0).addr
+  io.write(2).data := alu1.io.write(0).data
+
+  io.write(3).valid := alu1.io.write(1).valid
+  io.write(3).addr := alu1.io.write(1).addr
+  io.write(3).data := alu1.io.write(1).data
+
+  io.whint(2).valid := alu1.io.whint(0).valid
+  io.whint(2).addr := alu1.io.whint(0).addr
+
+  io.whint(3).valid := alu1.io.whint(1).valid
+  io.whint(3).addr := alu1.io.whint(1).addr
+
+  // ---------------------------------------------------------------------------
+  // Active.
+  io.active := q0.io.active | q1.io.active
+}
+
+object EmitVAlu extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new VAlu(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VAluInt.scala b/hdl/chisel/src/kelvin/vector/VAluInt.scala
new file mode 100644
index 0000000..619f5d1
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VAluInt.scala
@@ -0,0 +1,1529 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+// VAluInt is foremost an ML depthwise and activiation unit with pipelining
+// behaviors optimized to this functionality. All operations are pipelined with
+// a result latency of 2cc geared towards the goal of simplicity of design.
+//
+// Note: widening operations modify the size from ISA defined destination to
+// source read registers of sz/2.
+
+class VAluInt(p: Parameters, aluid: Int) extends Module {
+  val e = new VEncodeOp()
+
+  val io = IO(new Bundle {
+    val in = Input(new Bundle {
+      val valid = Bool()
+      val op = UInt(e.bits.W)
+      val f2 = UInt(3.W)
+      val sz = UInt(3.W)
+      val vd = new AluAddr()  // write port 0
+      val ve = new AluAddr()  // write port 1
+      val sv = new Bundle { val data = UInt(32.W) }  // scala value
+    })
+    val read = Vec(7, Input(new Bundle {
+      val data = UInt(p.vectorBits.W)
+    }))
+    val write = Vec(2, Output(new Bundle {
+      val valid = Bool()
+      val addr = UInt(6.W)
+      val data = UInt(p.vectorBits.W)
+    }))
+    val whint = Vec(2, Output(new Bundle {
+      val valid = Bool()
+      val addr = UInt(6.W)
+    }))
+  })
+
+  class AluAddr extends Bundle {
+    val addr = UInt(6.W)
+  }
+
+  val lanes = p.vectorBits / 32
+  assert(lanes == 4 || lanes == 8 || lanes == 16)
+
+  assert(!io.in.valid || PopCount(io.in.sz) <= 1.U)
+
+  // ---------------------------------------------------------------------------
+  // Tie-offs.
+  for (i <- 0 until 2) {
+    io.write(i).valid := false.B
+    io.write(i).addr := 0.U
+    io.write(i).data := 0.U
+  }
+  for (i <- 0 until 2) {
+    io.whint(i).valid := false.B
+    io.whint(i).addr := 0.U
+  }
+
+  // ---------------------------------------------------------------------------
+  // Encodings.
+  val e_absd  = io.in.op === e.vabsd.U
+  val e_acc   = io.in.op === e.vacc.U
+  val e_dup   = io.in.op === e.vdup.U
+  val e_max   = io.in.op === e.vmax.U
+  val e_min   = io.in.op === e.vmin.U
+  val e_rsub  = io.in.op === e.vrsub.U
+  val e_srans = io.in.op === e.vsrans.U
+  val e_sraqs = if (aluid == 0) io.in.op === e.vsraqs.U else false.B
+
+  val e_slidevn = io.in.op === e.vslidevn.U || io.in.op === e.vslidehn.U || io.in.op === e.vslidehn2.U
+  val e_slidevp = io.in.op === e.vslidevp.U || io.in.op === e.vslidehp.U || io.in.op === e.vslidehp2.U
+  val e_slidehn2 = io.in.op === e.vslidehn2.U
+  val e_slidehp2 = io.in.op === e.vslidehp2.U
+  val e_sel = io.in.op === e.vsel.U
+  val e_evn = io.in.op === e.vevn.U || io.in.op === e.vevnodd.U
+  val e_odd = io.in.op === e.vodd.U || io.in.op === e.vevnodd.U
+  val e_zip = io.in.op === e.vzip.U
+
+  val e_dwinit = io.in.op === e.adwinit.U
+  val e_dwconv = io.in.op === e.vdwconv.U || io.in.op === e.adwconv.U
+  val e_dwconva = io.in.op === e.adwconv.U
+
+  val e_add_add  = io.in.op === e.vadd.U
+  val e_add_adds = io.in.op === e.vadds.U
+  val e_add_addw = io.in.op === e.vaddw.U
+  val e_add_add3 = io.in.op === e.vadd3.U
+  val e_add_hadd = io.in.op === e.vhadd.U
+  val e_add = e_add_add || e_add_adds || e_add_addw || e_add_add3 || e_add_hadd
+
+  val e_cmp_eq = io.in.op === e.veq.U
+  val e_cmp_ne = io.in.op === e.vne.U
+  val e_cmp_lt = io.in.op === e.vlt.U
+  val e_cmp_le = io.in.op === e.vle.U
+  val e_cmp_gt = io.in.op === e.vgt.U
+  val e_cmp_ge = io.in.op === e.vge.U
+  val e_cmp    = e_cmp_eq || e_cmp_ne || e_cmp_lt || e_cmp_le || e_cmp_gt || e_cmp_ge
+  assert(PopCount(Cat(e_cmp_eq, e_cmp_ne, e_cmp_lt, e_cmp_le, e_cmp_gt, e_cmp_ge)) <= 1.U)
+
+  val e_log_and  = io.in.op === e.vand.U
+  val e_log_or   = io.in.op === e.vor.U
+  val e_log_xor  = io.in.op === e.vxor.U
+  val e_log_not  = io.in.op === e.vnot.U
+  val e_log_rev  = io.in.op === e.vrev.U
+  val e_log_ror  = io.in.op === e.vror.U
+  val e_log_clb  = io.in.op === e.vclb.U
+  val e_log_clz  = io.in.op === e.vclz.U
+  val e_log_cpop = io.in.op === e.vcpop.U
+  val e_log = e_log_and || e_log_or || e_log_xor || e_log_not || e_log_rev || e_log_ror || e_log_clb || e_log_clz || e_log_cpop
+  assert(PopCount(Cat(e_log_and, e_log_or, e_log_xor, e_log_not, e_log_rev, e_log_ror, e_log_clb, e_log_clz, e_log_cpop)) <= 1.U)
+
+  val e_mul0_dmulh = io.in.op === e.vdmulh.U || io.in.op === e.vdmulh2.U
+  val e_mul0_mul   = io.in.op === e.vmul.U || io.in.op === e.vmul2.U
+  val e_mul0_mulh  = io.in.op === e.vmulh.U || io.in.op === e.vmulh2.U
+  val e_mul0_muls  = io.in.op === e.vmuls.U || io.in.op === e.vmuls2.U
+  val e_mul0_mulw  = io.in.op === e.vmulw.U
+  val e_mul0_madd  = io.in.op === e.vmadd.U
+  val e_mul0 = e_mul0_dmulh || e_mul0_mul || e_mul0_mulh || e_mul0_muls || e_mul0_mulw || e_mul0_madd
+
+  val e_mul1_dmulh = io.in.op === e.vdmulh2.U
+  val e_mul1_mul   = io.in.op === e.vmul2.U
+  val e_mul1_mulh  = io.in.op === e.vmulh2.U
+  val e_mul1_muls  = io.in.op === e.vmuls2.U
+  val e_mul1 = e_mul1_dmulh || e_mul1_mul || e_mul1_mulh || e_mul1_muls
+
+  val e_mv2 = io.in.op === e.vmv2.U
+  val e_mvp = io.in.op === e.vmvp.U
+  val e_mv  = io.in.op === e.vmv.U || e_mv2 || e_mvp
+
+  val e_padd_add = io.in.op === e.vpadd.U
+  val e_padd_sub = io.in.op === e.vpsub.U
+  val e_padd = e_padd_add || e_padd_sub
+
+  val e_shf_shl = io.in.op === e.vshl.U
+  val e_shf_shr = io.in.op === e.vshr.U
+  val e_shf_shf = io.in.op === e.vshf.U
+  val e_shf_l = e_shf_shl || e_shf_shf
+  val e_shf_r = e_shf_shr || e_shf_shf
+
+  val e_sub_sub  = io.in.op === e.vsub.U
+  val e_sub_subs = io.in.op === e.vsubs.U
+  val e_sub_subw = io.in.op === e.vsubw.U
+  val e_sub_hsub = io.in.op === e.vhsub.U
+  val e_sub = e_sub_sub || e_sub_subs || e_sub_subw || e_sub_hsub
+
+  val e_negative = io.in.f2(0) && e_mul0_dmulh
+  val e_round    = io.in.f2(1) && (e_add_hadd || e_sub_hsub || e_mul0_dmulh || e_mul0_mulh || e_shf_shf || e_srans || e_sraqs)
+  val e_signed   = !io.in.f2(0) || e_mul0_dmulh
+
+  assert(!(e_mul1_dmulh && !e_mul0_dmulh))
+  assert(!(e_mul1_mul   && !e_mul0_mul))
+  assert(!(e_mul1_mulh  && !e_mul0_mulh))
+  assert(!(e_mul1_muls  && !e_mul0_muls))
+
+  // ---------------------------------------------------------------------------
+  // Control.
+  val vdvalid0 = RegInit(false.B)
+  val vdvalid1 = RegInit(false.B)
+  val vevalid0 = RegInit(false.B)
+  val vevalid1 = RegInit(false.B)
+  val wmask = RegInit(false.B)
+  val vdaddr0 = Reg(new AluAddr())
+  val vdaddr1 = Reg(new AluAddr())
+  val veaddr0 = Reg(new AluAddr())
+  val veaddr1 = Reg(new AluAddr())
+  val sz = RegInit(0.U(3.W))
+  val f2 = RegInit(0.U(3.W))
+  val sv = RegInit(0.U(32.W))
+
+  when (io.in.valid) {
+    // Note: sz is source size, not destination as is ISA defined.
+    val nxt_vdvalid = e_dwconv || e_mul0 || e_absd || e_acc || e_add || e_cmp || e_dup || e_log || e_evn || e_max || e_min || e_mv || e_padd || e_rsub || e_sel || e_shf_l || e_shf_r || e_slidevn || e_slidevp || e_srans || e_sraqs || e_sub || e_zip
+    val nxt_vevalid = e_dwconv || e_mul1 || e_mul0_mulw || e_acc || e_add_addw || e_mv2 || e_mvp || e_odd || e_slidehn2 || e_slidehp2 || e_sub_subw || e_zip
+    val nxt_widen = e_acc || e_add_addw || e_mul0_mulw || e_sub_subw
+    vdvalid0 := nxt_vdvalid
+    vevalid0 := nxt_vevalid
+    wmask := e_dwconva
+    sz := MuxOR(nxt_vdvalid || nxt_vevalid, Mux(nxt_widen, io.in.sz >> 1.U, io.in.sz))
+    f2 := io.in.f2
+    sv := io.in.sv.data
+  } .elsewhen (vdvalid0 || vevalid0) {
+    vdvalid0 := false.B
+    vevalid0 := false.B
+    wmask := false.B
+    sz := 0.U
+    f2 := 0.U
+    sv := 0.U
+  }
+
+  // Register VAluIntLane results, but mask io.write.valid outputs.
+  vdvalid1 := vdvalid0 && !wmask
+  vevalid1 := vevalid0 && !wmask
+
+  when (io.in.valid) {
+    vdaddr0 := io.in.vd
+    veaddr0 := io.in.ve
+  }
+
+  when (vdvalid0) {
+    vdaddr1 := vdaddr0
+  }
+
+  when (vevalid0) {
+    veaddr1 := veaddr0
+  }
+
+  // ---------------------------------------------------------------------------
+  // Side-bands.
+  val negative = Reg(Bool())
+  val round    = Reg(Bool())
+  val signed   = Reg(Bool())
+
+  when (io.in.valid) {
+    negative := e_negative
+    round    := e_round
+    signed   := e_signed
+  }
+
+  // ---------------------------------------------------------------------------
+  // Operations.
+  val absd  = Reg(Bool())
+  val acc   = Reg(Bool())
+  val dup   = Reg(Bool())
+  val max   = Reg(Bool())
+  val min   = Reg(Bool())
+  val srans = Reg(Bool())
+  val sraqs = Reg(Bool())
+
+  val slidevn  = Reg(Bool())
+  val slidevp  = Reg(Bool())
+  val slidehn2 = Reg(Bool())
+  val slidehp2 = Reg(Bool())
+  val sel      = Reg(Bool())
+  val evn      = Reg(Bool())
+  val odd      = Reg(Bool())
+  val zip      = Reg(Bool())
+
+  val dwinit     = Reg(Bool())
+  val dwconv     = Reg(Bool())
+  val dwconvData = Reg(Bool())
+
+  val add      = Reg(Bool())
+  val add_add  = Reg(Bool())
+  val add_adds = Reg(Bool())
+  val add_addw = Reg(Bool())
+  val add_add3 = Reg(Bool())
+  val add_hadd = Reg(Bool())
+
+  val padd = Reg(Bool())
+  val padd_add = Reg(Bool())
+  val padd_sub = Reg(Bool())
+
+  val rsub      = Reg(Bool())
+  val rsub_rsub = Reg(Bool())
+
+  val sub      = Reg(Bool())
+  val sub_sub  = Reg(Bool())
+  val sub_subs = Reg(Bool())
+  val sub_subw = Reg(Bool())
+  val sub_hsub = Reg(Bool())
+
+  val cmp    = Reg(Bool())
+  val cmp_eq = Reg(Bool())
+  val cmp_ne = Reg(Bool())
+  val cmp_lt = Reg(Bool())
+  val cmp_le = Reg(Bool())
+  val cmp_gt = Reg(Bool())
+  val cmp_ge = Reg(Bool())
+
+  val log      = Reg(Bool())
+  val log_and  = Reg(Bool())
+  val log_or   = Reg(Bool())
+  val log_xor  = Reg(Bool())
+  val log_not  = Reg(Bool())
+  val log_rev  = Reg(Bool())
+  val log_ror  = Reg(Bool())
+  val log_clb  = Reg(Bool())
+  val log_clz  = Reg(Bool())
+  val log_cpop = Reg(Bool())
+
+  val mul0       = Reg(Bool())
+  val mul0_dmulh = Reg(Bool())
+  val mul0_mul   = Reg(Bool())
+  val mul0_mulh  = Reg(Bool())
+  val mul0_muls  = Reg(Bool())
+  val mul0_mulw  = Reg(Bool())
+  val mul0_madd  = Reg(Bool())
+
+  val mul1       = Reg(Bool())
+  val mul1_dmulh = Reg(Bool())
+  val mul1_mul   = Reg(Bool())
+  val mul1_mulh  = Reg(Bool())
+  val mul1_muls  = Reg(Bool())
+
+  val mv  = Reg(Bool())
+  val mv2 = Reg(Bool())
+  val mvp = Reg(Bool())
+
+  val shf_l   = Reg(Bool())
+  val shf_r   = Reg(Bool())
+  val shf_shl = Reg(Bool())
+  val shf_shr = Reg(Bool())
+  val shf_shf = Reg(Bool())
+
+  val validClr = RegInit(false.B)
+  validClr := io.in.valid
+
+  when (io.in.valid || validClr) {
+    val valid = io.in.valid
+
+    absd  := valid && e_absd
+    acc   := valid && e_acc
+    dup   := valid && e_dup
+    max   := valid && e_max
+    min   := valid && e_min
+    srans := valid && e_srans
+    sraqs := valid && e_sraqs
+
+    slidevn  := valid && e_slidevn
+    slidevp  := valid && e_slidevp
+    slidehn2 := valid && e_slidehn2
+    slidehp2 := valid && e_slidehp2
+    sel      := valid && e_sel
+    evn      := valid && e_evn
+    odd      := valid && e_odd
+    zip      := valid && e_zip
+
+    dwinit   := valid && e_dwinit
+    dwconv   := valid && e_dwconv
+
+    add := valid && e_add  // unit activation
+    add_add  := valid && e_add_add
+    add_adds := valid && e_add_adds
+    add_addw := valid && e_add_addw
+    add_add3 := valid && e_add_add3
+    add_hadd := valid && e_add_hadd
+
+    padd := valid && e_padd
+    padd_add := valid && e_padd_add
+    padd_sub := valid && e_padd_sub
+
+    cmp := valid && (e_cmp || e_absd || e_max || e_min)  // unit activation
+    cmp_eq := valid && e_cmp_eq
+    cmp_ne := valid && e_cmp_ne
+    cmp_lt := valid && e_cmp_lt
+    cmp_le := valid && e_cmp_le
+    cmp_gt := valid && e_cmp_gt
+    cmp_ge := valid && e_cmp_ge
+
+    log := valid && e_log  // unit activation
+    log_and  := valid && e_log_and
+    log_or   := valid && e_log_or
+    log_xor  := valid && e_log_xor
+    log_not  := valid && e_log_not
+    log_rev  := valid && e_log_rev
+    log_ror  := valid && e_log_ror
+    log_clb  := valid && e_log_clb
+    log_clz  := valid && e_log_clz
+    log_cpop := valid && e_log_cpop
+
+    mul0 := valid && e_mul0  // unit activation
+    mul0_dmulh := valid && e_mul0_dmulh
+    mul0_mul   := valid && e_mul0_mul
+    mul0_mulh  := valid && e_mul0_mulh
+    mul0_muls  := valid && e_mul0_muls
+    mul0_mulw  := valid && e_mul0_mulw
+    mul0_madd  := valid && e_mul0_madd
+
+    mul1 := valid && e_mul1  // unit activation
+    mul1_dmulh := valid && e_mul1_dmulh
+    mul1_mul   := valid && e_mul1_mul
+    mul1_mulh  := valid && e_mul1_mulh
+    mul1_muls  := valid && e_mul1_muls
+
+    mv  := valid && e_mv
+    mv2 := valid && e_mv2
+    mvp := valid && e_mvp
+
+    rsub := valid && (e_rsub || e_absd)  // unit activation
+    rsub_rsub := valid && e_rsub
+
+    shf_l := valid && e_shf_l  // unit activation
+    shf_r := valid && e_shf_r  // unit activation
+    shf_shl := valid && e_shf_shl
+    shf_shr := valid && e_shf_shr
+    shf_shf := valid && e_shf_shf
+
+    sub := valid && (e_sub || e_absd)
+    sub_sub  := valid && e_sub_sub
+    sub_subs := valid && e_sub_subs
+    sub_subw := valid && e_sub_subw
+    sub_hsub := valid && e_sub_hsub
+  }
+
+  // Second cycle of ALU pipeline.
+  dwconvData := dwconv
+
+  // ---------------------------------------------------------------------------
+  // ALU segments.
+  val valu = for (i <- 0 until lanes) yield {
+    Module(new VAluIntLane)
+  }
+
+  val load = Wire(Vec(2, UInt(p.vectorBits.W)))
+
+  for (i <- 0 until lanes) {
+    val msb = 32 * i + 31
+    val lsb = 32 * i
+    valu(i).io.in.vdvalid := vdvalid0
+    valu(i).io.in.vevalid := vevalid0
+    valu(i).io.in.sz := sz
+    for (j <- 0 until 7) {
+      valu(i).io.read(j).data := io.read(j).data(msb, lsb)
+    }
+    for (j <- 0 until 2) {
+      valu(i).io.load(j) := load(j)(msb, lsb)
+    }
+  }
+
+  for (i <- 0 until lanes) {
+    valu(i).io.in.negative := negative
+    valu(i).io.in.round    := round
+    valu(i).io.in.signed   := signed
+  }
+
+  for (i <- 0 until lanes) {
+    valu(i).io.op.absd := absd
+    valu(i).io.op.acc  := acc
+    valu(i).io.op.dup  := dup
+    valu(i).io.op.max  := max
+    valu(i).io.op.min  := min
+    valu(i).io.op.mv   := mv
+    valu(i).io.op.mv2  := mv2
+    valu(i).io.op.mvp  := mvp
+    valu(i).io.op.srans := srans
+    valu(i).io.op.sraqs := sraqs
+
+    valu(i).io.op.dwinit := dwinit
+    valu(i).io.op.dwconv := dwconv
+    valu(i).io.op.dwconvData := dwconvData
+
+    valu(i).io.op.add.en := add
+    valu(i).io.op.add.add  := add_add
+    valu(i).io.op.add.adds := add_adds
+    valu(i).io.op.add.addw := add_addw
+    valu(i).io.op.add.add3 := add_add3
+    valu(i).io.op.add.hadd := add_hadd
+
+    valu(i).io.op.cmp.en := cmp
+    valu(i).io.op.cmp.eq := cmp_eq
+    valu(i).io.op.cmp.ne := cmp_ne
+    valu(i).io.op.cmp.lt := cmp_lt
+    valu(i).io.op.cmp.le := cmp_le
+    valu(i).io.op.cmp.gt := cmp_gt
+    valu(i).io.op.cmp.ge := cmp_ge
+
+    valu(i).io.op.log.en := log
+    valu(i).io.op.log.and  := log_and
+    valu(i).io.op.log.or   := log_or
+    valu(i).io.op.log.xor  := log_xor
+    valu(i).io.op.log.not  := log_not
+    valu(i).io.op.log.rev  := log_rev
+    valu(i).io.op.log.ror  := log_ror
+    valu(i).io.op.log.clb  := log_clb
+    valu(i).io.op.log.clz  := log_clz
+    valu(i).io.op.log.cpop := log_cpop
+
+    valu(i).io.op.mul0.en := mul0
+    valu(i).io.op.mul0.dmulh := mul0_dmulh
+    valu(i).io.op.mul0.mul   := mul0_mul
+    valu(i).io.op.mul0.mulh  := mul0_mulh
+    valu(i).io.op.mul0.muls  := mul0_muls
+    valu(i).io.op.mul0.mulw  := mul0_mulw
+    valu(i).io.op.mul0.madd  := mul0_madd
+
+    valu(i).io.op.mul1.en := mul1
+    valu(i).io.op.mul1.dmulh := mul1_dmulh
+    valu(i).io.op.mul1.mul   := mul1_mul
+    valu(i).io.op.mul1.mulh  := mul1_mulh
+    valu(i).io.op.mul1.muls  := mul1_muls
+
+    valu(i).io.op.padd.en := padd
+    valu(i).io.op.padd.add := padd_add
+    valu(i).io.op.padd.sub := padd_sub
+
+    valu(i).io.op.rsub.en := rsub
+    valu(i).io.op.rsub.rsub := rsub_rsub
+
+    valu(i).io.op.shf.en.l := shf_l
+    valu(i).io.op.shf.en.r := shf_r
+    valu(i).io.op.shf.shl := shf_shl
+    valu(i).io.op.shf.shr := shf_shr
+    valu(i).io.op.shf.shf := shf_shf
+
+    valu(i).io.op.sub.en := sub
+    valu(i).io.op.sub.sub  := sub_sub
+    valu(i).io.op.sub.subs := sub_subs
+    valu(i).io.op.sub.subw := sub_subw
+    valu(i).io.op.sub.hsub := sub_hsub
+  }
+
+  // ---------------------------------------------------------------------------
+  // VSlide.
+  def VSliden(sz: Int, sel: UInt, a: UInt, b: UInt): UInt = {
+    val size = 8 << sz
+    assert(sz == 0 || sz == 1 || sz == 2)
+    assert(size == 8 || size == 16 || size == 32)
+    assert(sel.getWidth == 2)
+
+    val cnt = a.getWidth / size
+    val cnt2 = cnt * 2
+    val in = Wire(Vec(cnt2, UInt(size.W)))
+    val sout1 = Wire(Vec(cnt, UInt(size.W)))
+    val sout2 = Wire(Vec(cnt, UInt(size.W)))
+    val sout3 = Wire(Vec(cnt, UInt(size.W)))
+    val sout4 = Wire(Vec(cnt, UInt(size.W)))
+
+    for (i <- 0 until cnt) {
+      val l = i * size      // lsb
+      val m = l + size - 1  // msb
+      in(i)       := a(m,l)
+      in(i + cnt) := b(m,l)
+    }
+
+    for (i <- 0 until cnt) {
+      sout1(i) := in(i + 1)
+      sout2(i) := in(i + 2)
+      sout3(i) := in(i + 3)
+      sout4(i) := in(i + 4)
+    }
+
+    val out = MuxOR(sel === 0.U, sout1.asUInt) |
+              MuxOR(sel === 1.U, sout2.asUInt) |
+              MuxOR(sel === 2.U, sout3.asUInt) |
+              MuxOR(sel === 3.U, sout4.asUInt)
+    assert(out.getWidth == a.getWidth)
+
+    out
+  }
+
+  def VSlidep(sz: Int, sel: UInt, a: UInt, b: UInt): UInt = {
+    val size = 8 << sz
+    assert(sz == 0 || sz == 1 || sz == 2)
+    assert(size == 8 || size == 16 || size == 32)
+    assert(sel.getWidth == 2)
+
+    val cnt = a.getWidth / size
+    val cnt2 = cnt * 2
+    val in = Wire(Vec(cnt2, UInt(size.W)))
+    val sout1 = Wire(Vec(cnt, UInt(size.W)))
+    val sout2 = Wire(Vec(cnt, UInt(size.W)))
+    val sout3 = Wire(Vec(cnt, UInt(size.W)))
+    val sout4 = Wire(Vec(cnt, UInt(size.W)))
+
+    for (i <- 0 until cnt) {
+      val l = i * size      // lsb
+      val m = l + size - 1  // msb
+      in(i)       := a(m,l)
+      in(i + cnt) := b(m,l)
+    }
+
+    for (i <- 0 until cnt) {
+      sout1(i) := in(i - 1 + cnt)
+      sout2(i) := in(i - 2 + cnt)
+      sout3(i) := in(i - 3 + cnt)
+      sout4(i) := in(i - 4 + cnt)
+    }
+
+    val out = MuxOR(sel === 0.U, sout1.asUInt) |
+              MuxOR(sel === 1.U, sout2.asUInt) |
+              MuxOR(sel === 2.U, sout3.asUInt) |
+              MuxOR(sel === 3.U, sout4.asUInt)
+    assert(out.getWidth == a.getWidth)
+
+    out
+  }
+
+  val slidenb0 = VSliden(0, f2(1,0), MuxOR(slidevn && sz(0), io.read(0).data), MuxOR(slidevn && sz(0), io.read(1).data))
+  val slidenh0 = VSliden(1, f2(1,0), MuxOR(slidevn && sz(1), io.read(0).data), MuxOR(slidevn && sz(1), io.read(1).data))
+  val slidenw0 = VSliden(2, f2(1,0), MuxOR(slidevn && sz(2), io.read(0).data), MuxOR(slidevn && sz(2), io.read(1).data))
+
+  val slidenb1 = VSliden(0, f2(1,0), MuxOR(slidehn2 && sz(0), io.read(1).data), MuxOR(slidehn2 && sz(0), io.read(2).data))
+  val slidenh1 = VSliden(1, f2(1,0), MuxOR(slidehn2 && sz(1), io.read(1).data), MuxOR(slidehn2 && sz(1), io.read(2).data))
+  val slidenw1 = VSliden(2, f2(1,0), MuxOR(slidehn2 && sz(2), io.read(1).data), MuxOR(slidehn2 && sz(2), io.read(2).data))
+
+  val slidepb0 = VSlidep(0, f2(1,0), MuxOR(slidevp && sz(0), io.read(0).data), MuxOR(slidevp && sz(0), io.read(1).data))
+  val slideph0 = VSlidep(1, f2(1,0), MuxOR(slidevp && sz(1), io.read(0).data), MuxOR(slidevp && sz(1), io.read(1).data))
+  val slidepw0 = VSlidep(2, f2(1,0), MuxOR(slidevp && sz(2), io.read(0).data), MuxOR(slidevp && sz(2), io.read(1).data))
+
+  val slidepb1 = VSlidep(0, f2(1,0), MuxOR(slidehp2 && sz(0), io.read(1).data), MuxOR(slidehp2 && sz(0), io.read(2).data))
+  val slideph1 = VSlidep(1, f2(1,0), MuxOR(slidehp2 && sz(1), io.read(1).data), MuxOR(slidehp2 && sz(1), io.read(2).data))
+  val slidepw1 = VSlidep(2, f2(1,0), MuxOR(slidehp2 && sz(2), io.read(1).data), MuxOR(slidehp2 && sz(2), io.read(2).data))
+
+  val slide0 = slidenb0 | slidenh0 | slidenw0 |
+               slidepb0 | slideph0 | slidepw0
+
+  val slide1 = slidenb1 | slidenh1 | slidenw1 |
+               slidepb1 | slideph1 | slidepw1
+
+  // ---------------------------------------------------------------------------
+  // Select.
+  def VSel(sz: Int, a: UInt, b: UInt, c: UInt): UInt = {
+    val size = 8 << sz
+    assert(sz == 0 || sz == 1 || sz == 2)
+    assert(size == 8 || size == 16 || size == 32)
+
+    val cnt = a.getWidth / size
+    val sout = Wire(Vec(cnt, UInt(size.W)))
+
+    for (i <- 0 until cnt) {
+      val l = i * size      // lsb
+      val m = l + size - 1  // msb
+      sout(i) := Mux(a(l), c(m,l), b(m,l))
+    }
+
+    val out = sout.asUInt
+    assert(out.getWidth == a.getWidth)
+
+    out
+  }
+
+  val selb0 = VSel(0, MuxOR(sel && sz(0), io.read(0).data), MuxOR(sel && sz(0), io.read(1).data), MuxOR(sel && sz(0), io.read(2).data))
+  val selh0 = VSel(1, MuxOR(sel && sz(1), io.read(0).data), MuxOR(sel && sz(1), io.read(1).data), MuxOR(sel && sz(1), io.read(2).data))
+  val selw0 = VSel(2, MuxOR(sel && sz(2), io.read(0).data), MuxOR(sel && sz(2), io.read(1).data), MuxOR(sel && sz(2), io.read(2).data))
+
+  val sel0 = selb0 | selh0 | selw0
+
+  // ---------------------------------------------------------------------------
+  // Even/Odd.
+  def VEvnOdd(sel: Int, sz: Int, a: UInt, b: UInt): UInt = {
+    val size = 8 << sz
+    assert(sz == 0 || sz == 1 || sz == 2)
+    assert(size == 8 || size == 16 || size == 32)
+    assert(sel == 0 || sel == 1)
+
+    val cnt = a.getWidth / size
+    val h = a.getWidth / 2
+    val evnodd = Wire(Vec(cnt, UInt(size.W)))
+
+    for (i <- 0 until cnt / 2) {
+      val j = i * 2 + sel
+      val l = j * size      // lsb
+      val m = l + size - 1  // msb
+      evnodd(i) := a(m,l)
+    }
+
+    for (i <- cnt / 2 until cnt) {
+      val j = (i - cnt / 2) * 2 + sel
+      val l = j * size      // lsb
+      val m = l + size - 1  // msb
+      evnodd(i) := b(m,l)
+    }
+
+    val out = evnodd.asUInt
+    assert(out.getWidth == a.getWidth)
+
+    out
+  }
+
+  val evnb = VEvnOdd(0, 0, MuxOR(evn && sz(0), io.read(0).data), MuxOR(evn && sz(0), io.read(1).data))
+  val evnh = VEvnOdd(0, 1, MuxOR(evn && sz(1), io.read(0).data), MuxOR(evn && sz(1), io.read(1).data))
+  val evnw = VEvnOdd(0, 2, MuxOR(evn && sz(2), io.read(0).data), MuxOR(evn && sz(2), io.read(1).data))
+  val oddb = VEvnOdd(1, 0, MuxOR(odd && sz(0), io.read(0).data), MuxOR(odd && sz(0), io.read(1).data))
+  val oddh = VEvnOdd(1, 1, MuxOR(odd && sz(1), io.read(0).data), MuxOR(odd && sz(1), io.read(1).data))
+  val oddw = VEvnOdd(1, 2, MuxOR(odd && sz(2), io.read(0).data), MuxOR(odd && sz(2), io.read(1).data))
+
+  val evn0 = evnb | evnh | evnw
+  val odd1 = oddb | oddh | oddw
+
+  // ---------------------------------------------------------------------------
+  // VZip.
+  def VZip(sz: Int, a: UInt, b: UInt): (UInt, UInt) = {
+    val size = 8 << sz
+    assert(sz == 0 || sz == 1 || sz == 2)
+    assert(size == 8 || size == 16 || size == 32)
+
+    val cnt = a.getWidth / size
+    val h = a.getWidth / 2
+    val zip0 = Wire(Vec(cnt, UInt(size.W)))
+    val zip1 = Wire(Vec(cnt, UInt(size.W)))
+
+    for (i <- 0 until cnt) {
+      val j = i / 2
+      val l = j * size      // lsb
+      val m = l + size - 1  // msb
+      if ((i & 1) == 0) {
+        zip0(i) := a(m+0,l+0)
+        zip1(i) := a(m+h,l+h)
+      } else {
+        zip0(i) := b(m+0,l+0)
+        zip1(i) := b(m+h,l+h)
+      }
+    }
+
+    val out0 = zip0.asUInt
+    val out1 = zip1.asUInt
+    assert(out0.getWidth == a.getWidth)
+    assert(out1.getWidth == a.getWidth)
+
+    (out0, out1)
+  }
+
+  val (zipb0, zipb1) = VZip(0, MuxOR(zip && sz(0), io.read(0).data), MuxOR(zip && sz(0), io.read(1).data))
+  val (ziph0, ziph1) = VZip(1, MuxOR(zip && sz(1), io.read(0).data), MuxOR(zip && sz(1), io.read(1).data))
+  val (zipw0, zipw1) = VZip(2, MuxOR(zip && sz(2), io.read(0).data), MuxOR(zip && sz(2), io.read(1).data))
+
+  val zip0 = zipb0 | ziph0 | zipw0
+  val zip1 = zipb1 | ziph1 | zipw1
+
+  // ---------------------------------------------------------------------------
+  // Depthwise.
+  val (dwconv0, dwconv1) =
+    if (aluid == 0) {
+      VDot(aluid, dwconv,
+           VecInit(io.read(0).data, io.read(1).data, io.read(2).data),
+           VecInit(io.read(3).data, io.read(4).data, io.read(5).data), sv)
+    } else {
+      VDot(aluid, dwconv,
+           VecInit(io.read(3).data, io.read(4).data, io.read(5).data),
+           VecInit(io.read(0).data, io.read(1).data, io.read(2).data), sv)
+    }
+
+  // ---------------------------------------------------------------------------
+  // Parallel Load registered VAluIntLane stage.
+  load(0) := evn0 | zip0 | slide0 | dwconv0 | sel0
+  load(1) := odd1 | zip1 | slide1 | dwconv1
+
+  // ---------------------------------------------------------------------------
+  // Outputs.
+  val vddata = Wire(Vec(lanes, UInt(32.W)))
+  val vedata = Wire(Vec(lanes, UInt(32.W)))
+
+  for (i <- 0 until lanes) {
+    vddata(i) := valu(i).io.write(0).data
+    vedata(i) := valu(i).io.write(1).data
+  }
+
+  io.write(0).valid := vdvalid1
+  io.write(0).addr := vdaddr1.addr
+  io.write(0).data := vddata.asUInt
+
+  io.write(1).valid := vevalid1
+  io.write(1).addr := veaddr1.addr
+  io.write(1).data := vedata.asUInt
+
+  io.whint(0).valid := vdvalid0 && !wmask
+  io.whint(0).addr := vdaddr0.addr
+
+  io.whint(1).valid := vevalid0 && !wmask
+  io.whint(1).addr := veaddr0.addr
+}
+
+class VAluIntLane extends Module {
+  val e = new VEncodeOp()
+
+  val io = IO(new Bundle {
+    val in = Input(new Bundle {
+      val vdvalid = Bool()
+      val vevalid = Bool()
+      val sz = UInt(3.W)
+      val negative = Bool()
+      val round = Bool()
+      val signed = Bool()
+    })
+    val op = Input(new Bundle {
+      val absd  = Bool()
+      val acc   = Bool()
+      val dup   = Bool()
+      val max   = Bool()
+      val min   = Bool()
+      val mv   = Bool()
+      val mv2  = Bool()
+      val mvp  = Bool()
+      val srans = Bool()
+      val sraqs = Bool()
+
+      val dwinit = Bool()
+      val dwconv = Bool()
+      val dwconvData = Bool()
+
+      val add = new Bundle {
+        val en = Bool()
+        val add  = Bool()
+        val adds = Bool()
+        val addw = Bool()
+        val add3 = Bool()
+        val hadd = Bool()
+      }
+
+      val cmp = new Bundle {
+        val en = Bool()
+        val eq = Bool()
+        val ne = Bool()
+        val lt = Bool()
+        val le = Bool()
+        val gt = Bool()
+        val ge = Bool()
+      }
+
+      val log = new Bundle {
+        val en = Bool()
+        val and  = Bool()
+        val or   = Bool()
+        val xor  = Bool()
+        val not  = Bool()
+        val rev  = Bool()
+        val ror  = Bool()
+        val clb  = Bool()
+        val clz  = Bool()
+        val cpop = Bool()
+      }
+
+      val mul0 = new Bundle {
+        val en = Bool()
+        val dmulh = Bool()
+        val mul   = Bool()
+        val mulh  = Bool()
+        val muls  = Bool()
+        val mulw  = Bool()
+        val madd  = Bool()
+      }
+
+      val mul1 = new Bundle {
+        val en = Bool()
+        val dmulh = Bool()
+        val mul   = Bool()
+        val mulh  = Bool()
+        val muls  = Bool()
+      }
+
+      val padd = new Bundle {
+        val en = Bool()
+        val add = Bool()
+        val sub = Bool()
+      }
+
+      val rsub  = new Bundle {
+        val en = Bool()
+        val rsub = Bool()
+      }
+
+      val shf = new Bundle {
+        val en = new Bundle {
+          val l = Bool()  // left
+          val r = Bool()  // right
+        }
+        val shl = Bool()
+        val shr = Bool()
+        val shf = Bool()
+      }
+
+      val sub  = new Bundle {
+        val en = Bool()
+        val sub  = Bool()
+        val subs = Bool()
+        val subw = Bool()
+        val hsub = Bool()
+      }
+    })
+      val read = Vec(7, Input(new Bundle {
+      val data = UInt(32.W)
+    }))
+    val write = Vec(2, Output(new Bundle {
+      val data = UInt(32.W)
+    }))
+    val load = Vec(2, Input(UInt(32.W)))  // parallel load data
+  })
+
+  def VAlu(sz: Int, a: UInt, b: UInt, c: UInt, d: UInt, e: UInt, f: UInt): (UInt, UInt, UInt, UInt, UInt, UInt) = {
+    // Note: sz is source size, not destination as is ISA defined.
+    val size = 8 << sz
+    assert(sz == 0 || sz == 1 || sz == 2)
+    assert(size == 8 || size == 16 || size == 32)
+    assert(a.getWidth == b.getWidth)
+    assert(a.getWidth == c.getWidth)
+    assert(a.getWidth == 32)
+    val cnt = a.getWidth / size
+    val alu0 = Wire(Vec(cnt, UInt(size.W)))
+    val alu1 = Wire(Vec(cnt, UInt(size.W)))
+    val aluw0 = Wire(Vec(cnt / 2, UInt((2 * size).W)))
+    val aluw1 = Wire(Vec(cnt / 2, UInt((2 * size).W)))
+    val rnd0 = Wire(Vec(cnt, UInt(size.W)))
+    val rnd1 = Wire(Vec(cnt, UInt(size.W)))
+
+    // -------------------------------------------------------------------------
+    // Controls.
+    val negative = io.in.negative
+    val round    = io.in.round
+    val signed   = io.in.signed
+
+    // -------------------------------------------------------------------------
+    // Datapath.
+    val aw = a
+    val bw = b
+    val cw = c
+    val dw = d
+    val ew = e
+    val fw = f
+
+    val acc_a = MuxOR(io.op.acc, aw)
+    val acc_b = MuxOR(io.op.acc, bw)
+    val acc_c = MuxOR(io.op.acc, cw)
+
+    val add_a = MuxOR(io.op.add.en, aw)
+    val add_b = MuxOR(io.op.add.en, bw)
+    val add_r = io.op.add.hadd && round
+
+    val cmp_a = MuxOR(io.op.cmp.en, aw)
+    val cmp_b = MuxOR(io.op.cmp.en, bw)
+
+    val log_a = MuxOR(io.op.log.en, aw)
+    val log_b = MuxOR(io.op.log.en, bw)
+
+    val mul0_a = MuxOR(io.op.mul0.en, aw)
+    val mul0_b = MuxOR(io.op.mul0.en, bw)
+    val mul1_a = MuxOR(io.op.mul1.en, cw)
+    val mul1_b = MuxOR(io.op.mul1.en, bw)
+
+    val padd_a = MuxOR(io.op.padd.en, aw)
+
+    val rsub_a = MuxOR(io.op.rsub.en, aw)
+    val rsub_b = MuxOR(io.op.rsub.en, bw)
+
+    val shl_a = MuxOR(io.op.shf.en.l, aw)
+    val shl_b = MuxOR(io.op.shf.en.l, bw)
+    val shr_a = MuxOR(io.op.shf.en.r, aw)
+    val shr_b = MuxOR(io.op.shf.en.r, bw)
+
+    val srans_a = MuxOR(io.op.srans, aw)
+    val srans_b = MuxOR(io.op.srans, bw)
+    val srans_c = MuxOR(io.op.srans, cw)
+
+    val sraqs_a = MuxOR(io.op.sraqs, aw)
+    val sraqs_b = MuxOR(io.op.sraqs, bw)
+    val sraqs_c = MuxOR(io.op.sraqs, cw)
+    val sraqs_d = MuxOR(io.op.sraqs, dw)
+    val sraqs_f = MuxOR(io.op.sraqs, fw)
+
+    val sub_a = MuxOR(io.op.sub.en, aw)
+    val sub_b = MuxOR(io.op.sub.en, bw)
+    val sub_r = io.op.sub.hsub && round
+
+    // -------------------------------------------------------------------------
+    // Functions.
+    for (i <- 0 until cnt) {
+      val l = i * size      // lsb
+      val m = l + size - 1  // msb
+      val ln = (i / 2) * 2 * size  // lsb narrowing
+      val mn = ln + 2 * size - 1   // msb narrowing
+      val lq = (i / 4) * 4 * size  // lsb narrowing
+      val mq = lq + 4 * size - 1   // msb narrowing
+      val mshamt = l + log2Ceil(size) - 1
+
+      // -----------------------------------------------------------------------
+      // Arithmetic.
+      val add_sa = add_a(m) && signed
+      val add_sb = add_b(m) && signed
+      val adder = (Cat(add_sa, add_a(m,l)).asSInt +& Cat(add_sb, add_b(m,l)).asSInt).asUInt + add_r
+      val sataddmsb = adder(size, size - 1)
+      val sataddsel =
+        Cat( signed && sataddmsb === 2.U,  // vadd.s -ve
+             signed && sataddmsb === 1.U,  // vadd.s +ve
+            !signed && sataddmsb(1))       // vadd.su +ve
+      assert(PopCount(sataddsel) <= 1.U)
+
+      val sub_sa = sub_a(m) && signed
+      val sub_sb = sub_b(m) && signed
+      val subtr = (Cat(sub_sa, sub_a(m,l)).asSInt -& Cat(sub_sb, sub_b(m,l)).asSInt).asUInt + sub_r
+      val satsubmsb = subtr(size, size - 1)
+      val satsubsel =
+        Cat( signed && satsubmsb === 2.U,  // vsub.s -ve
+             signed && satsubmsb === 1.U,  // vsub.s +ve
+            !signed && satsubmsb(1))       // vsub.su +ve
+      assert(PopCount(satsubsel) <= 1.U)
+
+      val rsubtr = rsub_b(m,l) - rsub_a(m,l)
+
+      val xeq = cmp_a(m,l) === cmp_b(m,l)
+      val xne = cmp_a(m,l) =/= cmp_b(m,l)
+      val slt = cmp_a(m,l).asSInt() < cmp_b(m,l).asSInt()
+      val ult = cmp_a(m,l) < cmp_b(m,l)
+      val sle = slt || xeq
+      val ule = ult || xeq
+
+      val sult = Mux(signed, slt, ult)
+
+      def Shift(a: UInt, b: UInt, sln: UInt, sra: UInt, srl: UInt): UInt = {
+        assert(a.getWidth == size)
+        assert(b.getWidth == size)
+        assert(sln.getWidth == (2 * size - 1))
+        assert(sra.getWidth == size)
+        assert(srl.getWidth == size)
+        val slnsz = sln(size - 1, 0)
+        val input_neg = a(size - 1)
+        val input_zero = a === 0.U
+        val shamt_neg = b(size - 1)
+        val rs = Wire(UInt(size.W))
+        val ru = Wire(UInt(size.W))
+        if (true) {
+          val shamt_negsat = b.asSInt <= (-(size - 1)).S
+          val shamt_possat = b.asSInt >= (size - 1).S
+          val signb = ~0.U(size.W) >> (b(log2Ceil(size) - 1, 0) - 1.U)
+          val possat = shamt_neg && !input_neg && (shamt_negsat || (sln(2 * size - 2, size - 1) =/= 0.U  )) && !input_zero
+          val negsat = shamt_neg &&  input_neg && (shamt_negsat || (sln(2 * size - 2, size - 1) =/= signb))
+          assert(!(possat && negsat))
+          val posmax = Cat(0.U(1.W), ~0.U((size - 1).W))
+          val negmin = Cat(1.U(1.W),  0.U((size - 1).W))
+          assert(posmax.getWidth == size)
+          assert(negmin.getWidth == size)
+
+          rs := MuxOR(!shamt_neg && !shamt_possat, sra) |
+                MuxOR(!shamt_neg &&  shamt_possat && input_neg, ~0.U(size.W)) |
+                MuxOR( shamt_neg && !possat && !negsat, slnsz) |
+                MuxOR( shamt_neg && possat, posmax) |
+                MuxOR( shamt_neg && negsat, negmin)
+        }
+        if (true) {
+          val shamt_negsat = b.asSInt <= -size.S
+          val shamt_possat = b.asSInt >= size.S
+          val possat = shamt_neg && (shamt_negsat || (sln(2 * size - 2, size) =/= 0.U)) && !input_zero
+          val posmax = ~0.U(size.W)
+          assert(posmax.getWidth == size)
+
+          ru := MuxOR(!shamt_neg && !shamt_possat, srl) |
+                MuxOR( shamt_neg && !possat, slnsz) |
+                MuxOR( shamt_neg && possat, posmax)
+        }
+        Mux(signed, rs, ru)
+      }
+
+      def Round(a: UInt, b: UInt): UInt = {
+        assert(a.getWidth == size)
+        assert(b.getWidth == size)
+        val input_neg = a(size - 1)
+        val shamt_neg = b(size - 1)
+        val shamt_zero = b === 0.U
+        val rbit = Cat(a(size - 2, 0), a(size - 1))(b(log2Ceil(size) - 1, 0))  // shf: idx[8] == idx[0]
+        val shamt_possat = Mux(signed, b.asSInt >= size.S, b.asSInt > size.S)
+        val r = MuxOR(round && !shamt_possat && !shamt_neg && !shamt_zero, rbit) |
+                MuxOR(round &&  shamt_possat && input_neg && signed, 1.U)
+        assert(r.getWidth == 1)
+        r
+      }
+
+      val shl = (shl_a(m,l) << shl_b(mshamt, l))(size - 1, 0)
+      val sln = (shl_a(m,l) << (size.U - shl_b(mshamt, l)))(2 * size - 2, 0)
+      val srl = shr_a(m,l) >> shr_b(mshamt, l)
+      val srs = MuxOR(shr_a(m), ((~0.U(size.W)) << ((size - 1).U - shr_b(mshamt, l)))(size - 1, 0))
+      val sra = srs | srl
+      val shf = Shift(shl_a(m,l), shl_b(m,l), sln, sra, srl)
+      val shr = Mux(signed, sra, srl)
+      assert(shl.getWidth == size)
+      assert(sln.getWidth == (2 * size - 1))
+      assert(sra.getWidth == size)
+      assert(srl.getWidth == size)
+      assert(srs.getWidth == size)
+      assert(shf.getWidth == size)
+
+      val shf_rnd = Round(shl_a(m,l), shl_b(m,l))
+      assert(shf_rnd.getWidth == 1)
+
+      def Srans(s: Int, a: UInt, b: UInt): UInt = {
+        assert(s == 2 || s == 4)
+        assert(a.getWidth == size * s)
+        assert(b.getWidth == size)
+
+        val shamt = b(log2Ceil(s * size) - 1, 0)
+        val srl = a >> shamt
+        val srs = MuxOR(a(s * size - 1), ((~0.U((s * size).W)) << ((s * size - 1).U - shamt))(s * size - 1, 0))
+        val sra = srs | srl
+        assert(srl.getWidth == (s * size))
+        assert(srs.getWidth == (s * size))
+        val rbit = Cat(a(s * size - 2, 0), 0.U(1.W))(shamt)
+        assert(rbit.getWidth == 1)
+
+        val umax = ((1 << size) - 1).S((s * size).W)
+        val umin = 0.S((s * size).W)
+        val smax = ((1 << (size - 1)) - 1).S((s * size).W)
+        val smin = -(1 << (size - 1)).S((s * size).W)
+        val rshf = Mux(round && rbit, sra + 1.U, sra)
+
+        val is_umax = !signed && (rshf.asSInt > umax)
+        val is_umin = !signed && (rshf.asSInt < umin)
+        val is_smax =  signed && (rshf.asSInt > smax)
+        val is_smin =  signed && (rshf.asSInt < smin)
+        val is_norm = !(is_umax || is_umin || is_smax || is_smin)
+        assert(PopCount(Cat(is_umax, is_umin, is_smax, is_smin, is_norm)) <= 1.U)
+
+        val r = MuxOR(is_umax, umax.asUInt(size - 1, 0)) |
+                MuxOR(is_umin, umin.asUInt(size - 1, 0)) |
+                MuxOR(is_smax, smax.asUInt(size - 1, 0)) |
+                MuxOR(is_smin, smin.asUInt(size - 1, 0)) |
+                MuxOR(is_norm, rshf(size - 1, 0))
+        assert(r.getWidth == size)
+        r
+      }
+
+      def Rev(a: UInt, s: UInt): UInt = {
+        if (size == 32) {
+          val b = Mux(!s(0), a, Cat(a(30), a(31), a(28), a(29), a(26), a(27), a(24), a(25),
+                                    a(22), a(23), a(20), a(21), a(18), a(19), a(16), a(17),
+                                    a(14), a(15), a(12), a(13), a(10), a(11), a( 8), a( 9),
+                                    a( 6), a( 7), a( 4), a( 5), a( 2), a( 3), a( 0), a( 1)))
+          val c = Mux(!s(1), b, Cat(b(29,28), b(31,30), b(25,24), b(27,26),
+                                    b(21,20), b(23,22), b(17,16), b(19,18),
+                                    b(13,12), b(15,14), b( 9, 8), b(11,10),
+                                    b( 5, 4), b( 7, 6), b( 1, 0), b( 3, 2)))
+          val d = Mux(!s(2), c, Cat(c(27,24), c(31,28), c(19,16), c(23,20),
+                                    c(11, 8), c(15,12), c( 3, 0), c( 7, 4)))
+          val e = Mux(!s(3), d, Cat(d(23,16), d(31,24), d( 7, 0), d(15, 8)))
+          val f = Mux(!s(4), e, Cat(e(15, 0), e(31,16)))
+          assert(a.getWidth == 32)
+          assert(b.getWidth == 32)
+          assert(c.getWidth == 32)
+          assert(d.getWidth == 32)
+          assert(e.getWidth == 32)
+          assert(f.getWidth == 32)
+          f
+        } else if (size == 16) {
+          val b = Mux(!s(0), a, Cat(a(14), a(15), a(12), a(13), a(10), a(11), a( 8), a( 9),
+                                    a( 6), a( 7), a( 4), a( 5), a( 2), a( 3), a( 0), a( 1)))
+          val c = Mux(!s(1), b, Cat(b(13,12), b(15,14), b( 9, 8), b(11,10),
+                                    b( 5, 4), b( 7, 6), b( 1, 0), b( 3, 2)))
+          val d = Mux(!s(2), c, Cat(c(11, 8), c(15,12), c( 3, 0), c( 7, 4)))
+          val e = Mux(!s(3), d, Cat(d( 7, 0), d(15, 8)))
+          assert(a.getWidth == 16)
+          assert(b.getWidth == 16)
+          assert(c.getWidth == 16)
+          assert(d.getWidth == 16)
+          assert(e.getWidth == 16)
+          e
+        } else {
+          val b = Mux(!s(0), a, Cat(a(6), a(7), a(4), a(5), a(2), a(3), a(0), a(1)))
+          val c = Mux(!s(1), b, Cat(b(5, 4), b(7, 6), b(1, 0), b( 3, 2)))
+          val d = Mux(!s(2), c, Cat(c(3, 0), c(7, 4)))
+          assert(a.getWidth == 8)
+          assert(b.getWidth == 8)
+          assert(c.getWidth == 8)
+          assert(d.getWidth == 8)
+          d
+        }
+      }
+
+      def Ror(a: UInt, s: UInt): UInt = {
+        if (size == 32) {
+          val b = Mux(!s(0), a, Cat(a(0), a(31,1)))
+          val c = Mux(!s(1), b, Cat(b(1,0), b(31,2)))
+          val d = Mux(!s(2), c, Cat(c(3,0), c(31,4)))
+          val e = Mux(!s(3), d, Cat(d(7,0), d(31,8)))
+          val f = Mux(!s(4), e, Cat(e(15,0), e(31,16)))
+          assert(a.getWidth == 32)
+          assert(b.getWidth == 32)
+          assert(c.getWidth == 32)
+          assert(d.getWidth == 32)
+          assert(e.getWidth == 32)
+          assert(f.getWidth == 32)
+          f
+        } else if (size == 16) {
+          val b = Mux(!s(0), a, Cat(a(0), a(15,1)))
+          val c = Mux(!s(1), b, Cat(b(1,0), b(15,2)))
+          val d = Mux(!s(2), c, Cat(c(3,0), c(15,4)))
+          val e = Mux(!s(3), d, Cat(d(7,0), d(15,8)))
+          assert(a.getWidth == 16)
+          assert(b.getWidth == 16)
+          assert(c.getWidth == 16)
+          assert(d.getWidth == 16)
+          assert(e.getWidth == 16)
+          e
+        } else {
+          val b = Mux(!s(0), a, Cat(a(0), a(7,1)))
+          val c = Mux(!s(1), b, Cat(b(1,0), b(7,2)))
+          val d = Mux(!s(2), c, Cat(c(3,0), c(7,4)))
+          assert(a.getWidth == 8)
+          assert(b.getWidth == 8)
+          assert(c.getWidth == 8)
+          assert(d.getWidth == 8)
+          d
+        }
+      }
+
+      val mul0_as = Cat(signed && mul0_a(m), mul0_a(m,l))
+      val mul0_bs = Cat(signed && mul0_b(m), mul0_b(m,l))
+      val mul0_sign = mul0_a(m) =/= mul0_b(m) && mul0_a(m,l) =/= 0.U && mul0_b(m,l) =/= 0.U
+      val prod0 = (mul0_as.asSInt * mul0_bs.asSInt).asUInt
+      val prodh0 = prod0(2 * size - 1, size)
+      val proddh0 = prod0(2 * size - 2, size - 1)
+
+      val mul1_as = Cat(signed && mul1_a(m), mul1_a(m,l))
+      val mul1_bs = Cat(signed && mul1_b(m), mul1_b(m,l))
+      val mul1_sign = mul1_a(m) =/= mul1_b(m) && mul1_a(m,l) =/= 0.U && mul1_b(m,l) =/= 0.U
+      val prod1 = (mul1_as.asSInt * mul1_bs.asSInt).asUInt
+      val prodh1 = prod1(2 * size - 1, size)
+      val proddh1 = prod1(2 * size - 2, size - 1)
+
+      val muls0_umax = !signed && prodh0 =/= 0.U
+      val muls0_smax =  signed && !mul0_sign && ( prod0(size - 1) || prodh0 =/=  0.U(size.W))
+      val muls0_smin =  signed &&  mul0_sign && (!prod0(size - 1) || prodh0 =/= ~0.U(size.W))
+      val muls0_base = !(muls0_umax || muls0_smax || muls0_smin)
+      assert(PopCount(Cat(muls0_umax, muls0_smax, muls0_smin, muls0_base)) <= 1.U)
+
+      val muls1_umax = !signed && prodh1 =/= 0.U
+      val muls1_smax =  signed && !mul1_sign && ( prod1(size - 1) || prodh1 =/=  0.U(size.W))
+      val muls1_smin =  signed &&  mul1_sign && (!prod1(size - 1) || prodh1 =/= ~0.U(size.W))
+      val muls1_base = !(muls1_umax || muls1_smax || muls1_smin)
+      assert(PopCount(Cat(muls1_umax, muls1_smax, muls1_smin, muls1_base)) <= 1.U)
+
+      val maxneg  = Cat(1.U(1.W), 0.U((size - 1).W))  // 0x80...
+
+      val dmulh0_possat = mul0_a(m,l) === maxneg && mul0_b(m,l) === maxneg
+
+      val dmulh1_possat = mul1_a(m,l) === maxneg && mul1_b(m,l) === maxneg
+
+      val dmulh0 = MuxOR(!dmulh0_possat, proddh0) |
+                   MuxOR(dmulh0_possat, Cat(0.U(1.W), ~0.U((size - 1).W)))    // 0x7f...
+
+      val dmulh1 = MuxOR(!dmulh1_possat, proddh1) |
+                   MuxOR(dmulh1_possat, Cat(0.U(1.W), ~0.U((size - 1).W)))    // 0x7f...
+
+      val mulh0 = prodh0
+      val mulh1 = prodh1
+
+      val muls0 = MuxOR(muls0_umax, ~0.U(size.W)) |
+                  MuxOR(muls0_smax, ~0.U((size - 1).W)) |
+                  MuxOR(muls0_smin, Cat(1.U(1.W), 0.U((size - 1).W))) |
+                  MuxOR(muls0_base, prod0(size - 1, 0))
+
+      val muls1 = MuxOR(muls1_umax, ~0.U(size.W)) |
+                  MuxOR(muls1_smax, ~0.U((size - 1).W)) |
+                  MuxOR(muls1_smin, Cat(1.U(1.W), 0.U((size - 1).W))) |
+                  MuxOR(muls1_base, prod1(size - 1, 0))
+
+      val dmulh0_rnd = MuxOR(round && io.op.mul0.dmulh && io.in.sz(sz) && !dmulh0_possat,
+                             Mux(negative && mul0_sign,
+                                 MuxOR(!prod0(size - 2), ~0.U(size.W)),   // -1
+                                 MuxOR( prod0(size - 2),  1.U(size.W))))  // +1
+
+      val dmulh1_rnd = MuxOR(round && io.op.mul1.dmulh && io.in.sz(sz) && !dmulh1_possat,
+                             Mux(negative && mul1_sign,
+                                 MuxOR(!prod1(size - 2), ~0.U(size.W)),   // -1
+                                 MuxOR( prod1(size - 2),  1.U(size.W))))  // +1
+
+      val mulh0_rnd = round && io.op.mul0.mulh && prod0(size - 1)
+      val mulh1_rnd = round && io.op.mul1.mulh && prod1(size - 1)
+
+      // -----------------------------------------------------------------------
+      // Operations.
+      val absd = MuxOR(io.op.absd, Mux(sult, rsubtr, subtr(size - 1, 0)))
+      assert(absd.getWidth == size)
+
+      val acc = if (sz == 0 || sz == 1) {  // size / 2
+                  if ((i & 1) == 0) {
+                    acc_a(mn,ln) + SignExt(Cat(signed & acc_b(m), acc_b(m,l)), 2 * size)
+                  } else {
+                    acc_c(mn,ln) + SignExt(Cat(signed & acc_b(m), acc_b(m,l)), 2 * size)
+                  }
+                } else {
+                  0.U((2 * size).W)
+                }
+      assert(acc.getWidth == (2 * size))
+
+      val add = MuxOR(sataddsel(2) && io.op.add.adds, Cat(1.U(1.W), 0.U((size - 1).W))) |
+                MuxOR(sataddsel(1) && io.op.add.adds, ~0.U((size - 1).W)) |
+                MuxOR(sataddsel(0) && io.op.add.adds, ~0.U(size.W)) |
+                MuxOR(sataddsel === 0.U && io.op.add.adds || io.op.add.add || io.op.add.add3, adder(size - 1, 0)) |
+                MuxOR(io.op.add.hadd, adder(size, 1))
+
+      val addw = MuxOR(io.op.add.addw, SignExt(adder, 2 * size))
+      assert(addw.getWidth == (2 * size))
+
+      val dup = MuxOR(io.op.dup, io.read(1).data(m,l))
+
+      val max = MuxOR(io.op.max, Mux(sult, cmp_b(m,l), cmp_a(m,l)))
+      val min = MuxOR(io.op.min, Mux(sult, cmp_a(m,l), cmp_b(m,l)))
+
+      val mul0 = MuxOR(io.op.mul0.mul || io.op.mul0.madd, prod0(size - 1, 0)) |
+                 MuxOR(io.op.mul0.dmulh, dmulh0) |
+                 MuxOR(io.op.mul0.mulh, mulh0) |
+                 MuxOR(io.op.mul0.muls, muls0)
+
+      val mul1 = MuxOR(io.op.mul1.mul, prod1(size - 1, 0)) |
+                 MuxOR(io.op.mul1.dmulh, dmulh1) |
+                 MuxOR(io.op.mul1.mulh, mulh1) |
+                 MuxOR(io.op.mul1.muls, muls1)
+
+      val mulw = MuxOR(io.op.mul0.mulw, prod0(2 * size - 1, 0))
+
+      val padd =
+        if (sz == 1 || sz == 2) {
+          val p0 = i * size
+          val p1 = p0 + size / 2 - 1
+          val p2 = p1 + 1
+          val p3 = p0 + size - 1
+          val a = Cat(signed && padd_a(p1), padd_a(p1,p0))
+          val b = Cat(signed && padd_a(p3), padd_a(p3,p2))
+          val add = MuxOR(io.op.padd.add, SignExt((a.asSInt +& b.asSInt).asUInt, size))
+          val sub = MuxOR(io.op.padd.sub, SignExt((a.asSInt -& b.asSInt).asUInt, size))
+          assert(add.getWidth == size)
+          assert(sub.getWidth == size)
+          add | sub
+        } else {
+          0.U(size.W)
+        }
+
+      val rsub = MuxOR(io.op.rsub.rsub, rsubtr)
+
+      val srans = if (sz == 0 || sz == 1) {  // size / 2
+                    if ((i & 1) == 0) {
+                      Srans(2, srans_a(mn,ln), srans_b(m,l))
+                    } else {
+                      Srans(2, srans_c(mn,ln), srans_b(m,l))
+                    }
+                  } else {
+                    0.U(size.W)
+                  }
+
+      val sraqs = if (sz == 0) {  // size / 4
+                    if ((i & 3) == 0) {
+                      Srans(4, sraqs_a(mq,lq), sraqs_b(m,l))
+                    } else if ((i & 3) == 1) {
+                      Srans(4, sraqs_d(mq,lq), sraqs_b(m,l))
+                    } else if ((i & 3) == 2) {
+                      Srans(4, sraqs_c(mq,lq), sraqs_b(m,l))
+                    } else {
+                      Srans(4, sraqs_f(mq,lq), sraqs_b(m,l))
+                    }
+                  } else {
+                    0.U(size.W)
+                  }
+
+      val sub = MuxOR(satsubsel(2) && io.op.sub.subs, Cat(1.U(1.W), 0.U((size - 1).W))) |
+                MuxOR(satsubsel(1) && io.op.sub.subs, ~0.U((size - 1).W)) |
+                MuxOR(satsubsel(0) && io.op.sub.subs, ~0.U(size.W)) |
+                MuxOR(satsubsel === 0.U && io.op.sub.subs || io.op.sub.sub, subtr(size - 1, 0)) |
+                MuxOR(io.op.sub.hsub, subtr(size, 1))
+
+      val subw = MuxOR(io.op.sub.subw, SignExt(subtr, 2 * size))
+      assert(subw.getWidth == (2 * size))
+
+      val cmp = io.in.sz(sz) &&
+                  (MuxOR(io.op.cmp.eq, xeq) |
+                   MuxOR(io.op.cmp.ne, xne) |
+                   MuxOR(io.op.cmp.lt &&  signed, slt) |
+                   MuxOR(io.op.cmp.lt && !signed, ult) |
+                   MuxOR(io.op.cmp.le &&  signed, sle) |
+                   MuxOR(io.op.cmp.le && !signed, ule) |
+                   MuxOR(io.op.cmp.gt &&  signed, !sle) |
+                   MuxOR(io.op.cmp.gt && !signed, !ule) |
+                   MuxOR(io.op.cmp.ge &&  signed, !slt) |
+                   MuxOR(io.op.cmp.ge && !signed, !ult))
+      assert(cmp.getWidth == 1)
+
+      val log =
+        MuxOR(io.op.log.and,  log_a(m,l) & log_b(m,l)) |
+        MuxOR(io.op.log.or,   log_a(m,l) | log_b(m,l)) |
+        MuxOR(io.op.log.xor,  log_a(m,l) ^ log_b(m,l)) |
+        MuxOR(io.op.log.not,  MuxOR(io.in.sz(sz), ~log_a(m,l))) |
+        MuxOR(io.op.log.rev,  Rev(log_a(m,l), log_b(m,l))) |
+        MuxOR(io.op.log.ror,  MuxOR(io.in.sz(sz), Ror(log_a(m,l), log_b(m,l)))) |
+        MuxOR(io.op.log.clb,  MuxOR(io.in.sz(sz), Clb(log_a(m,l)))) |
+        MuxOR(io.op.log.clz,  MuxOR(io.in.sz(sz), Clz(log_a(m,l)))) |
+        MuxOR(io.op.log.cpop, PopCount(log_a(m,l)))
+      assert(log.getWidth == size)
+
+      val shift =
+        MuxOR(io.op.shf.shl, shl) |
+        MuxOR(io.op.shf.shr, shr) |
+        MuxOR(io.op.shf.shf, shf)
+      assert(shf.getWidth == size)
+
+      val alu_oh = Cat(absd  =/= 0.U,
+                       add   =/= 0.U,
+                       cmp   =/= 0.U,
+                       dup   =/= 0.U,
+                       log   =/= 0.U,
+                       max   =/= 0.U,
+                       min   =/= 0.U,
+                       mul0  =/= 0.U,
+                       padd  =/= 0.U,
+                       rsub  =/= 0.U,
+                       shift =/= 0.U,
+                       srans =/= 0.U,
+                       sraqs =/= 0.U,
+                       sub   =/= 0.U)
+
+      assert(PopCount(alu_oh) <= 1.U)
+
+      alu0(i) := mul0 | absd | add | cmp | dup | log | max | min | padd | rsub | shift | srans | sraqs | sub |
+                 MuxOR(io.op.mv, aw(m,l))
+
+      alu1(i) := mul1 |
+                 MuxOR(io.op.mvp, bw(m,l)) |
+                 MuxOR(io.op.mv2, cw(m,l))
+
+      rnd0(i) := dmulh0_rnd | mulh0_rnd | shf_rnd
+      rnd1(i) := dmulh1_rnd | mulh1_rnd
+
+      if (sz < 2) {
+        if ((i & 1) == 0) {
+          aluw0(i / 2) := acc | addw | mulw | subw
+        } else {
+          aluw1(i / 2) := acc | addw | mulw | subw
+        }
+      }
+    }
+
+    val out_alu0 = alu0.asUInt
+    val out_alu1 = alu1.asUInt
+    val out_rnd0 = rnd0.asUInt
+    val out_rnd1 = rnd1.asUInt
+    val out_aluw0 = aluw0.asUInt
+    val out_aluw1 = aluw1.asUInt
+    assert(out_alu0.getWidth == a.getWidth)
+    assert(out_alu1.getWidth == a.getWidth)
+    assert(out_rnd0.getWidth == a.getWidth)
+    if (sz < 2) {
+      assert(out_aluw0.getWidth == a.getWidth)
+      assert(out_aluw1.getWidth == a.getWidth)
+    }
+
+    (out_alu0, out_alu1, out_rnd0, out_rnd1, out_aluw0, out_aluw1)
+  }
+
+  // ---------------------------------------------------------------------------
+  // Data mux.
+  val ina_b = MuxOR(io.in.sz(0), io.read(0).data)
+  val inb_b = MuxOR(io.in.sz(0), io.read(1).data)
+  val inc_b = MuxOR(io.in.sz(0), io.read(2).data)
+  val ind_b = MuxOR(io.in.sz(0), io.read(3).data)
+  val ine_b = MuxOR(io.in.sz(0), io.read(4).data)
+  val inf_b = MuxOR(io.in.sz(0), io.read(5).data)
+
+  val ina_h = MuxOR(io.in.sz(1), io.read(0).data)
+  val inb_h = MuxOR(io.in.sz(1), io.read(1).data)
+  val inc_h = MuxOR(io.in.sz(1), io.read(2).data)
+  val ind_h = MuxOR(io.in.sz(1), io.read(4).data)
+  val ine_h = MuxOR(io.in.sz(1), io.read(5).data)
+  val inf_h = MuxOR(io.in.sz(1), io.read(6).data)
+
+  val ina_w = MuxOR(io.in.sz(2), io.read(0).data)
+  val inb_w = MuxOR(io.in.sz(2), io.read(1).data)
+  val inc_w = MuxOR(io.in.sz(2), io.read(2).data)
+  val ind_w = MuxOR(io.in.sz(2), io.read(3).data)
+  val ine_w = MuxOR(io.in.sz(2), io.read(4).data)
+  val inf_w = MuxOR(io.in.sz(2), io.read(5).data)
+
+  val (outb0, outb1, rndb0, rndb1, outwb0, outwb1) = VAlu(0, ina_b, inb_b, inc_b, ind_b, ine_b, inf_b)
+  val (outh0, outh1, rndh0, rndh1, outwh0, outwh1) = VAlu(1, ina_h, inb_h, inc_h, ind_h, ine_h, inf_h)
+  val (outw0, outw1, rndw0, rndw1,      _,      _) = VAlu(2, ina_w, inb_w, inc_w, ind_w, ine_w, inf_w)
+
+  val out0 = outb0 | outh0 | outw0 | outwb0 | outwh0
+  val out1 = outb1 | outh1 | outw1 | outwb1 | outwh1
+  val rnd0 = rndb0 | rndh0 | rndw0
+  val rnd1 = rndb1 | rndh1 | rndw1
+
+  // ---------------------------------------------------------------------------
+  // Accumulator second input.
+  val accvalid0 = io.op.dwinit || io.op.mul0.dmulh || io.op.mul0.mulh || io.op.add.add3 || io.op.mul0.madd || io.op.shf.shf
+  val accvalid1 = io.op.dwinit || io.op.mul1.dmulh || io.op.mul1.mulh
+
+  val accum0 = MuxOR(io.op.add.add3 ||
+                     io.op.mul0.madd, io.read(2).data) |
+               MuxOR(io.op.mul0.dmulh ||
+                     io.op.mul0.mulh ||
+                     io.op.shf.shf, rnd0) |
+               MuxOR(io.op.dwinit, io.read(0).data)
+
+  val accum1 = MuxOR(io.op.mul1.dmulh ||
+                     io.op.mul1.mulh, rnd1) |
+               MuxOR(io.op.dwinit, io.read(1).data)
+
+  // ---------------------------------------------------------------------------
+  // Registration.
+  val wsz = RegInit(0.U(3.W))
+  val waccvalid0 = RegInit(false.B)
+  val waccvalid1 = RegInit(false.B)
+  val wdata0 = Reg(UInt(32.W))
+  val waccm0 = Reg(UInt(32.W))
+  val wdata1 = Reg(UInt(32.W))
+  val waccm1 = Reg(UInt(32.W))
+
+  wsz := MuxOR(io.in.vdvalid || io.in.vevalid, io.in.sz)
+  waccvalid0 := accvalid0 || io.op.dwconv
+  waccvalid1 := accvalid1 || io.op.dwconv
+
+  when (io.in.vdvalid) {
+    wdata0 := out0 | io.load(0)
+  }
+
+  when (accvalid0) {
+    waccm0 := accum0
+  } .elsewhen (io.op.dwconvData) {
+    waccm0 := io.write(0).data
+  }
+
+  when (io.in.vevalid) {
+    wdata1 := out1 | io.load(1)
+  }
+
+  when (accvalid1) {
+    waccm1 := accum1
+  } .elsewhen (io.op.dwconvData) {
+    waccm1 := io.write(1).data
+  }
+
+  def Accum(en: Bool, d: UInt, a: UInt): UInt = {
+    val dm = MuxOR(en, d)
+    val am = MuxOR(en, a)
+    val rm = MuxOR(en && wsz(0), Cat(dm(31,24) + am(31,24),
+                                     dm(23,16) + am(23,16),
+                                     dm(15, 8) + am(15, 8),
+                                     dm( 7, 0) + am( 7, 0))) |
+             MuxOR(en && wsz(1), Cat(dm(31,16) + am(31,16),
+                                     dm(15, 0) + am(15, 0))) |
+             MuxOR(en && wsz(2), dm(31, 0) + am(31, 0))
+    val rn = MuxOR(!en, d)
+    assert(rm.getWidth == 32)
+    assert(rn.getWidth == 32)
+    rm | rn
+  }
+
+  io.write(0).data := Accum(waccvalid0, wdata0, waccm0)
+  io.write(1).data := Accum(waccvalid1, wdata1, waccm1)
+}
+
+object EmitVAluInt extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new VAluInt(p, 0), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VCmdq.scala b/hdl/chisel/src/kelvin/vector/VCmdq.scala
new file mode 100644
index 0000000..41845ad
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VCmdq.scala
@@ -0,0 +1,167 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+// A queue of commands, reducing VDecodeBits to just the necessary fields.
+// <fin> retains just the needed fields or modifications.
+// <fout> accepts the current stripmine bank step.
+// <factive> returns the activation status for decode dependencies.
+
+object VCmdq {
+  def apply[T <: Data](n: Int, t: T, fin: (VDecodeBits) => T, fout: (T, Bool, UInt, Bool) => (T, Bool), factive: (T, Bool, UInt) => UInt) = {
+    Module(new VCmdq(n, t, fin, fout, factive))
+  }
+}
+
+class VCmdq[T <: Data](n: Int, t: T, fin: (VDecodeBits) => T, fout: (T, Bool, UInt, Bool) => (T, Bool), factive: (T, Bool, UInt) => UInt) extends Module {
+  val io = IO(new Bundle {
+    val in  = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+    val out = Decoupled(t)
+    val active = Output(UInt(64.W))
+    val nempty = Output(Bool())
+  })
+
+  class VCmdqWrapper extends Bundle {
+    val tin = Output(t)     // type input
+    val m = Output(Bool())  // stripmine
+  }
+
+  val f = Fifo4e(new VCmdqWrapper, n)
+
+  val active = RegInit(0.U(64.W))
+
+  val valid = RegInit(false.B)
+  val ready = io.out.ready
+  val value = Reg(new VCmdqWrapper)
+
+  // ---------------------------------------------------------------------------
+  // Step controls.
+  val step0 = 0.U(5.W)
+  val step = RegInit(step0)
+
+  val (tin, last) = fout(value.tin, value.m, step, valid)
+
+  // ---------------------------------------------------------------------------
+  // Fifo.
+  f.io.in.valid := io.in.valid
+  io.in.ready := f.io.in.ready
+
+  for (i <- 0 until 4) {
+    f.io.in.bits(i).valid := io.in.bits(i).valid
+    f.io.in.bits(i).bits.tin := fin(io.in.bits(i).bits)
+    f.io.in.bits(i).bits.m := io.in.bits(i).bits.m
+  }
+
+  f.io.out.ready := !valid || ready && last
+
+  // ---------------------------------------------------------------------------
+  // Output register.
+  when (f.io.out.valid && f.io.out.ready) {
+    valid := true.B
+    value := f.io.out.bits
+    step := 0.U
+  } .elsewhen (io.out.valid && io.out.ready) {
+    when (!last) {
+      valid := true.B
+      value.tin := tin
+      value.m := value.m
+      step := step + 1.U
+    } .otherwise {
+      //  Output value.tin == 0 when not active (eg. do not drive vreg reads).
+      valid := false.B
+      value.tin := 0.U.asTypeOf(t)
+      value.m := false.B
+      step := 0.U
+    }
+  }
+
+  when (reset.asBool) {
+    value.tin := 0.U.asTypeOf(t)
+    value.m := false.B
+  }
+
+  // ---------------------------------------------------------------------------
+  // Active.
+  def ValueActive(data: UInt = 0.U(64.W), i: Int = 0): UInt = {
+    assert(data.getWidth == 64)
+    if (i < n) {
+      val active = MuxOR(f.io.entry(i).valid, factive(f.io.entry(i).bits.tin, f.io.entry(i).bits.m, step0))
+      ValueActive(data | active, i + 1)
+    } else {
+      val m = value.m
+      val active0 = factive(value.tin, m, step + 0.U)
+      val active1 = factive(value.tin, m, step + 1.U)
+      val active = MuxOR(valid && (!ready || !last),
+                         Mux(!ready, active0, active1))
+      data | active
+    }
+  }
+
+  when (io.in.valid && io.in.ready || io.out.valid && io.out.ready) {
+    val fvalid = MuxOR(f.io.in.valid && f.io.in.ready,
+                 Cat(f.io.in.bits(3).valid, f.io.in.bits(2).valid,
+                     f.io.in.bits(1).valid, f.io.in.bits(0).valid))
+
+    active :=
+      MuxOR(fvalid(0), factive(f.io.in.bits(0).bits.tin, f.io.in.bits(0).bits.m, step0)) |
+      MuxOR(fvalid(1), factive(f.io.in.bits(1).bits.tin, f.io.in.bits(1).bits.m, step0)) |
+      MuxOR(fvalid(2), factive(f.io.in.bits(2).bits.tin, f.io.in.bits(2).bits.m, step0)) |
+      MuxOR(fvalid(3), factive(f.io.in.bits(3).bits.tin, f.io.in.bits(3).bits.m, step0)) |
+      ValueActive()
+  }
+
+  // ---------------------------------------------------------------------------
+  // Outputs.
+  io.out.valid := valid
+  io.out.bits := value.tin
+
+  io.active := active
+
+  io.nempty := f.io.nempty || valid
+}
+
+class VCmdqTestBundle extends Bundle {
+  val op = UInt(new VEncodeOp().bits.W)
+  val sz = UInt(3.W)
+  val vd = new VAddr()
+  val vs = new VAddrTag()
+  val data = UInt(32.W)
+}
+
+object EmitVCmdq extends App {
+  def VCmdqTestFin(in: VDecodeBits): VCmdqTestBundle = {
+    val out = Wire(new VCmdqTestBundle)
+    out.op := in.op
+    out.sz := in.sz
+    out.vd := in.vd
+    out.vs := in.vs
+    out.data := in.sv.data
+    out
+  }
+
+  def VCmdqTestFout(in: VCmdqTestBundle, m: Bool, step: UInt, valid: Bool): (VCmdqTestBundle, Bool) = {
+    val out = Wire(new VCmdqTestBundle)
+    val last = !m || step === 3.U
+    out.op := in.op
+    out.sz := in.sz
+    out.vd.valid := in.vd.valid
+    out.vs.valid := in.vs.valid
+    out.vd.addr := in.vd.addr + 1.U
+    out.vs.addr := in.vs.addr + 1.U
+    out.vs.tag := in.vs.tag
+    out.data := in.data
+    (out, last)
+  }
+
+  def VCmdqTestFactive(in: VCmdqTestBundle, m: Bool, step: UInt): UInt = {
+    assert(step.getWidth == 5)
+    val active = MuxOR(in.vd.valid, RegActive(m, step(2,0), in.vd.addr)) |
+                 MuxOR(in.vs.valid, RegActive(m, step(2,0), in.vs.addr))
+    assert(active.getWidth == 64)
+    active
+  }
+
+  (new chisel3.stage.ChiselStage).emitVerilog(new VCmdq(8, new VCmdqTestBundle, VCmdqTestFin, VCmdqTestFout, VCmdqTestFactive), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VCommon.scala b/hdl/chisel/src/kelvin/vector/VCommon.scala
new file mode 100644
index 0000000..5db855e
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VCommon.scala
@@ -0,0 +1,124 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+// Convert register port into a onehot w/wo stripmining.
+object RegActive {
+  def apply(m: Bool, step: UInt, regnum: UInt): UInt = {
+    assert(step.getWidth == 3)
+    assert(regnum.getWidth == 6)
+    assert(step <= 4.U)
+
+    val oh = OneHot(regnum(5,2), 16)
+
+    val oh0 = Cat(0.U(3.W), oh(15),
+                  0.U(3.W), oh(14),
+                  0.U(3.W), oh(13),
+                  0.U(3.W), oh(12),
+                  0.U(3.W), oh(11),
+                  0.U(3.W), oh(10),
+                  0.U(3.W), oh(9),
+                  0.U(3.W), oh(8),
+                  0.U(3.W), oh(7),
+                  0.U(3.W), oh(6),
+                  0.U(3.W), oh(5),
+                  0.U(3.W), oh(4),
+                  0.U(3.W), oh(3),
+                  0.U(3.W), oh(2),
+                  0.U(3.W), oh(1),
+                  0.U(3.W), oh(0))
+
+    val oh1 = Cat(0.U(2.W), oh(15), 0.U(1.W),
+                  0.U(2.W), oh(14), 0.U(1.W),
+                  0.U(2.W), oh(13), 0.U(1.W),
+                  0.U(2.W), oh(12), 0.U(1.W),
+                  0.U(2.W), oh(11), 0.U(1.W),
+                  0.U(2.W), oh(10), 0.U(1.W),
+                  0.U(2.W), oh(9), 0.U(1.W),
+                  0.U(2.W), oh(8), 0.U(1.W),
+                  0.U(2.W), oh(7), 0.U(1.W),
+                  0.U(2.W), oh(6), 0.U(1.W),
+                  0.U(2.W), oh(5), 0.U(1.W),
+                  0.U(2.W), oh(4), 0.U(1.W),
+                  0.U(2.W), oh(3), 0.U(1.W),
+                  0.U(2.W), oh(2), 0.U(1.W),
+                  0.U(2.W), oh(1), 0.U(1.W),
+                  0.U(2.W), oh(0), 0.U(1.W))
+
+    val oh2 = Cat(0.U(1.W), oh(15), 0.U(2.W),
+                  0.U(1.W), oh(14), 0.U(2.W),
+                  0.U(1.W), oh(13), 0.U(2.W),
+                  0.U(1.W), oh(12), 0.U(2.W),
+                  0.U(1.W), oh(11), 0.U(2.W),
+                  0.U(1.W), oh(10), 0.U(2.W),
+                  0.U(1.W), oh(9), 0.U(2.W),
+                  0.U(1.W), oh(8), 0.U(2.W),
+                  0.U(1.W), oh(7), 0.U(2.W),
+                  0.U(1.W), oh(6), 0.U(2.W),
+                  0.U(1.W), oh(5), 0.U(2.W),
+                  0.U(1.W), oh(4), 0.U(2.W),
+                  0.U(1.W), oh(3), 0.U(2.W),
+                  0.U(1.W), oh(2), 0.U(2.W),
+                  0.U(1.W), oh(1), 0.U(2.W),
+                  0.U(1.W), oh(0), 0.U(2.W))
+
+    val oh3 = Cat(oh(15), 0.U(3.W),
+                  oh(14), 0.U(3.W),
+                  oh(13), 0.U(3.W),
+                  oh(12), 0.U(3.W),
+                  oh(11), 0.U(3.W),
+                  oh(10), 0.U(3.W),
+                  oh(9), 0.U(3.W),
+                  oh(8), 0.U(3.W),
+                  oh(7), 0.U(3.W),
+                  oh(6), 0.U(3.W),
+                  oh(5), 0.U(3.W),
+                  oh(4), 0.U(3.W),
+                  oh(3), 0.U(3.W),
+                  oh(2), 0.U(3.W),
+                  oh(1), 0.U(3.W),
+                  oh(0), 0.U(3.W))
+
+    assert(oh.getWidth == 16)
+    assert(oh0.getWidth == 64)
+    assert(oh1.getWidth == 64)
+    assert(oh2.getWidth == 64)
+    assert(oh3.getWidth == 64)
+
+    val idx = regnum(1,0)
+
+    val active = MuxOR(!m && idx === 0.U || m && step <= 0.U, oh0) |
+                 MuxOR(!m && idx === 1.U || m && step <= 1.U, oh1) |
+                 MuxOR(!m && idx === 2.U || m && step <= 2.U, oh2) |
+                 MuxOR(!m && idx === 3.U || m && step <= 3.U, oh3)
+    assert(active.getWidth == 64)
+
+    active
+  }
+}
+
+// Convert tagged address into register file format.
+object OutTag {
+  def apply(v: VAddrTag): UInt = {
+    OutTag(v.addr, v.tag)
+  }
+
+  def apply(addr: UInt, tag: UInt): UInt = {
+    assert(addr.getWidth == 6)
+    assert(tag.getWidth == 4)
+    tag(addr(1,0))
+  }
+}
+
+object ScoreboardReady {
+  def apply(a: VAddrTag, sb: UInt): Bool = {
+    assert(a.addr.getWidth == 6)
+    assert(a.tag.getWidth == 4)
+    assert(sb.getWidth == 128)
+    val tag = a.tag(a.addr(1,0))
+    val idx = Cat(tag, a.addr)
+    assert(idx.getWidth == 7)
+    (!a.valid || !sb(idx))
+  }
+}
diff --git a/hdl/chisel/src/kelvin/vector/VConvAlu.scala b/hdl/chisel/src/kelvin/vector/VConvAlu.scala
new file mode 100644
index 0000000..c469161
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VConvAlu.scala
@@ -0,0 +1,109 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+object VConvAlu {
+  def apply(p: Parameters): VConvAlu = {
+    return Module(new VConvAlu(p))
+  }
+}
+
+class VConvAlu(p: Parameters) extends Module {
+  val tcnt = p.vectorBits / 32
+
+  val io = IO(new Bundle {
+    val op = new Bundle {
+      val conv  = Input(Bool())  // convolution
+      val init  = Input(Bool())  // initialize
+      val tran  = Input(Bool())  // transpose
+      val clear = Input(Bool())  // clear accumulator
+    }
+    val index = Input(UInt(log2Ceil(tcnt).W))
+    val adata = Input(UInt((tcnt * 32).W))
+    val bdata = Input(UInt((tcnt * 32).W))
+    val abias = Input(UInt(9.W))
+    val bbias = Input(UInt(9.W))
+    val asign = Input(Bool())
+    val bsign = Input(Bool())
+    val out = Output(Vec(tcnt, UInt((tcnt * 32).W)))
+  })
+
+  // MatMul
+  //   B B B B
+  // A . . . .
+  // A . . . .
+  // A . . . .
+  // A . . . .
+
+  val acc = Reg(Vec(tcnt, Vec(tcnt, UInt(32.W))))
+
+  assert(PopCount(Cat(io.op.conv, io.op.tran, io.op.clear)) <= 1.U)
+
+  // ---------------------------------------------------------------------------
+  // Output interleave to match shift reductions.
+  def Interleave(i: Int, j: Int): (Int, Int) = {
+    val interleave = Seq(0, 2, 1, 3);
+    val rbase = i & ~3;
+    val rquad = i & 3;
+    val word  = j;
+    val si = rbase + interleave(word & 3);
+    val sj = rquad * (tcnt / 4) + (word / 4);
+    (si, sj)
+  }
+
+  // ---------------------------------------------------------------------------
+  // Matrix Multiply.
+  val dpa = Wire(Vec(tcnt, Vec(tcnt, UInt(32.W))))  // dot product accumulate
+
+  for (i <- 0 until tcnt) {
+    for (j <- 0 until tcnt) {
+      val accum = MuxOR(io.op.conv, acc(i)(j))
+      dpa(i)(j) := accum + VDot(io.op.conv,
+          io.adata(i * 32 + 31, i * 32), io.bdata(j * 32 + 31, j * 32),
+          io.abias, io.bbias, io.asign, io.bsign)
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Parallel load.
+  val pload = MuxOR(io.op.tran, io.adata) |
+              MuxOR(io.op.init, io.bdata)
+
+  // ---------------------------------------------------------------------------
+  // Accumulators.
+  for (i <- 0 until tcnt) {
+    for (j <- 0 until tcnt) {
+      val (si, sj) = Interleave(i, j)
+
+      val aclr = io.op.clear || reset.asBool
+      val conv = io.op.conv
+      val load = (io.op.init || io.op.tran) && si.U === io.index
+
+      when (aclr || conv || load) {
+        acc(i)(j) := Mux(conv, dpa(i)(j),
+                         pload(sj * 32 + 31, sj * 32))
+      }
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Outputs.
+  val out = Wire(Vec(tcnt, Vec(tcnt, UInt(32.W))))
+
+  for (i <- 0 until tcnt) {
+    for (j <- 0 until tcnt) {
+      val (si, sj) = Interleave(i, j)
+      out(si)(sj) := acc(i)(j)
+    }
+  }
+
+  for (i <- 0 until tcnt) {
+    io.out(i) := out(i).asUInt
+  }
+}
+
+object EmitVConvAlu extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new VConvAlu(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VConvCtrl.scala b/hdl/chisel/src/kelvin/vector/VConvCtrl.scala
new file mode 100644
index 0000000..78fa0a4
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VConvCtrl.scala
@@ -0,0 +1,197 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object VConvCtrl {
+  def apply(p: Parameters): VConvCtrl = {
+    return Module(new VConvCtrl(p))
+  }
+}
+
+class VConvCtrl(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    // Instructions.
+    val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+    val active = Output(UInt(64.W))
+
+    // RegisterFile.
+    val vrfsb = Input(UInt(128.W))
+    val out = new VRegfileConvIO(p)
+  })
+
+  // A usable depth of outstanding commands.
+  val cmdqDepth = 4
+
+  val e = new VEncodeOp()
+
+  // ---------------------------------------------------------------------------
+  // Command Queue.
+  class VConvCtrlCmdq extends Bundle {
+    val conv   = Bool()  // convolution
+    val init   = Bool()  // initialize (set)
+    val tran   = Bool()  // transpose
+    val wclr   = Bool()  // get and clear (marks last cycle)
+    val addr1  = UInt(6.W)
+    val addr2  = UInt(6.W)
+    val base2  = UInt(6.W)
+    val mode   = UInt(2.W)
+    val mark2  = UInt((p.vectorBits / 32).W)
+    val index  = UInt(log2Ceil(p.vectorBits / 32).W)
+    val end    = UInt(log2Ceil(p.vectorBits / 32).W)
+    val abias  = UInt(9.W)
+    val bbias  = UInt(9.W)
+    val asign  = Bool()
+    val bsign  = Bool()
+  }
+
+  def Fin(in: VDecodeBits): VConvCtrlCmdq = {
+    val out = Wire(new VConvCtrlCmdq)
+
+    val vcget  = in.op === e.vcget.U
+    val acset  = in.op === e.acset.U
+    val actr   = in.op === e.actr.U
+    val aconv  = in.op === e.aconv.U
+
+    val addr1 = in.vs.addr
+    val addr2 = Mux(acset, in.vs.addr, in.vu.addr)
+    val data  = in.sv.data
+    val sp    = (p.vectorBits / 32) - 1
+    val mark2 = Wire(UInt((p.vectorBits / 32).W))
+    val start = Mux(acset || actr, 0.U,  data(6,2))
+    val stop  = Mux(acset || actr, sp.U, data(11,7))
+
+    if (p.vectorBits == 128) {
+      mark2 := 0xf.U >> (3.U - (stop(1,0) - start(1,0)))
+    } else if (p.vectorBits == 256) {
+      mark2 := 0xff.U >> (7.U - (stop(2,0) - start(2,0)))
+    } else if (p.vectorBits == 512) {
+      mark2 := 0xffff.U >> (15.U - (stop(3,0) - start(3,0)))
+    } else {
+      assert(false)
+    }
+
+    out.conv  := aconv
+    out.init  := acset
+    out.tran  := actr
+    out.wclr  := vcget
+    out.addr1 := addr1
+    out.addr2 := addr2
+    out.base2 := addr2
+    out.mode  := data(1,0)
+    out.mark2 := mark2
+    out.index := start
+    out.end   := stop
+    out.abias := data(20,12)
+    out.asign := data(21)
+    out.bbias := data(30,22)
+    out.bsign := data(31)
+
+    out
+  }
+
+  def Fout(in: VConvCtrlCmdq, m: Bool, step: UInt, valid: Bool): (VConvCtrlCmdq, Bool) = {
+    when (valid) {
+      assert(m === false.B)
+      assert(in.index <= in.end)
+
+      if (p.vectorBits == 128) {
+        assert(in.addr1(1,0) === 0.U)
+      } else if (p.vectorBits == 256) {
+        assert(in.addr1(2,0) === 0.U)
+      } else if (p.vectorBits == 512) {
+        assert(in.addr1(3,0) === 0.U)
+      }
+    }
+
+    val out = Wire(new VConvCtrlCmdq)
+    val last = in.index === in.end || in.wclr
+
+    out := in
+    out.index := in.index + 1.U
+    out.addr2 := in.addr2 + 1.U
+
+    (out, last)
+  }
+
+  def Factive(in: VConvCtrlCmdq, m: Bool, step: UInt): UInt = {
+    val active1 = Wire(UInt(64.W))
+    val active2 = Wire(UInt(64.W))
+
+    val addr1 = in.addr1
+    val addr2 = in.addr2
+
+    // (mark2 & (mark2 << step)) clears the lsb bits.
+    if (p.vectorBits == 128) {
+      active1 := 0xf.U << Cat(addr1(5,2), 0.U(2.W))
+      active2 := ((in.mark2 & (in.mark2 << step(1,0))) << in.base2)(63,0)
+    } else if (p.vectorBits == 256) {
+      active1 := 0xff.U << Cat(addr1(5,3), 0.U(3.W))
+      active2 := ((in.mark2 & (in.mark2 << step(2,0))) << in.base2)(63,0)
+    } else if (p.vectorBits == 512) {
+      active1 := 0xffff.U << Cat(addr1(5,4), 0.U(4.W))
+      active2 := ((in.mark2 & (in.mark2 << step(3,0))) << in.base2)(63,0)
+    } else {
+      assert(false)
+    }
+
+    // Only reads are reported in active, vrfsb tracks writes.
+    val active = MuxOR(in.conv || in.tran, active1) |
+                 MuxOR(in.conv || in.init, active2)
+
+    active
+  }
+
+  val q = VCmdq(cmdqDepth, new VConvCtrlCmdq, Fin, Fout, Factive)
+
+  q.io.in <> io.in
+
+  // ---------------------------------------------------------------------------
+  // VRegfile Conv.
+  val active = Factive(q.io.out.bits, false.B, 0.U)
+
+  // Write ports take 2 cycles to commit to register store, but 3 cycles need
+  // to be factored due to ALU-to-ALU scoreboard forwarding.
+  val vrfsb0 = io.vrfsb(63,0) | io.vrfsb(127,64)
+  val vrfsb1 = RegInit(0.U(64.W))
+  val vrfsb2 = RegInit(0.U(64.W))
+  val vrfsb = vrfsb0 | vrfsb1 | vrfsb2
+  vrfsb1 := vrfsb0
+  vrfsb2 := vrfsb1
+
+  val ready = (active & vrfsb) === 0.U
+
+  q.io.out.ready := ready
+
+  io.out.valid := q.io.out.valid
+  io.out.ready := ready
+
+  io.out.op.conv := q.io.out.bits.conv
+  io.out.op.init := q.io.out.bits.init
+  io.out.op.tran := q.io.out.bits.tran
+  io.out.op.wclr := q.io.out.bits.wclr
+
+  io.out.mode  := q.io.out.bits.mode
+  io.out.index := q.io.out.bits.index
+  io.out.addr1 := q.io.out.bits.addr1
+  io.out.addr2 := q.io.out.bits.addr2
+  io.out.abias := q.io.out.bits.abias
+  io.out.asign := q.io.out.bits.asign
+  io.out.bbias := q.io.out.bits.bbias
+  io.out.bsign := q.io.out.bits.bsign
+
+  assert(!(q.io.out.bits.wclr && !q.io.out.ready))
+
+  assert(!(io.out.valid && io.out.ready) ||
+         PopCount(Cat(io.out.op.conv, io.out.op.init, io.out.op.tran, io.out.op.wclr)) === 1.U)
+
+  // ---------------------------------------------------------------------------
+  // Active.
+  io.active := q.io.active
+}
+
+object EmitVConvCtrl extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new VConvCtrl(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VCore.scala b/hdl/chisel/src/kelvin/vector/VCore.scala
new file mode 100644
index 0000000..0ce01f4
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VCore.scala
@@ -0,0 +1,344 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object VCore {
+  def apply(p: Parameters): VCore = {
+    return Module(new VCore(p))
+  }
+}
+
+// object VCore {
+//   def apply(p: Parameters): VCoreEmpty = {
+//     return Module(new VCoreEmpty(p))
+//   }
+// }
+
+class VCoreIO(p: Parameters) extends Bundle {
+  // Decode cycle.
+  val vinst = Vec(4, new VInstIO)
+
+  // Execute cycle.
+  val rs = Vec(8, Flipped(new RegfileReadDataIO))
+  val rd = Vec(4, Flipped(new RegfileWriteDataIO))
+
+  // Status.
+  val mactive = Output(Bool())
+
+  // Faults.
+  val undef = Output(Bool())
+}
+
+class VCore(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    // Score <> VCore
+    val score = new VCoreIO(p)
+
+    // Data bus interface.
+    val dbus = new DBusIO(p)
+    val last = Output(Bool())
+
+    // AXI interface.
+    val ld = new AxiMasterReadIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits)
+    val st = new AxiMasterWriteIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits)
+  })
+
+  // Decode    : VInst.in
+  // Execute+0 : VInst.slice
+  // Execute+1 : VInst.out <> VDec::Fifo.in
+  // Execute+2 : VDec::Fifo.out <> VDec::Shuffle.in
+  // Execute+3 : VDec::Shuffle.out <> VCmdq::Fifo.in
+  // Execute+4 : VCmdq::Fifo.out <> VCmdq::Reg.in
+  // Execute+5 : VCmdq::Reg.out <> {VLdSt, VAlu, ...}
+
+  val vinst  = VInst(p)
+  val vdec   = VDecode(p)
+  val valu   = VAlu(p)
+  val vconv  = VConvCtrl(p)
+  val vldst  = VLdSt(p)
+  val vld    = VLd(p)
+  val vst    = VSt(p)
+  val vrf    = VRegfile(p)
+
+  vinst.io.in <> io.score.vinst
+  vinst.io.rs <> io.score.rs
+  vinst.io.rd <> io.score.rd
+
+  assert(PopCount(Cat(vst.io.read.valid && vst.io.read.ready,
+                      vldst.io.read.valid && vldst.io.read.ready)) <= 1.U)
+
+  // ---------------------------------------------------------------------------
+  // VDecode.
+  vdec.io.vrfsb <> vrf.io.vrfsb
+
+  vdec.io.active := valu.io.active | vconv.io.active | vldst.io.active | vst.io.active
+
+  vdec.io.in.valid := vinst.io.out.valid
+  vinst.io.out.ready := vdec.io.in.ready
+  assert(!(vdec.io.in.valid && !vdec.io.in.ready))
+
+  vinst.io.out.stall := vdec.io.stall  // decode backpressure
+
+  for (i <- 0 until 4) {
+    vdec.io.in.bits(i) := vinst.io.out.lane(i)
+  }
+
+  io.score.undef := vdec.io.undef
+
+  // ---------------------------------------------------------------------------
+  // VRegfile.
+  for (i <- 0 until 7) {
+    vrf.io.read(i).valid := false.B
+    vrf.io.read(i).addr := 0.U
+    vrf.io.read(i).tag := 0.U
+  }
+
+  for (i <- 0 until 6) {
+    vrf.io.write(i).valid := false.B
+    vrf.io.write(i).addr := 0.U
+    vrf.io.write(i).data := 0.U
+  }
+
+  for (i <- 0 until 4) {
+    vrf.io.whint(i).valid := false.B
+    vrf.io.whint(i).addr := 0.U
+  }
+
+  for (i <- 0 until 2) {
+    vrf.io.scalar(i).valid := false.B
+    vrf.io.scalar(i).data := 0.U
+  }
+
+  vrf.io.transpose.valid := false.B
+  vrf.io.transpose.index := 0.U
+  vrf.io.transpose.addr  := 0.U
+
+  // ---------------------------------------------------------------------------
+  // VALU.
+  val aluvalid = Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).alu,
+                     vdec.io.out(2).valid && vdec.io.cmdq(2).alu,
+                     vdec.io.out(1).valid && vdec.io.cmdq(1).alu,
+                     vdec.io.out(0).valid && vdec.io.cmdq(0).alu)
+
+  val aluready = Cat(valu.io.in.ready && vdec.io.cmdq(3).alu,
+                     valu.io.in.ready && vdec.io.cmdq(2).alu,
+                     valu.io.in.ready && vdec.io.cmdq(1).alu,
+                     valu.io.in.ready && vdec.io.cmdq(0).alu)
+
+  valu.io.in.valid := aluvalid =/= 0.U
+
+  for (i <- 0 until 4) {
+    valu.io.in.bits(i).valid := aluvalid(i)
+    valu.io.in.bits(i).bits := vdec.io.out(i).bits
+  }
+
+  for (i <- 0 until 7) {
+    vrf.io.read(i).valid := valu.io.read(i).valid
+    vrf.io.read(i).addr := valu.io.read(i).addr
+    vrf.io.read(i).tag  := valu.io.read(i).tag
+  }
+
+  for (i <- 0 until 7) {
+    valu.io.read(i).data := vrf.io.read(i).data
+  }
+
+  for (i <- 0 until 4) {
+    vrf.io.write(i).valid := valu.io.write(i).valid
+    vrf.io.write(i).addr := valu.io.write(i).addr
+    vrf.io.write(i).data := valu.io.write(i).data
+
+    vrf.io.whint(i).valid := valu.io.whint(i).valid
+    vrf.io.whint(i).addr := valu.io.whint(i).addr
+  }
+
+  for (i <- 0 until 2) {
+    vrf.io.scalar(i).valid := valu.io.scalar(i).valid
+    vrf.io.scalar(i).data := valu.io.scalar(i).data
+  }
+
+  valu.io.vrfsb := vrf.io.vrfsb.data
+
+  // ---------------------------------------------------------------------------
+  // VCONV.
+  val convvalid = Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).conv,
+                      vdec.io.out(2).valid && vdec.io.cmdq(2).conv,
+                      vdec.io.out(1).valid && vdec.io.cmdq(1).conv,
+                      vdec.io.out(0).valid && vdec.io.cmdq(0).conv)
+
+  val convready = Cat(vconv.io.in.ready && vdec.io.cmdq(3).conv,
+                      vconv.io.in.ready && vdec.io.cmdq(2).conv,
+                      vconv.io.in.ready && vdec.io.cmdq(1).conv,
+                      vconv.io.in.ready && vdec.io.cmdq(0).conv)
+
+  vconv.io.in.valid := convvalid =/= 0.U
+
+  for (i <- 0 until 4) {
+    vconv.io.in.bits(i).valid := convvalid(i)
+    vconv.io.in.bits(i).bits := vdec.io.out(i).bits
+  }
+
+  vrf.io.conv := vconv.io.out
+
+  vconv.io.vrfsb := vrf.io.vrfsb.data
+
+  // ---------------------------------------------------------------------------
+  // VLdSt.
+  val ldstvalid = Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).ldst,
+                      vdec.io.out(2).valid && vdec.io.cmdq(2).ldst,
+                      vdec.io.out(1).valid && vdec.io.cmdq(1).ldst,
+                      vdec.io.out(0).valid && vdec.io.cmdq(0).ldst)
+
+  val ldstready = Cat(vldst.io.in.ready && vdec.io.cmdq(3).ldst,
+                      vldst.io.in.ready && vdec.io.cmdq(2).ldst,
+                      vldst.io.in.ready && vdec.io.cmdq(1).ldst,
+                      vldst.io.in.ready && vdec.io.cmdq(0).ldst)
+
+  vldst.io.in.valid := ldstvalid =/= 0.U
+
+  for (i <- 0 until 4) {
+    vldst.io.in.bits(i).valid := ldstvalid(i)
+    vldst.io.in.bits(i).bits := vdec.io.out(i).bits
+  }
+
+  vldst.io.read.ready := !vst.io.read.valid
+  vldst.io.read.data := vrf.io.read(6).data
+
+  vldst.io.vrfsb := vrf.io.vrfsb.data
+
+  io.dbus <> vldst.io.dbus
+  io.last := vldst.io.last
+
+  // ---------------------------------------------------------------------------
+  // VLd.
+  val ldvalid = Wire(UInt(4.W))
+  val ldready = Wire(UInt(4.W))
+
+  ldvalid := Cat(vdec.io.cmdq(3).ld && vdec.io.out(3).valid,
+                 vdec.io.cmdq(2).ld && vdec.io.out(2).valid,
+                 vdec.io.cmdq(1).ld && vdec.io.out(1).valid,
+                 vdec.io.cmdq(0).ld && vdec.io.out(0).valid)
+
+  ldready := Cat(vdec.io.cmdq(3).ld && vld.io.in.ready,
+                 vdec.io.cmdq(2).ld && vld.io.in.ready,
+                 vdec.io.cmdq(1).ld && vld.io.in.ready,
+                 vdec.io.cmdq(0).ld && vld.io.in.ready)
+
+  vld.io.in.valid := ldvalid =/= 0.U
+
+  for (i <- 0 until 4) {
+    vld.io.in.bits(i).valid := ldvalid(i)
+    vld.io.in.bits(i).bits := vdec.io.out(i).bits
+  }
+
+  io.ld <> vld.io.axi
+
+  // ---------------------------------------------------------------------------
+  // VSt.
+  val stvalid = Wire(UInt(4.W))
+  val stready = Wire(UInt(4.W))
+
+  stvalid := Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).st,
+                 vdec.io.out(2).valid && vdec.io.cmdq(2).st,
+                 vdec.io.out(1).valid && vdec.io.cmdq(1).st,
+                 vdec.io.out(0).valid && vdec.io.cmdq(0).st)
+
+  stready := Cat(vst.io.in.ready && vdec.io.cmdq(3).st,
+                 vst.io.in.ready && vdec.io.cmdq(2).st,
+                 vst.io.in.ready && vdec.io.cmdq(1).st,
+                 vst.io.in.ready && vdec.io.cmdq(0).st)
+
+  vst.io.in.valid := stvalid =/= 0.U
+
+  for (i <- 0 until 4) {
+    vst.io.in.bits(i).valid := stvalid(i)
+    vst.io.in.bits(i).bits := vdec.io.out(i).bits
+  }
+
+  io.st <> vst.io.axi
+
+  vst.io.vrfsb := vrf.io.vrfsb.data
+
+  vst.io.read.ready := true.B
+  vst.io.read.data := vrf.io.read(6).data
+
+  // ---------------------------------------------------------------------------
+  // Load write.
+  vrf.io.write(4).valid := vldst.io.write.valid
+  vrf.io.write(4).addr := vldst.io.write.addr
+  vrf.io.write(4).data := vldst.io.write.data
+
+  vrf.io.write(5).valid := vld.io.write.valid
+  vrf.io.write(5).addr := vld.io.write.addr
+  vrf.io.write(5).data := vld.io.write.data
+
+  // ---------------------------------------------------------------------------
+  // Store read.
+  vrf.io.read(6).valid := vst.io.read.valid || vldst.io.read.valid
+  vrf.io.read(6).addr := Mux(vst.io.read.valid, vst.io.read.addr,
+                             vldst.io.read.addr)
+  vrf.io.read(6).tag := Mux(vst.io.read.valid, vst.io.read.tag,
+                            vldst.io.read.tag)
+
+  // ---------------------------------------------------------------------------
+  // VDecode.
+  for (i <- 0 until 4) {
+    vdec.io.out(i).ready := aluready(i) || convready(i) || ldstready(i) ||
+                            ldready(i) || stready(i)
+  }
+
+  // ---------------------------------------------------------------------------
+  // Memory active status.
+  io.score.mactive := vinst.io.nempty || vdec.io.nempty ||
+                      vld.io.nempty || vst.io.nempty
+}
+
+class VCoreEmpty(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    // Score <> VCore
+    val score = new VCoreIO(p)
+
+    // Data bus interface.
+    val dbus = new DBusIO(p)
+    val last = Output(Bool())
+
+    // AXI interface.
+    val ld = new AxiMasterReadIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits)
+    val st = new AxiMasterWriteIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits)
+  })
+
+  io.score.undef := io.score.vinst(0).valid || io.score.vinst(1).valid ||
+                    io.score.vinst(2).valid || io.score.vinst(3).valid
+
+  io.score.mactive := false.B
+
+  io.dbus.valid := false.B
+  io.dbus.write := false.B
+  io.dbus.size := 0.U
+  io.dbus.addr := 0.U
+  io.dbus.adrx := 0.U
+  io.dbus.wdata := 0.U
+  io.dbus.wmask := 0.U
+  io.last := false.B
+
+  for (i <- 0 until 4) {
+    io.score.vinst(i).ready := true.B
+    io.score.rd(i).valid := false.B
+    io.score.rd(i).addr := 0.U
+    io.score.rd(i).data := 0.U
+  }
+
+  io.ld.addr.valid := false.B
+  io.ld.addr.bits.addr := 0.U
+  io.ld.addr.bits.id := 0.U
+  io.ld.data.ready := false.B
+
+  io.st.addr.valid := false.B
+  io.st.addr.bits.addr := 0.U
+  io.st.addr.bits.id := 0.U
+  io.st.data.valid := false.B
+  io.st.data.bits.data := 0.U
+  io.st.data.bits.strb := 0.U
+  io.st.resp.ready := false.B
+}
diff --git a/hdl/chisel/src/kelvin/vector/VDecode.scala b/hdl/chisel/src/kelvin/vector/VDecode.scala
new file mode 100644
index 0000000..3451adc
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VDecode.scala
@@ -0,0 +1,440 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common.Fifo4x4
+
+object VDecode {
+  def apply(p: Parameters): VDecode = {
+    return Module(new VDecode(p))
+  }
+}
+
+class VDecode(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    val in = Flipped(Decoupled(Vec(4, Valid(new VectorInstructionLane))))
+    val out = Vec(4, Decoupled(new VDecodeBits))
+    val cmdq = Vec(4, Output(new VDecodeCmdq))
+    val actv = Vec(4, Output(new VDecodeActive))  // used in testbench
+    val stall = Output(Bool())
+    val active = Input(UInt(64.W))
+    val vrfsb = new VRegfileScoreboardIO
+    val undef = Output(Bool())
+    val nempty = Output(Bool())
+  })
+
+  val guard = 8  // two cycles of 4-way dispatch
+  val depth = 16 + guard
+
+  val enc = new VEncodeOp()
+
+  val f = Fifo4x4(new VectorInstructionLane, depth)
+
+  val d = Seq(Module(new VDecodeInstruction(p)),
+              Module(new VDecodeInstruction(p)),
+              Module(new VDecodeInstruction(p)),
+              Module(new VDecodeInstruction(p)))
+
+  val e = Wire(Vec(4, new VDecodeBits))
+
+  val valid = RegInit(VecInit(Seq.fill(4)(false.B)))
+  val data = Reg(Vec(4, new VDecodeBits))
+  val cmdq = Reg(Vec(4, new VDecodeCmdq))
+  val actv = Wire(Vec(4, new VDecodeActive))
+  val actv2 = Reg(Vec(4, new VDecodeActive2))
+  val dataNxt = Wire(Vec(4, new VDecodeBits))
+  val cmdqNxt = Wire(Vec(4, new VDecodeCmdq))
+  val actvNxt = Wire(Vec(4, new VDecodeActive2))
+
+  // ---------------------------------------------------------------------------
+  // Decode.
+  for (i <- 0 until 4) {
+    d(i).io.in := f.io.out(i).bits
+  }
+
+  // ---------------------------------------------------------------------------
+  // Apply "out-of-order" tags to read/write registers.
+  // Since only one write may be outstanding, track using 1bit which side of
+  // write the read usage is occurring on.
+  val tagReg = RegInit(0.U(64.W))
+
+  val tag0 = tagReg
+  val tag1 = tag0 ^ d(0).io.actv.wactive
+  val tag2 = tag1 ^ d(1).io.actv.wactive
+  val tag3 = tag2 ^ d(2).io.actv.wactive
+  val tag4 = tag3 ^ d(3).io.actv.wactive
+
+  val tags = Seq(tag0, tag1, tag2, tag3, tag4)
+
+  // f.io.out is ordered, so can use a priority tree.
+  when(f.io.out(3).valid && f.io.out(3).ready) {
+    tagReg := tag4
+  } .elsewhen(f.io.out(2).valid && f.io.out(2).ready) {
+    tagReg := tag3
+  } .elsewhen(f.io.out(1).valid && f.io.out(1).ready) {
+    tagReg := tag2
+  } .elsewhen(f.io.out(0).valid && f.io.out(0).ready) {
+    tagReg := tag1
+  }
+
+  def TagAddr(tag: UInt, v: VAddrTag): VAddrTag = {
+    assert(tag.getWidth == 64)
+    assert(v.addr.getWidth == 6)
+    assert(v.tag === 0.U)
+    val addr = v.addr
+    val addrm = addr(5,2)
+    val tagm = Wire(Vec(16, UInt(4.W)))
+    for (i <- 0 until 16) {
+      tagm(i) := tag(4 * i + 3, 4 * i)
+    }
+    val r = Wire(new VAddrTag())
+    r.valid := v.valid
+    r.addr := v.addr
+    r.tag := VecAt(tagm, addrm)
+    r
+  }
+
+  for (i <- 0 until 4) {
+    e(i) := d(i).io.out
+    e(i).vs := TagAddr(tags(i), d(i).io.out.vs)
+    e(i).vt := TagAddr(tags(i), d(i).io.out.vt)
+    e(i).vu := TagAddr(tags(i), d(i).io.out.vu)
+    e(i).vx := TagAddr(tags(i), d(i).io.out.vx)
+    e(i).vy := TagAddr(tags(i), d(i).io.out.vy)
+    e(i).vz := TagAddr(tags(i), d(i).io.out.vz)
+  }
+
+  // ---------------------------------------------------------------------------
+  // Undef.  (io.in.ready ignored to signal as early as possible)
+  io.undef := io.in.valid && (d(0).io.undef || d(1).io.undef || d(2).io.undef || d(3).io.undef)
+
+  // ---------------------------------------------------------------------------
+  // Fifo.
+  f.io.in <> io.in
+
+  val icount = MuxOR(io.in.valid, PopCount(Cat(io.in.bits(0).valid, io.in.bits(1).valid, io.in.bits(2).valid, io.in.bits(3).valid)))
+  assert(icount.getWidth == 3)
+
+  val ocount = PopCount(Cat(valid(0) && !(io.out(0).valid && io.out(0).ready),
+                            valid(1) && !(io.out(1).valid && io.out(1).ready),
+                            valid(2) && !(io.out(2).valid && io.out(2).ready),
+                            valid(3) && !(io.out(3).valid && io.out(3).ready)))
+  assert(ocount.getWidth == 3)
+
+  for (i <- 0 until 4) {
+    f.io.out(i).ready := (i.U + ocount) < 4.U
+  }
+
+  // ---------------------------------------------------------------------------
+  // Valid.
+  val fcount = PopCount(Cat(f.io.out(0).valid && f.io.out(0).ready,
+                            f.io.out(1).valid && f.io.out(1).ready,
+                            f.io.out(2).valid && f.io.out(2).ready,
+                            f.io.out(3).valid && f.io.out(3).ready))
+  assert(fcount.getWidth == 3)
+
+  for (i <- 0 until 4) {
+    valid(i) := (ocount + fcount) > i.U
+  }
+
+  // ---------------------------------------------------------------------------
+  // Stall.
+  io.stall := (f.io.count + icount) > (depth - guard).U
+
+  // ---------------------------------------------------------------------------
+  // Dependencies.
+  val depends = Wire(Vec(4, Bool()))
+
+  // Writes must not proceed past any outstanding reads or writes,
+  // or past any dispatching writes.
+  val wactive0 = io.vrfsb.data(63, 0) | io.vrfsb.data(127, 64) | io.active
+  val wactive1 = actv(0).ractive | actv(0).wactive | wactive0
+  val wactive2 = actv(1).ractive | actv(1).wactive | wactive1
+  val wactive3 = actv(2).ractive | actv(2).wactive | wactive2
+  val wactive = VecInit(wactive0, wactive1, wactive2, wactive3)
+
+  // Reads must not proceed past any dispatching writes.
+  val ractive0 = 0.U(64.W)
+  val ractive1 = actv(0).wactive | ractive0
+  val ractive2 = actv(1).wactive | ractive1
+  val ractive3 = actv(2).wactive | ractive2
+  val ractive = VecInit(ractive0, ractive1, ractive2, ractive3)
+
+  for (i <- 0 until 4) {
+    depends(i) := (wactive(i) & actv(i).wactive) =/= 0.U ||
+                  (ractive(i) & actv(i).ractive) =/= 0.U
+  }
+
+  // ---------------------------------------------------------------------------
+  // Data.
+  val fvalid = VecInit(f.io.out(0).valid, f.io.out(1).valid,
+                       f.io.out(2).valid, f.io.out(3).valid).asUInt
+  assert(!(fvalid(1) && fvalid(0,0) =/= 1.U))
+  assert(!(fvalid(2) && fvalid(1,0) =/= 3.U))
+  assert(!(fvalid(3) && fvalid(2,0) =/= 7.U))
+
+  // Register is updated when fifo has state or contents are active.
+  val dataEn = fvalid(0) || valid.asUInt =/= 0.U
+
+  for (i <- 0 until 4) {
+    when (dataEn) {
+      data(i) := dataNxt(i)
+      cmdq(i) := cmdqNxt(i)
+      actv2(i) := actvNxt(i)
+    }
+  }
+
+  for (i <- 0 until 4) {
+    actv(i).ractive := actv2(i).ractive
+    actv(i).wactive := actv2(i).wactive(63, 0) | actv2(i).wactive(127, 64)
+  }
+
+  // Tag the decode wactive.
+  val dactv = Wire(Vec(4, new VDecodeActive2))
+  for (i <- 0 until 4) {
+    val w0 = d(i).io.actv.wactive & ~tags(i + 1)
+    val w1 = d(i).io.actv.wactive &  tags(i + 1)
+    dactv(i).ractive := d(i).io.actv.ractive
+    dactv(i).wactive := Cat(w1, w0)
+  }
+
+  // Data multiplexor of current values and fifo+decode output.
+  val dataMux = VecInit(data(0), data(1), data(2), data(3),
+                        e(0), e(1), e(2), e(3))
+
+  val cmdqMux = VecInit(cmdq(0), cmdq(1), cmdq(2), cmdq(3),
+                        d(0).io.cmdq, d(1).io.cmdq, d(2).io.cmdq, d(3).io.cmdq)
+
+  val actvMux = VecInit(actv2(0), actv2(1), actv2(2), actv2(3),
+                        dactv(0), dactv(1), dactv(2), dactv(3))
+
+  // Mark the multiplexor entries that need to be kept.
+  val marked0 = Wire(UInt(5.W))
+  val marked1 = Wire(UInt(6.W))
+  val marked2 = Wire(UInt(7.W))
+
+  assert((marked1 & marked0) === marked0)
+  assert((marked2 & marked0) === marked0)
+  assert((marked2 & marked1) === marked1)
+
+  val output = Cat(io.out(3).valid && io.out(3).ready,
+                   io.out(2).valid && io.out(2).ready,
+                   io.out(1).valid && io.out(1).ready,
+                   io.out(0).valid && io.out(0).ready)
+
+  when (valid(0) && !output(0)) {
+    dataNxt(0) := dataMux(0)
+    cmdqNxt(0) := cmdqMux(0)
+    actvNxt(0) := actvMux(0)
+    marked0 := 0x01.U
+  } .elsewhen (valid(1) && !output(1)) {
+    dataNxt(0) := dataMux(1)
+    cmdqNxt(0) := cmdqMux(1)
+    actvNxt(0) := actvMux(1)
+    marked0 := 0x03.U
+  } .elsewhen (valid(2) && !output(2)) {
+    dataNxt(0) := dataMux(2)
+    cmdqNxt(0) := cmdqMux(2)
+    actvNxt(0) := actvMux(2)
+    marked0 := 0x07.U
+  } .elsewhen (valid(3) && !output(3)) {
+    dataNxt(0) := dataMux(3)
+    cmdqNxt(0) := cmdqMux(3)
+    actvNxt(0) := actvMux(3)
+    marked0 := 0x0f.U
+  } .otherwise {
+    dataNxt(0) := dataMux(4)
+    cmdqNxt(0) := cmdqMux(4)
+    actvNxt(0) := actvMux(4)
+    marked0 := 0x1f.U
+  }
+
+  when (!marked0(1) && valid(1) && !output(1)) {
+    dataNxt(1) := dataMux(1)
+    cmdqNxt(1) := cmdqMux(1)
+    actvNxt(1) := actvMux(1)
+    marked1 := 0x03.U
+  } .elsewhen (!marked0(2) && valid(2) && !output(2)) {
+    dataNxt(1) := dataMux(2)
+    cmdqNxt(1) := cmdqMux(2)
+    actvNxt(1) := actvMux(2)
+    marked1 := 0x07.U
+  } .elsewhen (!marked0(3) && valid(3) && !output(3)) {
+    dataNxt(1) := dataMux(3)
+    cmdqNxt(1) := cmdqMux(3)
+    actvNxt(1) := actvMux(3)
+    marked1 := 0x0f.U
+  } .elsewhen (!marked0(4)) {
+    dataNxt(1) := dataMux(4)
+    cmdqNxt(1) := cmdqMux(4)
+    actvNxt(1) := actvMux(4)
+    marked1 := 0x1f.U
+  } .otherwise {
+    dataNxt(1) := dataMux(5)
+    cmdqNxt(1) := cmdqMux(5)
+    actvNxt(1) := actvMux(5)
+    marked1 := 0x3f.U
+  }
+
+  when (!marked1(2) && valid(2) && !output(2)) {
+    dataNxt(2) := dataMux(2)
+    cmdqNxt(2) := cmdqMux(2)
+    actvNxt(2) := actvMux(2)
+    marked2 := 0x07.U
+  } .elsewhen (!marked1(3) && valid(3) && !output(3)) {
+    dataNxt(2) := dataMux(3)
+    cmdqNxt(2) := cmdqMux(3)
+    actvNxt(2) := actvMux(3)
+    marked2 := 0x0f.U
+  } .elsewhen (!marked1(4)) {
+    dataNxt(2) := dataMux(4)
+    cmdqNxt(2) := cmdqMux(4)
+    actvNxt(2) := actvMux(4)
+    marked2 := 0x1f.U
+  } .elsewhen (!marked1(5)) {
+    dataNxt(2) := dataMux(5)
+    cmdqNxt(2) := cmdqMux(5)
+    actvNxt(2) := actvMux(5)
+    marked2 := 0x3f.U
+  } .otherwise {
+    dataNxt(2) := dataMux(6)
+    cmdqNxt(2) := cmdqMux(6)
+    actvNxt(2) := actvMux(6)
+    marked2 := 0x7f.U
+  }
+
+  when (!marked2(3) && valid(3) && !output(3)) {
+    dataNxt(3) := dataMux(3)
+    cmdqNxt(3) := cmdqMux(3)
+    actvNxt(3) := actvMux(3)
+  } .elsewhen (!marked2(4)) {
+    dataNxt(3) := dataMux(4)
+    cmdqNxt(3) := cmdqMux(4)
+    actvNxt(3) := actvMux(4)
+  } .elsewhen (!marked2(5)) {
+    dataNxt(3) := dataMux(5)
+    cmdqNxt(3) := cmdqMux(5)
+    actvNxt(3) := actvMux(5)
+  } .elsewhen (!marked2(6)) {
+    dataNxt(3) := dataMux(6)
+    cmdqNxt(3) := cmdqMux(6)
+    actvNxt(3) := actvMux(6)
+  } .otherwise {
+    dataNxt(3) := dataMux(7)
+    cmdqNxt(3) := cmdqMux(7)
+    actvNxt(3) := actvMux(7)
+  }
+
+  // ---------------------------------------------------------------------------
+  // Scoreboard.
+  io.vrfsb.set.valid := output(0) || output(1) || output(2) || output(3)
+
+  io.vrfsb.set.bits := (MuxOR(output(0), actv2(0).wactive) |
+                        MuxOR(output(1), actv2(1).wactive) |
+                        MuxOR(output(2), actv2(2).wactive) |
+                        MuxOR(output(3), actv2(3).wactive))
+
+  assert((io.vrfsb.set.bits(63, 0) & io.vrfsb.set.bits(127, 64)) === 0.U)
+  assert(((io.vrfsb.data(63, 0) | io.vrfsb.data(127, 64)) & (io.vrfsb.set.bits(63, 0) | io.vrfsb.set.bits(127, 64))) === 0.U)
+
+  // ---------------------------------------------------------------------------
+  // Outputs.
+  val outvalid = Wire(Vec(4, Bool()))
+  val cmdsync = Wire(Vec(4, Bool()))
+
+  for (i <- 0 until 4) {
+    outvalid(i) := valid(i) && !depends(i)
+    cmdsync(i) := data(i).cmdsync
+  }
+
+  for (i <- 0 until 4) {
+    // Synchronize commands at cmdsync instance or if found in history.
+    // Note: {vdwinit, vdwconv, vdmulh}, vdmulh must not issue before vdwconv.
+    val synchronize = cmdsync.asUInt(i,0) =/= 0.U
+    val ordered = (~outvalid.asUInt(i,0)) === 0.U
+    val unorder = outvalid(i)
+    if (false) {
+      io.out(i).valid := Mux(synchronize, ordered, unorder)
+    } else {
+      io.out(i).valid := ordered
+    }
+    io.out(i).bits := data(i)
+    io.cmdq(i) := cmdq(i)
+    io.actv(i) := actv(i)
+  }
+
+  // ---------------------------------------------------------------------------
+  // Status.
+  val nempty = RegInit(false.B)
+
+  // Simple implementation, will overlap downstream units redundantly.
+  nempty := io.in.valid || f.io.nempty || valid.asUInt =/= 0.U
+
+  io.nempty := nempty
+}
+
+class VDecodeBits extends Bundle {
+  val op = UInt(new VEncodeOp().bits.W)
+  val f2 = UInt(3.W)  // func2
+  val sz = UInt(3.W)  // onehot size
+  val m  = Bool()     // stripmine
+
+  val vd = new VAddr()
+  val ve = new VAddr()
+  val vf = new VAddr()
+  val vg = new VAddr()
+  val vs = new VAddrTag()
+  val vt = new VAddrTag()
+  val vu = new VAddrTag()
+  val vx = new VAddrTag()
+  val vy = new VAddrTag()
+  val vz = new VAddrTag()
+  val sv = new SAddrData()
+
+  val cmdsync = Bool()  // Dual command queues synchronize.
+}
+
+class VDecodeCmdq extends Bundle {
+  val alu   = Bool()  // ALU
+  val conv  = Bool()  // Convolution vregfile
+  val ldst  = Bool()  // L1Dcache load/store
+  val ld    = Bool()  // Uncached load
+  val st    = Bool()  // Uncached store
+}
+
+class VDecodeActive extends Bundle {
+  val ractive = UInt(64.W)
+  val wactive = UInt(64.W)
+}
+
+class VDecodeActive2 extends Bundle {
+  val ractive = UInt(64.W)
+  val wactive = UInt(128.W)  // even/odd tags
+}
+
+class VAddr extends Bundle {
+  val valid = Bool()
+  val addr = UInt(6.W)
+}
+
+class VAddrTag extends Bundle {
+  val valid = Bool()
+  val addr = UInt(6.W)
+  val tag = UInt(4.W)
+}
+
+class SAddrData extends Bundle {
+  val valid = Bool()
+  val addr = UInt(32.W)
+  val data = UInt(32.W)
+}
+
+class SData extends Bundle {
+  val valid = Bool()
+  val data = UInt(32.W)
+}
+
+object EmitVDecode extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new VDecode(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VDecodeInstruction.scala b/hdl/chisel/src/kelvin/vector/VDecodeInstruction.scala
new file mode 100644
index 0000000..33dc5f2
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VDecodeInstruction.scala
@@ -0,0 +1,623 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+class VDecodeInstruction(p: Parameters) extends Module {
+  val dec = new VDecodeOp()
+  val enc = new VEncodeOp()
+
+  val io = IO(new Bundle {
+    val in = new Bundle {
+      val inst = Input(UInt(32.W))
+      val addr = Input(UInt(32.W))
+      val data = Input(UInt(32.W))
+    }
+    val out = Output(new VDecodeBits)
+    val cmdq = Output(new VDecodeCmdq)
+    val actv = Output(new VDecodeActive)
+    val undef = Output(Bool())
+  })
+
+  val inst = io.in.inst
+  val addr = io.in.addr
+  val data = io.in.data
+
+  val v     = inst(0)  // .vv .vx
+  val x     = inst(1)  // .vx
+  val x3    = inst(2)  // .vxv
+  val func1 = inst(4,2)
+  val m     = inst(5)
+  val sz    = inst(13,12)
+  val func2 = inst(31,26)
+
+  val vdbits = inst(11,6)
+  val vsbits = inst(19,14)
+  val vtbits = inst(25,20)
+  val vubits = inst(31,26)
+
+  val quad = m && x  // dual issue across ALUs
+
+  val uncached = addr(31)
+
+  def DecodeFmt(f1: Int, f2: Int, mask: Int = 0): Bool = {
+    assert(inst.getWidth == 32)
+    val m2 = ~mask.U(6.W)  // unsigned, rounding, ...
+    v === 0.U && func1 === f1.U && (func2 & m2) === (f2.U & m2) && sz < 3.U
+  }
+
+  def ToM(a: UInt): UInt = {
+    val bbits = Wire(Vec(16, UInt(4.W)))
+    for (i <- 0 until 16) {
+      val v = a(i)
+      bbits(i) := Cat(v, v, v, v)
+    }
+    val b = bbits.asUInt
+    assert(a.getWidth == 16)
+    assert(b.getWidth == 64)
+    b
+  }
+
+  def RActiveVsVt(i: Int): UInt = {
+    assert(i == 2 || i == 3)
+    val vs  = OneHot(vsbits, 64)
+    val vsm = MuxOR(m, ToM(OneHot(vsbits(5,2), 16)))
+    val vt =
+      if (i == 2) {
+        MuxOR(!x, OneHot(vtbits, 64))
+      } else {
+        MuxOR(!x3, OneHot(vtbits, 64))
+      }
+    val vtm =
+      if (i == 2) {
+        MuxOR(m && !x, ToM(OneHot(vtbits(5,2), 16)))
+      } else {
+        MuxOR(m && !x3, ToM(OneHot(vtbits(5,2), 16)))
+      }
+    assert(vs.getWidth == 64)
+    assert(vt.getWidth == 64)
+    assert(vsm.getWidth == 64)
+    assert(vtm.getWidth == 64)
+    vs | vsm | vt | vtm
+  }
+
+  def RActiveVs1(): UInt = {
+    // {vs+1} or {vsm+4}
+    val vs  = Cat(OneHot(vsbits, 64), 0.U(1.W))(63,0)
+    val vsm = MuxOR(m, Cat(ToM(OneHot(vsbits(5,2), 16)), 0.U(4.W))(63,0))
+    assert(vs.getWidth == 64)
+    assert(vsm.getWidth == 64)
+    vs | vsm
+  }
+
+  def RActiveVs2(): UInt = {
+    // {vs+2} or {vsm+8}
+    val vs  = Cat(OneHot(vsbits, 64), 0.U(2.W))(63,0)
+    val vsm = MuxOR(m, Cat(ToM(OneHot(vsbits(5,2), 16)), 0.U(8.W))(63,0))
+    assert(vs.getWidth == 64)
+    assert(vsm.getWidth == 64)
+    vs | vsm
+  }
+
+  def RActiveVs3(): UInt = {
+    // {vs+3} or {vsm+12}
+    val vs  = Cat(OneHot(vsbits, 64), 0.U(3.W))(63,0)
+    val vsm = MuxOR(m, Cat(ToM(OneHot(vsbits(5,2), 16)), 0.U(12.W))(63,0))
+    assert(vs.getWidth == 64)
+    assert(vsm.getWidth == 64)
+    vs | vsm
+  }
+
+  def RActiveVd(): UInt = {
+    val vd  = OneHot(vdbits, 64)
+    val vdm = MuxOR(m, ToM(OneHot(vdbits(5,2), 16)))
+    assert(vd.getWidth == 64)
+    assert(vdm.getWidth == 64)
+    vd | vdm
+  }
+
+  def RActiveVu(): UInt = {
+    val vu  = OneHot(vubits, 64)
+    val vum = MuxOR(m, ToM(OneHot(vubits(5,2), 16)))
+    assert(vu.getWidth == 64)
+    assert(vum.getWidth == 64)
+    vu | vum
+  }
+
+  def WActiveVd(): UInt = {
+    val vd  = OneHot(vdbits, 64)
+    val vdm = MuxOR(m, ToM(OneHot(vdbits(5,2), 16)))
+    assert(vd.getWidth == 64)
+    assert(vdm.getWidth == 64)
+    vd | vdm
+  }
+
+  def WActiveVd1(): UInt = {
+    // {vd+1} or {vdm+4}
+    val vd  = Cat(OneHot(vdbits, 64), 0.U(1.W))(63,0)
+    val vdm = MuxOR(m, Cat(ToM(OneHot(vdbits(5,2), 16)), 0.U(4.W))(63,0))
+    assert(vd.getWidth == 64)
+    assert(vdm.getWidth == 64)
+    vd | vdm
+  }
+
+  def DepthwiseRead(): (UInt, UInt, UInt, UInt, UInt, UInt, UInt) = {
+    val vstbl = VecInit(0.U, 1.U, 2.U, 3.U, 4.U, 5.U, 6.U, 1.U, 1.U, 3.U, 5.U, 7.U, 2.U, 4.U, 6.U, 8.U)
+    val vttbl = VecInit(1.U, 2.U, 3.U, 4.U, 5.U, 6.U, 7.U, 0.U, 2.U, 4.U, 6.U, 8.U, 0.U, 0.U, 0.U, 0.U)
+    val vutbl = VecInit(2.U, 3.U, 4.U, 5.U, 6.U, 7.U, 8.U, 2.U, 0.U, 0.U, 0.U, 0.U, 1.U, 1.U, 1.U, 1.U)
+
+    val regbase = data(7,4)
+
+    val vs = vsbits + vstbl(regbase)
+    val vt = vsbits + vttbl(regbase)
+    val vu = vsbits + vutbl(regbase)
+    assert(vs.getWidth == 6)
+    assert(vt.getWidth == 6)
+    assert(vu.getWidth == 6)
+
+    val vx = vubits
+    val vy = vubits + Mux(m, 4.U, 1.U)
+    val vz = vubits + Mux(m, 8.U, 2.U)
+    assert(vx.getWidth == 6)
+    assert(vy.getWidth == 6)
+    assert(vz.getWidth == 6)
+
+    val ra_vs  = OneHot(vs, 64)
+    val ra_vt  = OneHot(vt, 64)
+    val ra_vu  = OneHot(vu, 64)
+    val ra_vx  = OneHot(vx, 64)
+    val ra_vy  = OneHot(vy, 64)
+    val ra_vz  = OneHot(vz, 64)
+    val ra_vxm = MuxOR(m, ToM(OneHot(vx(5,2), 16)))
+    val ra_vym = MuxOR(m, ToM(OneHot(vy(5,2), 16)))
+    val ra_vzm = MuxOR(m, ToM(OneHot(vz(5,2), 16)))
+    assert(ra_vs.getWidth == 64)
+    assert(ra_vt.getWidth == 64)
+    assert(ra_vu.getWidth == 64)
+    assert(ra_vx.getWidth == 64)
+    assert(ra_vy.getWidth == 64)
+    assert(ra_vz.getWidth == 64)
+    assert(ra_vxm.getWidth == 64)
+    assert(ra_vym.getWidth == 64)
+    assert(ra_vzm.getWidth == 64)
+
+    val ractive = ra_vs | ra_vt | ra_vu | ra_vx | ra_vy | ra_vz | ra_vxm | ra_vym | ra_vzm
+    assert(ractive.getWidth == 64)
+
+    (vs, vt, vu, vx, vy, vz, ractive)
+  }
+
+  def SlideRead(): (UInt, UInt, UInt, UInt, UInt, UInt, UInt) = {
+    val s = func2(3)  // next(0) previous(1)
+    val vs = Mux(s, vsbits + 3.U, vsbits + 0.U)
+    val vt = Mux(s, vtbits + 0.U, vsbits + 1.U)
+    val vu = Mux(s, vtbits + 1.U, vsbits + 2.U)
+    val vx = Mux(s, vtbits + 1.U, vsbits + 2.U)
+    val vy = Mux(s, vtbits + 2.U, vsbits + 3.U)
+    val vz = Mux(s, vtbits + 3.U, vtbits + 0.U)
+    assert(vs.getWidth == 6)
+    assert(vt.getWidth == 6)
+    assert(vu.getWidth == 6)
+    assert(vx.getWidth == 6)
+    assert(vy.getWidth == 6)
+    assert(vz.getWidth == 6)
+
+    val ra_vs  =                 OneHot(vs, 64)
+    val ra_vt  = MuxOR(!x || !s, OneHot(vt, 64))
+    val ra_vu  = MuxOR(!x || !s, OneHot(vu, 64))
+    val ra_vx  = MuxOR(!x || !s, OneHot(vx, 64))
+    val ra_vy  = MuxOR(!x || !s, OneHot(vy, 64))
+    val ra_vz  = MuxOR(!x,       OneHot(vz, 64))
+    assert(ra_vs.getWidth == 64)
+    assert(ra_vt.getWidth == 64)
+    assert(ra_vu.getWidth == 64)
+    assert(ra_vx.getWidth == 64)
+    assert(ra_vy.getWidth == 64)
+    assert(ra_vz.getWidth == 64)
+
+    val ractive = ra_vs | ra_vt | ra_vu | ra_vx | ra_vy | ra_vz
+    assert(ractive.getWidth == 64)
+
+    (vs, vt, vu, vx, vy, vz, ractive)
+  }
+
+  // ---------------------------------------------------------------------------
+  // Decode the instruction bits.
+
+  // Duplicate
+  val vdup = DecodeBits(inst, "01000x_0xxxxx_000000_xx_xxxxxx_x_111_11") && sz < 3.U
+  val vdupf2 = inst(31,27) === 8.U  // used to prevent vdup and vldst op collision only
+
+  // Load/Store
+  val vldstdec = DecodeBits(inst, "xxxxxx_0xxxxx_xxxxx0_xx_xxxxxx_x_111_11") && sz < 3.U && !vdupf2
+  assert(!(vdup && vldstdec))
+
+  val vld  = vldstdec && (func2 === 0.U || func2 === 1.U || func2 === 2.U ||
+                          func2 === 4.U || func2 === 5.U || func2 === 6.U ||
+                          func2 === 7.U)
+
+  val vst  = vldstdec && (func2 === 8.U || func2 === 9.U || func2 === 10.U ||
+                          func2 === 12.U || func2 === 13.U || func2 === 14.U ||
+                          func2 === 15.U)
+
+  val vstq = vldstdec && (func2 === 26.U || func2 === 30.U)
+
+  val vldst = vld || vst || vstq
+
+  // Format0
+  val vadd  = DecodeFmt(0, dec.vadd)
+  val vsub  = DecodeFmt(0, dec.vsub)
+  val vrsub = DecodeFmt(0, dec.vrsub)
+  val veq   = DecodeFmt(0, dec.veq)
+  val vne   = DecodeFmt(0, dec.vne)
+  val vlt   = DecodeFmt(0, dec.vlt, 1)
+  val vle   = DecodeFmt(0, dec.vle, 1)
+  val vgt   = DecodeFmt(0, dec.vgt, 1)
+  val vge   = DecodeFmt(0, dec.vge, 1)
+  val vabsd = DecodeFmt(0, dec.vabsd, 1)
+  val vmax  = DecodeFmt(0, dec.vmax, 1)
+  val vmin  = DecodeFmt(0, dec.vmin, 1)
+  val vadd3 = DecodeFmt(0, dec.vadd3)
+
+  val vfmt0 = vadd || vsub || vrsub || veq || vne || vlt || vle || vgt || vge || vabsd || vmax || vmin || vadd3
+
+  // Format1
+  val vand  = DecodeFmt(1, dec.vand)
+  val vor   = DecodeFmt(1, dec.vor)
+  val vxor  = DecodeFmt(1, dec.vxor)
+  val vnot  = DecodeFmt(1, dec.vnot)
+  val vrev  = DecodeFmt(1, dec.vrev)
+  val vror  = DecodeFmt(1, dec.vror)
+  val vclb  = DecodeFmt(1, dec.vclb)
+  val vclz  = DecodeFmt(1, dec.vclz)
+  val vcpop = DecodeFmt(1, dec.vcpop)
+  val vmv   = DecodeFmt(1, dec.vmv) && !quad
+  val vmv2  = DecodeFmt(1, dec.vmv) &&  quad
+  val vmvp  = DecodeFmt(1, dec.vmvp)
+
+  val vfmt1 = vand || vor || vxor || vnot || vrev || vror || vclb || vclz || vcpop || vmv || vmv2 || vmvp
+
+  // do not include in 'vfmt1'
+  val acset   = DecodeFmt(1, dec.acset) && x && !m && vtbits === 0.U
+  val actr    = DecodeFmt(1, dec.actr)  && x && !m && vtbits === 0.U
+  val adwinit = DecodeFmt(1, dec.adwinit)
+
+  // Format2
+  val vsll   = DecodeFmt(2, dec.vsll)
+  val vsra   = DecodeFmt(2, dec.vsra)
+  val vsrl   = DecodeFmt(2, dec.vsrl)
+  val vsha   = DecodeFmt(2, dec.vsha, 2)
+  val vshl   = DecodeFmt(2, dec.vshl, 2)
+  val vsrans = DecodeFmt(2, dec.vsrans, 3)
+  val vsraqs = DecodeFmt(2, dec.vsraqs, 3)
+
+  val vfmt2 = vsll || vsra || vsrl || vsha || vshl || vsrans || vsraqs
+
+  // Format3
+  val vmul    = DecodeFmt(3, dec.vmul) && !quad
+  val vmul2   = DecodeFmt(3, dec.vmul) &&  quad
+  val vmuls   = DecodeFmt(3, dec.vmuls, 1) && !quad
+  val vmuls2  = DecodeFmt(3, dec.vmuls, 1) &&  quad
+  val vmulh   = DecodeFmt(3, dec.vmulh, 2) && !quad
+  val vmulh2  = DecodeFmt(3, dec.vmulh, 2) &&  quad
+  val vmulhu  = DecodeFmt(3, dec.vmulhu, 2) && !quad
+  val vmulhu2 = DecodeFmt(3, dec.vmulhu, 2) &&  quad
+  val vdmulh  = DecodeFmt(3, dec.vdmulh, 3) && !quad
+  val vdmulh2 = DecodeFmt(3, dec.vdmulh, 3) &&  quad
+  val vmulw   = DecodeFmt(3, dec.vmulw, 1)
+  val vmacc   = DecodeFmt(3, dec.vmacc)
+  val vmadd   = DecodeFmt(3, dec.vmadd)
+
+  val vfmt3 = vmul || vmul2 || vmuls || vmuls2 || vmulh || vmulh2 || vmulhu || vmulhu2 || vdmulh || vdmulh2 || vmulw || vmacc || vmadd
+
+  // Format4
+  val vadds  = DecodeFmt(4, dec.vadds, 1)
+  val vsubs  = DecodeFmt(4, dec.vsubs, 1)
+  val vaddw  = DecodeFmt(4, dec.vaddw, 1)
+  val vsubw  = DecodeFmt(4, dec.vsubw, 1)
+  val vacc   = DecodeFmt(4, dec.vacc, 1)
+  val vpadd  = DecodeFmt(4, dec.vpadd, 1)
+  val vpsub  = DecodeFmt(4, dec.vpsub, 1)
+  val vhadd  = DecodeFmt(4, dec.vhadd, 3)
+  val vhsub  = DecodeFmt(4, dec.vhsub, 3)
+
+  val vfmt4 = vadds || vsubs || vaddw || vsubw || vacc || vpadd || vpsub || vhadd || vhsub
+
+  // Format6
+  val vslidevn  = DecodeFmt(6, dec.vslidevn, 3)
+  val vslidehn  = DecodeFmt(6, dec.vslidehn, 3) && !m
+  val vslidehn2 = DecodeFmt(6, dec.vslidehn, 3) && m
+  val vslidevp  = DecodeFmt(6, dec.vslidevp, 3)
+  val vslidehp  = DecodeFmt(6, dec.vslidehp, 3) && !m
+  val vslidehp2 = DecodeFmt(6, dec.vslidehp, 3) && m
+  val vsel      = DecodeFmt(6, dec.vsel)
+  val vevn      = DecodeFmt(6, dec.vevn)
+  val vodd      = DecodeFmt(6, dec.vodd)
+  val vevnodd   = DecodeFmt(6, dec.vevnodd)
+  val vzip      = DecodeFmt(6, dec.vzip)
+
+  val vslideh2 = vslidehn2 || vslidehp2
+  val vevn3 = vevn || vevnodd || vodd
+
+  val vfmt6 = vslidevn | vslidehn | vslidehn2 | vslidevp | vslidehp | vslidehp2 | vsel | vevn | vodd | vevnodd | vzip
+
+  // FormatVVV
+  val aconv   = DecodeBits(inst, "xxxxxx_1xxxxx_xxxxxx_10_xxxxxx_0_00_101")
+  val vcget   = DecodeBits(inst, "010100_000000_000000_xx_xxxxxx_x_111_11")
+
+  val vdwconv = DecodeBits(inst, "xxxxxx_0xxxxx_xxxxxx_10_xxxxxx_x_10_101")
+  val adwconv = DecodeBits(inst, "xxxxxx_1xxxxx_xxxxxx_10_xxxxxx_x_10_101")
+  val vadwconv = vdwconv || adwconv
+
+  // Undef
+  val vopbits = Cat(
+    // Duplicate
+    vdup,
+    // Load/Store
+    vld, vst, vstq,
+    // Misc
+    vcget,
+    // Format0
+    vadd, vsub, vrsub, veq, vne, vlt, vle, vgt, vge, vabsd, vmax, vmin, vadd3,
+    // Format1
+    vand, vor, vxor, vnot, vrev, vror, vclb, vclz, vcpop, vmv, vmv2, vmvp, acset, actr, adwinit,
+    // Format2
+    vsll, vsra, vsrl, vsha, vshl, vsrans, vsraqs,
+    // Format3
+    vmul, vmul2, vmuls, vmuls2, vmulh, vmulh2, vmulhu, vmulhu2, vdmulh, vdmulh2, vmulw, vmacc, vmadd,
+    // Format4
+    vadds, vsubs, vaddw, vsubw, vacc, vpadd, vpsub, vhadd, vhsub,
+    // Format6
+    vslidevn, vslidehn, vslidehn2, vslidevp, vslidehp, vslidehp2, vsel, vevn, vodd, vevnodd, vzip,
+    // FormatVVV
+    aconv, vdwconv, adwconv)
+
+  val undef = !WiredOR(vopbits)
+  assert(PopCount(Cat(vopbits, undef)) === 1.U)
+
+  // Encode the opcode.
+  val op =
+      // Duplicate
+      MuxOR(vdup, enc.vdup.U) |
+      // Load/Store
+      MuxOR(vld,  enc.vld.U) |
+      MuxOR(vst,  enc.vst.U) |
+      MuxOR(vstq, enc.vstq.U) |
+      // Misc
+      MuxOR(vcget, enc.vcget.U) |
+      // Format0
+      MuxOR(vadd,  enc.vadd.U) |
+      MuxOR(vsub,  enc.vsub.U) |
+      MuxOR(vrsub, enc.vrsub.U) |
+      MuxOR(veq,   enc.veq.U) |
+      MuxOR(vne,   enc.vne.U) |
+      MuxOR(vlt,   enc.vlt.U) |
+      MuxOR(vle,   enc.vle.U) |
+      MuxOR(vgt,   enc.vgt.U) |
+      MuxOR(vge,   enc.vge.U) |
+      MuxOR(vabsd, enc.vabsd.U) |
+      MuxOR(vmax,  enc.vmax.U) |
+      MuxOR(vmin,  enc.vmin.U) |
+      MuxOR(vadd3, enc.vadd3.U) |
+      // Format1
+      MuxOR(vand,  enc.vand.U) |
+      MuxOR(vor,   enc.vor.U) |
+      MuxOR(vxor,  enc.vxor.U) |
+      MuxOR(vnot,  enc.vnot.U) |
+      MuxOR(vrev,  enc.vrev.U) |
+      MuxOR(vror,  enc.vror.U) |
+      MuxOR(vclb,  enc.vclb.U) |
+      MuxOR(vclz,  enc.vclz.U) |
+      MuxOR(vcpop, enc.vcpop.U) |
+      MuxOR(vmv,   enc.vmv.U) |
+      MuxOR(vmv2,  enc.vmv2.U) |
+      MuxOR(vmvp,  enc.vmvp.U) |
+      MuxOR(acset, enc.acset.U) |
+      MuxOR(actr,  enc.actr.U) |
+      MuxOR(adwinit, enc.adwinit.U) |
+      // Format2
+      MuxOR(vsll,   enc.vshl.U) |
+      MuxOR(vsra,   enc.vshr.U) |
+      MuxOR(vsrl,   enc.vshr.U) |
+      MuxOR(vsha,   enc.vshf.U) |
+      MuxOR(vshl,   enc.vshf.U) |
+      MuxOR(vsrans, enc.vsrans.U) |
+      MuxOR(vsraqs, enc.vsraqs.U) |
+      // Format3
+      MuxOR(vmul,    enc.vmul.U) |
+      MuxOR(vmul2,   enc.vmul2.U) |
+      MuxOR(vmuls,   enc.vmuls.U) |
+      MuxOR(vmuls2,  enc.vmuls2.U) |
+      MuxOR(vmulh,   enc.vmulh.U) |
+      MuxOR(vmulh2,  enc.vmulh2.U) |
+      MuxOR(vmulhu,  enc.vmulh.U) |
+      MuxOR(vmulhu2, enc.vmulh2.U) |
+      MuxOR(vdmulh,  enc.vdmulh.U) |
+      MuxOR(vdmulh2, enc.vdmulh2.U) |
+      MuxOR(vmulw,   enc.vmulw.U) |
+      MuxOR(vmacc,   enc.vmadd.U) |
+      MuxOR(vmadd,   enc.vmadd.U) |
+      // Format4
+      MuxOR(vadds,  enc.vadds.U) |
+      MuxOR(vsubs,  enc.vsubs.U) |
+      MuxOR(vaddw,  enc.vaddw.U) |
+      MuxOR(vsubw,  enc.vsubw.U) |
+      MuxOR(vacc,   enc.vacc.U) |
+      MuxOR(vpadd,  enc.vpadd.U) |
+      MuxOR(vpsub,  enc.vpsub.U) |
+      MuxOR(vhadd,  enc.vhadd.U) |
+      MuxOR(vhsub,  enc.vhsub.U) |
+      // Format6
+      MuxOR(vslidevn,  enc.vslidevn.U) |
+      MuxOR(vslidehn,  enc.vslidehn.U) |
+      MuxOR(vslidehn2, enc.vslidehn2.U) |
+      MuxOR(vslidevp,  enc.vslidevp.U) |
+      MuxOR(vslidehp,  enc.vslidehp.U) |
+      MuxOR(vslidehp2, enc.vslidehp2.U) |
+      MuxOR(vsel,     enc.vsel.U) |
+      MuxOR(vevn,     enc.vevn.U) |
+      MuxOR(vodd,     enc.vodd.U) |
+      MuxOR(vevnodd,  enc.vevnodd.U) |
+      MuxOR(vzip,     enc.vzip.U) |
+      // FormatVVV
+      MuxOR(aconv,    enc.aconv.U) |
+      MuxOR(vdwconv,  enc.vdwconv.U) |
+      MuxOR(adwconv,  enc.adwconv.U)
+
+  // Scalar.
+  def ScalarData(sz: UInt, data: UInt): UInt = {
+    assert(sz.getWidth == 2)
+    assert(data.getWidth == 32)
+    MuxOR(sz === 0.U, Cat(data(7,0), data(7,0), data(7,0), data(7,0))) |
+          MuxOR(sz === 1.U, Cat(data(15,0), data(15,0))) |
+          MuxOR(sz === 2.U, data(31,0))
+  }
+
+  // Depthwise read.
+  val (vsdw, vtdw, vudw, vxdw, vydw, vzdw, ractivedw) = DepthwiseRead()
+
+  val ractivedi = ToM(OneHot(vsbits(5,2), 16))
+  val wactivedw = ToM(OneHot(vdbits(5,2), 16))
+
+  // Slide composite read.
+  val (vssl, vtsl, vusl, vxsl, vysl, vzsl, ractivesl) = SlideRead()
+
+  // Convolution read/write.
+  val ractiveconv1 = Wire(UInt(64.W))
+  val ractiveconv2 = Wire(UInt(64.W))
+  val ractiveaset  = Wire(UInt(64.W))
+  val wactiveconv  = Wire(UInt(64.W))
+
+  // Narrow reads (vs) are aligned to 16 register base (v0, v16, v32, v48).
+  // Wide reads (vu) are aligned to SIMD width(4,8,16), assumes scalar control
+  // field does not access beyond this bounds.
+  if (p.vectorBits == 128) {
+    ractiveconv1 := 0x000f.U << Cat(vsbits(5,4), 0.U(4.W))
+    ractiveconv2 := 0x000f.U << Cat(vubits(5,2), 0.U(2.W))
+    ractiveaset  := 0x000f.U << Cat(vsbits(5,2), 0.U(2.W))
+    wactiveconv  := 0x000f.U << Cat(vdbits(5,4), 0.U(4.W))
+  } else if (p.vectorBits == 256) {
+    ractiveconv1 := 0x00ff.U << Cat(vsbits(5,4), 0.U(4.W))
+    ractiveconv2 := 0x00ff.U << Cat(vubits(5,3), 0.U(3.W))
+    ractiveaset  := 0x00ff.U << Cat(vsbits(5,3), 0.U(3.W))
+    wactiveconv  := 0x00ff.U << Cat(vdbits(5,4), 0.U(4.W))
+  } else  if (p.vectorBits == 512) {
+    ractiveconv1 := 0xffff.U << Cat(vsbits(5,4), 0.U(4.W))
+    ractiveconv2 := 0xffff.U << Cat(vubits(5,4), 0.U(4.W))
+    ractiveaset  := 0xffff.U << Cat(vsbits(5,4), 0.U(4.W))
+    wactiveconv  := 0xffff.U << Cat(vdbits(5,4), 0.U(4.W))
+  } else {
+    assert(false);
+  }
+
+  // Outputs.
+  io.undef := undef
+
+  io.out.op := op
+  io.out.f2 := func2(2,0)
+  io.out.sz := Cat(sz === 2.U, sz === 1.U, sz === 0.U)
+  io.out.m  := m && !vdmulh2 && !vmul2 && !vmulh2 && !vmulhu2 && !vmuls2 && !vmv2 && !vslidehn2 && !vslidehp2
+  io.out.cmdsync := adwinit || vadwconv || vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2 || vslideh2 || vsraqs
+
+  io.out.vd.valid := vdwconv || vfmt0 || vfmt1 || vfmt2 || vfmt3 || vfmt4 || vfmt6 || vld || vdup || vcget
+  io.out.ve.valid := vdwconv || vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2 || vacc || vmv2 || vmvp || vmulw || vaddw || vsubw || vevnodd || vslideh2 || vzip
+  io.out.vf.valid := vdwconv || vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2 || vslideh2
+  io.out.vg.valid := vdwconv || vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2 || vslideh2
+  io.out.vs.valid := vadwconv || adwinit || vfmt0 || vfmt1 || vfmt2 || vfmt3 || vfmt4 || vfmt6 || vst || vstq || aconv
+  io.out.vt.valid := vadwconv || adwinit || !x && (vfmt0 || vfmt1 || vfmt2 || vfmt3 || vfmt4 || vfmt6)
+  io.out.vu.valid := vadwconv || vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2 || vacc || vadd3 || vmacc || vmadd || aconv || vsrans || vsraqs || vsel || vslideh2 || m && vevn3
+  io.out.vx.valid := vadwconv || adwinit || vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2 || vslideh2 || vsraqs
+  io.out.vy.valid := vadwconv || adwinit || vslideh2 || !x && (vsraqs)
+  io.out.vz.valid := vadwconv || vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2 || vslideh2 || vsraqs
+  io.out.sv.valid := x && (vdup || vfmt0 || vfmt1 || vfmt2 || vfmt3 || vfmt4 || vfmt6)
+
+  io.out.vd.addr := vdbits
+  io.out.ve.addr := Mux(vodd, vdbits,
+                    Mux(vadwconv || vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2 || vslideh2 || vzip, vdbits + 1.U,
+                    Mux(m, vdbits + 4.U, vdbits + 1.U)))
+  io.out.vf.addr := vdbits + 2.U
+  io.out.vg.addr := vdbits + 3.U
+  io.out.vs.addr := Mux(vadwconv, vsdw,
+                    Mux(vslideh2, vssl,
+                    Mux(vmadd || vst || vstq, vdbits,
+                      vsbits)))
+  io.out.vt.addr := Mux(vadwconv, vtdw,
+                    Mux(adwinit, vsbits + 1.U,
+                    Mux(vslideh2, vtsl,
+                    Mux(m && vevn3, vsbits + 1.U,
+                      vtbits))))
+  io.out.vu.addr := Mux(vadwconv, vudw,
+                    Mux(vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2, vsbits + 1.U,
+                    Mux(vslideh2, vusl,
+                    Mux(vacc || vsrans, Mux(m, vsbits + 4.U, vsbits + 1.U),
+                    Mux(vsraqs, Mux(m, vsbits + 4.U, vsbits + 1.U),
+                    Mux(vmacc || vadd3 || vsel, vdbits,
+                    Mux(vmadd, vsbits,
+                    Mux(vevn3, vtbits,
+                      vubits))))))))
+  io.out.vx.addr := Mux(vadwconv, vxdw,
+                    Mux(adwinit || vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2, vsbits + 2.U,
+                    Mux(vsraqs, Mux(m, vsbits + 8.U, vsbits + 2.U),
+                      vxsl)))
+  io.out.vy.addr := Mux(vadwconv, vydw,
+                    Mux(adwinit, vsbits + 3.U,
+                    Mux(vsraqs, vtbits,
+                      vysl)))
+  io.out.vz.addr := Mux(vadwconv, vzdw,
+                    Mux(vdmulh2 || vmul2 || vmulh2 || vmulhu2 || vmuls2 || vmv2, vsbits + 3.U,
+                    Mux(vsraqs, Mux(m, vsbits + 12.U, vsbits + 3.U),
+                      vzsl)))
+
+  io.out.vs.tag := 0.U
+  io.out.vt.tag := 0.U
+  io.out.vu.tag := 0.U
+  io.out.vx.tag := 0.U
+  io.out.vy.tag := 0.U
+  io.out.vz.tag := 0.U
+
+  io.out.sv.addr := addr
+  io.out.sv.data := Mux(vldstdec, data,
+                    Mux(vaddw || vmulw || vsubw, ScalarData(sz - 1.U, data),
+                      ScalarData(sz, data)))
+
+  assert(PopCount(io.out.sz) <= 1.U)
+  assert(!(io.out.vx.valid && !io.out.cmdsync))
+  assert(!(io.out.vy.valid && !io.out.cmdsync))
+  assert(!(io.out.vz.valid && !io.out.cmdsync))
+
+  io.cmdq.alu  := vdup || vfmt0 || vfmt1 || vfmt2 || vfmt3 || vfmt4 || vfmt6 || vadwconv || adwinit
+  io.cmdq.conv := aconv || vcget || acset || actr
+  io.cmdq.ldst := vldst && !uncached
+  io.cmdq.ld := vld && uncached
+  io.cmdq.st := (vst || vstq) && uncached
+
+  val cmdqchk = Cat(io.undef, io.cmdq.alu, io.cmdq.conv, io.cmdq.ldst, io.cmdq.ld, io.cmdq.st)
+  assert(PopCount(cmdqchk) === 1.U)
+
+  io.actv.ractive :=
+    MuxOR(vfmt0 || vfmt1 || vfmt2 || vfmt3 || vfmt4 ||
+          vfmt6 && !vslideh2,                                  RActiveVsVt(2)) |
+    MuxOR(vsraqs || vsrans,                                      RActiveVs1()) |
+    MuxOR(vsraqs,                                                RActiveVs2()) |
+    MuxOR(vsraqs,                                                RActiveVs3()) |
+    MuxOR(vmacc || vmadd || vst || vstq,                          RActiveVd()) |
+    MuxOR(vadwconv,                                                 ractivedw) |
+    MuxOR(adwinit,                                                  ractivedi) |
+    MuxOR(vslideh2,                                                 ractivesl) |
+    MuxOR(aconv || actr,                                         ractiveconv1) |
+    MuxOR(aconv,                                                 ractiveconv2) |
+    MuxOR(acset,                                                  ractiveaset)
+
+  io.actv.wactive :=
+    MuxOR(vfmt0 || vfmt1 || vfmt2 || vfmt3 || vfmt4 || vfmt6 ||
+          vdup || vld,                                            WActiveVd()) |
+    MuxOR(vmvp || vmulw || vacc || vaddw || vsubw || vevnodd || vzip,
+                                                                 WActiveVd1()) |
+    MuxOR(vdwconv,                                                  wactivedw) |
+    MuxOR(vcget,                                                    wactiveconv)
+}
+
+object EmitVDecodeInstruction extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new VDecodeInstruction(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VDecodeOp.scala b/hdl/chisel/src/kelvin/vector/VDecodeOp.scala
new file mode 100644
index 0000000..553af1b
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VDecodeOp.scala
@@ -0,0 +1,84 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+case class VDecodeOp() {
+  // Format0
+  val vadd = 0
+  val vsub = 1
+  val vrsub = 2
+  val veq = 6
+  val vne = 7
+  val vlt = 8
+  val vle = 10
+  val vgt = 12
+  val vge = 14
+  val vabsd = 16
+  val vmax = 18
+  val vmin = 20
+  val vadd3 = 24
+
+  // Format1
+  val vand = 0
+  val vor = 1
+  val vxor = 2
+  val vnot = 3
+  val vrev = 4
+  val vror = 5
+  val vclb = 8
+  val vclz = 9
+  val vcpop = 10
+  val vmv = 12
+  val vmvp = 13
+  val acset = 16
+  val actr = 17
+  val adwinit = 18
+
+  // Format2
+  val vsll = 1
+  val vsra = 2
+  val vsrl = 3
+  val vsha = 8
+  val vshl = 9
+  val vsrans = 16
+  val vsraqs = 24
+
+  // Format3
+  val vmul = 0
+  val vmuls = 2
+  val vmulw = 4
+  val vmulh = 8
+  val vmulhu = 9
+  val vdmulh = 16
+  val vmacc = 20
+  val vmadd = 21
+
+  // Format4
+  val vadds = 0
+  val vsubs = 2
+  val vaddw = 4
+  val vsubw = 6
+  val vacc = 10
+  val vpadd = 12
+  val vpsub = 14
+  val vhadd = 16
+  val vhsub = 20
+
+  // Format6
+  val vsliden = 0
+  val vslidevn = 0
+  val vslidehn = 4
+  val vslidep = 8
+  val vslidevp = 8
+  val vslidehp = 12
+  val vsel = 16
+  val vevn = 24
+  val vodd = 25
+  val vevnodd = 26
+  val vzip = 28
+
+  // FormatVVV
+  val aconv = 8
+  val vdwconv = 10
+}
diff --git a/hdl/chisel/src/kelvin/vector/VDot.scala b/hdl/chisel/src/kelvin/vector/VDot.scala
new file mode 100644
index 0000000..50d978e
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VDot.scala
@@ -0,0 +1,174 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+object VDot {
+  // Conv2D
+  def apply(en: Bool, adata: UInt, bdata: UInt,
+      abias: UInt, bbias: UInt, asign: Bool, bsign: Bool): UInt = {
+    assert(abias.getWidth == 9)
+    assert(bbias.getWidth == 9)
+    assert(adata.getWidth == 32)
+    assert(bdata.getWidth == 32)
+
+    val mul = Wire(Vec(4, SInt(20.W)))
+
+    // input clamps
+    val adatac = MuxOR(en, adata)
+    val bdatac = MuxOR(en, bdata)
+    val abiasc = MuxOR(en, abias)
+    val bbiasc = MuxOR(en, bbias)
+
+    for (i <- 0 until 4) {
+      val as = adatac(8 * i + 7) & asign
+      val bs = bdatac(8 * i + 7) & bsign
+      val aval = Cat(as, adatac(8 * i + 7, 8 * i)).asSInt +& abiasc.asSInt
+      val bval = Cat(bs, bdatac(8 * i + 7, 8 * i)).asSInt +& bbiasc.asSInt
+      val mval = aval * bval
+      mul(i) := mval
+
+      assert(aval.getWidth == 10)
+      assert(bval.getWidth == 10)
+      assert(mval.getWidth == 20)
+    }
+
+    val dotp = (mul(0) +& mul(1)) +& (mul(2) +& mul(3))
+    val sdotp = Cat(MuxOR(dotp(21), ~0.U(10.W)), dotp)
+
+    assert(dotp.getWidth == 22)
+    assert(sdotp.getWidth == 32)
+
+    sdotp
+  }
+
+  // Depthwise
+  def apply(alu: Int, en: Bool, adata: Vec[UInt], bdata: Vec[UInt],
+      scalar: UInt): (UInt, UInt) = {
+    assert(adata.length == 3)
+    assert(bdata.length == 3)
+    assert(scalar.getWidth == 32)
+    val sparse = scalar(3,2)
+    val abias = scalar(20,12)
+    val asign = scalar(21)
+    val bbias = scalar(30,22)
+    val bsign = scalar(31)
+
+    val sparse0 = sparse === 0.U
+    val sparse1 = sparse === 1.U
+    val sparse2 = sparse === 2.U
+
+    val w = adata(0).getWidth
+    val cnt = w / 32
+    val dout0 = Wire(Vec(cnt, UInt(32.W)))
+    val dout1 = Wire(Vec(cnt, UInt(32.W)))
+
+    // Input clamps and dense/sparse swizzle.
+    val adatac = Wire(Vec(3, Vec(cnt, UInt(32.W))))
+    val bdatac = Wire(Vec(3, Vec(cnt, UInt(32.W))))
+
+    val abiasc = MuxOR(en, abias)
+    val bbiasc = MuxOR(en, bbias)
+
+    // Sparse 1 [n-1,n,n+1].
+    val adata1 = Wire(Vec(cnt + 2, UInt(32.W)))
+    if (true) {
+      val lsb = (cnt - 1) * 32
+      val msb = lsb + 32 - 1
+      adata1(0) := MuxOR(en && sparse1, adata(0)(msb,lsb))
+    }
+    for (i <- 0 until cnt) {
+      val lsb = i * 32
+      val msb = lsb + 32 - 1
+      adata1(i + 1) := MuxOR(en && sparse1, adata(1)(msb,lsb))
+    }
+    if (true) {
+      val lsb = 0
+      val msb = 31
+      adata1(cnt + 1) := MuxOR(en && sparse1, adata(2)(msb,lsb))
+    }
+
+    // Sparse 2 [n,n+1,n+2].
+    val adata2 = Wire(Vec(cnt + 2, UInt(32.W)))
+    for (i <- 0 until cnt) {
+      val lsb = i * 32
+      val msb = lsb + 32 - 1
+      adata2(i) := MuxOR(en && sparse2, adata(0)(msb,lsb))
+    }
+    for (i <- 0 until 2) {
+      val lsb = i * 32
+      val msb = lsb + 32 - 1
+      adata2(cnt + i) := MuxOR(en && sparse2, adata(1)(msb,lsb))
+    }
+
+    // vdot(a,b) for sparse[0,1,2].
+    for (j <- 0 until 3) {
+      for (i <- 0 until cnt) {
+        val lsb = i * 32
+        val msb = lsb + 32 - 1
+        val k = i + j
+
+        val adata0 = MuxOR(en && sparse0, adata(j)(msb,lsb))
+
+        adatac(j)(i) := adata0 | adata1(k) | adata2(k)
+        bdatac(j)(i) := MuxOR(en, bdata(j)(msb,lsb))
+      }
+    }
+
+    for (i <- 0 until cnt) {
+      val ad = VecInit(adatac(0)(i), adatac(1)(i), adatac(2)(i))
+      val bd = VecInit(bdatac(0)(i), bdatac(1)(i), bdatac(2)(i))
+      val (o0, o1) = dwlane(alu, en, ad, bd, abiasc, bbiasc, asign, bsign)
+      dout0(i) := o0
+      dout1(i) := o1
+    }
+
+    val out0 = dout0.asUInt
+    val out1 = dout1.asUInt
+    assert(out0.getWidth == w)
+    assert(out1.getWidth == w)
+    (out0, out1)
+  }
+
+  private def dwlane(alu: Int, en: Bool, adata: Vec[UInt], bdata: Vec[UInt],
+      abias: UInt, bbias: UInt, asign: Bool, bsign: Bool):
+        (UInt, UInt) = {
+    assert(adata.length == 3)
+    assert(bdata.length == 3)
+    assert(abias.getWidth == 9)
+    assert(bbias.getWidth == 9)
+    for (i <- 0 until 3) {
+      assert(adata(i).getWidth == 32)
+      assert(bdata(i).getWidth == 32)
+    }
+
+    val out = Wire(Vec(2, UInt(32.W)))
+
+    for (j <- 0 until 2) {
+      val m = 2 * j + alu  // alu[0]: {0, 2}; alu[1]: {1, 3}
+      val mul = Wire(Vec(3, SInt(20.W)))
+
+      for (i <- 0 until 3) {
+        val as = adata(i)(8 * m + 7) & asign
+        val bs = bdata(i)(8 * m + 7) & bsign
+        val aval = Cat(as, adata(i)(8 * m + 7, 8 * m)).asSInt +& abias.asSInt
+        val bval = Cat(bs, bdata(i)(8 * m + 7, 8 * m)).asSInt +& bbias.asSInt
+        val mval = aval * bval
+        mul(i) := mval
+
+        assert(aval.getWidth == 10)
+        assert(bval.getWidth == 10)
+        assert(mval.getWidth == 20)
+      }
+
+      val dotp = (mul(0) +& mul(1)) +& mul(2)
+      val sdotp = Cat(MuxOR(dotp(21), ~0.U(10.W)), dotp)
+      assert(dotp.getWidth == 22)
+      assert(sdotp.getWidth == 32)
+
+      out(j) := sdotp
+    }
+
+    (out(0), out(1))
+  }
+}
diff --git a/hdl/chisel/src/kelvin/vector/VEncodeOp.scala b/hdl/chisel/src/kelvin/vector/VEncodeOp.scala
new file mode 100644
index 0000000..754dd69
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VEncodeOp.scala
@@ -0,0 +1,104 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+// Opcode list will maintain unique IDs even if not populated in command queue.
+case class VEncodeOp() {
+  val undef       = 0
+
+  // Duplicate
+  val vdup        = 1
+
+  // Load/Store
+  val vld         = 2
+  val vst         = 3
+  val vstq        = 4
+
+  // Misc
+  val vcget       = 5
+
+  // Format0
+  val vadd        = 6
+  val vsub        = 7
+  val vrsub       = 8
+  val veq         = 9
+  val vne         = 10
+  val vlt         = 11
+  val vle         = 12
+  val vgt         = 13
+  val vge         = 14
+  val vabsd       = 15
+  val vmax        = 16
+  val vmin        = 17
+  val vadd3       = 18
+
+  // Format1
+  val vand        = 19
+  val vor         = 20
+  val vxor        = 21
+  val vnot        = 22
+  val vrev        = 23
+  val vror        = 24
+  val vclb        = 25
+  val vclz        = 26
+  val vcpop       = 27
+  val vmv         = 28
+  val vmv2        = 29
+  val vmvp        = 30
+  val acset       = 31
+  val actr        = 32
+  val adwinit     = 33
+
+  // Format2
+  val vshl        = 34
+  val vshr        = 35
+  val vshf        = 36
+  val vsrans      = 37
+  val vsraqs      = 38
+
+  // Format3
+  val vmul        = 39
+  val vmul2       = 40
+  val vmuls       = 41
+  val vmuls2      = 42
+  val vmulh       = 43
+  val vmulh2      = 44
+  val vdmulh      = 45
+  val vdmulh2     = 46
+  val vmulw       = 47
+  val vmadd       = 48
+
+  // Format4
+  val vadds       = 49
+  val vsubs       = 50
+  val vaddw       = 51
+  val vsubw       = 52
+  val vacc        = 53
+  val vpadd       = 54
+  val vpsub       = 55
+  val vhadd       = 56
+  val vhsub       = 57
+
+  // Format6
+  val vslidevn    = 58
+  val vslidehn    = 59
+  val vslidehn2   = 60
+  val vslidevp    = 61
+  val vslidehp    = 62
+  val vslidehp2   = 63
+  val vsel        = 64
+  val vevn        = 65
+  val vodd        = 66
+  val vevnodd     = 67
+  val vzip        = 68
+
+  // FormatVVV
+  val aconv       = 69
+  val vdwconv     = 70
+  val adwconv     = 71
+
+  // Entries
+  val entries     = 72
+  val bits = log2Ceil(entries)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VInst.scala b/hdl/chisel/src/kelvin/vector/VInst.scala
new file mode 100644
index 0000000..2da7fe6
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VInst.scala
@@ -0,0 +1,281 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object VInst {
+  def apply(p: Parameters): VInst = {
+    return Module(new VInst(p))
+  }
+}
+
+case class VInstOp() {
+  val GETVL = 0
+  val GETMAXVL = 1
+  val VLD = 2
+  val VST = 3
+  val VIOP = 4
+  val Entries = 5
+  val Bits = log2Ceil(Entries)
+}
+
+class VInstIO extends Bundle {
+  val valid = Input(Bool())
+  val ready = Output(Bool())
+  val addr = Input(UInt(5.W))
+  val inst = Input(UInt(32.W))
+  val op = Input(UInt(new VInstOp().Entries.W))
+}
+
+class VectorInstructionIO extends Bundle {
+  val valid = Output(Bool())
+  val ready = Input(Bool())
+  val stall = Input(Bool())
+  val lane = Vec(4, Valid(new VectorInstructionLane))
+}
+
+class VectorInstructionLane extends Bundle {
+  val inst = UInt(32.W)
+  val addr = UInt(32.W)
+  val data = UInt(32.W)
+}
+
+class VAddressActive extends Bundle {
+  val entry = Vec(8, new Bundle {
+    val valid = Output(Bool())
+    val store = Output(Bool())
+    val addr  = Output(UInt(32.W))
+  })
+}
+
+class VInst(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    // Decode cycle.
+    val in = Vec(4, new VInstIO)
+
+    // Execute cycle.
+    val rs = Vec(8, Flipped(new RegfileReadDataIO))
+    val rd = Vec(4, Flipped(new RegfileWriteDataIO))
+
+    // Vector interface.
+    val out = new VectorInstructionIO
+
+    // Status.
+    val nempty = Output(Bool())
+  })
+
+  val vinst = new VInstOp()
+
+  val maxvlb  = (p.vectorBits / 8).U(p.vectorCountBits.W)
+  val maxvlh  = (p.vectorBits / 16).U(p.vectorCountBits.W)
+  val maxvlw  = (p.vectorBits / 32).U(p.vectorCountBits.W)
+  val maxvlbm = (p.vectorBits * 4 / 8).U(p.vectorCountBits.W)
+  val maxvlhm = (p.vectorBits * 4 / 16).U(p.vectorCountBits.W)
+  val maxvlwm = (p.vectorBits * 4 / 32).U(p.vectorCountBits.W)
+  assert(maxvlw >= 4.U)
+
+  val slice = Slice(Vec(4, new Bundle {
+    val vld = Output(Bool())
+    val vst = Output(Bool())
+    val lane = Valid(new VectorInstructionLane)
+  }), true)
+
+  val reqvalid = VecInit(io.in(0).valid && io.in(0).ready,
+                         io.in(1).valid && io.in(1).ready,
+                         io.in(2).valid && io.in(2).ready,
+                         io.in(3).valid && io.in(3).ready)
+
+  val reqaddr = VecInit(io.in(0).inst(19,15),
+                        io.in(1).inst(19,15),
+                        io.in(2).inst(19,15),
+                        io.in(3).inst(19,15))
+
+  // ---------------------------------------------------------------------------
+  // Response to Decode.
+  for (i <- 0 until 4) {
+    io.in(i).ready := !io.out.stall
+  }
+
+  // ---------------------------------------------------------------------------
+  // Controls.
+  val vld_o = RegInit(VecInit(Seq.fill(4)(false.B)))
+  val vld_u = RegInit(VecInit(Seq.fill(4)(false.B)))
+  val vst_o = RegInit(VecInit(Seq.fill(4)(false.B)))
+  val vst_u = RegInit(VecInit(Seq.fill(4)(false.B)))
+  val vst_q = RegInit(VecInit(Seq.fill(4)(false.B)))
+  val getvl = RegInit(VecInit(Seq.fill(4)(false.B)))
+  val getmaxvl = RegInit(VecInit(Seq.fill(4)(false.B)))
+
+  val rdAddr = Reg(Vec(4, UInt(5.W)))
+
+  for (i <- 0 until 4) {
+    when (reqvalid(i)) {
+      rdAddr(i) := io.in(i).addr
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Vector Interface.
+  val vvalid = RegInit(false.B)
+  val vinstValid = Reg(Vec(4, Bool()))
+  val vinstInst = Reg(Vec(4, UInt(32.W)))
+  val nxtVinstValid = Wire(Vec(4, Bool()))
+
+  vvalid := nxtVinstValid.asUInt =/= 0.U
+
+  for (i <- 0 until 4) {
+    nxtVinstValid(i) := reqvalid(i) && (io.in(i).op(vinst.VLD) ||
+                                        io.in(i).op(vinst.VST) ||
+                                        io.in(i).op(vinst.VIOP))
+    vinstValid(i) := nxtVinstValid(i)
+    vinstInst(i) := io.in(i).inst
+  }
+
+  for (i <- 0 until 4) {
+    val p = io.in(i).inst(28)  // func2
+    val q = io.in(i).inst(30)  // func2
+    vld_o(i) := reqvalid(i) && io.in(i).op(vinst.VLD) && !p
+    vld_u(i) := reqvalid(i) && io.in(i).op(vinst.VLD) &&  p
+    vst_o(i) := reqvalid(i) && io.in(i).op(vinst.VST) && !p
+    vst_u(i) := reqvalid(i) && io.in(i).op(vinst.VST) &&  p && !q
+    vst_q(i) := reqvalid(i) && io.in(i).op(vinst.VST) &&  p &&  q
+    getvl(i) := reqvalid(i) && io.in(i).op(vinst.GETVL)
+    getmaxvl(i) := reqvalid(i) && io.in(i).op(vinst.GETMAXVL)
+  }
+
+  // ---------------------------------------------------------------------------
+  // Register write port.
+  val lsuAdder = Wire(Vec(4, UInt(32.W)))
+  val getvlValue = Wire(Vec(4, UInt(p.vectorCountBits.W)))  // bytes
+  val getmaxvlValue = Wire(Vec(4, UInt(p.vectorCountBits.W)))  // bytes
+
+  for (i <- 0 until 4) {
+    val rs1 = io.rs(2 * i + 0).data
+    val rs2 = io.rs(2 * i + 1).data
+    val m  = vinstInst(i)(5)
+    val sz = vinstInst(i)(13,12)
+    val sl = vinstInst(i)(27,26)  // func2
+    val q  = vinstInst(i)(30)
+    val count = rs2(31,0)
+    val xs2zero = vinstInst(i)(24,20) === 0.U
+
+    val max = MuxOR(sz === 0.U && !m, maxvlb) |
+              MuxOR(sz === 1.U && !m, maxvlh) |
+              MuxOR(sz === 2.U && !m, maxvlw) |
+              MuxOR(sz === 0.U && m, maxvlbm) |
+              MuxOR(sz === 1.U && m, maxvlhm) |
+              MuxOR(sz === 2.U && m, maxvlwm)
+
+    val cmp = Mux(count < max, count, max)
+
+    val bytes = (MuxOR(sz === 0.U && sl(0), cmp) |
+                 MuxOR(sz === 1.U && sl(0), Cat(cmp, 0.U(1.W))) |
+                 MuxOR(sz === 2.U && sl(0), Cat(cmp, 0.U(2.W))) |
+                 MuxOR(!sl(0) && !m, maxvlb) |
+                 MuxOR(!sl(0) && m, maxvlbm)
+                )(31,0)
+    assert(bytes.getWidth == 32)
+
+    val rt = (MuxOR(sz === 0.U, rs2) |
+              MuxOR(sz === 1.U, Cat(rs2, 0.U(1.W))) |
+              MuxOR(sz === 2.U, Cat(rs2, 0.U(2.W)))
+             )(31,0)
+
+    val rtm = (Cat(rt, 0.U(2.W)))(31,0)
+    val rtq = (Cat(rt, 0.U(4.W)))(31,0)
+
+    val p_x   = sl === 0.U &&  xs2zero
+    val p_xx  = sl === 0.U && !xs2zero
+    val lp_xx = sl === 1.U
+    val sp_xx = sl === 2.U && !q
+    val qp_xx = sl === 2.U &&  q  // vstq.sp
+    val tp_xx = sl === 3.U
+    assert(PopCount(Cat(p_x, p_xx, lp_xx, sp_xx, qp_xx, tp_xx)) <= 1.U)
+
+    val offset = MuxOR(p_x,   Mux(m, maxvlbm, maxvlb)) |
+                 MuxOR(p_xx,  rt) |
+                 MuxOR(lp_xx, bytes) |
+                 MuxOR(sp_xx, Mux(m, rtm, rt)) |
+                 MuxOR(tp_xx, maxvlb) |
+                 MuxOR(qp_xx, Mux(m, rtq, rtm))
+    assert(offset.getWidth == 32)
+
+    lsuAdder(i) := rs1 + offset
+  }
+
+  for (i <- 0 until 4) {
+    val len = Wire(UInt(p.vectorCountBits.W))  // bytes
+    val rs1 = io.rs(2 * i + 0).data
+    val rs2 = io.rs(2 * i + 1).data
+    val getvlsz = vinstInst(i)(26,25)
+    val getvlm  = vinstInst(i)(27)
+    val maxvl = MuxOR(getvlsz === 0.U && !getvlm, maxvlb) |
+                MuxOR(getvlsz === 1.U && !getvlm, maxvlh) |
+                MuxOR(getvlsz === 2.U && !getvlm, maxvlw) |
+                MuxOR(getvlsz === 0.U &&  getvlm, maxvlbm) |
+                MuxOR(getvlsz === 1.U &&  getvlm, maxvlhm) |
+                MuxOR(getvlsz === 2.U &&  getvlm, maxvlwm)
+
+    val rs2nonzero = vinstInst(i)(24,20) =/= 0.U
+
+    when (rs2 < maxvl && rs2 < rs1 && rs2nonzero) {
+      len := rs2
+    } .elsewhen (rs1 < maxvl) {
+      len := rs1
+    } .otherwise {
+      len := maxvl
+    }
+
+    getvlValue(i) := len
+    getmaxvlValue(i) := maxvl
+  }
+
+  for (i <- 0 until 4) {
+    io.rd(i).valid := getvl(i) || getmaxvl(i) || vld_u(i) || vst_u(i) || vst_q(i)
+    io.rd(i).addr := rdAddr(i)
+
+    io.rd(i).data :=
+        MuxOR(getvl(i), getvlValue(i)) |
+        MuxOR(getmaxvl(i), getmaxvlValue(i)) |
+        MuxOR(vld_u(i) || vst_u(i) || vst_q(i), lsuAdder(i))
+  }
+
+  // ---------------------------------------------------------------------------
+  // Vector Extension Opcodes.
+  slice.io.in.valid := vvalid
+  slice.io.out.ready := io.out.ready
+  io.out.valid := slice.io.out.valid
+
+  // Instruction in execute should always succeed.
+  // Resolve back-pressure with stall to io.in in decode.
+  assert(!(slice.io.in.valid && !slice.io.in.ready))
+
+  for (i <- 0 until 4) {
+    slice.io.in.bits(i).vld := vld_o(i) || vld_u(i)
+    slice.io.in.bits(i).vst := vst_o(i) || vst_u(i) || vst_q(i)
+    slice.io.in.bits(i).lane.valid := vinstValid(i)
+    slice.io.in.bits(i).lane.bits.inst := vinstInst(i)
+    slice.io.in.bits(i).lane.bits.addr := io.rs(2 * i + 0).data
+    slice.io.in.bits(i).lane.bits.data := io.rs(2 * i + 1).data
+  }
+
+  for (i <- 0 until 4) {
+    io.out.lane(i) := slice.io.out.bits(i).lane
+  }
+
+  // Note: slice.io.in.ready is not used in the flow control.
+  // Require the vector core to signal a stall signal into decode,
+  // such that the double buffered slice never overruns.
+  assert(!(vvalid && !slice.io.in.ready))
+
+  // ---------------------------------------------------------------------------
+  // Status.
+  val nempty = RegInit(false.B)
+
+  // Simple implementation, will overlap downstream units redundantly.
+  nempty := io.in(0).valid || io.in(1).valid || io.in(2).valid ||
+            io.in(3).valid || vvalid || io.out.valid
+
+  io.nempty := nempty
+}
diff --git a/hdl/chisel/src/kelvin/vector/VLd.scala b/hdl/chisel/src/kelvin/vector/VLd.scala
new file mode 100644
index 0000000..85a342e
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VLd.scala
@@ -0,0 +1,159 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object VLd {
+  def apply(p: Parameters): VLd = {
+    return Module(new VLd(p))
+  }
+}
+
+class VLd(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    // Instructions.
+    val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+
+    // VRegfile.
+    val write = new VRegfileWriteIO(p)
+
+    // Bus.
+    val axi = new AxiMasterReadIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits)
+
+    // Status.
+    val nempty = Output(Bool())
+  })
+
+  // Loads do not zero out-of-size lanes, all ALU lanes will be populated.
+  // Memory may be initially zeroed so that one half of operation is zero.
+  // Writes are masked so there is no harm to non-zero entries.
+
+  // A usable depth of outstanding commands.
+  val cmdqDepth = 8
+
+  val maxvlb  = (p.vectorBits / 8).U(p.vectorCountBits.W)
+  val maxvlbm = (p.vectorBits * 4 / 8).U(p.vectorCountBits.W)
+
+  val bytes = p.lsuDataBits / 8
+
+  val e = new VEncodeOp()
+
+  // ---------------------------------------------------------------------------
+  // Command Queue.
+  class VLdCmdq extends Bundle {
+    val op = UInt(new VEncodeOp().bits.W)
+    val f2 = UInt(3.W)
+    val sz = UInt(3.W)
+    val addr = UInt(32.W)
+    val offset = UInt(32.W)
+    val remain = UInt(p.vectorCountBits.W)
+    val vd = new VAddr()
+    val last = Bool()
+  }
+
+  def Fin(in: VDecodeBits): VLdCmdq = {
+    val out = Wire(new VLdCmdq)
+    val stride = in.f2(1)
+    val length = in.f2(0)
+    assert(PopCount(in.sz) <= 1.U)
+    assert(!(in.op === e.vld.U  && (!in.vd.valid ||  in.vs.valid)))
+
+    val limit = Mux(in.m, maxvlbm, maxvlb)
+
+    val data = MuxOR(in.sz(0), in.sv.data) |
+               MuxOR(in.sz(1), Cat(in.sv.data, 0.U(1.W))) |
+               MuxOR(in.sz(2), Cat(in.sv.data, 0.U(2.W)))
+
+    val remain0 = maxvlbm
+    val remain1 = Mux(data > limit, limit, data)(p.vectorCountBits - 1, 0)
+    assert(remain0.getWidth == p.vectorCountBits)
+    assert(remain1.getWidth == p.vectorCountBits)
+
+    out.op := in.op
+    out.f2 := in.f2
+    out.sz := in.sz
+    out.addr := in.sv.addr
+    out.offset := Mux(stride, data(31,0), maxvlb)
+    out.remain := Mux(length, remain1, remain0)
+    out.vd := in.vd
+    out.last := !in.m
+
+    out
+  }
+
+  def Fout(in: VLdCmdq, m: Bool, step: UInt, valid: Bool): (VLdCmdq, Bool) = {
+    val msb = log2Ceil(bytes) - 1
+    val addrAlign = in.addr(msb, 0)
+    val offsAlign = in.offset(msb, 0)
+    assert(addrAlign === 0.U)
+    assert(offsAlign === 0.U)
+    assert(!valid || in.op === e.vld.U)
+
+    val out = Wire(new VLdCmdq)
+    val stride = in.f2(1)
+
+    val outlast = !m || step === 2.U  // registered a cycle before 'last' usage
+
+    val last = !m || step === 3.U
+
+    out := in
+
+    out.vd.addr := in.vd.addr + 1.U
+
+    out.addr   := in.addr + in.offset
+    out.remain := Mux(in.remain <= maxvlb, 0.U, in.remain - maxvlb)
+
+    out.last := outlast
+
+    (out, last)
+  }
+
+  def Factive(in: VLdCmdq, m: Bool, step: UInt): UInt = {
+    assert(step.getWidth == 5)
+    0.U
+  }
+
+  val q = VCmdq(cmdqDepth, new VLdCmdq, Fin, Fout, Factive)
+
+  q.io.in <> io.in
+
+  // ---------------------------------------------------------------------------
+  // Axi.
+  io.axi.addr.valid := q.io.out.valid
+  io.axi.addr.bits.addr := Cat(0.U(1.W), q.io.out.bits.addr(30,0))
+  io.axi.addr.bits.id := q.io.out.bits.vd.addr
+  assert(!(q.io.out.valid && !q.io.out.bits.addr(31)))
+  assert(!(io.axi.addr.valid && io.axi.addr.bits.addr(31)))
+
+  q.io.out.ready := io.axi.addr.ready
+
+  // ---------------------------------------------------------------------------
+  // Write interface.
+  io.write.valid := io.axi.data.valid
+  io.write.data := io.axi.data.bits.data
+  io.write.addr := io.axi.data.bits.id
+
+  io.axi.data.ready := true.B
+
+  // ---------------------------------------------------------------------------
+  // Memory active status.
+  val nempty = RegInit(false.B)
+  val count = RegInit(0.U(7.W))
+  val inc = io.axi.addr.valid && io.axi.addr.ready
+  val dec = io.axi.data.valid && io.axi.data.ready
+
+  when (inc || dec) {
+    val nxtcount = count + inc - dec
+    count := nxtcount
+    nempty := nxtcount =/= 0.U
+    assert(count <= 64.U)
+  }
+
+  io.nempty := q.io.nempty || nempty
+}
+
+object EmitVLd extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new VLd(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VLdSt.scala b/hdl/chisel/src/kelvin/vector/VLdSt.scala
new file mode 100644
index 0000000..256ecd8
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VLdSt.scala
@@ -0,0 +1,302 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object VLdSt {
+  def apply(p: Parameters): VLdSt = {
+    return Module(new VLdSt(p))
+  }
+}
+
+class VLdSt(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    // Instructions.
+    val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+    val active = Output(UInt(64.W))
+
+    // VRegfile.
+    val vrfsb = Input(UInt(128.W))
+    val read  = new VRegfileReadHsIO(p)
+    val write = new VRegfileWriteIO(p)
+
+    // Bus.
+    val dbus = new DBusIO(p)
+    val last = Output(Bool())
+  })
+
+  // A usable amount of outstanding transactions.
+  val cmdqDepth = 8
+
+  // The minimum depth to cover pipeline delays in this unit.
+  val dbusDepth = 3
+
+  val maxvlb  = (p.vectorBits / 8).U(p.vectorCountBits.W)
+  val maxvlbm = (p.vectorBits * 4 / 8).U(p.vectorCountBits.W)
+
+  val bytes = p.lsuDataBits / 8
+
+  val e = new VEncodeOp()
+
+  // ---------------------------------------------------------------------------
+  // Swizzle datapath.
+  def Swizzle(positive: Boolean, size: Int, addr: UInt, data: UInt): UInt = {
+    val msb = log2Ceil(bytes) - 1
+    val datain = Wire(Vec(bytes, UInt(size.W)))
+    val dataout = Wire(Vec(bytes, UInt(size.W)))
+
+    for (i <- 0 until bytes) {
+      datain(i) := data(size * i + (size - 1), size * i)
+    }
+
+    val index = addr(msb, 0)
+    for (i <- 0 until bytes) {
+      val idx = if (positive) i.U + index else i.U - index
+      dataout(i) := VecAt(datain, idx)
+      assert(idx.getWidth == (msb + 1))
+    }
+
+    dataout.asUInt
+  }
+
+  // ---------------------------------------------------------------------------
+  // Command Queue.
+  class VLdStCmdq extends Bundle {
+    val op = UInt(new VEncodeOp().bits.W)
+    val f2 = UInt(3.W)
+    val sz = UInt(3.W)
+    val addr = UInt(32.W)
+    val offset = UInt(32.W)
+    val remain = UInt(p.vectorCountBits.W)
+    val vd = new VAddr()
+    val vs = new VAddrTag()
+    val quad = UInt(2.W)  // vstq position
+    val last = Bool()
+
+    def IsLoad(): Bool = {
+      op === e.vld.U
+    }
+
+    def IsStore(): Bool = {
+      op === e.vst.U || op === e.vstq.U
+    }
+  }
+
+  def Fin(in: VDecodeBits): VLdStCmdq = {
+    val out = Wire(new VLdStCmdq)
+    val stride = in.f2(1)
+    val length = in.f2(0)
+    assert(PopCount(in.sz) <= 1.U)
+    assert(!(in.op === e.vst.U  && ( in.vd.valid || !in.vs.valid)))
+    assert(!(in.op === e.vstq.U && ( in.vd.valid || !in.vs.valid)))
+    assert(!(in.op === e.vld.U  && (!in.vd.valid ||  in.vs.valid)))
+
+    val limit = Mux(in.m, maxvlbm, maxvlb)
+
+    val data = MuxOR(in.sz(0), in.sv.data) |
+               MuxOR(in.sz(1), Cat(in.sv.data, 0.U(1.W))) |
+               MuxOR(in.sz(2), Cat(in.sv.data, 0.U(2.W)))
+
+    val remain0 = maxvlbm
+    val remain1 = Mux(data > limit, limit, data)(p.vectorCountBits - 1, 0)
+    assert(remain0.getWidth == p.vectorCountBits)
+    assert(remain1.getWidth == p.vectorCountBits)
+
+    out.op := in.op
+    out.f2 := in.f2
+    out.sz := in.sz
+    out.addr := in.sv.addr
+    out.offset := Mux(stride, data(31,0), Mux(in.op === e.vstq.U, maxvlb >> 2, maxvlb))
+    out.remain := Mux(length, remain1, remain0)
+    out.vd := in.vd
+    out.vs := in.vs
+    out.last := !in.m && in.op =/= e.vstq.U
+
+    out.quad := 0.U
+
+    out
+  }
+
+  def Fout(in: VLdStCmdq, m: Bool, step: UInt, valid: Bool): (VLdStCmdq, Bool) = {
+    assert(!valid || in.op === e.vld.U || in.op === e.vst.U || in.op === e.vstq.U)
+
+    val out = Wire(new VLdStCmdq)
+    val vstq = in.op === e.vstq.U
+    val stride = in.f2(1)
+
+    val fmaxvlb = Mux(in.op === e.vstq.U, maxvlb >> 2, maxvlb)
+
+    val outlast1 = !m || step === 2.U  // registered a cycle before 'last' usage
+    val outlast2 = Mux(m, step === 14.U, step === 2.U)
+    val outlast = Mux(vstq, outlast2, outlast1)
+
+    val last1 = !m || step === 3.U
+    val last2 = Mux(m, step === 15.U, step === 3.U)
+    val last = Mux(vstq, last2, last1)
+
+    out := in
+
+    out.vd.addr := Mux(vstq && step(1,0) =/= 3.U, in.vd.addr, in.vd.addr + 1.U)
+    out.vs.addr := Mux(vstq && step(1,0) =/= 3.U, in.vs.addr, in.vs.addr + 1.U)
+
+    out.addr   := in.addr + in.offset
+    out.remain := Mux(in.remain <= fmaxvlb, 0.U, in.remain - fmaxvlb)
+
+    out.last := outlast
+
+    out.quad := Mux(in.op === e.vstq.U, step + 1.U, 0.U)
+
+    (out, last)
+  }
+
+  def Factive(in: VLdStCmdq, m: Bool, step: UInt): UInt = {
+    assert(step.getWidth == 5)
+    val vstq = in.op === e.vstq.U
+    val stepq = Mux(vstq, step(4,2), step(2,0))
+    // Only reads are reported in active, vrfsb tracks writes.
+    val active = MuxOR(in.vs.valid, RegActive(m, stepq, in.vs.addr))
+    assert(active.getWidth == 64)
+    active
+  }
+
+  val q = VCmdq(cmdqDepth, new VLdStCmdq, Fin, Fout, Factive)
+
+  q.io.in <> io.in
+
+  val ctrlready = Wire(Bool())
+  q.io.out.ready := ScoreboardReady(q.io.out.bits.vs, io.vrfsb) && ctrlready
+
+  // ---------------------------------------------------------------------------
+  // Read register.
+  io.read.valid := q.io.out.valid && q.io.out.bits.vs.valid
+  io.read.stall := !q.io.out.ready  // testbench signal
+  io.read.addr := q.io.out.bits.vs.addr
+  io.read.tag := OutTag(q.io.out.bits.vs)
+
+  // ---------------------------------------------------------------------------
+  // DBus.
+  class DBusCtrl extends Bundle {
+    val last = Bool()
+    val write = Bool()
+    val addr = UInt(p.lsuAddrBits.W)
+    val adrx = UInt(p.lsuAddrBits.W)
+    val size = UInt((log2Ceil(p.lsuDataBits / 8) + 1).W)
+    val widx = UInt(6.W)
+  }
+
+  class DBusWData extends Bundle {
+    val wdata = UInt(p.lsuDataBits.W)
+    val wmask = UInt((p.lsuDataBits / 8).W)
+  }
+
+  class RegWrite extends Bundle {
+    val widx = UInt(6.W)
+    val addr = UInt(log2Ceil(bytes).W)  // bus address
+    val size = UInt((log2Ceil(p.lsuDataBits / 8) + 1).W)
+  }
+
+  val lineoffset = (p.lsuDataBits / 8)
+
+  // Combinatorial paths back to command queue are to be avoided.
+  val ctrl = Fifo(new DBusCtrl, dbusDepth)
+  val data = Fifo(new DBusWData, dbusDepth)
+  val rdataEn = RegInit(false.B)
+  val rdataSize = Reg(UInt(p.vectorCountBits.W))
+  val rdataAddr = Reg(UInt(log2Ceil(bytes).W))
+  val rdataAshf = Reg(UInt(log2Ceil(bytes).W))
+
+  ctrlready := ctrl.io.in.ready && (io.read.ready || !ctrl.io.in.bits.write)
+
+  val qoutEn = q.io.out.valid && q.io.out.ready
+  val rdataEnNxt = qoutEn && ctrl.io.in.bits.write
+
+  val qmaxvlb = Mux(q.io.out.bits.op === e.vstq.U, maxvlb >> 2.U, maxvlb)
+  val qsize = Mux(q.io.out.bits.remain > qmaxvlb, qmaxvlb, q.io.out.bits.remain)
+  val rdataWmask = Wire(Vec(p.lsuDataBits / 8, Bool()))
+
+  when (rdataEnNxt) {
+    val quad = q.io.out.bits.quad(1,0)
+    rdataSize := qsize
+    rdataAddr := q.io.out.bits.addr
+    rdataAshf := q.io.out.bits.addr - (quad * (maxvlb >> 2.U))
+  }
+
+  for (i <- 0 until p.lsuDataBits / 8) {
+    rdataWmask(i) := rdataSize > i.U
+  }
+
+  rdataEn := rdataEnNxt
+  ctrl.io.in.valid := qoutEn
+
+  ctrl.io.in.bits.addr  := q.io.out.bits.addr
+  ctrl.io.in.bits.adrx  := q.io.out.bits.addr + lineoffset.U
+  ctrl.io.in.bits.size  := qsize
+  ctrl.io.in.bits.last  := q.io.out.bits.last
+  ctrl.io.in.bits.write := q.io.out.bits.IsStore()
+  ctrl.io.in.bits.widx  := q.io.out.bits.vd.addr
+  assert(!(ctrl.io.in.valid && !ctrl.io.in.ready))
+
+  data.io.in.valid := rdataEn
+  data.io.in.bits.wdata := Swizzle(false, 8, rdataAshf, io.read.data)
+  data.io.in.bits.wmask := Swizzle(false, 1, rdataAddr, rdataWmask.asUInt)
+  assert(!(data.io.in.valid && !data.io.in.ready))
+
+  ctrl.io.out.ready := io.dbus.ready && (data.io.out.valid || !ctrl.io.out.bits.write)
+  data.io.out.ready := io.dbus.ready && (ctrl.io.out.valid &&  ctrl.io.out.bits.write)
+  assert(!(data.io.out.valid && !ctrl.io.out.valid))
+
+  io.dbus.valid := ctrl.io.out.valid && (data.io.out.valid || !ctrl.io.out.bits.write)
+  io.dbus.write := ctrl.io.out.bits.write
+  io.dbus.addr := Cat(0.U(1.W), ctrl.io.out.bits.addr(30,0))
+  io.dbus.adrx := Cat(0.U(1.W), ctrl.io.out.bits.adrx(30,0))
+  io.dbus.size := ctrl.io.out.bits.size
+  io.dbus.wdata := data.io.out.bits.wdata
+  io.dbus.wmask := data.io.out.bits.wmask
+  assert(!(ctrl.io.out.valid && ctrl.io.out.bits.addr(31)))
+  assert(!(ctrl.io.out.valid && ctrl.io.out.bits.adrx(31)))
+  assert(!(io.dbus.valid && io.dbus.addr(31)))
+  assert(!(io.dbus.valid && io.dbus.adrx(31)))
+
+  io.last := ctrl.io.out.bits.last
+
+  // ---------------------------------------------------------------------------
+  // Write register.
+  val wrega = Slice(new RegWrite, true, true)
+  val wregd = Slice(UInt(p.vectorBits.W), false, true)
+  val wdataEn = RegInit(false.B)
+
+  wdataEn := io.dbus.valid && io.dbus.ready && !io.dbus.write
+
+  wrega.io.in.valid := ctrl.io.out.valid && io.dbus.ready && !ctrl.io.out.bits.write
+  wrega.io.in.bits.widx := ctrl.io.out.bits.widx
+  wrega.io.in.bits.addr := ctrl.io.out.bits.addr
+  wrega.io.in.bits.size := ctrl.io.out.bits.size
+  wrega.io.out.ready := wregd.io.out.valid
+  assert(!(wrega.io.in.valid && !wrega.io.in.ready))
+
+  wregd.io.in.valid := wdataEn
+  wregd.io.in.bits := io.dbus.rdata
+  wregd.io.out.ready := wrega.io.out.valid
+  assert(!(wregd.io.in.valid && !wregd.io.in.ready))
+
+  val maskb = Wire(Vec(p.vectorBits / 8, UInt(8.W)))
+  val mask = maskb.asUInt
+
+  for (i <- 0 until p.vectorBits / 8) {
+    maskb(i) := MuxOR(i.U < wrega.io.out.bits.size, 0xff.U)
+  }
+
+  io.write.valid := wrega.io.out.valid && wregd.io.out.valid
+  io.write.addr := wrega.io.out.bits.widx
+  io.write.data := Swizzle(true, 8, wrega.io.out.bits.addr, wregd.io.out.bits) & mask
+
+  // ---------------------------------------------------------------------------
+  // Active.
+  io.active := q.io.active
+}
+
+object EmitVLdSt extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new VLdSt(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VRegfile.scala b/hdl/chisel/src/kelvin/vector/VRegfile.scala
new file mode 100644
index 0000000..8f3b1a4
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VRegfile.scala
@@ -0,0 +1,430 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object VRegfile {
+  def apply(p: Parameters): VRegfile = {
+    return Module(new VRegfile(p))
+  }
+}
+
+class VRegfileReadIO(p: Parameters) extends Bundle {
+  val valid = Output(Bool())
+  val addr = Output(UInt(6.W))
+  val tag  = Output(UInt(1.W))
+  val data = Input(UInt(p.vectorBits.W))
+}
+
+class VRegfileReadHsIO(p: Parameters) extends Bundle {
+  val valid = Output(Bool())
+  val ready = Input(Bool())   // handshake
+  val stall = Output(Bool())  // Testbench signal.
+  val addr = Output(UInt(6.W))
+  val tag  = Output(UInt(1.W))
+  val data = Input(UInt(p.vectorBits.W))
+}
+
+class VRegfileScalarIO(p: Parameters) extends Bundle {
+  val valid = Output(Bool())
+  val data = Output(UInt(32.W))
+}
+
+class VRegfileTransposeIO(p: Parameters) extends Bundle {
+  val valid = Output(Bool())
+  val tcnt = 16.min(p.vectorBits / 32)
+  val addr = Output(UInt(6.W))
+  val index = Output(UInt(log2Ceil(tcnt).W))
+  val data = Input(UInt((tcnt * 32).W))
+}
+
+class VRegfileWrite(p: Parameters) extends Bundle {
+  val addr = UInt(6.W)
+  val data = UInt(p.vectorBits.W)
+}
+
+class VRegfileWriteIO(p: Parameters) extends Bundle {
+  val valid = Output(Bool())
+  val addr = Output(UInt(6.W))
+  val data = Output(UInt(p.vectorBits.W))
+}
+
+class VRegfileWriteHsIO(p: Parameters) extends Bundle {
+  val valid = Output(Bool())
+  val ready = Input(Bool())  // handshake used in arbitration logic
+  val addr = Output(UInt(6.W))
+  val data = Output(UInt(p.vectorBits.W))
+}
+
+// Write internal.
+class VRegfileWrintIO(p: Parameters) extends Bundle {
+  val valid = Output(Bool())
+  val addr = Output(UInt(6.W))
+  val data = Output(UInt(p.vectorBits.W))
+}
+
+// Write internal.
+class VRegfileWhintIO(p: Parameters) extends Bundle {
+  val valid = Output(Bool())
+  val addr = Output(UInt(6.W))
+}
+
+class VRegfileConvIO(p: Parameters) extends Bundle {
+  val valid = Output(Bool())     // registered signal suitable for mux control
+  val ready = Output(Bool())     // combinatoral from scheduling logic
+  val op = new Bundle {
+    val conv = Output(Bool())  // convolution to accum
+    val init = Output(Bool())  // set accum
+    val tran = Output(Bool())  // transpose to accum
+    val wclr = Output(Bool())  // write accum to vreg and clear accum
+  }
+  val addr1 = Output(UInt(6.W))  // narrow: transpose
+  val addr2 = Output(UInt(6.W))  // wide:   internal
+  val mode  = Output(UInt(2.W))
+  val index = Output(UInt(log2Ceil(p.vectorBits / 32).W))
+  val abias = Output(UInt(9.W))
+  val bbias = Output(UInt(9.W))
+  val asign = Output(Bool())
+  val bsign = Output(Bool())
+}
+
+class VRegfileScoreboardIO extends Bundle {
+  // 64 registers sequenced from even/odd tags.
+  val set  = Valid(UInt(128.W))
+  val data = Input(UInt(128.W))
+}
+
+class VRegfile(p: Parameters) extends Module {
+  val readPorts = 7
+  val writePorts = 6
+  val whintPorts = 4
+
+  val io = IO(new Bundle {
+    val read = Vec(readPorts, Flipped(new VRegfileReadIO(p)))
+    val scalar = Vec(readPorts / 3, Flipped(new VRegfileScalarIO(p)))
+    val write = Vec(writePorts, Flipped(new VRegfileWrintIO(p)))
+    val whint = Vec(whintPorts, Flipped(new VRegfileWhintIO(p)))
+    val conv = Flipped(new VRegfileConvIO(p))
+    val transpose = Flipped(new VRegfileTransposeIO(p))
+    val vrfsb = Flipped(new VRegfileScoreboardIO)
+  })
+
+  val segcnt = p.vectorBits / 32
+  val segcntBits = log2Ceil(segcnt)
+
+  // ---------------------------------------------------------------------------
+  // Register file storage.
+  val vreg = for (i <- 0 until segcnt) yield {
+    Module(new VRegfileSegment(p))
+  }
+
+  // ---------------------------------------------------------------------------
+  // Convolution unit.
+  val vconv = VConvAlu(p)
+
+  // ---------------------------------------------------------------------------
+  // Assert state.
+  val writeCurr = Wire(UInt(64.W))
+  val writePrev = RegInit(0.U(64.W))
+  val writeSet = Wire(Vec(writePorts, UInt(64.W)))
+
+  for (i <- 0 until writePorts) {
+    writeSet(i) := MuxOR(io.write(i).valid, 1.U << io.write(i).addr)
+  }
+
+  writeCurr := VecOR(writeSet)
+  writePrev := writeCurr
+
+  // ---------------------------------------------------------------------------
+  // Write port interface and registration.
+  val writevalidBool = Wire(Vec(writePorts, Bool()))
+  val writevalid = writevalidBool.asUInt
+  val writebits = Wire(Vec(writePorts, new VRegfileWrite(p)))
+  val writevalidReg = RegInit(0.U(writePorts.W))
+  val writebitsReg = Reg(Vec(writePorts, new VRegfileWrite(p)))
+
+  for (i <- 0 until writePorts) {
+    writevalidBool(i) := io.write(i).valid
+    writebits(i).addr := io.write(i).addr
+    writebits(i).data := io.write(i).data
+  }
+
+  writevalidReg := writevalid
+
+  for (i <- 0 until writePorts) {
+    when (io.write(i).valid) {
+      writebitsReg(i).addr := io.write(i).addr
+      writebitsReg(i).data := io.write(i).data
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Write ports.
+  for (i <- 0 until writePorts) {
+    for (j <- 0 until segcnt) {
+      vreg(j).io.write(i).valid := writevalidReg(i)
+      vreg(j).io.write(i).addr := writebitsReg(i).addr
+      vreg(j).io.write(i).data := writebitsReg(i).data(32 * j + 31, 32 * j)
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Read ports.
+  val readData = Reg(Vec(readPorts, UInt(p.vectorBits.W)))
+
+  def ReadScalar(i: Int): (Bool, UInt) = {
+    val valid  = Wire(Bool())
+    val scalar = Wire(UInt(32.W))
+
+    if (i == 1 || i == 4) {
+      valid  := io.scalar(i / 3).valid
+      scalar := io.scalar(i / 3).data
+    } else {
+      valid  := false.B
+      scalar := 0.U
+    }
+
+    val lanes = p.vectorBits / 32
+    val values = Wire(Vec(lanes, UInt(32.W)))
+    for (i <- 0 until lanes) {
+      values(i) := scalar
+    }
+
+    val result = values.asUInt
+    assert(result.getWidth == p.vectorBits)
+    (valid, result)
+  }
+
+  val rdata = Wire(Vec(readPorts, Vec(segcnt, UInt(32.W))))
+
+  for (i <- 0 until readPorts) {
+    for (j <- 0 until segcnt) {
+      vreg(j).io.read(i).addr := io.read(i).addr
+      rdata(i)(j) := vreg(j).io.read(i).data
+    }
+  }
+
+  for (i <- 0 until readPorts) {
+    // Forwarding of internal write-staging registers.
+    val f1validBits = Wire(Vec(writePorts, Bool()))
+    val f1valid = f1validBits.asUInt
+    assert(PopCount(f1valid) <= 1.U)
+
+    val f2validBits = Wire(Vec(writePorts, Bool()))
+    val f2valid = f2validBits.asUInt
+    assert(PopCount(f2valid) <= 1.U)
+
+    for (j <- 0 until writePorts) {
+      f1validBits(j) := writevalid(j) &&
+                        writebits(j).addr === io.read(i).addr
+    }
+
+    for (j <- 0 until writePorts) {
+      f2validBits(j) := writevalidReg(j) &&
+                        writebitsReg(j).addr === io.read(i).addr
+    }
+
+    val f1dataBits = Wire(Vec(writePorts, UInt(p.vectorBits.W)))
+    val f1data = VecOR(f1dataBits, writePorts)
+
+    for (j <- 0 until writePorts) {
+      f1dataBits(j) := MuxOR(f1valid(j), writebits(j).data)
+    }
+
+    val f2dataBits = Wire(Vec(writePorts, UInt(p.vectorBits.W)))
+    val f2data = VecOR(f2dataBits, writePorts)
+
+    for (j <- 0 until writePorts) {
+      f2dataBits(j) := MuxOR(f2valid(j), writebitsReg(j).data)
+    }
+
+    val (scalarValid, scalarData) = ReadScalar(i)
+
+    val sel = Cat(scalarValid,
+                  !scalarValid && f1valid =/= 0.U,
+                  !scalarValid && f1valid === 0.U && f2valid =/= 0.U,
+                  !scalarValid && f1valid === 0.U && f2valid === 0.U)
+    assert(PopCount(sel) <= 1.U)
+
+    val data = MuxOR(sel(3), scalarData) |
+               MuxOR(sel(2), f1data) |
+               MuxOR(sel(1), f2data) |
+               MuxOR(sel(0), rdata(i).asUInt)
+
+    val rvalid =
+      if (i == 1 || i == 4) {
+        assert(!(io.read(i).valid && io.scalar(i / 3).valid))
+        io.read(i).valid || io.scalar(i / 3).valid
+      } else {
+        io.read(i).valid
+      }
+
+    when (rvalid) {
+      readData(i) := data
+    }
+  }
+
+  for (i <- 0 until readPorts) {
+    io.read(i).data := readData(i)
+  }
+
+  // ---------------------------------------------------------------------------
+  // Conv port.
+  val convConv = RegInit(false.B)
+  val convInit = RegInit(false.B)
+  val convTran = RegInit(false.B)
+  val convClear = RegInit(false.B)
+  val convIndex = Reg(UInt(log2Ceil(p.vectorBits / 32).W))
+  val convAbias = Reg(UInt(9.W))
+  val convBbias = Reg(UInt(9.W))
+  val convAsign = Reg(Bool())
+  val convBsign = Reg(Bool())
+  val internalData = Reg(UInt(p.vectorBits.W))
+
+  // io.conv.valid controls read multiplexors
+  // io.conv.ready frames data phase readiness
+  convConv  := io.conv.valid && io.conv.ready && io.conv.op.conv
+  convInit  := io.conv.valid && io.conv.ready && io.conv.op.init
+  convTran  := io.conv.valid && io.conv.ready && io.conv.op.tran
+  convClear := io.conv.valid && io.conv.ready && io.conv.op.wclr
+  convIndex := io.conv.index
+
+  assert(!(io.conv.valid && io.conv.ready) ||
+         PopCount(Cat(io.conv.op.conv, io.conv.op.wclr, io.conv.op.init, io.conv.op.tran)) === 1.U)
+
+  val idata = Wire(Vec(segcnt, UInt(32.W)))
+  for (i <- 0 until segcnt) {
+    idata(i) := vreg(i).io.internal.data
+  }
+
+  for (i <- 0 until segcnt) {
+    vreg(i).io.internal.addr := io.conv.addr2
+  }
+
+  when (io.conv.valid) {
+    internalData := idata.asUInt
+  }
+
+  when (io.conv.valid) {
+    convAbias  := io.conv.abias
+    convBbias  := io.conv.bbias
+    convAsign  := io.conv.asign
+    convBsign  := io.conv.bsign
+  }
+
+  for (i <- 0 until segcnt) {
+    vreg(i).io.conv.valid := convClear
+    for (j <- 0 until segcnt) {
+      vreg(i).io.conv.data(j) := vconv.io.out(j)(32 * i + 31, 32 * i)  // note index are reversed
+    }
+  }
+
+  // Note: do not assert if read touches any of the conv read/write registers.
+  // Other scheduling mechanisms are used to not advance the opcode.
+  val convRead0  = io.conv.valid && io.conv.ready && io.conv.op.conv
+  val convClear0 = io.conv.valid && io.conv.ready && io.conv.op.wclr
+
+  assert(!(convRead0 && io.conv.mode =/= 0.U))
+  // assert(!(convRead0 && io.conv.addr1(5,4) === 3.U))
+  // assert(!(convRead0 && io.conv.addr2(5,4) === 3.U))
+  assert(!(convRead0 && io.conv.addr1(3,0) =/= 0.U))
+  assert(!(convRead0 && io.conv.addr1(5,2) === io.conv.addr2(5,2) && (p.vectorBits == 128).B))
+  assert(!(convRead0 && io.conv.addr1(5,3) === io.conv.addr2(5,3) && (p.vectorBits == 256).B))
+  assert(!(convRead0 && io.conv.addr1(5,4) === io.conv.addr2(5,4) && (p.vectorBits == 512).B))
+
+  // Convolution reads must not be under pipelined writes.
+  assert(!(convRead0 && writeCurr(io.conv.addr2)))
+  assert(!(convRead0 && writePrev(io.conv.addr2)))
+
+  val convmaska = 0xffff.U << 48.U
+  assert(!(convClear0 && (writeCurr & convmaska) =/= 0.U))
+  assert(!(convClear0 && (writePrev & convmaska) =/= 0.U))
+  // // Note: writePrev check not needed since accumulator is a cycle after reads.
+  // // assert(!(convClear0 && (writePrev & convmaska) =/= 0.U))
+
+  for (i <- 0 until writePorts) {
+    assert(!((convClear0 || convClear) && io.write(i).valid && io.write(i).addr >= 48.U))
+  }
+
+  // ---------------------------------------------------------------------------
+  // Convolution.
+  vconv.io.op.conv := convConv
+  vconv.io.op.init := convInit
+  vconv.io.op.tran := convTran
+  vconv.io.op.clear := convClear
+  vconv.io.index := convIndex
+  vconv.io.adata := io.transpose.data
+  vconv.io.bdata := internalData
+  vconv.io.abias := convAbias
+  vconv.io.bbias := convBbias
+  vconv.io.asign := convAsign
+  vconv.io.bsign := convBsign
+
+  // ---------------------------------------------------------------------------
+  // Transpose port.
+  val transposeData = Reg(UInt(io.transpose.data.getWidth.W))
+  val transposeDataMux = Wire(Vec(segcnt, UInt(io.transpose.data.getWidth.W)))
+
+  for (i <- 0 until segcnt) {
+    vreg(i).io.transpose.addr := Mux(io.conv.valid, io.conv.addr1, io.transpose.addr)
+    transposeDataMux(i) := vreg(i).io.transpose.data
+  }
+
+  when (io.conv.valid || io.transpose.valid) {
+    val index = Mux(io.conv.valid, io.conv.index, io.transpose.index)
+    transposeData := VecAt(transposeDataMux, index)
+  }
+
+  io.transpose.data := transposeData
+
+  // Transpose reads must not be under pipelined writes.
+  for (i <- 0 until segcnt) {
+    assert(!(io.transpose.valid && writeCurr(io.transpose.addr + i.U)))
+    assert(!(io.transpose.valid && writePrev(io.transpose.addr + i.U)))
+  }
+
+  assert(!(io.transpose.valid && io.conv.valid))
+  assert(!(io.transpose.valid && convConv))
+
+  // ---------------------------------------------------------------------------
+  // Scoreboard.
+  def SbClr(valid: Bool = false.B, data: UInt = 0.U(128.W), i: Int = 0): (Bool, UInt) = {
+    if (i < writePorts) {
+      val wvalid = io.write(i).valid
+      val hvalid = if (i < whintPorts) io.whint(i).valid else false.B
+      val woh = MuxOR(io.write(i).valid, OneHot(io.write(i).addr, 64))
+      val hoh = if (i < whintPorts) MuxOR(io.whint(i).valid, OneHot(io.whint(i).addr, 64)) else 0.U
+      val whoh = woh | hoh
+      val whdata = Cat(whoh, whoh)
+      assert(whdata.getWidth == 128)
+      SbClr(valid || wvalid || hvalid, data | whdata, i + 1)
+    } else {
+      val cvalid = convClear  // delayed one cycle beyond io.conv.wclr, no forwarding to read ports
+      val cdataH = Wire(UInt(16.W))
+      val cdata  = MuxOR(cvalid, Cat(cdataH, 0.U(48.W), cdataH, 0.U(48.W)))
+      assert(cdata.getWidth == 128)
+      if (p.vectorBits == 128) cdataH := 0x000f.U
+      if (p.vectorBits == 256) cdataH := 0x00ff.U
+      if (p.vectorBits == 512) cdataH := 0xffff.U
+
+      (valid || cvalid, data | cdata)
+    }
+  }
+
+  val vrfsb = RegInit(0.U(128.W))
+  val vrfsbSetEn = io.vrfsb.set.valid
+  val vrfsbSet = MuxOR(io.vrfsb.set.valid, io.vrfsb.set.bits)
+  val (vrfsbClrEn, vrfsbClr) = SbClr()
+
+  when (vrfsbSetEn || vrfsbClrEn) {
+    vrfsb := (vrfsb & ~vrfsbClr) | vrfsbSet
+  }
+
+  io.vrfsb.data := vrfsb
+}
+
+object EmitVRegfile extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new VRegfile(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VRegfileSegment.scala b/hdl/chisel/src/kelvin/vector/VRegfileSegment.scala
new file mode 100644
index 0000000..5290d45
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VRegfileSegment.scala
@@ -0,0 +1,103 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+
+class VRegfileSegment(p: Parameters) extends Module {
+  val readPorts = 7
+  val writePorts = 6
+  val tcnt = 16.min(p.vectorBits / 32)
+
+  val io = IO(new Bundle {
+    val read = Vec(readPorts, new Bundle {
+      val addr = Input(UInt(6.W))
+      val data = Output(UInt(32.W))
+    })
+
+    val transpose = new Bundle {
+      val addr = Input(UInt(6.W))
+      val data = Output(UInt((tcnt * 32).W))
+    }
+
+    val internal = new Bundle {
+      val addr = Input(UInt(6.W))
+      val data = Output(UInt(32.W))
+    }
+
+    val write = Vec(writePorts, new Bundle {
+      val valid = Input(Bool())
+      val addr = Input(UInt(6.W))
+      val data = Input(UInt(32.W))
+    })
+
+    val conv = new Bundle {
+      val valid = Input(Bool())
+      val data = Input(Vec(tcnt, UInt(32.W)))
+    }
+  })
+
+  // Do not use a memory object, this breaks the synthesis.
+  //  eg. val vreg = Mem(64, UInt(32.W))
+  val vreg = Reg(Vec(64, UInt(32.W)))
+
+  // ---------------------------------------------------------------------------
+  // Read.
+  for (i <- 0 until readPorts) {
+    val ridx = io.read(i).addr
+    io.read(i).data := VecAt(vreg, ridx)
+  }
+
+  // ---------------------------------------------------------------------------
+  // Transpose.
+  val tdata = Wire(Vec(tcnt, UInt(32.W)))
+  for (i <- 0 until tcnt) {
+    val tidx = Cat(io.transpose.addr(5,4), i.U(4.W))  // only supports [v0, v16, v32, v48].
+    assert(tidx.getWidth == 6)
+    tdata(i) := VecAt(vreg, tidx)
+  }
+  io.transpose.data := tdata.asUInt
+  assert(io.transpose.addr(3,0) === 0.U)
+
+  // ---------------------------------------------------------------------------
+  // Internal.
+  io.internal.data := VecAt(vreg, io.internal.addr)
+
+  // ---------------------------------------------------------------------------
+  // Write.
+  for (i <- 0 until 64) {
+    val wvalidBits = Wire(Vec(writePorts, Bool()))
+    val wdataBits = Wire(Vec(writePorts, UInt(32.W)))
+    assert(PopCount(wvalidBits.asUInt) <= 1.U)
+
+    for (j <- 0 until writePorts) {
+      wvalidBits(j) := io.write(j).valid && io.write(j).addr === i.U
+      wdataBits(j) := MuxOR(wvalidBits(j), io.write(j).data)
+    }
+
+    val wvalid = VecOR(wvalidBits, writePorts)
+    val wdata = VecOR(wdataBits, writePorts)
+
+    when (wvalid) {
+      vreg(i) := wdata
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Convolution parallel load interface.
+  // Data has been transposed in VRegfile.
+  //  [48, 49, 50, ...] = data
+  when (io.conv.valid) {
+    for (i <- 0 until tcnt) {
+      vreg(i + 48) := io.conv.data(i)
+    }
+  }
+
+  for (i <- 0 until writePorts) {
+    assert(!(io.conv.valid && io.write(i).valid && io.write(i).addr >= 48.U))
+  }
+}
+
+object EmitVRegfileSegment extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new VRegfileSegment(p), args)
+}
diff --git a/hdl/chisel/src/kelvin/vector/VSt.scala b/hdl/chisel/src/kelvin/vector/VSt.scala
new file mode 100644
index 0000000..d9892c7
--- /dev/null
+++ b/hdl/chisel/src/kelvin/vector/VSt.scala
@@ -0,0 +1,309 @@
+package kelvin
+
+import chisel3._
+import chisel3.util._
+import common._
+
+object VSt {
+  def apply(p: Parameters): VSt = {
+    return Module(new VSt(p))
+  }
+}
+
+class VSt(p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    // Instructions.
+    val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+    val active = Output(UInt(64.W))
+
+    // VRegfile.
+    val vrfsb = Input(UInt(128.W))
+    val read  = new VRegfileReadHsIO(p)
+
+    // Bus.
+    val axi = new AxiMasterWriteIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits)
+
+    // Status.
+    val nempty = Output(Bool())
+  })
+
+  // A usable depth of outstanding commands.
+  val cmdqDepth = 8
+
+  val maxvlb  = (p.vectorBits / 8).U(p.vectorCountBits.W)
+  val maxvlbm = (p.vectorBits * 4 / 8).U(p.vectorCountBits.W)
+
+  val bytes = p.lsuDataBits / 8
+  val msb = log2Ceil(bytes)
+
+  val e = new VEncodeOp()
+
+  // ---------------------------------------------------------------------------
+  // Tie-offs
+  io.active := 0.U
+
+  io.in.ready := false.B
+
+  io.read.valid := false.B
+  io.read.stall := false.B
+  io.read.addr := 0.U
+  io.read.tag := 0.U
+
+  io.axi.addr.valid := false.B
+  io.axi.addr.bits.addr := 0.U
+  io.axi.addr.bits.id := 0.U
+
+  io.axi.data.valid := false.B
+  io.axi.data.bits.strb := 0.U
+  io.axi.data.bits.data := 0.U
+
+  io.axi.resp.ready := false.B
+
+  // ---------------------------------------------------------------------------
+  // Command Queue.
+  class VStCmdq extends Bundle {
+    val op = UInt(new VEncodeOp().bits.W)
+    val f2 = UInt(3.W)
+    val sz = UInt(3.W)
+    val addr = UInt(32.W)
+    val offset = UInt(32.W)
+    val remain = UInt(p.vectorCountBits.W)
+    val vs = new VAddrTag()
+    val quad = UInt(2.W)  // vstq position
+    val last = Bool()
+  }
+
+  def Fin(in: VDecodeBits): VStCmdq = {
+    val out = Wire(new VStCmdq)
+    val stride = in.f2(1)
+    val length = in.f2(0)
+    assert(PopCount(in.sz) <= 1.U)
+    assert(!(in.op === e.vst.U  && !in.vs.valid))
+    assert(!(in.op === e.vstq.U && !in.vs.valid))
+
+    val limit = Mux(in.m, maxvlbm, maxvlb)
+
+    val data = MuxOR(in.sz(0), in.sv.data) |
+               MuxOR(in.sz(1), Cat(in.sv.data, 0.U(1.W))) |
+               MuxOR(in.sz(2), Cat(in.sv.data, 0.U(2.W)))
+
+    val remain0 = maxvlbm
+    val remain1 = Mux(data > limit, limit, data)(p.vectorCountBits - 1, 0)
+    assert(remain0.getWidth == p.vectorCountBits)
+    assert(remain1.getWidth == p.vectorCountBits)
+
+    out.op := in.op
+    out.f2 := in.f2
+    out.sz := in.sz
+    out.addr := in.sv.addr
+    out.offset := Mux(stride, data(31,0), Mux(in.op === e.vstq.U, maxvlb >> 2, maxvlb))
+    out.remain := Mux(length, remain1, remain0)
+    out.vs := in.vs
+    out.last := !in.m && in.op =/= e.vstq.U
+
+    out.quad := 0.U
+
+    out
+  }
+
+  def Fout(in: VStCmdq, m: Bool, step: UInt, valid: Bool): (VStCmdq, Bool) = {
+    val addrAlign = Mux(in.op === e.vstq.U, in.addr(msb - 3, 0), in.addr(msb - 1, 0))
+    val offsAlign = Mux(in.op === e.vstq.U, in.offset(msb - 3, 0), in.offset(msb - 1, 0))
+    assert(addrAlign === 0.U)
+    assert(offsAlign === 0.U)
+    assert(!valid || in.op === e.vst.U || in.op === e.vstq.U)
+
+    val out = Wire(new VStCmdq)
+    val vstq = in.op === e.vstq.U
+    val stride = in.f2(1)
+
+    val fmaxvlb = Mux(in.op === e.vstq.U, maxvlb >> 2, maxvlb)
+
+    val outlast1 = !m || step === 2.U  // registered a cycle before 'last' usage
+    val outlast2 = Mux(m, step === 14.U, step === 2.U)
+    val outlast = Mux(vstq, outlast2, outlast1)
+
+    val last1 = !m || step === 3.U
+    val last2 = Mux(m, step === 15.U, step === 3.U)
+    val last = Mux(vstq, last2, last1)
+
+    out := in
+
+    out.vs.addr := Mux(vstq && step(1,0) =/= 3.U, in.vs.addr, in.vs.addr + 1.U)
+
+    out.addr   := in.addr + in.offset
+    out.remain := Mux(in.remain <= fmaxvlb, 0.U, in.remain - fmaxvlb)
+
+    out.last := outlast
+
+    out.quad := Mux(in.op === e.vstq.U, step + 1.U, 0.U)
+
+    (out, last)
+  }
+
+  def Factive(in: VStCmdq, m: Bool, step: UInt): UInt = {
+    assert(step.getWidth == 5)
+    val vstq = in.op === e.vstq.U
+    val stepq = Mux(vstq, step(4,2), step(2,0))
+    val active = MuxOR(in.vs.valid, RegActive(m, stepq, in.vs.addr))
+    assert(active.getWidth == 64)
+    active
+  }
+
+  class Ctrl extends Bundle {
+    val addr = UInt(p.lsuAddrBits.W)
+    val id   = UInt(6.W)
+    val size = UInt((log2Ceil(p.lsuDataBits / 8) + 1).W)
+    val vstq = Bool()
+    val quad = UInt(2.W)
+  }
+
+  class Data extends Bundle {
+    val data = UInt(p.lsuDataBits.W)
+    val strb = UInt((p.lsuDataBits / 8).W)
+  }
+
+  val q = VCmdq(cmdqDepth, new VStCmdq, Fin, Fout, Factive)
+
+  val ctrl = Slice(new Ctrl, false, true)
+  val data = Slice(new Data, false, true, true)
+
+  val dataEn = RegInit(false.B)
+
+  // ---------------------------------------------------------------------------
+  // Swizzle.
+  def SwizzleData(): UInt = {
+    val dsb = p.vectorBits / 4
+
+    val addr = ctrl.io.out.bits.addr
+    val vstq = ctrl.io.out.bits.vstq
+    val quad = ctrl.io.out.bits.quad
+    val data = io.read.data
+
+    val d0 = data(1 * dsb - 1, 0 * dsb)
+    val d1 = data(2 * dsb - 1, 1 * dsb)
+    val d2 = data(3 * dsb - 1, 2 * dsb)
+    val d3 = data(4 * dsb - 1, 3 * dsb)
+
+    val dataout = MuxOR(!vstq, data) |
+                  MuxOR(vstq && quad === 0.U, Cat(d0, d0, d0, d0)) |
+                  MuxOR(vstq && quad === 1.U, Cat(d1, d1, d1, d1)) |
+                  MuxOR(vstq && quad === 2.U, Cat(d2, d2, d2, d2)) |
+                  MuxOR(vstq && quad === 3.U, Cat(d3, d3, d3, d3))
+    assert(dataout.getWidth == p.vectorBits)
+
+    dataout
+  }
+
+  def SwizzleStrb(): UInt = {
+    val n4 = bytes / 4
+    val n = bytes
+
+    val strbB = Wire(Vec(n, Bool()))
+    val strb = strbB.asUInt
+    val strbq = strb(n4 - 1, 0)
+    val addr = ctrl.io.out.bits.addr
+    val size = ctrl.io.out.bits.size
+    val vstq = ctrl.io.out.bits.vstq
+    val quad = addr(msb - 1, msb - 2)
+    val zeroq = Cat(0.U(n4.W))
+
+    for (i <- 0 until p.lsuDataBits / 8) {
+      strbB(i) := size > i.U
+    }
+
+    val strbout = MuxOR(!vstq, strb) |
+       MuxOR(vstq && quad === 0.U, Cat(zeroq, zeroq, zeroq, strbq)) |
+       MuxOR(vstq && quad === 1.U, Cat(zeroq, zeroq, strbq, zeroq)) |
+       MuxOR(vstq && quad === 2.U, Cat(zeroq, strbq, zeroq, zeroq)) |
+       MuxOR(vstq && quad === 3.U, Cat(strbq, zeroq, zeroq, zeroq))
+    assert(strbout.getWidth == p.vectorBits / 8)
+
+    strbout
+  }
+
+  // ---------------------------------------------------------------------------
+  // Instruction queue.
+  q.io.in <> io.in
+
+  val ctrlready = Wire(Bool())
+  q.io.out.ready := ScoreboardReady(q.io.out.bits.vs, io.vrfsb) && ctrlready
+
+  val qmaxvlb = Mux(q.io.out.bits.op === e.vstq.U, maxvlb >> 2.U, maxvlb)
+  val qsize = Mux(q.io.out.bits.remain > qmaxvlb, qmaxvlb, q.io.out.bits.remain)
+
+  val qoutEn = q.io.out.valid && q.io.out.ready
+
+  // ---------------------------------------------------------------------------
+  // Register read.
+  io.read.valid := q.io.out.valid && q.io.out.bits.vs.valid
+  io.read.stall := !q.io.out.ready
+  io.read.addr := q.io.out.bits.vs.addr
+  io.read.tag := OutTag(q.io.out.bits.vs)
+
+  dataEn := qoutEn
+
+  data.io.in.valid := dataEn
+  assert(!(data.io.in.valid && !data.io.in.ready))
+
+  data.io.out.ready := io.axi.addr.ready
+
+  data.io.in.bits.data := SwizzleData()
+  data.io.in.bits.strb := SwizzleStrb()
+
+  // ---------------------------------------------------------------------------
+  // Control.
+  ctrl.io.in.valid := qoutEn
+
+  ctrl.io.in.bits.addr := q.io.out.bits.addr
+  ctrl.io.in.bits.id   := q.io.out.bits.vs.addr
+  ctrl.io.in.bits.size := qsize
+  ctrl.io.in.bits.vstq := q.io.out.bits.op === e.vstq.U
+  ctrl.io.in.bits.quad := q.io.out.bits.quad
+
+  ctrl.io.out.ready := io.axi.addr.ready
+
+  ctrlready := io.read.ready && ctrl.io.in.ready && data.io.in.ready
+
+  // ---------------------------------------------------------------------------
+  // Axi.
+  io.axi.addr.valid := ctrl.io.out.valid
+  io.axi.addr.bits.addr := Cat(0.U(1.W), ctrl.io.out.bits.addr(30, msb), 0.U(msb.W))
+  io.axi.addr.bits.id := ctrl.io.out.bits.id
+  assert(!(ctrl.io.out.valid && !ctrl.io.out.bits.addr(31)))
+  assert(!(io.axi.addr.valid && io.axi.addr.bits.addr(31)))
+
+  io.axi.data.valid := ctrl.io.out.valid
+  io.axi.data.bits.data := data.io.out.bits.data
+  io.axi.data.bits.strb := data.io.out.bits.strb
+
+  io.axi.resp.ready := true.B
+
+  assert(io.axi.addr.valid === io.axi.data.valid)
+  assert(io.axi.addr.ready === io.axi.data.ready)
+
+  // ---------------------------------------------------------------------------
+  // Active.
+  io.active := q.io.active
+
+  // ---------------------------------------------------------------------------
+  // Memory active status.
+  val nempty = RegInit(false.B)
+  val count = RegInit(0.U(9.W))
+  val inc = io.axi.addr.valid && io.axi.addr.ready
+  val dec = io.axi.resp.valid && io.axi.resp.ready
+
+  when (inc || dec) {
+    val nxtcount = count + inc - dec
+    count := nxtcount
+    nempty := nxtcount =/= 0.U
+    assert(count <= 256.U)
+  }
+
+  io.nempty := q.io.nempty || ctrl.io.out.valid || nempty
+}
+
+object EmitVSt extends App {
+  val p = new Parameters
+  (new chisel3.stage.ChiselStage).emitVerilog(new VSt(p), args)
+}
diff --git a/hdl/verilog/BUILD b/hdl/verilog/BUILD
new file mode 100644
index 0000000..81d5af0
--- /dev/null
+++ b/hdl/verilog/BUILD
@@ -0,0 +1,19 @@
+load("@rules_hdl//verilog:providers.bzl", "VerilogInfo", "verilog_library")
+
+verilog_library(
+    name = "clock_gate",
+    srcs = ["ClockGate.v"],
+    visibility = ["//visibility:public"],
+)
+
+verilog_library(
+    name = "sram_1rw_256x256",
+    srcs = ["Sram_1rw_256x256.v"],
+    visibility = ["//visibility:public"],
+)
+
+verilog_library(
+    name = "sram_1rw_256x288",
+    srcs = ["Sram_1rwm_256x288.v"],
+    visibility = ["//visibility:public"],
+)
\ No newline at end of file
diff --git a/hdl/verilog/ClockGate.v b/hdl/verilog/ClockGate.v
new file mode 100644
index 0000000..e170ae7
--- /dev/null
+++ b/hdl/verilog/ClockGate.v
@@ -0,0 +1,26 @@
+module ClockGate(
+  input         clk_i,
+  input         enable,  // '1' passthrough, '0' disable.
+  output        clk_o
+);
+
+`ifndef CLOCKGATE_ENABLE
+
+assign clk_o = clk_i;
+
+`else
+
+reg clk_en;
+
+// Capture 'enable' during low phase of the clock.
+always_latch begin
+  if (~clk_i) begin
+    clk_en <= enable;
+  end
+end
+
+assign clk_o = clk_i & clk_en;
+
+`endif
+
+endmodule  // ClockGate
diff --git a/hdl/verilog/Sram_1rw_256x256.v b/hdl/verilog/Sram_1rw_256x256.v
new file mode 100644
index 0000000..82ebd75
--- /dev/null
+++ b/hdl/verilog/Sram_1rw_256x256.v
@@ -0,0 +1,23 @@
+module Sram_1rw_256x256(
+  input          clock,
+  input          valid,
+  input          write,
+  input  [7:0]   addr,
+  input  [255:0] wdata,
+  output [255:0] rdata
+);
+
+  reg [255:0] mem [0:255];
+  reg [7:0] raddr;
+
+  assign rdata = mem[raddr];
+
+  always @(posedge clock) begin
+    if (valid & write) begin
+      mem[addr] <= wdata;
+    end
+    if (valid & ~write) begin
+      raddr <= addr;
+    end
+  end
+endmodule
diff --git a/hdl/verilog/Sram_1rwm_256x288.v b/hdl/verilog/Sram_1rwm_256x288.v
new file mode 100644
index 0000000..fd76452
--- /dev/null
+++ b/hdl/verilog/Sram_1rwm_256x288.v
@@ -0,0 +1,92 @@
+module Sram_1rwm_256x288(
+  input          clock,
+  input          valid,
+  input          write,
+  input  [7:0]   addr,
+  input  [287:0] wdata,
+  input  [31:0]  wmask,
+  output [287:0] rdata
+);
+
+reg [287:0] mem [0:255];
+reg [7:0] raddr;
+
+assign rdata = mem[raddr];
+
+`ifdef FPGA
+
+always @(posedge clock) begin
+  for (int i = 0; i < 32; i++) begin
+    if (valid & write & wmask[i]) begin
+      mem[addr][i*9 +: 9] <= wdata[i*9 +: 9];
+    end
+  end
+  if (valid & ~write) begin
+    raddr <= addr;
+  end
+end
+
+endmodule  // Sram_1rwm_256x288
+
+`else  // !FPGA
+
+Sram_1rw_256x9 u_bl00(clock, valid & (~write | wmask[0]),  write, addr, wdata[  0 +: 9], rdata[  0 +: 9]);
+Sram_1rw_256x9 u_bl01(clock, valid & (~write | wmask[1]),  write, addr, wdata[  9 +: 9], rdata[  9 +: 9]);
+Sram_1rw_256x9 u_bl02(clock, valid & (~write | wmask[2]),  write, addr, wdata[ 18 +: 9], rdata[ 18 +: 9]);
+Sram_1rw_256x9 u_bl03(clock, valid & (~write | wmask[3]),  write, addr, wdata[ 27 +: 9], rdata[ 27 +: 9]);
+Sram_1rw_256x9 u_bl04(clock, valid & (~write | wmask[4]),  write, addr, wdata[ 36 +: 9], rdata[ 36 +: 9]);
+Sram_1rw_256x9 u_bl05(clock, valid & (~write | wmask[5]),  write, addr, wdata[ 45 +: 9], rdata[ 45 +: 9]);
+Sram_1rw_256x9 u_bl06(clock, valid & (~write | wmask[6]),  write, addr, wdata[ 54 +: 9], rdata[ 54 +: 9]);
+Sram_1rw_256x9 u_bl07(clock, valid & (~write | wmask[7]),  write, addr, wdata[ 63 +: 9], rdata[ 63 +: 9]);
+Sram_1rw_256x9 u_bl08(clock, valid & (~write | wmask[8]),  write, addr, wdata[ 72 +: 9], rdata[ 72 +: 9]);
+Sram_1rw_256x9 u_bl09(clock, valid & (~write | wmask[9]),  write, addr, wdata[ 81 +: 9], rdata[ 81 +: 9]);
+Sram_1rw_256x9 u_bl10(clock, valid & (~write | wmask[10]), write, addr, wdata[ 90 +: 9], rdata[ 90 +: 9]);
+Sram_1rw_256x9 u_bl11(clock, valid & (~write | wmask[11]), write, addr, wdata[ 99 +: 9], rdata[ 99 +: 9]);
+Sram_1rw_256x9 u_bl12(clock, valid & (~write | wmask[12]), write, addr, wdata[108 +: 9], rdata[108 +: 9]);
+Sram_1rw_256x9 u_bl13(clock, valid & (~write | wmask[13]), write, addr, wdata[117 +: 9], rdata[117 +: 9]);
+Sram_1rw_256x9 u_bl14(clock, valid & (~write | wmask[14]), write, addr, wdata[126 +: 9], rdata[126 +: 9]);
+Sram_1rw_256x9 u_bl15(clock, valid & (~write | wmask[15]), write, addr, wdata[135 +: 9], rdata[135 +: 9]);
+Sram_1rw_256x9 u_bl16(clock, valid & (~write | wmask[16]), write, addr, wdata[144 +: 9], rdata[144 +: 9]);
+Sram_1rw_256x9 u_bl17(clock, valid & (~write | wmask[17]), write, addr, wdata[153 +: 9], rdata[153 +: 9]);
+Sram_1rw_256x9 u_bl18(clock, valid & (~write | wmask[18]), write, addr, wdata[162 +: 9], rdata[162 +: 9]);
+Sram_1rw_256x9 u_bl19(clock, valid & (~write | wmask[19]), write, addr, wdata[171 +: 9], rdata[171 +: 9]);
+Sram_1rw_256x9 u_bl20(clock, valid & (~write | wmask[20]), write, addr, wdata[180 +: 9], rdata[180 +: 9]);
+Sram_1rw_256x9 u_bl21(clock, valid & (~write | wmask[21]), write, addr, wdata[189 +: 9], rdata[189 +: 9]);
+Sram_1rw_256x9 u_bl22(clock, valid & (~write | wmask[22]), write, addr, wdata[198 +: 9], rdata[198 +: 9]);
+Sram_1rw_256x9 u_bl23(clock, valid & (~write | wmask[23]), write, addr, wdata[207 +: 9], rdata[207 +: 9]);
+Sram_1rw_256x9 u_bl24(clock, valid & (~write | wmask[24]), write, addr, wdata[216 +: 9], rdata[216 +: 9]);
+Sram_1rw_256x9 u_bl25(clock, valid & (~write | wmask[25]), write, addr, wdata[225 +: 9], rdata[225 +: 9]);
+Sram_1rw_256x9 u_bl26(clock, valid & (~write | wmask[26]), write, addr, wdata[234 +: 9], rdata[234 +: 9]);
+Sram_1rw_256x9 u_bl27(clock, valid & (~write | wmask[27]), write, addr, wdata[243 +: 9], rdata[243 +: 9]);
+Sram_1rw_256x9 u_bl28(clock, valid & (~write | wmask[28]), write, addr, wdata[252 +: 9], rdata[252 +: 9]);
+Sram_1rw_256x9 u_bl29(clock, valid & (~write | wmask[29]), write, addr, wdata[261 +: 9], rdata[261 +: 9]);
+Sram_1rw_256x9 u_bl30(clock, valid & (~write | wmask[30]), write, addr, wdata[270 +: 9], rdata[270 +: 9]);
+Sram_1rw_256x9 u_bl31(clock, valid & (~write | wmask[31]), write, addr, wdata[279 +: 9], rdata[279 +: 9]);
+
+endmodule  // Sram_1rwm_256x288
+
+module Sram_1rw_256x9(
+  input          clock,
+  input          valid,
+  input          write,
+  input  [7:0]   addr,
+  input  [8:0]   wdata,
+  output [8:0]   rdata
+);
+
+  reg [8:0] mem [0:255];
+  reg [7:0] raddr;
+
+  assign rdata = mem[raddr];
+
+  always @(posedge clock) begin
+    if (valid & write) begin
+      mem[addr] <= wdata;
+    end
+    if (valid & ~write) begin
+      raddr <= addr;
+    end
+  end
+endmodule  // Sram_1rw_256x9
+
+`endif  // FPGA
diff --git a/lib/BUILD b/lib/BUILD
new file mode 100644
index 0000000..79ff16e
--- /dev/null
+++ b/lib/BUILD
@@ -0,0 +1,46 @@
+load("@io_bazel_rules_scala//scala:scala.bzl", "scala_library")
+
+scala_library(
+    name = "chisel_lib",
+
+    deps = [
+        "@com_thoughtworks_paranamer//jar",
+        "@org_json4s_json4s_scalap//jar",
+        "@org_json4s_json4s_ast//jar",
+        "@org_json4s_json4s_core//jar",
+        "@org_json4s_json4s_native//jar",
+        "@org_apache_commons_commons_lang3//jar",
+        "@org_apache_commons_commons_text//jar",
+        "@edu_berkeley_cs_chisel3_plugin//jar",
+        "@com_github_scopt//jar",
+        "@net_jcazevedo_moultingyaml//jar",
+        "@edu_berkeley_cs_firrtl//jar",
+        "@edu_berkeley_cs_chisel3_core//jar",
+        "@edu_berkeley_cs_chisel3_macros//jar",
+        "@edu_berkeley_cs_chisel3//jar",
+    ],
+
+    exports = [
+        "@com_thoughtworks_paranamer//jar",
+        "@org_json4s_json4s_scalap//jar",
+        "@org_json4s_json4s_ast//jar",
+        "@org_json4s_json4s_core//jar",
+        "@org_json4s_json4s_native//jar",
+        "@org_apache_commons_commons_lang3//jar",
+        "@org_apache_commons_commons_text//jar",
+        "@edu_berkeley_cs_chisel3_plugin//jar",
+        "@com_github_scopt//jar",
+        "@net_jcazevedo_moultingyaml//jar",
+        "@edu_berkeley_cs_firrtl//jar",
+        "@edu_berkeley_cs_chisel3_core//jar",
+        "@edu_berkeley_cs_chisel3_macros//jar",
+        "@edu_berkeley_cs_chisel3//jar",
+    ],
+
+    visibility = ["//visibility:public"],
+
+    scalacopts = [
+        "-Xplugin:$(execpath @edu_berkeley_cs_chisel3_plugin//jar)",
+        "-P:chiselplugin:genBundleElements",
+    ],
+)
\ No newline at end of file
diff --git a/rules/BUILD b/rules/BUILD
new file mode 100644
index 0000000..ffd0fb0
--- /dev/null
+++ b/rules/BUILD
@@ -0,0 +1 @@
+package(default_visibility = ["//visibility:public"])
diff --git a/rules/chisel.bzl b/rules/chisel.bzl
new file mode 100644
index 0000000..5ce9315
--- /dev/null
+++ b/rules/chisel.bzl
@@ -0,0 +1,76 @@
+load("@io_bazel_rules_scala//scala:scala.bzl", "scala_binary", "scala_library")
+load("@rules_hdl//verilog:providers.bzl", "VerilogInfo", "verilog_library")
+load("@kelvin_hw//rules:verilator.bzl", "verilator_cc_library")
+
+def chisel_library(name,
+                   srcs = [],
+                   deps = [],
+                   visibility = None):
+    scala_library(
+        name = name,
+        srcs = srcs,
+        deps = [
+            "@kelvin_hw//lib:chisel_lib",
+            "@edu_berkeley_cs_chisel3_plugin//jar",
+        ] + deps,
+        scalacopts = [
+            "-Xplugin:$(execpath @edu_berkeley_cs_chisel3_plugin//jar)",
+            "-P:chiselplugin:genBundleElements",
+        ],
+        visibility = visibility,
+    )
+
+def chisel_binary(name,
+                  main_class,
+                  srcs = [],
+                  deps = [],
+                  visibility = None):
+    scala_binary(
+        name = name,
+        srcs = srcs,
+        main_class = main_class,
+        deps = [
+            "@kelvin_hw//lib:chisel_lib",
+            "@edu_berkeley_cs_chisel3_plugin//jar",
+        ] + deps,
+        scalacopts = [
+            "-Xplugin:$(execpath @edu_berkeley_cs_chisel3_plugin//jar)",
+            "-P:chiselplugin:genBundleElements",
+        ],
+        visibility = visibility,
+    )
+
+def chisel_cc_library(name,
+                      chisel_lib,
+                      emit_class,
+                      module_name,
+                      verilog_deps=[]):
+    gen_binary_name = name + "_emit_verilog_binary"
+    chisel_binary(
+        name = gen_binary_name,
+        deps = [ chisel_lib ],
+        main_class = emit_class,
+    )
+
+    native.genrule(
+        name = name + "_emit_verilog",
+        srcs = [],
+        outs = [module_name + ".v"],
+        cmd = "./$(location " + gen_binary_name + ") --target-dir $(RULEDIR)",
+        tools = [":{}".format(gen_binary_name)],
+    )
+
+    verilog_library(
+        name = name + "_verilog",
+        srcs = [module_name + ".v"],
+        deps = verilog_deps,
+    )
+
+    verilator_cc_library(
+        name = name,
+        module = ":{}_verilog".format(name),
+        module_top = module_name,
+        visibility = ["//visibility:public"],
+        # TODO(derekjchow): Re-enable the default -Wall?
+        vopts = [],
+    )
diff --git a/rules/deps.bzl b/rules/deps.bzl
new file mode 100644
index 0000000..9e359b7
--- /dev/null
+++ b/rules/deps.bzl
@@ -0,0 +1,122 @@
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@io_bazel_rules_scala//scala:scala_maven_import_external.bzl",
+     "scala_maven_import_external")
+load("@io_bazel_rules_scala//scala:scala_cross_version.bzl",
+     "default_maven_server_urls")
+load("@rules_foreign_cc//foreign_cc:repositories.bzl",
+     "rules_foreign_cc_dependencies")
+load("@rules_hdl//dependency_support:dependency_support.bzl",
+     rules_hdl_dependency_support = "dependency_support")
+
+
+def kelvin_deps():
+    rules_foreign_cc_dependencies()
+    rules_hdl_dependency_support()
+
+    http_archive(
+        name = "accellera_systemc",
+        build_file = "systemc.BUILD",
+        sha256 = "bfb309485a8ad35a08ee78827d1647a451ec5455767b25136e74522a6f41e0ea",
+        strip_prefix = "systemc-2.3.4",
+        urls = [
+            "https://github.com/accellera-official/systemc/archive/refs/tags/2.3.4.tar.gz",
+        ],
+    )
+
+    # paranamer
+    scala_maven_import_external(
+        name = "com_thoughtworks_paranamer",
+        artifact = "com.thoughtworks.paranamer:paranamer:%s" % "2.8",
+        server_urls = default_maven_server_urls(),
+        licenses = ["notice"],
+    )
+
+    # json4s
+    scala_maven_import_external(
+        name = "org_json4s_json4s_ast",
+        artifact = "org.json4s:json4s-ast_2.13:%s" % "3.6.12",
+        server_urls = default_maven_server_urls(),
+        licenses = ["notice"],
+    )
+    scala_maven_import_external(
+        name = "org_json4s_json4s_scalap",
+        artifact = "org.json4s:json4s-scalap_2.13:%s" % "3.6.12",
+        server_urls = default_maven_server_urls(),
+        licenses = ["notice"],
+    )
+    scala_maven_import_external(
+        name = "org_json4s_json4s_core",
+        artifact = "org.json4s:json4s-core_2.13:%s" % "3.6.12",
+        server_urls = default_maven_server_urls(),
+        licenses = ["notice"],
+    )
+    scala_maven_import_external(
+        name = "org_json4s_json4s_native",
+        artifact = "org.json4s:json4s-native_2.13:%s" % "3.6.12",
+        server_urls = default_maven_server_urls(),
+        licenses = ["notice"],
+    )
+
+    # org.apache.commons
+    scala_maven_import_external(
+        name = "org_apache_commons_commons_lang3",
+        artifact = "org.apache.commons:commons-lang3:%s" % "3.11",
+        server_urls = default_maven_server_urls(),
+        licenses = ["notice"],
+    )
+    scala_maven_import_external(
+        name = "org_apache_commons_commons_text",
+        artifact = "org.apache.commons:commons-text:%s" % "1.9",
+        server_urls = default_maven_server_urls(),
+        licenses = ["notice"],
+    )
+
+    # scopt
+    scala_maven_import_external(
+        name = "com_github_scopt",
+        artifact = "com.github.scopt:scopt_2.13:%s" % "3.7.1",
+        server_urls = default_maven_server_urls(),
+        licenses = ["notice"],
+    )
+
+    # moultingyaml
+    scala_maven_import_external(
+        name = "net_jcazevedo_moultingyaml",
+        artifact = "net.jcazevedo:moultingyaml_2.13:%s" % "0.4.2",
+        server_urls = default_maven_server_urls(),
+        licenses = ["notice"],
+    )
+
+    # FIRRTL
+    scala_maven_import_external(
+        name = "edu_berkeley_cs_firrtl",
+        artifact = "edu.berkeley.cs:firrtl_2.13:%s" % "1.5.1",
+        server_urls = default_maven_server_urls(),
+        licenses = ["notice"],
+    )
+
+    # Chisel3
+    scala_maven_import_external(
+        name = "edu_berkeley_cs_chisel3",
+        artifact = "edu.berkeley.cs:chisel3_2.13:%s" % "3.5.1",
+        server_urls = default_maven_server_urls(),
+        licenses = ["notice"],
+    )
+    scala_maven_import_external(
+        name = "edu_berkeley_cs_chisel3_core",
+        artifact = "edu.berkeley.cs:chisel3-core_2.13:%s" % "3.5.1",
+        server_urls = default_maven_server_urls(),
+        licenses = ["notice"],
+    )
+    scala_maven_import_external(
+        name = "edu_berkeley_cs_chisel3_macros",
+        artifact = "edu.berkeley.cs:chisel3-macros_2.13:%s" % "3.5.1",
+        server_urls = default_maven_server_urls(),
+        licenses = ["notice"],
+    )
+    scala_maven_import_external(
+        name = "edu_berkeley_cs_chisel3_plugin",
+        artifact = "edu.berkeley.cs:chisel3-plugin_2.13.6:%s" % "3.5.1",
+        server_urls = default_maven_server_urls(),
+        licenses = ["notice"],
+    )
diff --git a/rules/repos.bzl b/rules/repos.bzl
new file mode 100644
index 0000000..dcf8658
--- /dev/null
+++ b/rules/repos.bzl
@@ -0,0 +1,44 @@
+# Kelvin repositories
+#
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+def kelvin_repos():
+    http_archive(
+        name = "bazel_skylib",
+        sha256 = "b8a1527901774180afc798aeb28c4634bdccf19c4d98e7bdd1ce79d1fe9aaad7",
+        urls = [
+            "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.4.1/bazel-skylib-1.4.1.tar.gz",
+            "https://github.com/bazelbuild/bazel-skylib/releases/download/1.4.1/bazel-skylib-1.4.1.tar.gz",
+        ],
+    )
+
+    http_archive(
+        name = "rules_hdl",
+        sha256 = "223bce01f8375b29073a1475591c0c7e0d86c0d0b2ed73cbdb85f9e9dfa0dda3",
+        strip_prefix = "bazel_rules_hdl-b58d34add60108ae20d273ee480193b25e96d000",
+        urls = [
+            "https://github.com/hdl/bazel_rules_hdl/archive/b58d34add60108ae20d273ee480193b25e96d000.tar.gz",
+        ],
+        patches = [
+            "0001-Update-version-of-Googletest-for-bazel-compatitibili.patch",
+            "0002-SystemC-support-for-verilator.patch",
+        ],
+    )
+
+    # See https://github.com/bazelbuild/rules_scala/releases for up to date version information.
+    rules_scala_version = "c711b4d1f0d1cc386c63ef748c9df14d2f3a187e"
+    http_archive(
+        name = "io_bazel_rules_scala",
+        sha256 = "556677f505634da64efc41912d280895e61f5da109d82bdee41cde4120a190a1",
+        strip_prefix = "rules_scala-%s" % rules_scala_version,
+        type = "zip",
+        url = "https://github.com/bazelbuild/rules_scala/archive/%s.zip" % rules_scala_version,
+    )
+
+    http_archive(
+        name = "rules_foreign_cc",
+        sha256 = "2a4d07cd64b0719b39a7c12218a3e507672b82a97b98c6a89d38565894cf7c51",
+        strip_prefix = "rules_foreign_cc-0.9.0",
+        url = "https://github.com/bazelbuild/rules_foreign_cc/archive/refs/tags/0.9.0.tar.gz",
+    )
diff --git a/rules/verilator.bzl b/rules/verilator.bzl
new file mode 100644
index 0000000..834cd68
--- /dev/null
+++ b/rules/verilator.bzl
@@ -0,0 +1,223 @@
+# Modified from bazel_rules_hdl to use SystemC
+"""Functions for verilator."""
+
+load("@rules_hdl//verilog:providers.bzl", "VerilogInfo")
+load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
+
+def cc_compile_and_link_static_library(ctx, srcs, hdrs, deps, runfiles, includes = [], defines = []):
+    """Compile and link C++ source into a static library
+
+    Args:
+        ctx: Context for rule
+        srcs: The cpp sources generated by verilator.
+        hdrs: The headers generated by verilator.
+        deps: Library dependencies to build with.
+        runfiles: Data dependencies that are read at runtime.
+        includes: The includes for the verilator module to build.
+        defines: Cpp defines to build with.
+
+    Returns:
+        CCInfo with the compiled library.
+    """
+    cc_toolchain = find_cpp_toolchain(ctx)
+    feature_configuration = cc_common.configure_features(
+        ctx = ctx,
+        cc_toolchain = cc_toolchain,
+        requested_features = ctx.features,
+        unsupported_features = ctx.disabled_features,
+    )
+
+    compilation_contexts = [dep[CcInfo].compilation_context for dep in deps]
+    compilation_context, compilation_outputs = cc_common.compile(
+        name = ctx.label.name,
+        actions = ctx.actions,
+        feature_configuration = feature_configuration,
+        cc_toolchain = cc_toolchain,
+        srcs = srcs,
+        includes = includes,
+        defines = defines,
+        public_hdrs = hdrs,
+        compilation_contexts = compilation_contexts,
+    )
+
+    linking_contexts = [dep[CcInfo].linking_context for dep in deps]
+    linking_context, linking_output = cc_common.create_linking_context_from_compilation_outputs(
+        actions = ctx.actions,
+        feature_configuration = feature_configuration,
+        cc_toolchain = cc_toolchain,
+        compilation_outputs = compilation_outputs,
+        linking_contexts = linking_contexts,
+        name = ctx.label.name,
+        disallow_dynamic_library = True,
+    )
+
+    output_files = []
+    if linking_output.library_to_link.static_library != None:
+        output_files.append(linking_output.library_to_link.static_library)
+    if linking_output.library_to_link.dynamic_library != None:
+        output_files.append(linking_output.library_to_link.dynamic_library)
+
+    return [
+        DefaultInfo(files = depset(output_files), runfiles = ctx.runfiles(files = runfiles)),
+        CcInfo(
+            compilation_context = compilation_context,
+            linking_context = linking_context,
+        ),
+    ]
+
+_CPP_SRC = ["cc", "cpp", "cxx", "c++"]
+_HPP_SRC = ["h", "hh", "hpp"]
+_RUNFILES = ["dat", "mem"]
+
+def _only_cpp(f):
+    """Filter for just C++ source/headers"""
+    if f.extension in _CPP_SRC + _HPP_SRC:
+        return f.path
+    return None
+
+def _only_hpp(f):
+    """Filter for just C++ headers"""
+    if f.extension in _HPP_SRC:
+        return f.path
+    return None
+
+_COPY_TREE_SH = """
+OUT=$1; shift && mkdir -p "$OUT" && cp $* "$OUT"
+"""
+
+def _copy_tree(ctx, idir, odir, map_each = None, progress_message = None):
+    """Copy files from a TreeArtifact to a new directory"""
+    args = ctx.actions.args()
+    args.add(odir.path)
+    args.add_all([idir], map_each = map_each)
+    ctx.actions.run_shell(
+        arguments = [args],
+        command = _COPY_TREE_SH,
+        inputs = [idir],
+        outputs = [odir],
+        progress_message = progress_message,
+    )
+
+    return odir
+
+def _verilator_cc_library(ctx):
+    transitive_srcs = depset([], transitive = [ctx.attr.module[VerilogInfo].dag])
+    all_srcs = [verilog_info_struct.srcs for verilog_info_struct in transitive_srcs.to_list()]
+    all_files = [src for sub_tuple in all_srcs for src in sub_tuple]
+
+    # Filter out .dat files.
+    runfiles = []
+    verilog_files = []
+    for file in all_files:
+        if file.extension in _RUNFILES:
+            runfiles.append(file)
+        else:
+            verilog_files.append(file)
+
+    verilator_output = ctx.actions.declare_directory(ctx.label.name + "-gen")
+    verilator_output_cpp = ctx.actions.declare_directory(ctx.label.name + ".cpp")
+    verilator_output_hpp = ctx.actions.declare_directory(ctx.label.name + ".h")
+
+    prefix = "V" + ctx.attr.module_top
+
+    args = ctx.actions.args()
+    args.add("--sc")
+    args.add("--pins-bv", "2")
+    args.add("--Mdir", verilator_output.path)
+    args.add("--top-module", ctx.attr.module_top)
+    args.add("--prefix", prefix)
+    if ctx.attr.trace:
+        args.add("--trace")
+    for verilog_file in verilog_files:
+        args.add(verilog_file.path)
+    args.add_all(ctx.attr.vopts, expand_directories = False)
+
+    ctx.actions.run(
+        arguments = [args],
+        executable = ctx.executable._verilator,
+        inputs = verilog_files,
+        outputs = [verilator_output],
+        progress_message = "[Verilator] Compiling {}".format(ctx.label),
+    )
+
+    _copy_tree(
+        ctx,
+        verilator_output,
+        verilator_output_cpp,
+        map_each = _only_cpp,
+        progress_message = "[Verilator] Extracting C++ source files",
+    )
+    _copy_tree(
+        ctx,
+        verilator_output,
+        verilator_output_hpp,
+        map_each = _only_hpp,
+        progress_message = "[Verilator] Extracting C++ header files",
+    )
+
+    # Do actual compile
+    defines = ["VM_TRACE"] if ctx.attr.trace else []
+    deps = [ctx.attr._verilator_lib, ctx.attr._zlib, ctx.attr._verilator_svdpi]
+
+    return cc_compile_and_link_static_library(
+        ctx,
+        srcs = [verilator_output_cpp],
+        hdrs = [verilator_output_hpp],
+        defines = defines,
+        runfiles = runfiles,
+        includes = [verilator_output_hpp.path],
+        deps = deps,
+    )
+
+verilator_cc_library = rule(
+    _verilator_cc_library,
+    attrs = {
+        "module": attr.label(
+            doc = "The top level module target to verilate.",
+            providers = [VerilogInfo],
+            mandatory = True,
+        ),
+        "module_top": attr.string(
+            doc = "The name of the verilog module to verilate.",
+            mandatory = True,
+        ),
+        "trace": attr.bool(
+            doc = "Enable tracing for Verilator",
+            default = True,
+        ),
+        "vopts": attr.string_list(
+            doc = "Additional command line options to pass to Verilator",
+            default = ["-Wall"],
+        ),
+        "_cc_toolchain": attr.label(
+            doc = "CC compiler.",
+            default = Label("@bazel_tools//tools/cpp:current_cc_toolchain"),
+        ),
+        "_verilator": attr.label(
+            doc = "Verilator binary.",
+            executable = True,
+            cfg = "exec",
+            default = Label("@verilator//:verilator_executable"),
+        ),
+        "_verilator_lib": attr.label(
+            doc = "Verilator library",
+            default = Label("@verilator//:libverilator"),
+        ),
+        "_verilator_svdpi": attr.label(
+            doc = "Verilator svdpi lib",
+            default = Label("@verilator//:svdpi"),
+        ),
+        "_zlib": attr.label(
+            doc = "zlib dependency",
+            default = Label("@net_zlib//:zlib"),
+        ),
+    },
+    provides = [
+        CcInfo,
+        DefaultInfo,
+    ],
+    toolchains = [
+        "@bazel_tools//tools/cpp:toolchain_type",
+    ],
+    fragments = ["cpp"],
+)
\ No newline at end of file
diff --git a/tests/verilator_sim/BUILD b/tests/verilator_sim/BUILD
new file mode 100644
index 0000000..409f100
--- /dev/null
+++ b/tests/verilator_sim/BUILD
@@ -0,0 +1,237 @@
+cc_library(
+    name = "sim_libs",
+    hdrs = [
+        "fifo.h",
+        "sysc_module.h",
+        "sysc_tb.h",
+    ],
+)
+
+cc_library(
+    name = "kelvin_if",
+    hdrs = [
+        "kelvin/core_if.h",
+        "kelvin/debug_if.h",
+        "kelvin/kelvin_cfg.h",
+        "kelvin/memory_if.h",
+    ],
+    defines = ["KELVIN_SIMD=256"],
+)
+
+cc_binary(
+    name = "core_sim",
+    srcs = [
+        "kelvin/core_tb.cc",
+    ],
+    deps = [
+        ":sim_libs",
+        ":kelvin_if",
+        "//hdl/chisel:core_cc_library",
+        "@accellera_systemc//:systemc",
+    ],
+)
+
+cc_binary(
+    name = "dbus2axi_tb",
+    srcs = [
+        "kelvin/dbus2axi_tb.cc",
+    ],
+    deps = [
+        ":sim_libs",
+        "//hdl/chisel:dbus2axi_cc_library",
+        "@accellera_systemc//:systemc",
+    ],
+)
+
+cc_binary(
+    name = "l1dcache_tb",
+    srcs = [
+        "kelvin/l1dcache_tb.cc",
+    ],
+    deps = [
+        ":kelvin_if",
+        ":sim_libs",
+        "//hdl/chisel:l1dcache_cc_library",
+        "@accellera_systemc//:systemc",
+    ],
+)
+
+cc_binary(
+    name = "l1icache_tb",
+    srcs = [
+        "kelvin/l1icache_tb.cc",
+    ],
+    deps = [
+        ":kelvin_if",
+        ":sim_libs",
+        "//hdl/chisel:l1icache_cc_library",
+        "@accellera_systemc//:systemc",
+    ],
+)
+
+# TODO(derekjchow): Add valu and valuint test benches
+
+cc_library(
+    name = "vdecode",
+    hdrs = [
+        "kelvin/vdecode.h",
+    ],
+    deps = [
+        ":vdecodeop",
+        ":vencodeop",
+    ],
+)
+
+cc_library(
+    name = "vdecodeop",
+    hdrs = [
+        "kelvin/vdecodeop.h",
+    ],
+)
+
+cc_library(
+    name = "vencodeop",
+    hdrs = [
+        "kelvin/vencodeop.h",
+    ],
+)
+
+cc_binary(
+    name = "vcmdq_tb",
+    srcs = [
+        "kelvin/vcmdq_tb.cc",
+    ],
+    deps = [
+        ":kelvin_if",
+        ":sim_libs",
+        ":vencodeop",
+        "//hdl/chisel:vcmdq_cc_library",
+        "@accellera_systemc//:systemc",
+    ],
+)
+
+cc_binary(
+    name = "vconvalu_tb",
+    srcs = [
+        "kelvin/vconvalu_tb.cc",
+    ],
+    deps = [
+        ":kelvin_if",
+        ":sim_libs",
+        "//hdl/chisel:vconvalu_cc_library",
+        "@accellera_systemc//:systemc",
+    ],
+)
+
+# TODO(derekjchow): Fix broken test
+cc_binary(
+    name = "vconvctrl_tb",
+    srcs = [
+        "kelvin/vconvctrl_tb.cc",
+    ],
+    deps = [
+        ":kelvin_if",
+        ":sim_libs",
+        ":vencodeop",
+        "//hdl/chisel:vconvctrl_cc_library",
+        "@accellera_systemc//:systemc",
+    ],
+)
+
+# TODO(derekjchow): Fix broken test
+cc_binary(
+    name = "vdecodeinstruction_tb",
+    srcs = [
+        "kelvin/vdecodeinstruction_tb.cc",
+    ],
+    deps = [
+        ":kelvin_if",
+        ":sim_libs",
+        ":vdecode",
+        "//hdl/chisel:vdecodeinstruction_cc_library",
+        "@accellera_systemc//:systemc",
+    ],
+)
+
+# TODO(derekjchow): Fix broken test
+cc_binary(
+    name = "vdecode_tb",
+    srcs = [
+        "kelvin/vdecode_tb.cc",
+    ],
+    deps = [
+        ":kelvin_if",
+        ":sim_libs",
+        ":vdecode",
+        "//hdl/chisel:vdecode_cc_library",
+        "@accellera_systemc//:systemc",
+    ],
+)
+
+cc_binary(
+    name = "vldst_tb",
+    srcs = [
+        "kelvin/vldst_tb.cc",
+    ],
+    deps = [
+        ":kelvin_if",
+        ":sim_libs",
+        ":vencodeop",
+        "//hdl/chisel:vldst_cc_library",
+        "@accellera_systemc//:systemc",
+    ],
+)
+
+cc_binary(
+    name = "vld_tb",
+    srcs = [
+        "kelvin/vld_tb.cc",
+    ],
+    deps = [
+        ":kelvin_if",
+        ":sim_libs",
+        ":vencodeop",
+        "//hdl/chisel:vld_cc_library",
+        "@accellera_systemc//:systemc",
+    ],
+)
+
+cc_binary(
+    name = "vregfilesegment_tb",
+    srcs = [
+        "kelvin/vregfilesegment_tb.cc",
+    ],
+    deps = [
+        ":kelvin_if",
+        ":sim_libs",
+        "//hdl/chisel:vregfilesegment_cc_library",
+        "@accellera_systemc//:systemc",
+    ],
+)
+
+cc_binary(
+    name = "vregfile_tb",
+    srcs = [
+        "kelvin/vregfile_tb.cc",
+    ],
+    deps = [
+        ":kelvin_if",
+        ":sim_libs",
+        "//hdl/chisel:vregfile_cc_library",
+        "@accellera_systemc//:systemc",
+    ],
+)
+
+cc_binary(
+    name = "vst_tb",
+    srcs = [
+        "kelvin/vst_tb.cc",
+    ],
+    deps = [
+        ":kelvin_if",
+        ":sim_libs",
+        ":vencodeop",
+        "//hdl/chisel:vst_cc_library",
+        "@accellera_systemc//:systemc",
+    ],
+)
diff --git a/tests/verilator_sim/fifo.h b/tests/verilator_sim/fifo.h
new file mode 100644
index 0000000..4e492ec
--- /dev/null
+++ b/tests/verilator_sim/fifo.h
@@ -0,0 +1,58 @@
+#ifndef TESTS_VERILATOR_SIM_FIFO_H_
+#define TESTS_VERILATOR_SIM_FIFO_H_
+
+// A SystemC CRT transaction queue.
+
+template <typename T>
+class fifo_t {
+ public:
+  bool empty() { return entries_.empty(); }
+
+  void write(T v) { entries_.emplace_back(v); }
+
+  bool read(T& v) {
+    if (entries_.empty()) return false;
+    v = entries_.at(0);
+    entries_.erase(entries_.begin());
+    return true;
+  }
+
+  bool next(T& v, int index = 0) {
+    if (index >= count()) return false;
+    v = entries_.at(index);
+    return true;
+  }
+
+  bool rand(T& v) {
+    if (entries_.empty()) return false;
+    int index = ::rand() % count();
+    v = entries_.at(index);
+    return true;
+  }
+
+  void clear() { entries_.clear(); }
+
+  bool remove(int index = 0) {
+    if (index >= count()) return false;
+    entries_.erase(entries_.begin() + index);
+    return true;
+  }
+
+  void shuffle() {
+    const int count = entries_.size();
+    if (count < 2) return;
+    for (int i = 0; i < count; ++i) {
+      const int index = ::rand() % count;
+      T v = entries_.at(index);
+      entries_.erase(entries_.begin() + index);
+      entries_.emplace_back(v);
+    }
+  }
+
+  int count() { return entries_.size(); }
+
+ private:
+  std::vector<T> entries_;
+};
+
+#endif  // TESTS_VERILATOR_SIM_FIFO_H_
diff --git a/tests/verilator_sim/kelvin/core_if.h b/tests/verilator_sim/kelvin/core_if.h
new file mode 100644
index 0000000..9ec703c
--- /dev/null
+++ b/tests/verilator_sim/kelvin/core_if.h
@@ -0,0 +1,300 @@
+#ifndef TESTS_VERILATOR_SIM_KELVIN_CORE_IF_H_
+#define TESTS_VERILATOR_SIM_KELVIN_CORE_IF_H_
+
+#include "tests/verilator_sim/fifo.h"
+#include "tests/verilator_sim/kelvin/kelvin_cfg.h"
+#include "tests/verilator_sim/kelvin/memory_if.h"
+
+constexpr int kAxiWaitState = 3;
+
+static bool rand_bool() {
+  return rand() & 1;
+}
+
+static bool rand_bool_ibus() {
+#if 1
+  return rand_bool();
+#else
+  return true;
+#endif
+}
+
+static bool rand_bool_dbus() {
+#if 1
+  return rand_bool();
+#else
+  return true;
+#endif
+}
+
+static bool rand_bool_axi_w() {
+#if 1
+  return rand_bool();
+#else
+  return true;
+#endif
+}
+
+static bool rand_bool_axi_r() {
+#if 1
+  return rand_bool();
+#else
+  return true;
+#endif
+}
+
+// ScalarCore Memory Interface.
+struct Core_if : Memory_if {
+  sc_in<bool>         io_ibus_valid;
+  sc_out<bool>        io_ibus_ready;
+  sc_in<sc_bv<32> >   io_ibus_addr;
+  sc_out<sc_bv<256> > io_ibus_rdata;
+
+  sc_in<bool> io_dbus_valid;
+  sc_out<bool> io_dbus_ready;
+  sc_in<bool> io_dbus_write;
+  sc_in<sc_bv<32> > io_dbus_addr;
+  sc_in<sc_bv<32> > io_dbus_adrx;
+  sc_in<sc_bv<kDbusBits> > io_dbus_size;
+  sc_in<sc_bv<kVector> > io_dbus_wdata;
+  sc_in<sc_bv<kVector / 8> > io_dbus_wmask;
+  sc_out<sc_bv<kVector> > io_dbus_rdata;
+
+  sc_out<bool> io_axi0_write_addr_ready;
+  sc_in<bool> io_axi0_write_addr_valid;
+  sc_in<sc_bv<32> > io_axi0_write_addr_bits_addr;
+  sc_in<sc_bv<kUncId> > io_axi0_write_addr_bits_id;
+  sc_out<bool> io_axi0_write_data_ready;
+  sc_in<bool> io_axi0_write_data_valid;
+  sc_in<sc_bv<kUncBits> > io_axi0_write_data_bits_data;
+  sc_in<sc_bv<kUncStrb> > io_axi0_write_data_bits_strb;
+  sc_in<bool> io_axi0_write_resp_ready;
+  sc_out<bool> io_axi0_write_resp_valid;
+  sc_out<sc_bv<kUncId> > io_axi0_write_resp_bits_id;
+  sc_out<sc_bv<2> > io_axi0_write_resp_bits_resp;
+  sc_out<bool> io_axi0_read_addr_ready;
+  sc_in<bool> io_axi0_read_addr_valid;
+  sc_in<sc_bv<32> > io_axi0_read_addr_bits_addr;
+  sc_in<sc_bv<kUncId> > io_axi0_read_addr_bits_id;
+  sc_in<bool> io_axi0_read_data_ready;
+  sc_out<bool> io_axi0_read_data_valid;
+  sc_out<sc_bv<2> > io_axi0_read_data_bits_resp;
+  sc_out<sc_bv<kUncId> > io_axi0_read_data_bits_id;
+  sc_out<sc_bv<kUncBits> > io_axi0_read_data_bits_data;
+  sc_out<bool> io_axi1_write_addr_ready;
+  sc_in<bool> io_axi1_write_addr_valid;
+  sc_in<sc_bv<32> > io_axi1_write_addr_bits_addr;
+  sc_in<sc_bv<kUncId> > io_axi1_write_addr_bits_id;
+  sc_out<bool> io_axi1_write_data_ready;
+  sc_in<bool> io_axi1_write_data_valid;
+  sc_in<sc_bv<kUncBits> > io_axi1_write_data_bits_data;
+  sc_in<sc_bv<kUncStrb> > io_axi1_write_data_bits_strb;
+  sc_in<bool> io_axi1_write_resp_ready;
+  sc_out<bool> io_axi1_write_resp_valid;
+  sc_out<sc_bv<kUncId> > io_axi1_write_resp_bits_id;
+  sc_out<sc_bv<2> > io_axi1_write_resp_bits_resp;
+  sc_out<bool> io_axi1_read_addr_ready;
+  sc_in<bool> io_axi1_read_addr_valid;
+  sc_in<sc_bv<32> > io_axi1_read_addr_bits_addr;
+  sc_in<sc_bv<kUncId> > io_axi1_read_addr_bits_id;
+  sc_in<bool> io_axi1_read_data_ready;
+  sc_out<bool> io_axi1_read_data_valid;
+  sc_out<sc_bv<2> > io_axi1_read_data_bits_resp;
+  sc_out<sc_bv<kUncId> > io_axi1_read_data_bits_id;
+  sc_out<sc_bv<kUncBits> > io_axi1_read_data_bits_data;
+
+  Core_if(sc_module_name n, const char* bin) : Memory_if(n, bin) {
+    for (int i = 0; i < kUncBits / 32; ++i) {
+      runused_.set_word(i, 0);
+    }
+  }
+
+  void eval() {
+    if (reset) {
+      io_ibus_ready = false;
+      io_axi0_read_addr_ready = false;
+      io_axi0_read_data_valid = false;
+      io_axi0_write_addr_ready = false;
+      io_axi0_write_data_ready = false;
+      io_axi0_write_resp_valid = false;
+      io_axi1_read_addr_ready = false;
+      io_axi1_read_data_valid = false;
+      io_axi1_write_addr_ready = false;
+      io_axi1_write_data_ready = false;
+      io_axi1_write_resp_valid = false;
+    } else if (clock->posedge()) {
+      cycle_++;
+
+      const bool axi0_write_ready = rand_bool_axi_w();
+      const bool axi1_write_ready = rand_bool_axi_w();
+
+      io_ibus_ready = rand_bool_ibus();
+      io_dbus_ready = rand_bool_dbus();
+      io_axi0_read_addr_ready = true;
+      io_axi0_write_addr_ready = axi0_write_ready;
+      io_axi0_write_data_ready = axi0_write_ready;
+      io_axi0_write_resp_valid = false;
+      io_axi1_read_addr_ready = true;
+      io_axi1_write_addr_ready = axi1_write_ready;
+      io_axi1_write_data_ready = axi1_write_ready;
+      io_axi1_write_resp_valid = false;
+
+      // Instruction bus read.
+      if (io_ibus_valid && io_ibus_ready) {
+        sc_bv<256> rdata;
+        uint32_t addr = io_ibus_addr.read().get_word(0);
+        uint32_t words[256 / 32];
+        Read(addr, 256 / 8, (uint8_t*) words);
+
+        for (int i = 0; i < 256 / 32; ++i) {
+          rdata.set_word(i, words[i]);
+        }
+
+        io_ibus_rdata = rdata;
+      }
+
+      // Data bus read.
+      if (io_dbus_valid && io_dbus_ready && !io_dbus_write) {
+        sc_bv<kVector> rdata;
+        uint32_t addr = io_dbus_addr.read().get_word(0);
+        uint32_t words[kVector / 32] = {0};
+        memset(words, 0xcc, sizeof(words));
+        int bytes = io_dbus_size.read().get_word(0);
+        Read(addr, bytes, (uint8_t*) words);
+        ReadSwizzle(addr, kVector / 8, (uint8_t*) words);
+        for (int i = 0; i < kVector / 32; ++i) {
+          rdata.set_word(i, words[i]);
+        }
+        io_dbus_rdata = rdata;
+      }
+
+      // Data bus write.
+      if (io_dbus_valid && io_dbus_ready && io_dbus_write) {
+        sc_bv<kVector> wdata = io_dbus_wdata;
+        uint32_t addr = io_dbus_addr.read().get_word(0);
+        uint32_t words[kVector / 32];
+        int bytes = io_dbus_size.read().get_word(0);
+        for (int i = 0; i < kVector / 32; ++i) {
+          words[i] = wdata.get_word(i);
+        }
+        WriteSwizzle(addr, kVector / 8, (uint8_t*) words);
+        Write(addr, bytes, (uint8_t*) words);
+      }
+
+      rtcm_t tcm_read;
+      sc_bv<kUncBits> rdata;
+
+      // axi0 read.
+      if (io_axi0_read_addr_valid && io_axi0_read_addr_ready) {
+        uint32_t addr = io_axi0_read_addr_bits_addr.read().get_word(0);
+        uint32_t words[kUncBits / 32];
+        Read(addr, kUncBits / 8, (uint8_t*) words);
+
+        tcm_read.cycle = cycle_;
+        tcm_read.id = io_axi0_read_addr_bits_id.read().get_word(0);
+        for (int i = 0; i < kUncBits / 32; ++i) {
+          tcm_read.data.set_word(i, words[i]);
+        }
+        rtcm_[0].write(tcm_read);
+      }
+
+      bool read0 = rand_bool_axi_r() && rtcm_[0].next(tcm_read);
+      if (read0 && (cycle_ - tcm_read.cycle) >= kAxiWaitState) {
+        assert(rtcm_[0].remove());
+        io_axi0_read_data_bits_id = tcm_read.id;
+        io_axi0_read_data_bits_data = tcm_read.data;
+      } else {
+        read0 = false;
+        io_axi0_read_data_bits_id = 0;
+        io_axi0_read_data_bits_data = runused_;
+      }
+      io_axi0_read_data_valid = read0;
+
+      // axi0 write.
+      if (io_axi0_write_addr_valid && io_axi0_write_addr_ready) {
+        assert(io_axi0_write_data_valid && io_axi0_write_data_valid);
+        uint8_t wdata[kUncBits / 8];
+        uint32_t addr = io_axi0_write_addr_bits_addr.read().get_word(0);
+        uint32_t* p_wdata = (uint32_t*) wdata;
+
+        for (int i = 0; i < kUncBits / 32; ++i) {
+          p_wdata[i] = io_axi0_write_data_bits_data.read().get_word(i);
+        }
+
+        for (int i = 0; i < kUncBits / 8; ++i) {
+          if (io_axi0_write_data_bits_strb.read().get_bit(i) != 0) {
+            Write(addr + i, 1, wdata + i);
+          }
+        }
+      }
+
+      if (io_axi0_write_addr_valid && io_axi0_write_addr_ready) {
+        io_axi0_write_resp_valid = true;
+        io_axi0_write_resp_bits_id = io_axi0_write_addr_bits_id;
+      }
+
+      // axi1 read.
+      if (io_axi1_read_addr_valid && io_axi1_read_addr_ready) {
+        uint32_t addr = io_axi1_read_addr_bits_addr.read().get_word(0);
+        uint32_t words[kUncBits / 32];
+        Read(addr, kUncBits / 8, (uint8_t*) words);
+
+        tcm_read.cycle = cycle_;
+        tcm_read.id = io_axi1_read_addr_bits_id.read().get_word(0);
+        for (int i = 0; i < kUncBits / 32; ++i) {
+          tcm_read.data.set_word(i, words[i]);
+        }
+        rtcm_[1].write(tcm_read);
+      }
+
+      bool read1 = rand_bool_axi_r() && rtcm_[1].next(tcm_read);
+      if (read1 && (cycle_ - tcm_read.cycle) >= kAxiWaitState) {
+        assert(rtcm_[1].remove());
+        io_axi1_read_data_bits_id = tcm_read.id;
+        io_axi1_read_data_bits_data = tcm_read.data;
+      } else {
+        read1 = false;
+        io_axi1_read_data_bits_id = 0;
+        io_axi1_read_data_bits_data = runused_;
+      }
+      io_axi1_read_data_valid = read1;
+
+      // axi1 write.
+      if (io_axi1_write_addr_valid && io_axi1_write_addr_ready) {
+        assert(io_axi1_write_data_valid && io_axi1_write_data_valid);
+        uint8_t wdata[kUncBits / 8];
+        uint32_t addr = io_axi1_write_addr_bits_addr.read().get_word(0);
+        uint32_t* p_wdata = (uint32_t*) wdata;
+
+        for (int i = 0; i < kUncBits / 32; ++i) {
+          p_wdata[i] = io_axi1_write_data_bits_data.read().get_word(i);
+        }
+
+        for (int i = 0; i < kUncBits / 8; ++i) {
+          if (io_axi1_write_data_bits_strb.read().get_bit(i) != 0) {
+            Write(addr + i, 1, wdata + i);
+          }
+        }
+      }
+
+      if (io_axi1_write_addr_valid && io_axi1_write_addr_ready) {
+        io_axi1_write_resp_valid = true;
+        io_axi1_write_resp_bits_id = io_axi1_write_addr_bits_id;
+      }
+    }
+  }
+
+private:
+  uint32_t cycle_ = 0;
+
+  struct rtcm_t {
+    uint32_t cycle;
+    uint32_t id : 7;
+    sc_bv<kUncBits> data;
+  };
+
+  fifo_t<rtcm_t> rtcm_[2];
+  sc_bv<kUncBits> runused_;
+};
+
+#endif  // TESTS_VERILATOR_SIM_KELVIN_CORE_IF_H_
diff --git a/tests/verilator_sim/kelvin/core_tb.cc b/tests/verilator_sim/kelvin/core_tb.cc
new file mode 100644
index 0000000..e2a6e10
--- /dev/null
+++ b/tests/verilator_sim/kelvin/core_tb.cc
@@ -0,0 +1,270 @@
+#include "tests/verilator_sim/sysc_tb.h"
+
+#include "VCore.h"
+
+#include "tests/verilator_sim/kelvin/core_if.h"
+#include "tests/verilator_sim/kelvin/debug_if.h"
+#include "tests/verilator_sim/kelvin/kelvin_cfg.h"
+
+
+struct Core_tb : Sysc_tb {
+  sc_in<bool> io_halted;
+  sc_in<bool> io_fault;
+
+  using Sysc_tb::Sysc_tb;  // constructor
+
+  void posedge() {
+    check(!io_fault, "io_fault");
+    if (io_halted) sc_stop();
+  }
+};
+
+static void Core_run(const char* name, const char* bin, const bool trace) {
+  VCore core(name);
+  Core_tb tb("Core_tb", 100000000, random);
+  Core_if mif("Core_if", bin);
+  Debug_if dbg("Debug_if", &mif);
+
+  sc_signal<bool> io_halted;
+  sc_signal<bool> io_fault;
+  sc_signal<bool> io_ibus_valid;
+  sc_signal<bool> io_ibus_ready;
+  sc_signal<bool> io_dbus_valid;
+  sc_signal<bool> io_dbus_ready;
+  sc_signal<bool> io_dbus_write;
+  sc_signal<bool> io_iflush_valid;
+  sc_signal<bool> io_iflush_ready;
+  sc_signal<bool> io_dflush_valid;
+  sc_signal<bool> io_dflush_ready;
+  sc_signal<bool> io_dflush_all;
+  sc_signal<bool> io_dflush_clean;
+  sc_signal<bool> io_slog_valid;
+  sc_signal<sc_bv<32> > io_csr_in_value_0;
+  sc_signal<sc_bv<32> > io_csr_in_value_1;
+  sc_signal<sc_bv<32> > io_csr_in_value_2;
+  sc_signal<sc_bv<32> > io_csr_in_value_3;
+  sc_signal<sc_bv<32> > io_csr_in_value_4;
+  sc_signal<sc_bv<32> > io_csr_in_value_5;
+  sc_signal<sc_bv<32> > io_csr_in_value_6;
+  sc_signal<sc_bv<32> > io_csr_in_value_7;
+  sc_signal<sc_bv<32> > io_csr_in_value_8;
+  sc_signal<sc_bv<32> > io_csr_in_value_9;
+  sc_signal<sc_bv<32> > io_csr_in_value_10;
+  sc_signal<sc_bv<32> > io_csr_in_value_11;
+  sc_signal<sc_bv<32> > io_csr_out_value_0;
+  sc_signal<sc_bv<32> > io_csr_out_value_1;
+  sc_signal<sc_bv<32> > io_csr_out_value_2;
+  sc_signal<sc_bv<32> > io_csr_out_value_3;
+  sc_signal<sc_bv<32> > io_csr_out_value_4;
+  sc_signal<sc_bv<32> > io_csr_out_value_5;
+  sc_signal<sc_bv<32> > io_csr_out_value_6;
+  sc_signal<sc_bv<32> > io_csr_out_value_7;
+  sc_signal<sc_bv<32> > io_ibus_addr;
+  sc_signal<sc_bv<256> > io_ibus_rdata;
+  sc_signal<sc_bv<32> > io_dbus_addr;
+  sc_signal<sc_bv<32> > io_dbus_adrx;
+  sc_signal<sc_bv<kDbusBits> > io_dbus_size;
+  sc_signal<sc_bv<kVector> > io_dbus_wdata;
+  sc_signal<sc_bv<kVector / 8> > io_dbus_wmask;
+  sc_signal<sc_bv<kVector> > io_dbus_rdata;
+  sc_signal<sc_bv<5> > io_slog_addr;
+  sc_signal<sc_bv<32> > io_slog_data;
+  sc_signal<sc_bv<4> > io_debug_en;
+  sc_signal<sc_bv<32> > io_debug_addr0;
+  sc_signal<sc_bv<32> > io_debug_addr1;
+  sc_signal<sc_bv<32> > io_debug_addr2;
+  sc_signal<sc_bv<32> > io_debug_addr3;
+  sc_signal<sc_bv<32> > io_debug_inst0;
+  sc_signal<sc_bv<32> > io_debug_inst1;
+  sc_signal<sc_bv<32> > io_debug_inst2;
+  sc_signal<sc_bv<32> > io_debug_inst3;
+  sc_signal<sc_bv<32> > io_debug_cycles;
+  sc_signal<bool> io_axi0_write_addr_ready;
+  sc_signal<bool> io_axi0_write_addr_valid;
+  sc_signal<sc_bv<32> > io_axi0_write_addr_bits_addr;
+  sc_signal<sc_bv<kUncId> > io_axi0_write_addr_bits_id;
+  sc_signal<bool> io_axi0_write_data_ready;
+  sc_signal<bool> io_axi0_write_data_valid;
+  sc_signal<sc_bv<kUncBits> > io_axi0_write_data_bits_data;
+  sc_signal<sc_bv<kUncStrb> > io_axi0_write_data_bits_strb;
+  sc_signal<bool> io_axi0_write_resp_ready;
+  sc_signal<bool> io_axi0_write_resp_valid;
+  sc_signal<sc_bv<kUncId> > io_axi0_write_resp_bits_id;
+  sc_signal<sc_bv<2> > io_axi0_write_resp_bits_resp;
+  sc_signal<bool> io_axi0_read_addr_ready;
+  sc_signal<bool> io_axi0_read_addr_valid;
+  sc_signal<sc_bv<32> > io_axi0_read_addr_bits_addr;
+  sc_signal<sc_bv<kUncId> > io_axi0_read_addr_bits_id;
+  sc_signal<bool> io_axi0_read_data_ready;
+  sc_signal<bool> io_axi0_read_data_valid;
+  sc_signal<sc_bv<2> > io_axi0_read_data_bits_resp;
+  sc_signal<sc_bv<kUncId> > io_axi0_read_data_bits_id;
+  sc_signal<sc_bv<kUncBits> > io_axi0_read_data_bits_data;
+  sc_signal<bool> io_axi1_write_addr_ready;
+  sc_signal<bool> io_axi1_write_addr_valid;
+  sc_signal<sc_bv<32> > io_axi1_write_addr_bits_addr;
+  sc_signal<sc_bv<kUncId> > io_axi1_write_addr_bits_id;
+  sc_signal<bool> io_axi1_write_data_ready;
+  sc_signal<bool> io_axi1_write_data_valid;
+  sc_signal<sc_bv<kUncBits> > io_axi1_write_data_bits_data;
+  sc_signal<sc_bv<kUncStrb> > io_axi1_write_data_bits_strb;
+  sc_signal<bool> io_axi1_write_resp_ready;
+  sc_signal<bool> io_axi1_write_resp_valid;
+  sc_signal<sc_bv<kUncId> > io_axi1_write_resp_bits_id;
+  sc_signal<sc_bv<2> > io_axi1_write_resp_bits_resp;
+  sc_signal<bool> io_axi1_read_addr_ready;
+  sc_signal<bool> io_axi1_read_addr_valid;
+  sc_signal<sc_bv<32> > io_axi1_read_addr_bits_addr;
+  sc_signal<sc_bv<kUncId> > io_axi1_read_addr_bits_id;
+  sc_signal<bool> io_axi1_read_data_ready;
+  sc_signal<bool> io_axi1_read_data_valid;
+  sc_signal<sc_bv<2> > io_axi1_read_data_bits_resp;
+  sc_signal<sc_bv<kUncId> > io_axi1_read_data_bits_id;
+  sc_signal<sc_bv<kUncBits> > io_axi1_read_data_bits_data;
+
+  io_iflush_ready = 1;
+  io_dflush_ready = 1;
+
+  tb.io_halted(io_halted);
+  tb.io_fault(io_fault);
+
+  core.clock(tb.clock);
+  core.reset(tb.reset);
+  core.io_halted(io_halted);
+  core.io_fault(io_fault);
+  core.io_ibus_valid(io_ibus_valid);
+  core.io_ibus_ready(io_ibus_ready);
+  core.io_dbus_valid(io_dbus_valid);
+  core.io_dbus_ready(io_dbus_ready);
+  core.io_dbus_write(io_dbus_write);
+  core.io_iflush_valid(io_iflush_valid);
+  core.io_iflush_ready(io_iflush_ready);
+  core.io_dflush_valid(io_dflush_valid);
+  core.io_dflush_ready(io_dflush_ready);
+  core.io_dflush_all(io_dflush_all);
+  core.io_dflush_clean(io_dflush_clean);
+  core.io_slog_valid(io_slog_valid);
+  core.io_csr_in_value_0(io_csr_in_value_0);
+  core.io_csr_in_value_1(io_csr_in_value_1);
+  core.io_csr_in_value_2(io_csr_in_value_2);
+  core.io_csr_in_value_3(io_csr_in_value_3);
+  core.io_csr_in_value_4(io_csr_in_value_4);
+  core.io_csr_in_value_5(io_csr_in_value_5);
+  core.io_csr_in_value_6(io_csr_in_value_6);
+  core.io_csr_in_value_7(io_csr_in_value_7);
+  core.io_csr_in_value_8(io_csr_in_value_8);
+  core.io_csr_in_value_9(io_csr_in_value_9);
+  core.io_csr_in_value_10(io_csr_in_value_10);
+  core.io_csr_in_value_11(io_csr_in_value_11);
+  core.io_csr_out_value_0(io_csr_out_value_0);
+  core.io_csr_out_value_1(io_csr_out_value_1);
+  core.io_csr_out_value_2(io_csr_out_value_2);
+  core.io_csr_out_value_3(io_csr_out_value_3);
+  core.io_csr_out_value_4(io_csr_out_value_4);
+  core.io_csr_out_value_5(io_csr_out_value_5);
+  core.io_csr_out_value_6(io_csr_out_value_6);
+  core.io_csr_out_value_7(io_csr_out_value_7);
+  core.io_ibus_addr(io_ibus_addr);
+  core.io_ibus_rdata(io_ibus_rdata);
+  core.io_dbus_addr(io_dbus_addr);
+  core.io_dbus_adrx(io_dbus_adrx);
+  core.io_dbus_size(io_dbus_size);
+  core.io_dbus_wdata(io_dbus_wdata);
+  core.io_dbus_wmask(io_dbus_wmask);
+  core.io_dbus_rdata(io_dbus_rdata);
+  core.io_slog_addr(io_slog_addr);
+  core.io_slog_data(io_slog_data);
+  core.io_debug_en(io_debug_en);
+  core.io_debug_addr0(io_debug_addr0);
+  core.io_debug_addr1(io_debug_addr1);
+  core.io_debug_addr2(io_debug_addr2);
+  core.io_debug_addr3(io_debug_addr3);
+  core.io_debug_inst0(io_debug_inst0);
+  core.io_debug_inst1(io_debug_inst1);
+  core.io_debug_inst2(io_debug_inst2);
+  core.io_debug_inst3(io_debug_inst3);
+  core.io_debug_cycles(io_debug_cycles);
+
+  mif.clock(tb.clock);
+  mif.reset(tb.reset);
+  mif.io_ibus_valid(io_ibus_valid);
+  mif.io_ibus_ready(io_ibus_ready);
+  mif.io_ibus_addr(io_ibus_addr);
+  mif.io_ibus_rdata(io_ibus_rdata);
+  mif.io_dbus_valid(io_dbus_valid);
+  mif.io_dbus_ready(io_dbus_ready);
+  mif.io_dbus_write(io_dbus_write);
+  mif.io_dbus_addr(io_dbus_addr);
+  mif.io_dbus_adrx(io_dbus_adrx);
+  mif.io_dbus_size(io_dbus_size);
+  mif.io_dbus_wdata(io_dbus_wdata);
+  mif.io_dbus_wmask(io_dbus_wmask);
+  mif.io_dbus_rdata(io_dbus_rdata);
+
+  dbg.clock(tb.clock);
+  dbg.reset(tb.reset);
+  dbg.io_slog_valid(io_slog_valid);
+  dbg.io_slog_addr(io_slog_addr);
+  dbg.io_slog_data(io_slog_data);
+
+#define BINDAXI(a) core.a(a); mif.a(a)
+  BINDAXI(io_axi0_write_addr_ready);
+  BINDAXI(io_axi0_write_addr_valid);
+  BINDAXI(io_axi0_write_addr_bits_addr);
+  BINDAXI(io_axi0_write_addr_bits_id);
+  BINDAXI(io_axi0_write_data_ready);
+  BINDAXI(io_axi0_write_data_valid);
+  BINDAXI(io_axi0_write_data_bits_data);
+  BINDAXI(io_axi0_write_data_bits_strb);
+  BINDAXI(io_axi0_write_resp_ready);
+  BINDAXI(io_axi0_write_resp_valid);
+  BINDAXI(io_axi0_write_resp_bits_id);
+  BINDAXI(io_axi0_write_resp_bits_resp);
+  BINDAXI(io_axi0_read_addr_ready);
+  BINDAXI(io_axi0_read_addr_valid);
+  BINDAXI(io_axi0_read_addr_bits_addr);
+  BINDAXI(io_axi0_read_addr_bits_id);
+  BINDAXI(io_axi0_read_data_ready);
+  BINDAXI(io_axi0_read_data_valid);
+  BINDAXI(io_axi0_read_data_bits_resp);
+  BINDAXI(io_axi0_read_data_bits_id);
+  BINDAXI(io_axi0_read_data_bits_data);
+  BINDAXI(io_axi1_write_addr_ready);
+  BINDAXI(io_axi1_write_addr_valid);
+  BINDAXI(io_axi1_write_addr_bits_addr);
+  BINDAXI(io_axi1_write_addr_bits_id);
+  BINDAXI(io_axi1_write_data_ready);
+  BINDAXI(io_axi1_write_data_valid);
+  BINDAXI(io_axi1_write_data_bits_data);
+  BINDAXI(io_axi1_write_data_bits_strb);
+  BINDAXI(io_axi1_write_resp_ready);
+  BINDAXI(io_axi1_write_resp_valid);
+  BINDAXI(io_axi1_write_resp_bits_id);
+  BINDAXI(io_axi1_write_resp_bits_resp);
+  BINDAXI(io_axi1_read_addr_ready);
+  BINDAXI(io_axi1_read_addr_valid);
+  BINDAXI(io_axi1_read_addr_bits_addr);
+  BINDAXI(io_axi1_read_addr_bits_id);
+  BINDAXI(io_axi1_read_data_ready);
+  BINDAXI(io_axi1_read_data_valid);
+  BINDAXI(io_axi1_read_data_bits_resp);
+  BINDAXI(io_axi1_read_data_bits_id);
+  BINDAXI(io_axi1_read_data_bits_data);
+
+  if (trace) {
+    tb.trace(core);
+  }
+
+  tb.start();
+}
+
+int sc_main(int argc, char *argv[]) {
+  if (argc <= 1) {
+    printf("Expected binary file argument\n");
+    return -1;
+  }
+
+  const char* path = argv[1];
+  Core_run(Sysc_tb::get_name(argv[0]), path, false);
+  return 0;
+}
diff --git a/tests/verilator_sim/kelvin/dbus2axi_tb.cc b/tests/verilator_sim/kelvin/dbus2axi_tb.cc
new file mode 100644
index 0000000..92855e7
--- /dev/null
+++ b/tests/verilator_sim/kelvin/dbus2axi_tb.cc
@@ -0,0 +1,343 @@
+#include "VDBus2Axi.h"
+#include "tests/verilator_sim/sysc_tb.h"
+
+struct DBus2Axi_tb : Sysc_tb {
+  sc_out<bool> io_dbus_valid;
+  sc_in<bool> io_dbus_ready;
+  sc_out<bool> io_dbus_write;
+  sc_out<bool> io_axi_write_addr_ready;
+  sc_in<bool> io_axi_write_addr_valid;
+  sc_out<bool> io_axi_write_data_ready;
+  sc_in<bool> io_axi_write_data_valid;
+  sc_in<bool> io_axi_write_resp_ready;
+  sc_out<bool> io_axi_write_resp_valid;
+  sc_out<bool> io_axi_read_addr_ready;
+  sc_in<bool> io_axi_read_addr_valid;
+  sc_in<bool> io_axi_read_data_ready;
+  sc_out<bool> io_axi_read_data_valid;
+  sc_out<sc_bv<32> > io_dbus_addr;
+  sc_out<sc_bv<32> > io_dbus_adrx;
+  sc_out<sc_bv<6> > io_dbus_size;
+  sc_out<sc_bv<256> > io_dbus_wdata;
+  sc_out<sc_bv<32> > io_dbus_wmask;
+  sc_in<sc_bv<256> > io_dbus_rdata;
+  sc_in<sc_bv<32> > io_axi_write_addr_bits_addr;
+  sc_in<sc_bv<6> > io_axi_write_addr_bits_id;
+  sc_in<sc_bv<256> > io_axi_write_data_bits_data;
+  sc_in<sc_bv<32> > io_axi_write_data_bits_strb;
+  sc_out<sc_bv<6> > io_axi_write_resp_bits_id;
+  sc_out<sc_bv<2> > io_axi_write_resp_bits_resp;
+  sc_in<sc_bv<32> > io_axi_read_addr_bits_addr;
+  sc_in<sc_bv<6> > io_axi_read_addr_bits_id;
+  sc_out<sc_bv<2> > io_axi_read_data_bits_resp;
+  sc_out<sc_bv<6> > io_axi_read_data_bits_id;
+  sc_out<sc_bv<256> > io_axi_read_data_bits_data;
+
+  using Sysc_tb::Sysc_tb;
+
+  void posedge() {
+    sc_bv<32> dbus_wmask;
+    sc_bv<256> dbus_wdata;
+    for (int i = 0; i < 8; ++i) dbus_wdata.set_word(i, rand_uint32());
+    dbus_wmask.set_word(0, rand_uint32());
+
+    if (!(io_dbus_valid && !io_dbus_ready && !io_dbus_write)) {
+      io_dbus_valid = rand_bool();
+      io_dbus_write = rand_bool();
+      io_dbus_addr = rand_uint32();
+      io_dbus_adrx = rand_uint32();
+      io_dbus_size = rand_uint32();
+      io_dbus_wdata = dbus_wdata;
+      io_dbus_wmask = dbus_wmask;
+    }
+
+    io_axi_read_addr_ready = rand_bool();
+
+    const bool write_ready = rand_bool();
+    io_axi_write_addr_ready = write_ready;
+    io_axi_write_data_ready = write_ready;
+
+    // *************************************************************************
+    // DBus Addr.
+    if (io_dbus_valid && !io_dbus_write && !dbus_read_ready_) {
+      dbus_read_ready_ = true;
+      axi_read_addr_t r;
+      r.addr = io_dbus_addr.read().get_word(0) & ~31;
+      r.id = 0x00;  // from RTL
+      axi_read_addr_.write(r);
+    }
+
+    if (io_dbus_valid && io_dbus_ready && io_dbus_write) {
+      axi_write_addr_t w;
+      sc_bv<256> data;
+      sc_bv<32> strb;
+      w.addr = io_dbus_addr.read().get_word(0) & ~31;
+      w.id = 0x00;  // from RTL
+      w.strb = io_dbus_wmask;
+      w.data = io_dbus_wdata;
+      axi_write_addr_.write(w);
+    }
+
+    // *************************************************************************
+    // DBus Read Data.
+    if (dbus_read_active_) {
+      dbus_read_active_ = false;
+      dbus_read_data_t ref, dut;
+      check(dbus_read_data_.read(ref), "dbus read data");
+      dut.data = io_dbus_rdata;
+      if (ref != dut) {
+        ref.print("ref::dbus_read_addr");
+        dut.print("dut::dbus_read_addr");
+        check(false);
+      }
+    }
+
+    if (io_dbus_valid && io_dbus_ready && !io_dbus_write) {
+      dbus_read_ready_ = false;
+      dbus_read_active_ = true;
+    }
+
+    // *************************************************************************
+    // AXI Read Addr.
+    if (io_axi_read_addr_valid && io_axi_read_addr_ready) {
+      axi_read_addr_t dut, ref;
+      check(axi_read_addr_.read(ref), "axi read addr");
+      dut.addr = io_axi_read_addr_bits_addr.read().get_word(0);
+      dut.id = io_axi_read_addr_bits_id.read().get_word(0);
+      if (ref != dut) {
+        ref.print("ref::axi_read_addr");
+        dut.print("dut::axi_read_addr");
+        check(false);
+      }
+
+      sc_bv<256> data;
+      for (int i = 0; i < 8; ++i) data.set_word(i, rand_uint32());
+      axi_read_data_t raxi;
+      raxi.id = dut.id;
+      raxi.data = data;
+      raxi.resp = rand_int();
+      axi_read_data_.write(raxi);
+
+      dbus_read_data_t dbus;
+      dbus.data = data;
+      dbus_read_data_.write(dbus);
+    }
+
+    // *************************************************************************
+    // AXI Read Data.
+    if (io_axi_read_data_valid && io_axi_read_data_ready) {
+      check(axi_read_data_.remove(), "axi read data");
+    }
+
+    axi_read_data_t rdata;
+    bool read_data_valid = axi_read_data_.next(rdata);
+    io_axi_read_data_valid = read_data_valid && rand_bool();
+    io_axi_read_data_bits_id = rdata.id;
+    io_axi_read_data_bits_data = rdata.data;
+    io_axi_read_data_bits_resp = rdata.resp;
+
+    // *************************************************************************
+    // AXI Write Addr.
+    if (io_axi_write_addr_valid && io_axi_write_addr_ready) {
+      assert(io_axi_write_data_valid && io_axi_write_data_ready);
+      axi_write_addr_t dut, ref;
+      check(axi_write_addr_.read(ref), "axi write addr");
+      dut.addr = io_axi_write_addr_bits_addr.read().get_word(0);
+      dut.id = io_axi_write_addr_bits_id.read().get_word(0);
+      dut.data = io_axi_write_data_bits_data;
+      dut.strb = io_axi_write_data_bits_strb;
+      if (ref != dut) {
+        ref.print("ref::axi_write_addr");
+        dut.print("dut::axi_write_addr");
+        check(false);
+      }
+
+      axi_write_resp_t resp;
+      resp.id = dut.id;
+      resp.resp = rand_int();
+      axi_write_resp_.write(resp);
+    }
+
+    // *************************************************************************
+    // AXI Write Resp.
+    if (io_axi_write_resp_valid && io_axi_write_resp_ready) {
+      check(axi_write_resp_.remove(), "axi write resp");
+    }
+
+    axi_write_resp_t wresp;
+    bool write_resp_valid = axi_write_resp_.next(wresp);
+    io_axi_write_resp_valid = write_resp_valid;
+    io_axi_write_resp_bits_id = wresp.id;
+    io_axi_write_resp_bits_resp = wresp.resp;
+  }
+
+
+ private:
+   struct axi_read_addr_t {
+     uint32_t addr;
+     uint32_t id : 7;
+
+     bool operator!=(const axi_read_addr_t& rhs) const {
+       if (addr != rhs.addr) return true;
+       if (id != rhs.id) return true;
+       return false;
+     }
+
+     void print(const char* name) {
+       printf("[%s]: id=%x addr=%08x\n", name, id, addr);
+     }
+   };
+
+   struct axi_read_data_t {
+     uint32_t id : 7;
+     uint32_t resp : 7;
+     sc_bv<256> data;
+
+     bool operator!=(const axi_read_data_t& rhs) const {
+       if (id != rhs.id) return true;
+       if (data != rhs.data) return true;
+       return false;
+     }
+
+     void print(const char* name) {
+       printf("[%s]: id=%x data=", name, id);
+       for (int i = 0; i < 256 / 32; ++i) {
+         printf("%08x ", data.get_word(i));
+       }
+       printf("\n");
+     }
+   };
+
+   struct axi_write_addr_t {
+     uint32_t addr;
+     uint32_t id : 7;
+     sc_bv<256> data;
+     sc_bv<32> strb;
+
+     bool operator!=(const axi_write_addr_t& rhs) const {
+       if (addr != rhs.addr) return true;
+       if (id != rhs.id) return true;
+       if (strb != rhs.strb) return true;
+       if (data != rhs.data) return true;
+       return false;
+     }
+
+     void print(const char* name) {
+       printf("[%s]: id=%x addr=%08x strb=%08x data=", name, id, addr, strb.get_word(0));
+       for (int i = 0; i < 256 / 32; ++i) {
+         printf("%08x ", data.get_word(0));
+       }
+       printf("\n");
+     }
+   };
+
+   struct axi_write_resp_t {
+     uint32_t id : 7;
+     uint32_t resp : 2;
+   };
+
+   struct dbus_read_data_t {
+     sc_bv<256> data;
+
+     bool operator!=(const dbus_read_data_t& rhs) const {
+       if (data != rhs.data) return true;
+       return false;
+     }
+
+     void print(const char* name) {
+       printf("[%s]: data=", name);
+       for (int i = 0; i < 256 / 32; ++i) {
+         printf("%08x ", data.get_word(i));
+       }
+       printf("\n");
+     }
+   };
+
+   bool dbus_read_ready_ = false;
+   bool dbus_read_active_ = false;
+   fifo_t<axi_read_addr_t> axi_read_addr_;
+   fifo_t<axi_read_data_t> axi_read_data_;
+   fifo_t<axi_write_addr_t> axi_write_addr_;
+   fifo_t<axi_write_resp_t> axi_write_resp_;
+   fifo_t<dbus_read_data_t> dbus_read_data_;
+};
+
+static void DBus2Axi_test(char* name, int loops, bool trace) {
+  sc_signal<bool> io_dbus_valid;
+  sc_signal<bool> io_dbus_ready;
+  sc_signal<bool> io_dbus_write;
+  sc_signal<bool> io_axi_write_addr_ready;
+  sc_signal<bool> io_axi_write_addr_valid;
+  sc_signal<bool> io_axi_write_data_ready;
+  sc_signal<bool> io_axi_write_data_valid;
+  sc_signal<bool> io_axi_write_resp_ready;
+  sc_signal<bool> io_axi_write_resp_valid;
+  sc_signal<bool> io_axi_read_addr_ready;
+  sc_signal<bool> io_axi_read_addr_valid;
+  sc_signal<bool> io_axi_read_data_ready;
+  sc_signal<bool> io_axi_read_data_valid;
+  sc_signal<sc_bv<32> > io_dbus_addr;
+  sc_signal<sc_bv<32> > io_dbus_adrx;
+  sc_signal<sc_bv<6> > io_dbus_size;
+  sc_signal<sc_bv<256> > io_dbus_wdata;
+  sc_signal<sc_bv<32> > io_dbus_wmask;
+  sc_signal<sc_bv<256> > io_dbus_rdata;
+  sc_signal<sc_bv<32> > io_axi_write_addr_bits_addr;
+  sc_signal<sc_bv<6> > io_axi_write_addr_bits_id;
+  sc_signal<sc_bv<256> > io_axi_write_data_bits_data;
+  sc_signal<sc_bv<32> > io_axi_write_data_bits_strb;
+  sc_signal<sc_bv<6> > io_axi_write_resp_bits_id;
+  sc_signal<sc_bv<2> > io_axi_write_resp_bits_resp;
+  sc_signal<sc_bv<32> > io_axi_read_addr_bits_addr;
+  sc_signal<sc_bv<6> > io_axi_read_addr_bits_id;
+  sc_signal<sc_bv<2> > io_axi_read_data_bits_resp;
+  sc_signal<sc_bv<6> > io_axi_read_data_bits_id;
+  sc_signal<sc_bv<256> > io_axi_read_data_bits_data;
+
+  DBus2Axi_tb tb("DBus2Axi_tb", loops, true /*random*/);
+  VDBus2Axi d2a(name);
+
+  d2a.clock(tb.clock);
+  d2a.reset(tb.reset);
+  BIND2(tb, d2a, io_dbus_valid);
+  BIND2(tb, d2a, io_dbus_ready);
+  BIND2(tb, d2a, io_dbus_write);
+  BIND2(tb, d2a, io_axi_write_addr_ready);
+  BIND2(tb, d2a, io_axi_write_addr_valid);
+  BIND2(tb, d2a, io_axi_write_data_ready);
+  BIND2(tb, d2a, io_axi_write_data_valid);
+  BIND2(tb, d2a, io_axi_write_resp_ready);
+  BIND2(tb, d2a, io_axi_write_resp_valid);
+  BIND2(tb, d2a, io_axi_read_addr_ready);
+  BIND2(tb, d2a, io_axi_read_addr_valid);
+  BIND2(tb, d2a, io_axi_read_data_ready);
+  BIND2(tb, d2a, io_axi_read_data_valid);
+  BIND2(tb, d2a, io_dbus_addr);
+  BIND2(tb, d2a, io_dbus_adrx);
+  BIND2(tb, d2a, io_dbus_size);
+  BIND2(tb, d2a, io_dbus_wdata);
+  BIND2(tb, d2a, io_dbus_wmask);
+  BIND2(tb, d2a, io_dbus_rdata);
+  BIND2(tb, d2a, io_axi_write_addr_bits_addr);
+  BIND2(tb, d2a, io_axi_write_addr_bits_id);
+  BIND2(tb, d2a, io_axi_write_data_bits_data);
+  BIND2(tb, d2a, io_axi_write_data_bits_strb);
+  BIND2(tb, d2a, io_axi_write_resp_bits_id);
+  BIND2(tb, d2a, io_axi_write_resp_bits_resp);
+  BIND2(tb, d2a, io_axi_read_addr_bits_addr);
+  BIND2(tb, d2a, io_axi_read_addr_bits_id);
+  BIND2(tb, d2a, io_axi_read_data_bits_resp);
+  BIND2(tb, d2a, io_axi_read_data_bits_id);
+  BIND2(tb, d2a, io_axi_read_data_bits_data);
+
+  if (trace) {
+    tb.trace(d2a);
+  }
+
+  tb.start();
+}
+
+int sc_main(int argc, char* argv[]) {
+  DBus2Axi_test(Sysc_tb::get_name(argv[0]), 1000000, false);
+  return 0;
+}
diff --git a/tests/verilator_sim/kelvin/debug_if.h b/tests/verilator_sim/kelvin/debug_if.h
new file mode 100644
index 0000000..e35a27a
--- /dev/null
+++ b/tests/verilator_sim/kelvin/debug_if.h
@@ -0,0 +1,155 @@
+#ifndef TESTS_VERILATOR_SIM_KELVIN_DEBUG_IF_H_
+#define TESTS_VERILATOR_SIM_KELVIN_DEBUG_IF_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/time.h>
+
+#include "tests/verilator_sim/sysc_module.h"
+#include "tests/verilator_sim/kelvin/memory_if.h"
+
+// A core debug model.
+struct Debug_if : Sysc_module {
+  sc_in<bool>       io_slog_valid;
+  sc_in<sc_bv<5> >  io_slog_addr;
+  sc_in<sc_bv<32> > io_slog_data;
+
+  Debug_if(sc_module_name n, Memory_if* mm) : Sysc_module(n), mm_(mm) {
+    gettimeofday(&start_, NULL);
+  }
+
+  ~Debug_if() {
+    gettimeofday(&stop_, NULL);
+    const float time_s =
+        static_cast<float>(stop_.tv_sec - start_.tv_sec) +
+        static_cast<float>(stop_.tv_usec - start_.tv_usec) / 1000000.0f;
+
+    // Integer with commas.
+    auto s = std::to_string(cycle_);
+    int n = s.length() - 3;
+    while (n > 0) {
+      s.insert(n, ",");
+      n -= 3;
+    }
+
+    printf("Info: %s cycles  @%.2fK/s\n", s.c_str(), cycle_ / time_s / 1000.0f);
+  }
+
+  void eval() {
+    if (reset) {
+      cycle_ = 0;
+    } else if (clock->posedge()) {
+      cycle_++;
+      if (io_slog_valid) {
+        Slog(io_slog_addr.read().get_word(0), io_slog_data.read().get_word(0));
+      }
+    }
+  }
+
+private:
+#ifndef TIME_DISABLE
+  const char* KNRM = "\x1B[0m";
+  const char* KRED = "\x1B[31m";
+  const char* KGRN = "\x1B[32m";
+  const char* KYEL = "\x1B[33m";
+  const char* KBLU = "\x1B[34m";
+  const char* KMAG = "\x1B[35m";
+  const char* KCYN = "\x1B[36m";
+  const char* KWHT = "\x1B[37m";
+  const char* KRST = "\033[0m";
+#endif  // TIME_DISABLE
+
+  static const int ARGMAX = 16;
+  static const int BUFFERLIMIT = 100;
+  int argpos_;
+  uint64_t arg_[ARGMAX];
+  uint8_t str_[ARGMAX][BUFFERLIMIT];
+  uint8_t pos_[ARGMAX] = {0};
+
+  struct timeval stop_, start_;
+
+  Memory_if* mm_;
+
+  bool newline_ = false;
+  int cycle_ = 0;
+
+  void Slog(const uint8_t cmd, const uint32_t data) {
+    constexpr int FLOG = 0;
+    constexpr int SLOG = 1;
+    constexpr int CLOG = 2;
+    constexpr int KLOG = 3;
+
+    if (cmd == FLOG) {
+      char buf[BUFFERLIMIT];
+      char sbuf[ARGMAX * BUFFERLIMIT];
+
+      mm_->Read(data, BUFFERLIMIT, (uint8_t*) buf);
+      buf[sizeof(buf) - 1] = '\0';
+
+      sprintf(sbuf, buf, arg_[0], arg_[1], arg_[2], arg_[3],
+              arg_[4], arg_[5], arg_[6], arg_[7],
+              arg_[8], arg_[9], arg_[10], arg_[11],
+              arg_[12], arg_[13], arg_[14], arg_[15]);  // ARGMAX
+
+      int len = strlen(sbuf);
+#ifndef TIME_DISABLE
+      printf("%s", KGRN);
+#endif  // TIME_DISABLE
+      for (int i = 0; i < len; ++i) {
+        if (!newline_) {
+          newline_ = true;
+#ifndef TIME_DISABLE
+          printf("%s[%7d] %s", KCYN, cycle_, KGRN);
+#endif  // TIME_DISABLE
+        }
+        const char ch = sbuf[i];
+        putc(ch, stdout);
+        if (ch == '\n') {
+          newline_ = false;
+          fflush(stdout);
+        }
+      }
+#ifndef TIME_DISABLE
+      printf("%s", KRST);
+#endif  // TIME_DISABLE
+
+      memset(pos_, 0, sizeof(pos_));
+      argpos_ = 0;
+      return;
+    }
+
+    assert(argpos_ < ARGMAX);
+
+    if (cmd == SLOG) {
+      arg_[argpos_] = data;
+      argpos_++;
+    } else if (cmd == CLOG) {
+      arg_[argpos_] = (uint64_t) str_[argpos_];
+      const uint8_t *ptr = (const uint8_t*) &data;
+      uint8_t *buf = str_[argpos_];
+      for (int i = 0; i < 4; ++i) {
+        const int p = pos_[argpos_]++;
+        const char c = ptr[i];
+        assert(p + 1 < BUFFERLIMIT);
+        buf[p] = c;
+        buf[p + 1] = '\0';
+        if (!c) {
+          argpos_++;
+          break;
+        }
+      }
+    } else if (cmd == KLOG) {
+      arg_[argpos_] = (uint64_t) str_[argpos_];
+      uint8_t *buf = str_[argpos_];
+      char c = 0;
+      int pos = 0;
+      mm_->Read(data, BUFFERLIMIT, buf);
+      argpos_++;
+    } else {
+      printf("\n**error: RV32L SLOG unknown cmd=%d\n", cmd);
+      exit(-1);
+    }
+  }
+};
+
+#endif  // TESTS_VERILATOR_SIM_KELVIN_DEBUG_IF_H_
diff --git a/tests/verilator_sim/kelvin/kelvin_cfg.h b/tests/verilator_sim/kelvin/kelvin_cfg.h
new file mode 100644
index 0000000..ef2a2b5
--- /dev/null
+++ b/tests/verilator_sim/kelvin/kelvin_cfg.h
@@ -0,0 +1,66 @@
+#ifndef TESTS_VERILATOR_SIM_KELVIN_KELVIN_CFG_H_
+#define TESTS_VERILATOR_SIM_KELVIN_KELVIN_CFG_H_
+
+#ifndef KELVIN_SIMD
+#error KELVIN_SIMD must be defined in Environment or Makefile.
+#elif KELVIN_SIMD == 128
+constexpr int kVector = 128;
+#elif KELVIN_SIMD == 256
+constexpr int kVector = 256;
+#elif KELVIN_SIMD == 512
+constexpr int kVector = 512;
+#else
+#error KELVIN_SIMD unsupported configuration.
+#endif
+
+constexpr int ctz(int a) {
+  if (a == 1) return 0;
+  if (a == 2) return 1;
+  if (a == 4) return 2;
+  if (a == 8) return 3;
+  if (a == 16) return 4;
+  if (a == 32) return 5;
+  if (a == 64) return 6;
+  if (a == 128) return 7;
+  if (a == 256) return 8;
+  return -1;
+}
+
+// ISS defines.
+constexpr uint32_t VLENB = kVector / 8;
+constexpr uint32_t VLENH = kVector / 16;
+constexpr uint32_t VLENW = kVector / 32;
+constexpr uint32_t SM = 4;
+
+constexpr int kDbusBits = ctz(kVector / 8) + 1;
+constexpr int kVlenBits = ctz(kVector / 8) + 1 + 2;
+
+// [External] System AXI.
+constexpr int kAxiBits = 256;
+constexpr int kAxiStrb = kAxiBits / 8;
+constexpr int kAxiId = 7;
+
+// [Internal] L1I AXI.
+constexpr int kL1IAxiBits = 256;
+constexpr int kL1IAxiStrb = kL1IAxiBits / 8;
+constexpr int kL1IAxiId = 4;
+
+// [Internal] L1D AXI.
+constexpr int kL1DAxiBits = 256;
+constexpr int kL1DAxiStrb = kL1DAxiBits / 8;
+constexpr int kL1DAxiId = 4;
+
+// [Internal] Uncached AXI[Vector,Scalar].
+constexpr int kUncBits = kVector;
+constexpr int kUncStrb = kVector / 8;
+constexpr int kUncId = 6;
+
+// Transaction is uncached (and bus width aligned).
+static uint8_t is_uncached(const uint32_t addr) {
+  // bit31==1 (0x80000000)
+  return (addr & (1u << 31)) != 0;
+}
+
+constexpr int kAlignedLsb = ctz(kVector / 8);
+
+#endif  // TESTS_VERILATOR_SIM_KELVIN_KELVIN_CFG_H_
diff --git a/tests/verilator_sim/kelvin/l1dcache_tb.cc b/tests/verilator_sim/kelvin/l1dcache_tb.cc
new file mode 100644
index 0000000..299aedf
--- /dev/null
+++ b/tests/verilator_sim/kelvin/l1dcache_tb.cc
@@ -0,0 +1,510 @@
+#include "tests/verilator_sim/sysc_tb.h"
+
+#ifndef L1DCACHEBANK
+#include "VL1DCache.h"
+constexpr int kDBusBankAdj = 0;
+#else
+constexpr int kDBusBankAdj = 1;
+#endif
+
+#include "tests/verilator_sim/kelvin/kelvin_cfg.h"
+
+constexpr int kLineSize = kVector / 8;
+constexpr int kLineBase = ~(kLineSize - 1);
+constexpr int kLineOffset = kLineSize - 1;
+
+struct L1DCache_tb : Sysc_tb
+{
+  sc_out<bool> io_flush_valid;
+  sc_in<bool>  io_flush_ready;
+  sc_out<bool> io_flush_all;
+  sc_out<bool> io_flush_clean;
+
+  sc_out<bool> io_dbus_valid;
+  sc_in<bool> io_dbus_ready;
+  sc_out<bool> io_dbus_write;
+  sc_out<sc_bv<kDbusBits> > io_dbus_size;
+  sc_out<sc_bv<32 - kDBusBankAdj> > io_dbus_addr;
+  sc_out<sc_bv<32 - kDBusBankAdj> > io_dbus_adrx;
+  sc_in<sc_bv<kVector> > io_dbus_rdata;
+  sc_out<sc_bv<kVector> > io_dbus_wdata;
+  sc_out<sc_bv<kVector / 8> > io_dbus_wmask;
+
+  sc_in<bool> io_axi_read_addr_valid;
+  sc_out<bool> io_axi_read_addr_ready;
+  sc_in<sc_bv<kL1DAxiId - kDBusBankAdj> > io_axi_read_addr_bits_id;
+  sc_in<sc_bv<32 - kDBusBankAdj> > io_axi_read_addr_bits_addr;
+
+  sc_out<bool> io_axi_read_data_valid;
+  sc_in<bool> io_axi_read_data_ready;
+  sc_out<sc_bv<2> > io_axi_read_data_bits_resp;
+  sc_out<sc_bv<kL1DAxiId - kDBusBankAdj> > io_axi_read_data_bits_id;
+  sc_out<sc_bv<kL1DAxiBits> > io_axi_read_data_bits_data;
+
+  sc_in<bool> io_axi_write_addr_valid;
+  sc_out<bool> io_axi_write_addr_ready;
+  sc_in<sc_bv<kL1DAxiId - kDBusBankAdj> > io_axi_write_addr_bits_id;
+  sc_in<sc_bv<32 - kDBusBankAdj> > io_axi_write_addr_bits_addr;
+
+  sc_in<bool> io_axi_write_data_valid;
+  sc_out<bool> io_axi_write_data_ready;
+  sc_in<sc_bv<kL1DAxiStrb> > io_axi_write_data_bits_strb;
+  sc_in<sc_bv<kL1DAxiBits> > io_axi_write_data_bits_data;
+
+  sc_out<bool> io_axi_write_resp_valid;
+  sc_in<bool> io_axi_write_resp_ready;
+  sc_out<sc_bv<2> > io_axi_write_resp_bits_resp;
+  sc_out<sc_bv<kL1DAxiId - kDBusBankAdj> > io_axi_write_resp_bits_id;
+
+  using Sysc_tb::Sysc_tb;
+
+  void posedge() {
+    // dbus
+#ifdef L1DCACHEBANK
+    // Checks a bank cache line.
+    if (dbus_resp_pipeline_) {
+      dbus_resp_pipeline_ = false;
+      uint32_t addr = dbus_resp_addr_;
+      int size = dbus_resp_size_;
+      for (int i = 0; i < vlenb_ && size; ++i) {
+        uint8_t ref = dbus_resp_data_[i];
+        uint8_t dut = io_dbus_rdata.read().get_word(i / 4) >> (8 * i);
+        if (ref != dut) {
+          printf("DDD(%d) %08x : %02x %02x\n", i, (addr & ~(vlenb_ - 1)) + i, ref, dut);
+        }
+        check(ref == dut, "dbus read data");
+      }
+    }
+#else
+    if (dbus_resp_pipeline_) {
+      dbus_resp_pipeline_ = false;
+      uint32_t addr = dbus_resp_addr_;
+      int size = dbus_resp_size_;
+      for (int j = addr; j < addr + size; ++j) {
+        int i = j & (vlenb_ - 1);
+        uint8_t ref = dbus_resp_data_[i];
+        uint8_t dut = io_dbus_rdata.read().get_word(i / 4) >> (8 * i);
+        check(ref == dut, "dbus read data");
+      }
+    }
+#endif
+
+    if (io_dbus_valid && io_dbus_ready && !io_dbus_write) {
+      dbus_active_ = false;
+      dbus_resp_pipeline_ = true;
+      dbus_resp_addr_ = io_dbus_addr.read().get_word(0);
+      dbus_resp_size_ = io_dbus_size.read().get_word(0);
+#ifdef L1DCACHEBANK
+      ReadBus(dbus_resp_addr_ & kLineBase, vlenb_, dbus_resp_data_);
+#else
+      ReadBus(dbus_resp_addr_, vlenb_, dbus_resp_data_);
+#endif
+      history_t cmd({dbus_resp_addr_});
+      history_.write(cmd);
+      if (history_.count() > 16) {
+        history_.remove();
+      }
+    }
+
+    if (io_dbus_valid && io_dbus_ready && io_dbus_write) {
+      dbus_active_ = false;
+
+      uint32_t addr = io_dbus_addr.read().get_word(0);
+      int size = io_dbus_size.read().get_word(0);
+      uint8_t wdata[vlenb_];
+      uint32_t* p_wdata = (uint32_t*) wdata;
+      for (int i = 0; i < vlenw_; ++i) {
+        p_wdata[i] = io_dbus_wdata.read().get_word(i);
+      }
+      const uint32_t linemask = vlenb_ - 1;
+      const uint32_t linebase = addr & ~linemask;
+      for (int i = 0; i < size; ++i, ++addr) {
+        const uint32_t lineoffset = addr & linemask;
+        if (io_dbus_wmask.read().get_bit(lineoffset)) {
+#ifdef L1DCACHEBANK
+          WriteBus(linebase + lineoffset, wdata[lineoffset]);
+#else
+          WriteBus(addr, wdata[lineoffset]);
+#endif
+        }
+      }
+    }
+
+    if (io_flush_valid && io_flush_ready) {
+      flush_valid_ = false;
+      flush_all_ = false;
+      flush_clean_ = false;
+    }
+
+    if (++flush_count_ > 5000 && !dbus_active_ && !flush_valid_) {
+      // Flush controls must not change during handshake.
+      flush_count_ = 0;
+      flush_valid_ = true;
+      flush_all_   = rand_bool();
+      flush_clean_ = rand_bool();
+    }
+
+    io_flush_valid = flush_valid_;
+    io_flush_all   = flush_all_;
+    io_flush_clean = flush_clean_;
+
+    history_t dbus;
+    if (!io_dbus_valid || !dbus_active_) {  // latch transaction
+      bool valid = rand_bool() && !flush_valid_;
+      bool write = rand_int(0, 3) == 0;
+      bool newaddr = rand_int(0, 3) == 0 || !history_.rand(dbus);
+      uint32_t addr = newaddr ? rand_uint32() : (dbus.addr + rand_int(-vlenb_, vlenb_));
+      addr = std::min(0xffffff00u, addr);  // TODO: avoids a raxi() crash.
+      if (kDBusBankAdj) {
+        addr &= 0x7fffffff;
+      }
+      if (rand_int(0, 7) == 0) {
+        addr &= 0x3fff;
+      }
+#ifdef L1DCACHEBANK
+      int size = rand_int(1, vlenb_);
+#else
+      int size = rand_int(0, vlenb_);
+#endif
+      io_dbus_valid = valid;
+      io_dbus_write = write;
+      io_dbus_addr = addr;
+      io_dbus_adrx = addr + vlenb_;
+      io_dbus_size = size;
+      if (valid) {
+        dbus_active_ = true;
+        CheckAddr(addr, size);
+      }
+
+      sc_bv<kVector> wdata = 0;
+      sc_bv<kVector / 8> wmask = 0;
+
+      if (write) {
+        for (int i = 0; i < vlenw_; ++i) {
+          wdata.set_word(i, rand_uint32());
+        }
+        const uint32_t linemask = vlenb_ - 1;
+        const uint32_t lineoffset = addr & linemask;
+        const bool all = rand_bool();
+        for (int i = 0; i < size; ++i) {
+          if (all || rand_bool()) {
+            wmask.set_bit((i + lineoffset) & linemask, sc_dt::Log_1);
+          }
+        }
+      }
+
+      io_dbus_wdata.write(wdata);
+      io_dbus_wmask.write(wmask);
+    }
+
+    timeout_ = io_dbus_ready ? 0 : timeout_ + io_dbus_valid;
+    check(timeout_ < 10000, "dbus timeout");
+
+    // axi_read_addr
+    io_axi_read_addr_ready = rand_bool();
+
+    if (io_axi_read_addr_valid && io_axi_read_addr_ready) {
+      uint32_t id = io_axi_read_addr_bits_id.read().get_word(0);
+      uint32_t addr = io_axi_read_addr_bits_addr.read().get_word(0);
+      response_t resp({id, addr});
+      resp_.write(resp);
+    }
+
+    // axi_read_data
+    io_axi_read_data_valid = false;
+    io_axi_read_data_bits_id = 0;
+    io_axi_read_data_bits_data = 0;
+
+    if (io_axi_read_data_valid && io_axi_read_data_ready) {
+      check(resp_.remove(), "no response to erase");
+    }
+
+    response_t resp;
+    resp_.shuffle();
+    if (resp_.next(resp)) {
+      io_axi_read_data_valid = rand_bool();
+      io_axi_read_data_bits_id = resp.id;
+      uint32_t addr = resp.addr;
+      sc_bv<kL1DAxiBits> out;
+      for (int i = 0; i < axiw_; ++i) {
+        uint32_t data;
+        ReadAxi(addr, 4, (uint8_t*) &data);
+        out.set_word(i, data);
+        addr += 4;
+      }
+      io_axi_read_data_bits_data = out;
+    }
+
+    // axi_write_addr
+    bool writedataready = rand_bool();
+
+    io_axi_write_addr_ready = writedataready;
+
+    if (io_axi_write_addr_valid && io_axi_write_addr_ready) {
+      axiwaddr_t p;
+      p.id = io_axi_write_addr_bits_id.read().get_word(0);
+      p.addr = io_axi_write_addr_bits_addr.read().get_word(0);
+      waddr_.write(p);
+    }
+
+    // axi_write_data
+    io_axi_write_data_ready = writedataready;
+
+    if (io_axi_write_data_valid && io_axi_write_data_ready) {
+      axiwdata_t p;
+      uint32_t* ptr = (uint32_t*) p.data;
+      for (int i = 0; i < axiw_; ++i, ++ptr) {
+        ptr[0] = io_axi_write_data_bits_data.read().get_word(i);
+      }
+      for (int i = 0; i < axib_; ++i) {
+        p.mask[i] = io_axi_write_data_bits_strb.read().get_bit(i);
+      }
+      wdata_.write(p);
+    }
+
+    // axi_write_resp
+    if (io_axi_write_resp_valid && io_axi_write_resp_ready) {
+      wresp_.remove();
+    }
+
+    axiwaddr_t wr;
+    io_axi_write_resp_valid = rand_int(0, 4) == 0 && wresp_.next(wr);
+    io_axi_write_resp_bits_id = wr.id;
+
+    // Process axi data write, and populate response.
+    axiwaddr_t wa;
+    axiwdata_t wd;
+    if (waddr_.next(wa) && wdata_.next(wd)) {
+      waddr_.remove();
+      wdata_.remove();
+      wresp_.write(wa);
+
+      uint32_t addr = wa.addr;
+      for (int i = 0; i < axib_; ++i, ++addr) {
+        if (wd.mask[i]) {
+          WriteAxi(addr, wd.data[i]);
+        }
+      }
+    }
+  }
+
+private:
+  struct history_t {
+    uint32_t addr;
+  };
+
+  struct response_t {
+    uint32_t id;
+    uint32_t addr;
+  };
+
+  struct axiwaddr_t {
+    uint32_t id;
+    uint32_t addr;
+  };
+
+  struct axiwdata_t {
+    uint8_t data[kL1DAxiBits / 8];
+    bool    mask[kL1DAxiBits / 8];
+  };
+
+  const int vlenb_ = kVector / 8;
+  const int vlenw_ = kVector / 32;
+  const int axib_ = kL1DAxiBits / 8;
+  const int axiw_ = kL1DAxiBits / 32;
+
+  int timeout_ = 0;
+  int flush_count_ = 0;
+  bool flush_valid_ = false;
+  bool flush_all_ = false;
+  bool flush_clean_ = false;
+
+  bool dbus_active_ = false;
+  bool dbus_resp_pipeline_ = false;
+  uint32_t dbus_resp_addr_;
+  uint32_t dbus_resp_size_;
+  uint8_t dbus_resp_data_[kVector / 8];
+  fifo_t<response_t> resp_;
+  fifo_t<history_t> history_;
+  fifo_t<axiwaddr_t> waddr_;
+  fifo_t<axiwdata_t> wdata_;
+  fifo_t<axiwaddr_t> wresp_;
+
+private:
+  std::map<uint32_t, uint8_t[kLineSize]> mem_bus_;
+  std::map<uint32_t, uint8_t[kLineSize]> mem_axi_;
+
+  void _CheckAddr(uint32_t addr, uint8_t size) {
+    const uint32_t paddr = addr & kLineBase;
+    if (mem_bus_.find(paddr) == mem_bus_.end()) {
+      uint8_t data[kLineSize];
+      uint32_t* p_data = (uint32_t*) data;
+      for (int i = 0; i < kLineSize / 4; ++i) {
+        p_data[i] = rand();
+        // p_data[i] = paddr + 4 * i;  // debug
+      }
+      memcpy(mem_bus_[paddr], data, kLineSize);
+      memcpy(mem_axi_[paddr], data, kLineSize);
+    }
+  }
+
+  void CheckAddr(uint32_t addr, uint8_t size) {
+    _CheckAddr(addr, size);
+    // if ((addr & kLineBase) == ((addr + size) & kLineBase)) return;
+    _CheckAddr(addr + kLineSize, size);
+  }
+
+  template<int outsz>
+  void _Read(uint32_t addr, uint8_t size, uint8_t* data,
+             std::map<uint32_t, uint8_t[kLineSize]>& m) {
+    const uint32_t laddr = addr & kLineBase;
+    const uint32_t loffset = addr & kLineOffset;
+    const uint32_t doffset = addr & (outsz - 1);
+    uint32_t start = addr;
+    uint32_t end = std::min(addr + size, laddr + kLineSize);
+    int size0 = end - start;
+    int size1 = size - size0;
+
+    memset(data, 0xCC, outsz);
+#ifdef L1DCACHEBANK
+    assert(doffset == 0);
+    memcpy(data + doffset, m[laddr] + loffset, outsz);
+#else
+    memcpy(data + doffset, m[laddr] + loffset, size0);
+    if (!size1) return;
+    memcpy(data, m[laddr + kLineSize], size1);
+#endif
+  }
+
+  void _Write(uint32_t addr, uint8_t data,
+              std::map<uint32_t, uint8_t[kLineSize]>& m) {
+    const uint32_t laddr = addr & kLineBase;
+    const uint32_t loffset = addr & kLineOffset;
+
+    m[laddr][loffset] = data;
+  }
+
+  void ReadBus(uint32_t addr, uint8_t size, uint8_t* data) {
+    _Read<kVector / 8>(addr, size, data, mem_bus_);
+  }
+
+  void ReadAxi(uint32_t addr, uint8_t size, uint8_t* data) {
+    _Read<4>(addr, size, data, mem_axi_);
+  }
+
+  void WriteBus(uint32_t addr, uint8_t data) {
+    _Write(addr, data, mem_bus_);
+  }
+
+  void WriteAxi(uint32_t addr, uint8_t data) {
+    _Write(addr, data, mem_axi_);
+  }
+};
+
+static void L1DCache_test(char* name, int loops, bool trace) {
+  sc_signal<bool> clock;
+  sc_signal<bool> reset;
+
+  sc_signal<bool> io_flush_valid;
+  sc_signal<bool> io_flush_ready;
+  sc_signal<bool> io_flush_all;
+  sc_signal<bool> io_flush_clean;
+
+  sc_signal<bool> io_dbus_valid;
+  sc_signal<bool> io_dbus_ready;
+  sc_signal<bool> io_dbus_write;
+  sc_signal<sc_bv<kDbusBits> > io_dbus_size;
+  sc_signal<sc_bv<32 - kDBusBankAdj> > io_dbus_addr;
+  sc_signal<sc_bv<32 - kDBusBankAdj> > io_dbus_adrx;
+  sc_signal<sc_bv<kVector> > io_dbus_rdata;
+  sc_signal<sc_bv<kVector> > io_dbus_wdata;
+  sc_signal<sc_bv<kVector / 8> > io_dbus_wmask;
+
+  sc_signal<bool> io_axi_read_addr_valid;
+  sc_signal<bool> io_axi_read_addr_ready;
+  sc_signal<sc_bv<kL1DAxiId - kDBusBankAdj> > io_axi_read_addr_bits_id;
+  sc_signal<sc_bv<32 - kDBusBankAdj> > io_axi_read_addr_bits_addr;
+
+  sc_signal<bool> io_axi_read_data_valid;
+  sc_signal<bool> io_axi_read_data_ready;
+  sc_signal<sc_bv<2> > io_axi_read_data_bits_resp;
+  sc_signal<sc_bv<kL1DAxiId - kDBusBankAdj> > io_axi_read_data_bits_id;
+  sc_signal<sc_bv<kL1DAxiBits> > io_axi_read_data_bits_data;
+
+  sc_signal<bool> io_axi_write_addr_valid;
+  sc_signal<bool> io_axi_write_addr_ready;
+  sc_signal<sc_bv<kL1DAxiId - kDBusBankAdj> > io_axi_write_addr_bits_id;
+  sc_signal<sc_bv<32 - kDBusBankAdj> > io_axi_write_addr_bits_addr;
+
+  sc_signal<bool> io_axi_write_data_valid;
+  sc_signal<bool> io_axi_write_data_ready;
+  sc_signal<sc_bv<kL1DAxiStrb> > io_axi_write_data_bits_strb;
+  sc_signal<sc_bv<kL1DAxiBits> > io_axi_write_data_bits_data;
+
+  sc_signal<bool> io_axi_write_resp_valid;
+  sc_signal<bool> io_axi_write_resp_ready;
+  sc_signal<sc_bv<2> > io_axi_write_resp_bits_resp;
+  sc_signal<sc_bv<kL1DAxiId - kDBusBankAdj> > io_axi_write_resp_bits_id;
+
+  L1DCache_tb tb("L1DCache_tb", loops, true /*random*/);
+#ifdef L1DCACHEBANK
+  VL1DCacheBank l1dcache(name);
+#else
+  VL1DCache l1dcache(name);
+#endif
+
+  if (trace) {
+    tb.trace(l1dcache);
+  }
+
+  l1dcache.clock(tb.clock);
+  l1dcache.reset(tb.reset);
+
+  BIND2(tb, l1dcache, io_flush_valid);
+  BIND2(tb, l1dcache, io_flush_ready);
+  BIND2(tb, l1dcache, io_flush_all);
+  BIND2(tb, l1dcache, io_flush_clean);
+
+  BIND2(tb, l1dcache, io_dbus_valid);
+  BIND2(tb, l1dcache, io_dbus_ready);
+  BIND2(tb, l1dcache, io_dbus_write);
+  BIND2(tb, l1dcache, io_dbus_size);
+  BIND2(tb, l1dcache, io_dbus_addr);
+  BIND2(tb, l1dcache, io_dbus_adrx);
+  BIND2(tb, l1dcache, io_dbus_rdata);
+  BIND2(tb, l1dcache, io_dbus_wdata);
+  BIND2(tb, l1dcache, io_dbus_wmask);
+
+  BIND2(tb, l1dcache, io_axi_read_addr_valid);
+  BIND2(tb, l1dcache, io_axi_read_addr_ready);
+  BIND2(tb, l1dcache, io_axi_read_addr_bits_id);
+  BIND2(tb, l1dcache, io_axi_read_addr_bits_addr);
+
+  BIND2(tb, l1dcache, io_axi_read_data_valid);
+  BIND2(tb, l1dcache, io_axi_read_data_ready);
+  BIND2(tb, l1dcache, io_axi_read_data_bits_resp);
+  BIND2(tb, l1dcache, io_axi_read_data_bits_id);
+  BIND2(tb, l1dcache, io_axi_read_data_bits_data);
+
+  BIND2(tb, l1dcache, io_axi_write_addr_valid);
+  BIND2(tb, l1dcache, io_axi_write_addr_ready);
+  BIND2(tb, l1dcache, io_axi_write_addr_bits_id);
+  BIND2(tb, l1dcache, io_axi_write_addr_bits_addr);
+
+  BIND2(tb, l1dcache, io_axi_write_data_valid);
+  BIND2(tb, l1dcache, io_axi_write_data_ready);
+  BIND2(tb, l1dcache, io_axi_write_data_bits_strb);
+  BIND2(tb, l1dcache, io_axi_write_data_bits_data);
+
+  BIND2(tb, l1dcache, io_axi_write_resp_valid);
+  BIND2(tb, l1dcache, io_axi_write_resp_ready);
+  BIND2(tb, l1dcache, io_axi_write_resp_bits_resp);
+  BIND2(tb, l1dcache, io_axi_write_resp_bits_id);
+
+  tb.start();
+}
+
+int sc_main(int argc, char *argv[]) {
+  L1DCache_test(Sysc_tb::get_name(argv[0]), 1000000, false);
+  return 0;
+}
diff --git a/tests/verilator_sim/kelvin/l1dcachebank_tb.cc b/tests/verilator_sim/kelvin/l1dcachebank_tb.cc
new file mode 100644
index 0000000..4328333
--- /dev/null
+++ b/tests/verilator_sim/kelvin/l1dcachebank_tb.cc
@@ -0,0 +1,5 @@
+#include "VL1DCacheBank.h"
+
+#define L1DCACHEBANK
+
+#include "l1dcache_tb.cc"
diff --git a/tests/verilator_sim/kelvin/l1icache_tb.cc b/tests/verilator_sim/kelvin/l1icache_tb.cc
new file mode 100644
index 0000000..ced4d78
--- /dev/null
+++ b/tests/verilator_sim/kelvin/l1icache_tb.cc
@@ -0,0 +1,171 @@
+#include "tests/verilator_sim/sysc_tb.h"
+
+#include "VL1ICache.h"
+
+#include "tests/verilator_sim/kelvin/kelvin_cfg.h"
+
+struct L1ICache_tb : Sysc_tb
+{
+  sc_out<bool> io_flush_valid;
+  sc_in<bool> io_flush_ready;
+  sc_out<bool> io_ibus_valid;
+  sc_in<bool> io_ibus_ready;
+  sc_out<sc_bv<32> > io_ibus_addr;
+  sc_in<sc_bv<kL1IAxiBits> > io_ibus_rdata;
+  sc_in<bool> io_axi_read_addr_valid;
+  sc_out<bool> io_axi_read_addr_ready;
+  sc_in<sc_bv<kL1IAxiId> > io_axi_read_addr_bits_id;
+  sc_in<sc_bv<32> > io_axi_read_addr_bits_addr;
+  sc_out<bool> io_axi_read_data_valid;
+  sc_in<bool> io_axi_read_data_ready;
+  sc_out<sc_bv<2> > io_axi_read_data_bits_resp;
+  sc_out<sc_bv<kL1IAxiId> > io_axi_read_data_bits_id;
+  sc_out<sc_bv<kL1IAxiBits> > io_axi_read_data_bits_data;
+
+  using Sysc_tb::Sysc_tb;
+
+  void posedge() {
+    // flush
+    io_flush_valid = rand_int(0, 255) == 0;
+
+    // ibus
+    if (ibus_resp_pipeline_) {
+      ibus_resp_pipeline_ = false;
+      for (int i = 0; i < ibusw_; ++i) {
+        uint32_t ref = ibus_resp_data_ + i * 4;
+        uint32_t dut = io_ibus_rdata.read().get_word(i);
+        check(ref == dut, "ibus read data");
+      }
+    }
+
+    if (io_ibus_valid && io_ibus_ready) {
+      ibus_resp_pipeline_ = true;
+      ibus_resp_data_ = io_ibus_addr.read().get_word(0) & ~(ibusb_ - 1);
+
+      command_t cmd({io_ibus_addr.read().get_word(0)});
+      history_.write(cmd);
+      if (history_.count() > 16) {
+        history_.remove();
+      }
+    }
+
+    if (!io_ibus_valid || io_ibus_ready) {  // latch transaction
+      command_t cmd;
+      bool newaddr = rand_int(0, 3) == 0 || !history_.rand(cmd);
+      uint32_t addr = newaddr ? rand_uint32() : cmd.addr;
+      if (rand_int(0, 7) == 0) {
+        addr &= 0x3fff;
+      }
+      io_ibus_valid = rand_bool();
+      io_ibus_addr = addr;
+    }
+
+    timeout_ = io_ibus_ready ? 0 : timeout_ + io_ibus_valid;
+    check(timeout_ < 100, "ibus timeout");
+
+    // kxi_read_addr
+    io_axi_read_addr_ready = rand_bool();
+
+    if (io_axi_read_addr_valid && io_axi_read_addr_ready) {
+      uint32_t id = io_axi_read_addr_bits_id.read().get_word(0);
+      uint32_t addr = io_axi_read_addr_bits_addr.read().get_word(0);
+      response_t resp({id, addr});
+      resp_.write(resp);
+    }
+
+    // kxi_read_data
+    io_axi_read_data_valid = false;
+    io_axi_read_data_bits_id = 0;
+    io_axi_read_data_bits_data = 0;
+
+    if (io_axi_read_data_valid && io_axi_read_data_ready) {
+      check(resp_.remove(), "no response to erase");
+      resp_.shuffle();
+    }
+
+    response_t resp;
+    if (resp_.next(resp)) {
+      io_axi_read_data_valid = rand_bool();
+      io_axi_read_data_bits_id = resp.id;
+      uint32_t data = resp.data;
+      sc_bv<kL1IAxiBits> out;
+      for (int i = 0; i < axiw_; ++i) {
+        out.set_word(i, data);
+        data += 4;
+      }
+      io_axi_read_data_bits_data = out;
+    }
+  }
+
+private:
+  struct command_t {
+    uint32_t addr;
+  };
+
+  struct response_t {
+    uint32_t id;
+    uint32_t data;
+  };
+
+  const int ibusb_ = kL1IAxiBits / 8;
+  const int ibusw_ = kL1IAxiBits / 32;
+  const int axib_ = kL1IAxiBits / 8;
+  const int axiw_ = kL1IAxiBits / 32;
+
+  int timeout_ = 0;
+
+  bool ibus_resp_pipeline_ = false;
+  uint32_t ibus_resp_data_;
+  fifo_t<command_t> history_;
+  fifo_t<response_t> resp_;
+};
+
+static void L1ICache_test(char* name, int loops, bool trace) {
+  sc_signal<bool> io_flush_valid;
+  sc_signal<bool> io_flush_ready;
+  sc_signal<bool> io_ibus_valid;
+  sc_signal<bool> io_ibus_ready;
+  sc_signal<sc_bv<32> > io_ibus_addr;
+  sc_signal<sc_bv<kL1IAxiBits> > io_ibus_rdata;
+  sc_signal<bool> io_axi_read_addr_valid;
+  sc_signal<bool> io_axi_read_addr_ready;
+  sc_signal<sc_bv<kL1IAxiId> > io_axi_read_addr_bits_id;
+  sc_signal<sc_bv<32> > io_axi_read_addr_bits_addr;
+  sc_signal<bool> io_axi_read_data_valid;
+  sc_signal<bool> io_axi_read_data_ready;
+  sc_signal<sc_bv<2> > io_axi_read_data_bits_resp;
+  sc_signal<sc_bv<kL1IAxiId> > io_axi_read_data_bits_id;
+  sc_signal<sc_bv<kL1IAxiBits> > io_axi_read_data_bits_data;
+
+  L1ICache_tb tb("L1ICache_tb", loops, true /*random*/);
+  VL1ICache l1icache(name);
+
+  if (trace) {
+    tb.trace(l1icache);
+  }
+
+  l1icache.clock(tb.clock);
+  l1icache.reset(tb.reset);
+  BIND2(tb, l1icache, io_flush_valid);
+  BIND2(tb, l1icache, io_flush_ready);
+  BIND2(tb, l1icache, io_ibus_valid);
+  BIND2(tb, l1icache, io_ibus_ready);
+  BIND2(tb, l1icache, io_ibus_addr);
+  BIND2(tb, l1icache, io_ibus_rdata);
+  BIND2(tb, l1icache, io_axi_read_addr_valid);
+  BIND2(tb, l1icache, io_axi_read_addr_ready);
+  BIND2(tb, l1icache, io_axi_read_addr_bits_id);
+  BIND2(tb, l1icache, io_axi_read_addr_bits_addr);
+  BIND2(tb, l1icache, io_axi_read_data_ready);
+  BIND2(tb, l1icache, io_axi_read_data_valid);
+  BIND2(tb, l1icache, io_axi_read_data_bits_data);
+  BIND2(tb, l1icache, io_axi_read_data_bits_id);
+  BIND2(tb, l1icache, io_axi_read_data_bits_resp);
+
+  tb.start();
+}
+
+int sc_main(int argc, char *argv[]) {
+  L1ICache_test(Sysc_tb::get_name(argv[0]), 1000000, false);
+  return 0;
+}
diff --git a/tests/verilator_sim/kelvin/memory_if.h b/tests/verilator_sim/kelvin/memory_if.h
new file mode 100644
index 0000000..ebe7fc1
--- /dev/null
+++ b/tests/verilator_sim/kelvin/memory_if.h
@@ -0,0 +1,177 @@
+#ifndef TESTS_VERILATOR_SIM_KELVIN_MEMORY_IF_H_
+#define TESTS_VERILATOR_SIM_KELVIN_MEMORY_IF_H_
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "tests/verilator_sim/sysc_module.h"
+
+// A memory model base class
+struct Memory_if : Sysc_module {
+  const int kPageSize = 4 * 1024;
+  const int kPageMask = ~(kPageSize - 1);
+
+  struct memory_page_t {
+    uint32_t addr;
+    uint8_t  data[4096];
+  };
+
+  Memory_if(sc_module_name n, const char* bin, int limit = -1) :
+      Sysc_module(n) {
+    FILE *f = fopen(bin, "rb");
+
+    fseek(f, 0, SEEK_END);
+    int64_t fsize = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    uint8_t *fdata = new uint8_t[fsize];
+
+    fread(fdata, fsize, 1, f);
+    fclose(f);
+
+    if (limit > 0 && fsize > limit) {
+      printf("***ERROR Memory_if limit exceeded [%d > %d]\n", fsize, limit);
+      exit(-1);
+    }
+
+    for (int addr = 0; addr < fsize; addr += kPageSize) {
+      const int64_t size = std::min(fsize - addr, int64_t(kPageSize));
+      AddPage(addr, size, fdata + addr);
+    }
+
+    delete [] fdata;
+  }
+
+  void Read(uint32_t addr, int bytes, uint8_t* data) {
+    while (bytes > 0) {
+      const uint32_t maddr = addr & kPageMask;
+      const uint32_t offset = addr - maddr;
+      const int limit = kPageSize - offset;
+      const int len = std::min(bytes, limit);
+
+      if (!HasPage(maddr)) {
+#ifdef PRINT_ADD_PAGE
+        printf("MemoryModel::Read add_page %08x\n", addr);
+#endif
+        AddPage(maddr, kPageSize);
+      }
+
+      auto& p = page_[maddr];
+      uint8_t* d = p.data;
+      memcpy(data, d + offset, len);
+#if 0
+      printf("READ  %08x", addr);
+      for (int i = 0; i < len; i++) {
+        printf(" %02x", data[i]);
+      }
+      printf("\n");
+#endif
+      addr += len;
+      data += len;
+      bytes -= len;
+      assert (bytes >= 0);
+    }
+  }
+
+  void Write(uint32_t addr, int bytes, const uint8_t* data) {
+    while (bytes > 0) {
+      const uint32_t maddr = addr & kPageMask;
+      const uint32_t offset = addr - maddr;
+      const int limit = kPageSize - offset;
+      const int len = std::min(bytes, limit);
+
+      if (!HasPage(maddr)) {
+#ifdef PRINT_ADD_PAGE
+        printf("MemoryModel::Write add_page %08x\n", addr);
+#endif
+        AddPage(maddr, kPageSize);
+      }
+
+      auto& p = page_[maddr];
+      uint8_t* d = p.data;
+      memcpy(d + offset, data, len);
+#if 0
+      printf("WRITE %08x", addr);
+      for (int i = 0; i < len; i++) {
+        printf(" %02x", data[i]);
+      }
+      printf("\n");
+#endif
+      addr += len;
+      data += len;
+      bytes -= len;
+      assert (bytes >= 0);
+    }
+  }
+
+protected:
+  void ReadSwizzle(const uint32_t addr, const int bytes, uint8_t* data) {
+    const int mask = bytes - 1;
+    const int alignment = (bytes - (addr & mask)) & mask;  // left shuffle
+    uint8_t tmp[512/8];
+
+    if (!alignment) return;
+
+    for (int i = 0; i < bytes; ++i) {
+      tmp[i] = data[i];
+    }
+
+    for (int i = 0; i < bytes; ++i) {
+      data[i] = tmp[(i + alignment) & mask];
+    }
+  }
+
+  void WriteSwizzle(const uint32_t addr, const int bytes, uint8_t* data) {
+    const int mask = bytes - 1;
+    const int alignment = addr & mask;  // right shuffle
+    uint8_t tmp[512/8];
+
+    if (!alignment) return;
+
+    for (int i = 0; i < bytes; ++i) {
+      tmp[i] = data[i];
+    }
+
+    for (int i = 0; i < bytes; ++i) {
+      data[i] = tmp[(i + alignment) & mask];
+    }
+  }
+
+private:
+  std::map<uint32_t, memory_page_t> page_;
+
+  bool HasPage(const uint32_t addr) {
+    return page_.find(addr) != page_.end();
+  }
+
+  void AddPage(const uint32_t addr, const int bytes,
+               const uint8_t* data = nullptr) {
+    const uint32_t addrbase = addr & kPageMask;
+    if (addr != addrbase) {
+      printf("AddPage(%08x, %d)\n", addr, bytes);
+      assert(false && "AddPage: address not page aligned");
+    }
+
+    if (HasPage(addr)) {
+      printf("AddPage(%08x, %d)\n", addr, bytes);
+      assert(false && "AddPage: address already populated");
+    }
+
+    auto& p = page_[addr];
+    uint8_t* d = p.data;
+
+    if (bytes < kPageSize || data == nullptr) {
+#if 1
+      // remove need for .bss  (hacky?)
+      memset(d, 0x00, kPageSize);
+#else
+      memset(d, 0xcc, kPageSize);
+#endif
+    }
+
+    if (data) {
+      memcpy(d, data, bytes);
+    }
+  }
+};
+
+#endif  // TESTS_VERILATOR_SIM_KELVIN_MEMORY_IF_H_
diff --git a/tests/verilator_sim/kelvin/valu.h b/tests/verilator_sim/kelvin/valu.h
new file mode 100644
index 0000000..4e68a53
--- /dev/null
+++ b/tests/verilator_sim/kelvin/valu.h
@@ -0,0 +1,1108 @@
+#ifndef TESTS_VERILATOR_SIM_KELVIN_VALU_H_
+#define TESTS_VERILATOR_SIM_KELVIN_VALU_H_
+
+#include "tools/iss/alu.h"  // Modified
+#include "tests/verilator_sim/kelvin/kelvin_cfg.h"
+#include "tests/verilator_sim/kelvin/vencodeop.h"
+
+constexpr int kLanes = kVector / 32;
+constexpr int kReadPorts = 7;
+constexpr int kWritePorts = 4;
+
+using namespace encode;
+
+struct valu_t {
+  uint8_t op : 7;
+  uint8_t f2 : 3;
+  uint8_t sz : 3;
+  struct {
+    uint32_t data[kLanes];
+  } in[kReadPorts];
+  struct {
+    uint32_t data[kLanes];
+  } out[kWritePorts];
+  struct {
+    uint32_t data;
+  } sv;
+  // Tracking the read/write/scalar controls.
+  struct {
+    bool valid;
+    uint8_t addr : 6;
+    uint8_t tag : 1;
+  } r[kReadPorts];
+  struct {
+    bool valid;
+    uint8_t addr : 6;
+  } w[kWritePorts];
+  struct {
+    bool valid;
+  } scalar;
+
+  bool operator!=(const valu_t& rhs) const {
+    if (w[0].valid != rhs.w[0].valid) return true;
+    if (w[1].valid != rhs.w[1].valid) return true;
+    if (w[0].valid && w[0].addr != rhs.w[0].addr) return true;
+    if (w[1].valid && w[1].addr != rhs.w[1].addr) return true;
+    for (int i = 0; i < kLanes; ++i) {
+      if (w[0].valid && out[0].data[i] != rhs.out[0].data[i]) return true;
+      if (w[1].valid && out[1].data[i] != rhs.out[1].data[i]) return true;
+    }
+    return false;
+  }
+
+  void print(const char* name, const bool inputs = false) {
+    printf("[%s] op=%d f2=%d sz=%d valid=[%d,%d]  waddr=%d", name, op, f2, sz,
+           w[0].valid, w[1].valid, w[0].valid ? w[0].addr : 0);
+    if (w[1].valid) {
+      printf(" {%d}", w[1].addr);
+    }
+    printf("  wdata =");
+    for (int i = 0; i < kLanes; ++i) {
+      printf(" %08x", w[0].valid ? out[0].data[i] : 0);
+    }
+    if (w[1].valid) {
+      printf(" : {");
+      for (int i = 0; i < kLanes; ++i) {
+        printf(" %08x", out[1].data[i]);
+      }
+      printf(" }");
+    }
+    printf("\n");
+    if (inputs) {
+      printf("\n");
+      for (int i = 0; i < kReadPorts; ++i) {
+        printf("                                               read%d =", i);
+        for (int j = 0; j < kLanes; ++j) {
+          printf(" %08x", in[i].data[j]);
+        }
+        printf("\n");
+      }
+    }
+  }
+};
+
+#define VOP1U(func)                                                  \
+  if (sz == 1) {                                                     \
+    v = 1;                                                           \
+    x = func(uint8_t(a)) | func(uint8_t(a >> 8)) << 8 |              \
+        func(uint8_t(a >> 16)) << 16 | func(uint8_t(a >> 24)) << 24; \
+  }                                                                  \
+  if (sz == 2) {                                                     \
+    v = 1;                                                           \
+    x = func(uint16_t(a)) | func(uint16_t(a >> 16)) << 16;           \
+  }                                                                  \
+  if (sz == 4) {                                                     \
+    v = 1;                                                           \
+    x = func(uint32_t(a));                                           \
+  }
+
+#define VOP1PU(func)                                                  \
+  if (sz == 1) {                                                     \
+    v = 1;                                                           \
+    w = 1;                                                           \
+    x = func(uint8_t(a)) | func(uint8_t(a >> 8)) << 8 |              \
+        func(uint8_t(a >> 16)) << 16 | func(uint8_t(a >> 24)) << 24; \
+    y = func(uint8_t(c)) | func(uint8_t(c >> 8)) << 8 |              \
+        func(uint8_t(c >> 16)) << 16 | func(uint8_t(c >> 24)) << 24; \
+  }                                                                  \
+  if (sz == 2) {                                                     \
+    v = 1;                                                           \
+    w = 1;                                                           \
+    x = func(uint16_t(a)) | func(uint16_t(a >> 16)) << 16;           \
+    y = func(uint16_t(c)) | func(uint16_t(c >> 16)) << 16;           \
+  }                                                                  \
+  if (sz == 4) {                                                     \
+    v = 1;                                                           \
+    w = 1;                                                           \
+    x = func(uint32_t(a));                                           \
+    y = func(uint32_t(c));                                           \
+  }
+
+#define VOPXU(func)                                                  \
+  if (sz == 1) {                                                     \
+    v = 1;                                                           \
+    x = func(uint8_t(b)) | func(uint8_t(b >> 8)) << 8 |              \
+        func(uint8_t(b >> 16)) << 16 | func(uint8_t(b >> 24)) << 24; \
+  }                                                                  \
+  if (sz == 2) {                                                     \
+    v = 1;                                                           \
+    x = func(uint16_t(b)) | func(uint16_t(b >> 16)) << 16;           \
+  }                                                                  \
+  if (sz == 4) {                                                     \
+    v = 1;                                                           \
+    x = func(uint32_t(b));                                           \
+  }
+
+#define VOP2S(func)                                               \
+  if (sz == 1) {                                                  \
+    v = 1;                                                        \
+    x = uint8_t(func(int8_t(a), int8_t(b))) |                     \
+        uint8_t(func(int8_t(a >> 8), int8_t(b >> 8))) << 8 |      \
+        uint8_t(func(int8_t(a >> 16), int8_t(b >> 16))) << 16 |   \
+        uint8_t(func(int8_t(a >> 24), int8_t(b >> 24))) << 24;    \
+  } else if (sz == 2) {                                           \
+    v = 1;                                                        \
+    x = uint16_t(func(int16_t(a), int16_t(b))) |                  \
+        uint16_t(func(int16_t(a >> 16), int16_t(b >> 16))) << 16; \
+  } else if (sz == 4) {                                           \
+    v = 1;                                                        \
+    x = uint32_t(func(int32_t(a), int32_t(b)));                   \
+  }
+
+#define VOP2U(func)                                       \
+  if (sz == 1) {                                          \
+    v = 1;                                                \
+    x = func(uint8_t(a), uint8_t(b)) |                    \
+        func(uint8_t(a >> 8), uint8_t(b >> 8)) << 8 |     \
+        func(uint8_t(a >> 16), uint8_t(b >> 16)) << 16 |  \
+        func(uint8_t(a >> 24), uint8_t(b >> 24)) << 24;   \
+  } else if (sz == 2) {                                   \
+    v = 1;                                                \
+    x = func(uint16_t(a), uint16_t(b)) |                  \
+        func(uint16_t(a >> 16), uint16_t(b >> 16)) << 16; \
+  } else if (sz == 4) {                                   \
+    v = 1;                                                \
+    x = func(uint32_t(a), uint32_t(b));                   \
+  }
+
+#define VOP2(func) \
+  if (f2_signed) { \
+    VOP2S(func)    \
+  } else {         \
+    VOP2U(func)    \
+  }
+
+#define VOP2S_R(func, r)                                             \
+  if (sz == 1) {                                                     \
+    v = 1;                                                           \
+    x = uint8_t(func(int8_t(a), int8_t(b), r)) |                     \
+        uint8_t(func(int8_t(a >> 8), int8_t(b >> 8), r)) << 8 |      \
+        uint8_t(func(int8_t(a >> 16), int8_t(b >> 16), r)) << 16 |   \
+        uint8_t(func(int8_t(a >> 24), int8_t(b >> 24), r)) << 24;    \
+  } else if (sz == 2) {                                              \
+    v = 1;                                                           \
+    x = uint16_t(func(int16_t(a), int16_t(b), r)) |                  \
+        uint16_t(func(int16_t(a >> 16), int16_t(b >> 16), r)) << 16; \
+  } else if (sz == 4) {                                              \
+    v = 1;                                                           \
+    x = uint32_t(func(int32_t(a), int32_t(b), r));                   \
+  }
+
+#define VOP2U_R(func, r)                                               \
+  if (sz == 1) {                                                       \
+    v = 1;                                                             \
+    x = uint8_t(func(uint8_t(a), uint8_t(b), r)) |                     \
+        uint8_t(func(uint8_t(a >> 8), uint8_t(b >> 8), r)) << 8 |      \
+        uint8_t(func(uint8_t(a >> 16), uint8_t(b >> 16), r)) << 16 |   \
+        uint8_t(func(uint8_t(a >> 24), uint8_t(b >> 24), r)) << 24;    \
+  } else if (sz == 2) {                                                \
+    v = 1;                                                             \
+    x = uint16_t(func(uint16_t(a), uint16_t(b), r)) |                  \
+        uint16_t(func(uint16_t(a >> 16), uint16_t(b >> 16), r)) << 16; \
+  } else if (sz == 4) {                                                \
+    v = 1;                                                             \
+    x = uint32_t(func(uint32_t(a), uint32_t(b), r));                   \
+  }
+
+#define VOP2_R(func, r) \
+  if (f2_signed) {      \
+    VOP2S_R(func, r)    \
+  } else {              \
+    VOP2U_R(func, r)    \
+  }
+
+#define VOP2PS(func)                                               \
+  if (sz == 1) {                                                  \
+    v = 1;                                                        \
+    w = 1;                                                        \
+    x = uint8_t(func(int8_t(a), int8_t(b))) |                     \
+        uint8_t(func(int8_t(a >> 8), int8_t(b >> 8))) << 8 |      \
+        uint8_t(func(int8_t(a >> 16), int8_t(b >> 16))) << 16 |   \
+        uint8_t(func(int8_t(a >> 24), int8_t(b >> 24))) << 24;    \
+    y = uint8_t(func(int8_t(c), int8_t(b))) |                     \
+        uint8_t(func(int8_t(c >> 8), int8_t(b >> 8))) << 8 |      \
+        uint8_t(func(int8_t(c >> 16), int8_t(b >> 16))) << 16 |   \
+        uint8_t(func(int8_t(c >> 24), int8_t(b >> 24))) << 24;    \
+  } else if (sz == 2) {                                           \
+    v = 1;                                                        \
+    w = 1;                                                        \
+    x = uint16_t(func(int16_t(a), int16_t(b))) |                  \
+        uint16_t(func(int16_t(a >> 16), int16_t(b >> 16))) << 16; \
+    y = uint16_t(func(int16_t(c), int16_t(b))) |                  \
+        uint16_t(func(int16_t(c >> 16), int16_t(b >> 16))) << 16; \
+  } else if (sz == 4) {                                           \
+    v = 1;                                                        \
+    w = 1;                                                        \
+    x = uint32_t(func(int32_t(a), int32_t(b)));                   \
+    y = uint32_t(func(int32_t(c), int32_t(b)));                   \
+  }
+
+#define VOP2PU(func)                                      \
+  if (sz == 1) {                                          \
+    v = 1;                                                \
+    w = 1;                                                \
+    x = func(uint8_t(a), uint8_t(b)) |                    \
+        func(uint8_t(a >> 8), uint8_t(b >> 8)) << 8 |     \
+        func(uint8_t(a >> 16), uint8_t(b >> 16)) << 16 |  \
+        func(uint8_t(a >> 24), uint8_t(b >> 24)) << 24;   \
+    y = func(uint8_t(c), uint8_t(b)) |                    \
+        func(uint8_t(c >> 8), uint8_t(b >> 8)) << 8 |     \
+        func(uint8_t(c >> 16), uint8_t(b >> 16)) << 16 |  \
+        func(uint8_t(c >> 24), uint8_t(b >> 24)) << 24;   \
+  } else if (sz == 2) {                                   \
+    v = 1;                                                \
+    w = 1;                                                \
+    x = func(uint16_t(a), uint16_t(b)) |                  \
+        func(uint16_t(a >> 16), uint16_t(b >> 16)) << 16; \
+    y = func(uint16_t(c), uint16_t(b)) |                  \
+        func(uint16_t(c >> 16), uint16_t(b >> 16)) << 16; \
+  } else if (sz == 4) {                                   \
+    v = 1;                                                \
+    w = 1;                                                \
+    x = func(uint32_t(a), uint32_t(b));                   \
+    y = func(uint32_t(c), uint32_t(b));                   \
+  }
+
+#define VOP2P(func) \
+  if (f2_signed) { \
+    VOP2PS(func)    \
+  } else {         \
+    VOP2PU(func)    \
+  }
+
+#define VOP2PS_R(func, r)                                            \
+  if (sz == 1) {                                                     \
+    v = 1;                                                           \
+    w = 1;                                                           \
+    x = uint8_t(func(int8_t(a), int8_t(b), r)) |                     \
+        uint8_t(func(int8_t(a >> 8), int8_t(b >> 8), r)) << 8 |      \
+        uint8_t(func(int8_t(a >> 16), int8_t(b >> 16), r)) << 16 |