Parameterize Kelvin over instructionLanes

- Many things in Kelvin were hard-coded to operate on 4 instruction lanes -- refactor those to be flexible based on the instructionLanes value in Parameters

Change-Id: I1957d87b6f355d815380a88c28d210c1c8eec737
diff --git a/hdl/chisel/src/common/BUILD b/hdl/chisel/src/common/BUILD
index 5bab0ba..2805f14 100644
--- a/hdl/chisel/src/common/BUILD
+++ b/hdl/chisel/src/common/BUILD
@@ -18,9 +18,9 @@
 chisel_library(
     name = "common",
     srcs = [
-        "Fifo4e.scala",
-        "Fifo4.scala",
-        "Fifo4x4.scala",
+        "FifoXe.scala",
+        "FifoX.scala",
+        "FifoIxO.scala",
         "Fifo.scala",
         "IDiv.scala",
         "Library.scala",
diff --git a/hdl/chisel/src/common/Fifo4.scala b/hdl/chisel/src/common/Fifo4.scala
deleted file mode 100644
index a01963f..0000000
--- a/hdl/chisel/src/common/Fifo4.scala
+++ /dev/null
@@ -1,192 +0,0 @@
-// Copyright 2023 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package common
-
-import chisel3._
-import chisel3.util._
-import _root_.circt.stage.ChiselStage
-
-object Fifo4 {
-  def apply[T <: Data](t: T, n: Int) = {
-    Module(new Fifo4(t, n))
-  }
-}
-
-// 4way decode, used for Fifo4 style input controls.
-object Fifo4Valid {
-  def apply(in: UInt): (UInt, UInt, UInt, UInt) = {
-    assert(in.getWidth == 4)
-
-    val in0 = Cat(in(3,0) === 8.U,  // 8
-                  in(2,0) === 4.U,  // 4, 12
-                  in(1,0) === 2.U,  // 2, 6, 10, 14
-                  in(0))            // 1, 3, 5, 7, 9, 11, 13, 15
-
-    val in1 = Cat(in(3,0) === 12.U ||
-                  in(3,0) === 10.U ||
-                  in(3,0) === 9.U,  // 9, 10, 12
-                  in(2,0) === 6.U ||
-                  in(2,0) === 5.U,  // 5, 6, 13, 14
-                  in(1,0) === 3.U,  // 3, 7, 11, 15
-                  false.B)
-
-    val in2 = Cat(in(3,0) === 14.U ||
-                  in(3,0) === 13.U ||
-                  in(3,0) === 11.U,  // 11, 13, 14
-                  in(2,0) === 15.U ||
-                  in(2,0) === 7.U,   // 7, 15
-                  false.B, false.B)
-
-    val in3 = Cat(in(3,0) === 15.U,  // 15
-                  false.B, false.B, false.B)
-
-    (in0.asUInt, in1.asUInt, in2.asUInt, in3.asUInt)
-  }
-}
-
-class Fifo4[T <: Data](t: T, n: Int) extends Module {
-  val io = IO(new Bundle {
-    val in  = Flipped(Decoupled(Vec(4, Valid(t))))
-    val out = Decoupled(t)
-    val count = Output(UInt(log2Ceil(n+1).W))
-  })
-
-  val m = n - 1  // n = Mem(n-1) + Slice
-
-  def Increment(a: UInt, b: UInt): UInt = {
-    val c = a +& b
-    val d = Mux(c < m.U, c, c - m.U)(a.getWidth - 1, 0)
-    d
-  }
-
-  val mem = Mem(m, t)
-  val mslice = Slice(t, false, true)
-
-  val in0pos = RegInit(0.U(log2Ceil(m).W))
-  val in1pos = RegInit(1.U(log2Ceil(m).W))
-  val in2pos = RegInit(2.U(log2Ceil(m).W))
-  val in3pos = RegInit(3.U(log2Ceil(m).W))
-  val outpos = RegInit(0.U(log2Ceil(m).W))
-  val mcount = RegInit(0.U(log2Ceil(n+1).W))
-
-  io.count := mcount + io.out.valid
-
-  val ivalid = io.in.valid && io.in.ready
-  val ovalid = mslice.io.in.valid && mslice.io.in.ready
-
-  val iactive = Cat(io.in.bits(3).valid, io.in.bits(2).valid,
-                    io.in.bits(1).valid, io.in.bits(0).valid).asUInt
-
-  val icount = PopCount(iactive)
-
-  // ---------------------------------------------------------------------------
-  // Fifo Control.
-  when (ivalid) {
-    in0pos := Increment(in0pos, icount)
-    in1pos := Increment(in1pos, icount)
-    in2pos := Increment(in2pos, icount)
-    in3pos := Increment(in3pos, icount)
-  }
-
-  when (ovalid) {
-    outpos := Increment(outpos, 1.U)
-  }
-
-  val inc = MuxOR(ivalid, icount)
-  val dec = mslice.io.in.valid && mslice.io.in.ready
-
-  when (ivalid || ovalid) {
-    mcount := mcount + inc - dec
-  }
-
-  // ---------------------------------------------------------------------------
-  // Fifo Input.
-  val (in0valid, in1valid, in2valid, in3valid) = Fifo4Valid(iactive)
-
-  for (i <- 0 until m) {
-    val valid = Cat(in0pos === i.U && in0valid(3) ||
-                    in1pos === i.U && in1valid(3) ||
-                    in2pos === i.U && in2valid(3) ||
-                    in3pos === i.U && in3valid(3),
-                    in0pos === i.U && in0valid(2) ||
-                    in1pos === i.U && in1valid(2) ||
-                    in2pos === i.U && in2valid(2),
-                    in0pos === i.U && in0valid(1) ||
-                    in1pos === i.U && in1valid(1),
-                    in0pos === i.U && in0valid(0))
-
-    when (ivalid) {
-      when (valid(0)) {
-        mem(i) := io.in.bits(0).bits
-      } .elsewhen (valid(1)) {
-        mem(i) := io.in.bits(1).bits
-      } .elsewhen (valid(2)) {
-        mem(i) := io.in.bits(2).bits
-      } .elsewhen (valid(3)) {
-        mem(i) := io.in.bits(3).bits
-      }
-    }
-  }
-
-  mslice.io.in.valid := false.B
-  mslice.io.in.bits := io.in.bits(0).bits  // defaults
-
-  when (mcount > 0.U) {
-    when (io.out.ready) {
-      mslice.io.in.valid := true.B
-    }
-  } .otherwise {
-    when (ivalid && iactive =/= 0.U) {
-      mslice.io.in.valid := true.B
-    }
-  }
-
-  when (mcount > 0.U) {
-    mslice.io.in.bits := mem(outpos)
-  } .elsewhen (ivalid) {
-    when (iactive(0)) {
-      mslice.io.in.bits := io.in.bits(0).bits
-    } .elsewhen (iactive(1)) {
-      mslice.io.in.bits := io.in.bits(1).bits
-    } .elsewhen (iactive(2)) {
-      mslice.io.in.bits := io.in.bits(2).bits
-    } .elsewhen (iactive(3)) {
-      mslice.io.in.bits := io.in.bits(3).bits
-    }
-  }
-
-  // ---------------------------------------------------------------------------
-  // Valid Entries.
-  val active = RegInit(0.U(m.W))
-
-  val activeSet = MuxOR(ivalid,
-      ((icount >= 1.U) << in0pos) | ((icount >= 2.U) << in1pos) |
-      ((icount >= 3.U) << in2pos) | ((icount >= 4.U) << in3pos))
-
-  val activeClr = MuxOR(mslice.io.in.valid && mslice.io.in.ready, 1.U << outpos)
-
-  active := (active | activeSet) & ~activeClr
-
-  // ---------------------------------------------------------------------------
-  // Interface.
-  io.in.ready := mcount <= (m.U - icount)
-  io.out <> mslice.io.out
-
-  assert(mcount <= m.U)
-}
-
-object EmitFifo4 extends App {
-  ChiselStage.emitSystemVerilogFile(new Fifo4(UInt(8.W), 11), args)
-}
diff --git a/hdl/chisel/src/common/Fifo4e.scala b/hdl/chisel/src/common/Fifo4e.scala
deleted file mode 100644
index 392e7ee..0000000
--- a/hdl/chisel/src/common/Fifo4e.scala
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright 2023 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package common
-
-import chisel3._
-import chisel3.util._
-import _root_.circt.stage.ChiselStage
-
-// Fifo4 with entry output and no output registration stage.
-
-object Fifo4e {
-  def apply[T <: Data](t: T, n: Int) = {
-    Module(new Fifo4e(t, n))
-  }
-}
-
-class Fifo4e[T <: Data](t: T, n: Int) extends Module {
-  val io = IO(new Bundle {
-    val in  = Flipped(Decoupled(Vec(4, Valid(t))))
-    val out = Decoupled(t)
-    val count = Output(UInt(log2Ceil(n+1).W))
-    val entry = Output(Vec(n, Valid(t)))
-    val nempty = Output(Bool())
-  })
-
-  def Increment(a: UInt, b: UInt): UInt = {
-    val c = a +& b
-    val d = Mux(c < n.U, c, c - n.U)(a.getWidth - 1, 0)
-    d
-  }
-
-  val mem = Mem(n, t)
-
-  val in0pos = RegInit(0.U(log2Ceil(n).W))
-  val in1pos = RegInit(1.U(log2Ceil(n).W))
-  val in2pos = RegInit(2.U(log2Ceil(n).W))
-  val in3pos = RegInit(3.U(log2Ceil(n).W))
-  val outpos = RegInit(0.U(log2Ceil(n).W))
-  val mcount = RegInit(0.U(log2Ceil(n+1).W))
-  val nempty = RegInit(false.B)
-
-  io.count := mcount
-  io.nempty := nempty
-
-  val ivalid = io.in.valid && io.in.ready
-  val ovalid = io.out.valid && io.out.ready
-
-  val iactive = Cat(io.in.bits(3).valid, io.in.bits(2).valid,
-                    io.in.bits(1).valid, io.in.bits(0).valid).asUInt
-
-  val icount = PopCount(iactive)
-
-  // ---------------------------------------------------------------------------
-  // Fifo Control.
-  when (ivalid) {
-    in0pos := Increment(in0pos, icount)
-    in1pos := Increment(in1pos, icount)
-    in2pos := Increment(in2pos, icount)
-    in3pos := Increment(in3pos, icount)
-  }
-
-  when (ovalid) {
-    outpos := Increment(outpos, 1.U)
-  }
-
-  val inc = MuxOR(ivalid, icount)
-  val dec = ovalid
-
-  when (ivalid || ovalid) {
-    val nxtcount = mcount + inc - dec
-    mcount := nxtcount
-    nempty := nxtcount =/= 0.U
-  }
-
-  // ---------------------------------------------------------------------------
-  // Fifo Input.
-  val (in0valid, in1valid, in2valid, in3valid) = Fifo4Valid(iactive)
-
-  for (i <- 0 until n) {
-    val valid = Cat(in0pos === i.U && in0valid(3) ||
-                    in1pos === i.U && in1valid(3) ||
-                    in2pos === i.U && in2valid(3) ||
-                    in3pos === i.U && in3valid(3),
-                    in0pos === i.U && in0valid(2) ||
-                    in1pos === i.U && in1valid(2) ||
-                    in2pos === i.U && in2valid(2),
-                    in0pos === i.U && in0valid(1) ||
-                    in1pos === i.U && in1valid(1),
-                    in0pos === i.U && in0valid(0))
-
-    when (ivalid) {
-      when (valid(0)) {
-        mem(i) := io.in.bits(0).bits
-      } .elsewhen (valid(1)) {
-        mem(i) := io.in.bits(1).bits
-      } .elsewhen (valid(2)) {
-        mem(i) := io.in.bits(2).bits
-      } .elsewhen (valid(3)) {
-        mem(i) := io.in.bits(3).bits
-      }
-    }
-  }
-
-  // ---------------------------------------------------------------------------
-  // Valid Entries.
-  val active = RegInit(0.U(n.W))
-
-  val activeSet = MuxOR(ivalid,
-      ((icount >= 1.U) << in0pos) | ((icount >= 2.U) << in1pos) |
-      ((icount >= 3.U) << in2pos) | ((icount >= 4.U) << in3pos))
-
-  val activeClr = MuxOR(io.out.valid && io.out.ready, 1.U << outpos)
-
-  when (io.in.valid && io.in.ready || io.out.valid && io.out.ready) {
-    active := (active | activeSet) & ~activeClr
-  }
-
-  // ---------------------------------------------------------------------------
-  // Interface.
-  io.in.ready := mcount <= (n.U - icount)
-
-  io.out.valid := mcount =/= 0.U
-  io.out.bits := mem(outpos)
-
-  assert(mcount <= n.U)
-
-  for (i <- 0 until n) {
-    io.entry(i).valid := active(i)
-    io.entry(i).bits := mem(i)
-  }
-}
-
-object EmitFifo4e extends App {
-  ChiselStage.emitSystemVerilogFile(new Fifo4e(UInt(8.W), 10), args)
-}
diff --git a/hdl/chisel/src/common/Fifo4x4.scala b/hdl/chisel/src/common/Fifo4x4.scala
deleted file mode 100644
index 064af4b..0000000
--- a/hdl/chisel/src/common/Fifo4x4.scala
+++ /dev/null
@@ -1,198 +0,0 @@
-// Copyright 2023 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package common
-
-import chisel3._
-import chisel3.util._
-import _root_.circt.stage.ChiselStage
-
-object Fifo4x4 {
-  def apply[T <: Data](t: T, n: Int) = {
-    Module(new Fifo4x4(t, n))
-  }
-}
-
-// Input accepted with a common handshake and per lane select.
-// Outputs are transacted independently, and ordered {[0], [0,1], [0,1,2], [0,1,2,3]}.
-// Outputs are not registered, assumes passes directly into shallow combinatorial.
-class Fifo4x4[T <: Data](t: T, n: Int) extends Module {
-  val io = IO(new Bundle {
-    val in  = Flipped(Decoupled(Vec(4, Valid(t))))
-    val out = Vec(4, Decoupled(t))
-    val count = Output(UInt(log2Ceil(n+1).W))
-    val nempty = Output(Bool())
-  })
-
-  val m = n
-
-  val mb  = log2Ceil(m)
-  val n1b = log2Ceil(n + 1)
-
-  def Increment(a: UInt, b: UInt): UInt = {
-    val c = a +& b
-    val d = Mux(c < m.U, c, c - m.U)(a.getWidth - 1, 0)
-    d
-  }
-
-  val mem = Reg(Vec(n, t))
-
-  val inpos  = Reg(Vec(4, UInt(mb.W)))  // reset below
-  val outpos = Reg(Vec(4, UInt(mb.W)))  // reset below
-
-  val mcount = RegInit(0.U(n1b.W))
-  val nempty = RegInit(false.B)
-  val inready = RegInit(false.B)
-  val outvalid = RegInit(0.U(4.W))
-
-  val ivalid = io.in.valid && io.in.ready
-
-  val iactive = Cat(io.in.bits(3).valid, io.in.bits(2).valid,
-                    io.in.bits(1).valid, io.in.bits(0).valid).asUInt
-
-  val icount = (io.in.bits(0).valid +& io.in.bits(1).valid +&
-                io.in.bits(2).valid +& io.in.bits(3).valid)(2,0)
-
-  val oactiveBits = Cat(io.out(3).valid && io.out(3).ready,
-                        io.out(2).valid && io.out(2).ready,
-                        io.out(1).valid && io.out(1).ready,
-                        io.out(0).valid && io.out(0).ready)
-
-  val ovalid = oactiveBits =/= 0.U
-
-  val ocount = (oactiveBits(0) +& oactiveBits(1) +&
-                oactiveBits(2) +& oactiveBits(3))(2,0)
-
-  assert(!(oactiveBits(1) === 1.U && oactiveBits(0,0) =/= 1.U))
-  assert(!(oactiveBits(2) === 1.U && oactiveBits(1,0) =/= 3.U))
-  assert(!(oactiveBits(3) === 1.U && oactiveBits(2,0) =/= 7.U))
-
-  val ovalidBits = Cat(io.out(3).valid, io.out(2).valid,
-                       io.out(1).valid, io.out(0).valid)
-
-  assert(!(ovalidBits(1) === 1.U && ovalidBits(0,0) =/= 1.U))
-  assert(!(ovalidBits(2) === 1.U && ovalidBits(1,0) =/= 3.U))
-  assert(!(ovalidBits(3) === 1.U && ovalidBits(2,0) =/= 7.U))
-
-  val oreadyBits = Cat(io.out(3).ready, io.out(2).ready,
-                       io.out(1).ready, io.out(0).ready)
-
-  assert(!(oreadyBits(1) === 1.U && oreadyBits(0,0) =/= 1.U))
-  assert(!(oreadyBits(2) === 1.U && oreadyBits(1,0) =/= 3.U))
-  assert(!(oreadyBits(3) === 1.U && oreadyBits(2,0) =/= 7.U))
-
-  // ---------------------------------------------------------------------------
-  // Fifo Control.
-  when (reset.asBool) {
-    for (i <- 0 until 4) {
-      inpos(i) := i.U
-    }
-  } .elsewhen (ivalid) {
-    for (i <- 0 until 4) {
-      inpos(i) := Increment(inpos(i), icount)
-    }
-  }
-
-  when (reset.asBool) {
-    for (i <- 0 until 4) {
-      outpos(i) := i.U
-    }
-  } .elsewhen (ovalid) {
-    for (i <- 0 until 4) {
-      outpos(i) := Increment(outpos(i), ocount)
-    }
-  }
-
-  val inc = MuxOR(ivalid, icount)
-  val dec = MuxOR(ovalid, ocount)
-
-  when (ivalid || ovalid) {
-    val nxtmcount = mcount + inc - dec
-    inready := nxtmcount <= (m.U - 4.U)
-    mcount := nxtmcount
-    nempty := nxtmcount =/= 0.U
-    outvalid := Cat(nxtmcount >= 4.U,
-                    nxtmcount >= 3.U,
-                    nxtmcount >= 2.U,
-                    nxtmcount >= 1.U)
-  } .otherwise {
-    inready := mcount <= (m.U - 4.U)
-    outvalid := Cat(mcount >= 4.U,
-                    mcount >= 3.U,
-                    mcount >= 2.U,
-                    mcount >= 1.U)
-  }
-
-  // ---------------------------------------------------------------------------
-  // Fifo Input.
-  val (in0valid, in1valid, in2valid, in3valid) = Fifo4Valid(iactive)
-
-  for (i <- 0 until m) {
-    val valid = Cat(inpos(0) === i.U && in0valid(3) ||
-                    inpos(1) === i.U && in1valid(3) ||
-                    inpos(2) === i.U && in2valid(3) ||
-                    inpos(3) === i.U && in3valid(3),
-
-                    inpos(0) === i.U && in0valid(2) ||
-                    inpos(1) === i.U && in1valid(2) ||
-                    inpos(2) === i.U && in2valid(2),
-
-                    inpos(0) === i.U && in0valid(1) ||
-                    inpos(1) === i.U && in1valid(1),
-
-                    inpos(0) === i.U && in0valid(0))
-
-    if (true) {
-      val data = MuxOR(valid(0), io.in.bits(0).bits.asUInt) |
-                 MuxOR(valid(1), io.in.bits(1).bits.asUInt) |
-                 MuxOR(valid(2), io.in.bits(2).bits.asUInt) |
-                 MuxOR(valid(3), io.in.bits(3).bits.asUInt)
-
-      when (ivalid && valid =/= 0.U) {
-        mem(i) := data.asTypeOf(t)
-      }
-    } else {
-      when (ivalid) {
-        when (valid(0)) {
-          mem(i) := io.in.bits(0).bits
-        } .elsewhen (valid(1)) {
-          mem(i) := io.in.bits(1).bits
-        } .elsewhen (valid(2)) {
-          mem(i) := io.in.bits(2).bits
-        } .elsewhen (valid(3)) {
-          mem(i) := io.in.bits(3).bits
-        }
-      }
-    }
-  }
-
-  // ---------------------------------------------------------------------------
-  // Interface.
-  io.in.ready := inready
-
-  for (i <- 0 until 4) {
-    io.out(i).valid := outvalid(i)
-    io.out(i).bits := mem(outpos(i))  // TODO: VecAt()
-  }
-
-  io.count := mcount
-
-  io.nempty := nempty
-
-  assert(io.count <= m.U)
-}
-
-object EmitFifo4x4 extends App {
-  ChiselStage.emitSystemVerilogFile(new Fifo4x4(UInt(32.W), 24), args)
-}
diff --git a/hdl/chisel/src/common/FifoIxO.scala b/hdl/chisel/src/common/FifoIxO.scala
new file mode 100644
index 0000000..5be7dd6
--- /dev/null
+++ b/hdl/chisel/src/common/FifoIxO.scala
@@ -0,0 +1,172 @@
+// Copyright 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package common
+
+import chisel3._
+import chisel3.util._
+import _root_.circt.stage.ChiselStage
+
+object FifoIxO {
+  def apply[T <: Data](t: T, i: Int, o: Int, n: Int) = {
+    Module(new FifoIxO(t, i, o, n))
+  }
+}
+
+// Input accepted with a common handshake and per lane select.
+// Outputs are transacted independently, and ordered {[0], [0,1], [0,1,2], [0,1,2,3]}.
+// Outputs are not registered, assumes passes directly into shallow combinatorial.
+class FifoIxO[T <: Data](t: T, i: Int, o: Int, n: Int /* depth */) extends Module {
+  val io = IO(new Bundle {
+    val in  = Flipped(Decoupled(Vec(i, Valid(t))))
+    val out = Vec(o, Decoupled(t))
+    val count = Output(UInt(log2Ceil(n+1).W))
+    val nempty = Output(Bool())
+  })
+
+  val m = n
+
+  val mb  = log2Ceil(m)
+  val n1b = log2Ceil(n + 1)
+
+  def Increment(a: UInt, b: UInt): UInt = {
+    val c = a +& b
+    val d = Mux(c < m.U, c, c - m.U)(a.getWidth - 1, 0)
+    d
+  }
+
+  val mem = Reg(Vec(n, t))
+
+  val inpos  = Reg(Vec(i, UInt(mb.W)))  // reset below
+  val outpos = Reg(Vec(o, UInt(mb.W)))  // reset below
+
+  val mcount = RegInit(0.U(n1b.W))
+  val nempty = RegInit(false.B)
+  val inready = RegInit(false.B)
+  val outvalid = RegInit(0.U(o.W))
+
+  val ivalid = io.in.valid && io.in.ready
+
+  val iactive = Cat((0 until i).reverse.map(x => io.in.bits(x).valid)).asUInt
+
+  val icount = (io.in.bits.map(x => x.valid.asUInt).reduce(_ +& _))(log2Ceil(i),0)
+
+  val oactiveBits = Cat((0 until o).reverse.map(x => io.out(x).valid && io.out(x).ready))
+
+  val ovalid = oactiveBits =/= 0.U
+
+  val ocount = (0 until o).map(x => oactiveBits(x).asUInt).reduce(_ +& _)(log2Ceil(o),0)
+
+  for (n <- 1 until o) {
+    assert(!(oactiveBits(n) === 1.U && oactiveBits(n - 1,0) =/= ((1 << n) - 1).U))
+  }
+
+  val ovalidBits = Cat((0 until o).reverse.map(x => io.out(x).valid))
+
+  for (n <- 1 until o) {
+    assert(!(ovalidBits(n) === 1.U && ovalidBits(n - 1, 0) =/= ((1 << n) - 1).U))
+  }
+
+  val oreadyBits = Cat((0 until o).reverse.map(x => io.out(x).ready))
+
+  for (n <- 1 until o) {
+    assert(!(oreadyBits(n) === 1.U && oreadyBits(n - 1, 0) =/= ((1 << n) - 1).U))
+  }
+
+  // ---------------------------------------------------------------------------
+  // Fifo Control.
+  when (reset.asBool) {
+    for (i <- 0 until i) {
+      inpos(i) := i.U
+    }
+  } .elsewhen (ivalid) {
+    for (i <- 0 until i) {
+      inpos(i) := Increment(inpos(i), icount)
+    }
+  }
+
+  when (reset.asBool) {
+    for (i <- 0 until o) {
+      outpos(i) := i.U
+    }
+  } .elsewhen (ovalid) {
+    for (i <- 0 until o) {
+      outpos(i) := Increment(outpos(i), ocount)
+    }
+  }
+
+  val inc = MuxOR(ivalid, icount)
+  val dec = MuxOR(ovalid, ocount)
+
+  when (ivalid || ovalid) {
+    val nxtmcount = mcount + inc - dec
+    inready := nxtmcount <= (m.U - i.U)
+    mcount := nxtmcount
+    nempty := nxtmcount =/= 0.U
+    outvalid := Cat((0 until o).reverse.map(x => nxtmcount >= (x + 1).U))
+  } .otherwise {
+    inready := mcount <= (m.U - i.U)
+    outvalid := Cat((0 until o).reverse.map(x => mcount >= (x + 1).U))
+  }
+
+  // ---------------------------------------------------------------------------
+  // Fifo Input.
+  val inxvalid = FifoXValid(iactive)
+
+  for (q <- 0 until m) {
+    val valid = Cat(
+      (0 until i).reverse.map(x =>
+        if (x == 0) { inpos(0) === q.U && inxvalid(0)(0) } else {
+          (0 to x).map(y =>
+            inpos(y) === q.U && inxvalid(y)(x)
+          ).reduce(_ || _)
+        }
+      )
+    )
+
+    if (true) {
+      val data = (0 until i).map(x => MuxOR(valid(x), io.in.bits(x).bits.asUInt)).reduce(_ | _)
+
+      when (ivalid && valid =/= 0.U) {
+        mem(q) := data.asTypeOf(t)
+      }
+    } else {
+      when (ivalid) {
+        when(PopCount(valid) >= 1.U) {
+          val idx = PriorityEncoder(valid)
+          mem(q) := io.in.bits(idx).bits
+        }
+      }
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Interface.
+  io.in.ready := inready
+
+  for (i <- 0 until o) {
+    io.out(i).valid := outvalid(i)
+    io.out(i).bits := mem(outpos(i))  // TODO: VecAt()
+  }
+
+  io.count := mcount
+
+  io.nempty := nempty
+
+  assert(io.count <= m.U)
+}
+
+object EmitFifoIxO extends App {
+  ChiselStage.emitSystemVerilogFile(new FifoIxO(UInt(32.W), 4, 4, 24), args)
+}
diff --git a/hdl/chisel/src/common/FifoX.scala b/hdl/chisel/src/common/FifoX.scala
new file mode 100644
index 0000000..ee3f041
--- /dev/null
+++ b/hdl/chisel/src/common/FifoX.scala
@@ -0,0 +1,163 @@
+// Copyright 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package common
+
+import chisel3._
+import chisel3.util._
+import _root_.circt.stage.ChiselStage
+
+object FifoX {
+  def apply[T <: Data](t: T, x: Int, n: Int) = {
+    Module(new FifoX(t, x, n))
+  }
+}
+
+// Xway decode, used for FifoX style input controls.
+object FifoXValid {
+  def apply(in: UInt): Seq[UInt] = {
+    val inx = new Array[UInt](in.getWidth)
+
+    for (i <- 0 until in.getWidth) {
+      inx(i) = Cat(
+        (0 until in.getWidth).reverse.map(x =>
+          if (x < i) { false.B } else {
+            (PopCount(in(x,0)) === (i + 1).U) && in(x)
+          }
+        )
+      )
+    }
+    inx
+  }
+}
+
+class FifoX[T <: Data](t: T, x: Int, n: Int) extends Module {
+  val io = IO(new Bundle {
+    val in  = Flipped(Decoupled(Vec(x, Valid(t))))
+    val out = Decoupled(t)
+    val count = Output(UInt(log2Ceil(n+1).W))
+  })
+
+  val m = n - 1  // n = Mem(n-1) + Slice
+
+  def Increment(a: UInt, b: UInt): UInt = {
+    val c = a +& b
+    val d = Mux(c < m.U, c, c - m.U)(a.getWidth - 1, 0)
+    d
+  }
+
+  val mem = Mem(m, t)
+  val mslice = Slice(t, false, true)
+
+  val inxpos = RegInit(VecInit((0 until x).map(x => x.U(log2Ceil(m).W))))
+  val outpos = RegInit(0.U(log2Ceil(m).W))
+  val mcount = RegInit(0.U(log2Ceil(n+1).W))
+
+  io.count := mcount + io.out.valid
+
+  val ivalid = io.in.valid && io.in.ready
+  val ovalid = mslice.io.in.valid && mslice.io.in.ready
+
+  val iactive = Cat((0 until x).reverse.map(x => io.in.bits(x).valid))
+
+  val icount = PopCount(iactive)
+
+  // ---------------------------------------------------------------------------
+  // Fifo Control.
+  when (ivalid) {
+    for (i <- 0 until x) {
+      inxpos(i) := Increment(inxpos(i), icount)
+    }
+  }
+
+  when (ovalid) {
+    outpos := Increment(outpos, 1.U)
+  }
+
+  val inc = MuxOR(ivalid, icount)
+  val dec = mslice.io.in.valid && mslice.io.in.ready
+
+  when (ivalid || ovalid) {
+    mcount := mcount + inc - dec
+  }
+
+  // ---------------------------------------------------------------------------
+  // Fifo Input.
+  val inxvalid = FifoXValid(iactive)
+
+  for (i <- 0 until m) {
+    val valid = Cat(
+      (0 until x).reverse.map(q =>
+      if (q == 0) { inxpos(0) === i.U && inxvalid(0)(0) } else {
+          (0 to q).map(y =>
+            inxpos(y) === i.U && inxvalid(y)(q)
+          ).reduce(_ || _)
+        }
+      )
+    )
+
+    when (ivalid) {
+     when (PopCount(valid) >= 1.U) {
+      val idx = PriorityEncoder(valid)
+      mem(i) := io.in.bits(idx).bits
+     }
+    }
+  }
+
+  mslice.io.in.valid := false.B
+  mslice.io.in.bits := io.in.bits(0).bits  // defaults
+
+  when (mcount > 0.U) {
+    when (io.out.ready) {
+      mslice.io.in.valid := true.B
+    }
+  } .otherwise {
+    when (ivalid && iactive =/= 0.U) {
+      mslice.io.in.valid := true.B
+    }
+  }
+
+  when (mcount > 0.U) {
+    mslice.io.in.bits := mem(outpos)
+  } .elsewhen (ivalid) {
+    assert(PopCount(iactive) >= 1.U)
+    when (iactive =/= 0.U) {
+      val idx = PriorityEncoder(iactive)
+      mslice.io.in.bits := io.in.bits(idx).bits
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Valid Entries.
+  val active = RegInit(0.U(m.W))
+
+  val activeSet = MuxOR(ivalid,
+    (0 until x).map(i => (icount >= (i + 1).U) << inxpos(i)).reduce(_ | _)
+  )
+
+  val activeClr = MuxOR(mslice.io.in.valid && mslice.io.in.ready, 1.U << outpos)
+
+  active := (active | activeSet) & ~activeClr
+
+  // ---------------------------------------------------------------------------
+  // Interface.
+  io.in.ready := mcount <= (m.U - icount)
+  io.out <> mslice.io.out
+
+  assert(mcount <= m.U)
+}
+
+object EmitFifoX extends App {
+  ChiselStage.emitSystemVerilogFile(new FifoX(UInt(8.W), 4, 11), args)
+}
diff --git a/hdl/chisel/src/common/FifoXe.scala b/hdl/chisel/src/common/FifoXe.scala
new file mode 100644
index 0000000..587be62
--- /dev/null
+++ b/hdl/chisel/src/common/FifoXe.scala
@@ -0,0 +1,136 @@
+// Copyright 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package common
+
+import chisel3._
+import chisel3.util._
+import _root_.circt.stage.ChiselStage
+
+// FifoX with entry output and no output registration stage.
+
+object FifoXe {
+  def apply[T <: Data](t: T, x: Int, n: Int) = {
+    Module(new FifoXe(t, x, n))
+  }
+}
+
+class FifoXe[T <: Data](t: T, x:Int, n: Int) extends Module {
+  val io = IO(new Bundle {
+    val in  = Flipped(Decoupled(Vec(x, Valid(t))))
+    val out = Decoupled(t)
+    val count = Output(UInt(log2Ceil(n+1).W))
+    val entry = Output(Vec(n, Valid(t)))
+    val nempty = Output(Bool())
+  })
+
+  def Increment(a: UInt, b: UInt): UInt = {
+    val c = a +& b
+    val d = Mux(c < n.U, c, c - n.U)(a.getWidth - 1, 0)
+    d
+  }
+
+  val mem = Mem(n, t)
+
+  val inxpos = RegInit(VecInit((0 until x).map(x => x.U((log2Ceil(n) + 1).W))))
+  val outpos = RegInit(0.U(log2Ceil(n).W))
+  val mcount = RegInit(0.U(log2Ceil(n+1).W))
+  val nempty = RegInit(false.B)
+
+  io.count := mcount
+  io.nempty := nempty
+
+  val ivalid = io.in.valid && io.in.ready
+  val ovalid = io.out.valid && io.out.ready
+
+  val iactive = Cat((0 until x).reverse.map(x => io.in.bits(x).valid))
+
+  val icount = PopCount(iactive)
+
+  // ---------------------------------------------------------------------------
+  // Fifo Control.
+  when (ivalid) {
+    for (i <- 0 until x) {
+      inxpos(i) := Increment(inxpos(i), icount)
+    }
+  }
+
+  when (ovalid) {
+    outpos := Increment(outpos, 1.U)
+  }
+
+  val inc = MuxOR(ivalid, icount)
+  val dec = ovalid
+
+  when (ivalid || ovalid) {
+    val nxtcount = mcount + inc - dec
+    mcount := nxtcount
+    nempty := nxtcount =/= 0.U
+  }
+
+  // ---------------------------------------------------------------------------
+  // Fifo Input.
+  val inxvalid = FifoXValid(iactive)
+
+  for (i <- 0 until n) {
+    val valid = Cat(
+      (0 until x).reverse.map(q =>
+        if (q == 0) { inxpos(0) === i.U && inxvalid(0)(0) } else {
+          (0 to q).map(y =>
+            inxpos(y) === i.U && inxvalid(y)(q)
+          ).reduce(_ || _)
+        }
+      )
+    )
+
+    when (ivalid) {
+     when (PopCount(valid) >= 1.U) {
+      val idx = PriorityEncoder(valid)
+      mem(i) := io.in.bits(idx).bits
+     }
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Valid Entries.
+  val active = RegInit(0.U(n.W))
+
+  val activeSet = MuxOR(ivalid,
+    (0 until x).map(i => (icount >= (i + 1).U) << inxpos(i)).reduce(_ | _)
+  )
+
+  val activeClr = MuxOR(io.out.valid && io.out.ready, 1.U << outpos)
+
+  when (io.in.valid && io.in.ready || io.out.valid && io.out.ready) {
+    active := (active | activeSet) & ~activeClr
+  }
+
+  // ---------------------------------------------------------------------------
+  // Interface.
+  io.in.ready := mcount <= (n.U - icount)
+
+  io.out.valid := mcount =/= 0.U
+  io.out.bits := mem(outpos)
+
+  assert(mcount <= n.U)
+
+  for (i <- 0 until n) {
+    io.entry(i).valid := active(i)
+    io.entry(i).bits := mem(i)
+  }
+}
+
+object EmitFifoXe extends App {
+  ChiselStage.emitSystemVerilogFile(new FifoXe(UInt(8.W), 4, 10), args)
+}
diff --git a/hdl/chisel/src/kelvin/Parameters.scala b/hdl/chisel/src/kelvin/Parameters.scala
index 5b4b14e..35b4929 100644
--- a/hdl/chisel/src/kelvin/Parameters.scala
+++ b/hdl/chisel/src/kelvin/Parameters.scala
@@ -56,6 +56,12 @@
 
   val vectorCountBits = log2Ceil(vectorBits / 8) + 1 + 2  // +2 stripmine
 
+  val vectorAluCount = 2
+  val vectorReadPorts = (vectorAluCount * 3) + 1
+  val vectorWritePorts = 6
+  val vectorWhintPorts = 4
+  val vectorScalarPorts = 2
+
   // Vector queue.
   val vectorFifoDepth = 16
 
diff --git a/hdl/chisel/src/kelvin/scalar/Debug.scala b/hdl/chisel/src/kelvin/scalar/Debug.scala
index 4181680..d2123c6 100644
--- a/hdl/chisel/src/kelvin/scalar/Debug.scala
+++ b/hdl/chisel/src/kelvin/scalar/Debug.scala
@@ -21,13 +21,7 @@
 // Debug signals for HDL development.
 class DebugIO(p: Parameters) extends Bundle {
   val en = Output(UInt(4.W))
-  val addr0 = Output(UInt(32.W))
-  val addr1 = Output(UInt(32.W))
-  val addr2 = Output(UInt(32.W))
-  val addr3 = Output(UInt(32.W))
-  val inst0 = Output(UInt(32.W))
-  val inst1 = Output(UInt(32.W))
-  val inst2 = Output(UInt(32.W))
-  val inst3 = Output(UInt(32.W))
+  val addr = Vec(p.instructionLanes, UInt(32.W))
+  val inst = Vec(p.instructionLanes, UInt(32.W))
   val cycles = Output(UInt(32.W))
 }
diff --git a/hdl/chisel/src/kelvin/scalar/Fetch.scala b/hdl/chisel/src/kelvin/scalar/Fetch.scala
index cb21c9d..d6de873 100644
--- a/hdl/chisel/src/kelvin/scalar/Fetch.scala
+++ b/hdl/chisel/src/kelvin/scalar/Fetch.scala
@@ -56,7 +56,7 @@
     val csr = new CsrInIO(p)
     val ibus = new IBusIO(p)
     val inst = new FetchIO(p)
-    val branch = Flipped(Vec(4, new BranchTakenIO(p)))
+    val branch = Flipped(Vec(p.instructionLanes, new BranchTakenIO(p)))
     val linkPort = Flipped(new RegfileLinkPortIO)
     val iflush = Flipped(new IFlushIO(p))
   })
@@ -103,9 +103,9 @@
   val l0data  = Reg(Vec(indices, UInt(p.fetchDataBits.W)))
 
   // Instruction outputs.
-  val instValid = RegInit(VecInit(Seq.fill(4)(false.B)))
-  val instAddr  = Reg(Vec(4, UInt(p.instructionBits.W)))
-  val instBits  = Reg(Vec(4, UInt(p.instructionBits.W)))
+  val instValid = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
+  val instAddr  = Reg(Vec(p.instructionLanes, UInt(p.instructionBits.W)))
+  val instBits  = Reg(Vec(p.instructionLanes, UInt(p.instructionBits.W)))
 
   val instAligned0 = Cat(instAddr(0)(31, indexLsb), 0.U(indexLsb.W))
   val instAligned1 = instAligned0 + Cat(1.U, 0.U(indexLsb.W))
@@ -135,71 +135,48 @@
     (jal, target)
   }
 
-  val (preBranchTaken0, preBranchTarget0) =
-      Predecode(instAddr(0), instBits(0))
-  val (preBranchTaken1, preBranchTarget1) =
-      Predecode(instAddr(1), instBits(1))
-  val (preBranchTaken2, preBranchTarget2) =
-      Predecode(instAddr(2), instBits(2))
-  val (preBranchTaken3, preBranchTarget3) =
-      Predecode(instAddr(3), instBits(3))
+  val preBranch = (0 until p.instructionLanes).map(x => Predecode(instAddr(x), instBits(x)))
+  val preBranchTakens = preBranch.map { case (taken, target) => taken }
+  val preBranchTargets = preBranch.map { case (taken, target) => target }
 
-  val preBranchTaken = io.inst.lanes(0).valid && preBranchTaken0 ||
-                       io.inst.lanes(1).valid && preBranchTaken1 ||
-                       io.inst.lanes(2).valid && preBranchTaken2 ||
-                       io.inst.lanes(3).valid && preBranchTaken3
+  val preBranchTaken = (0 until p.instructionLanes).map(i =>
+    io.inst.lanes(i).valid && preBranchTakens(i)).reduce(_ || _)
 
-  val preBranchTarget = Mux(preBranchTaken0, preBranchTarget0,
-                        Mux(preBranchTaken1, preBranchTarget1,
-                        Mux(preBranchTaken2, preBranchTarget2,
-                            preBranchTarget3)))
+  val preBranchTarget = MuxCase(
+    preBranchTargets(p.instructionLanes - 1),
+    (0 until p.instructionLanes - 1).map(i => preBranchTakens(i) -> preBranchTargets(i))
+  )
 
   val preBranchTag = preBranchTarget(tagMsb, tagLsb)
   val preBranchIndex = preBranchTarget(indexMsb, indexLsb)
 
-  val branchTag0 = io.branch(0).value(tagMsb, tagLsb)
-  val branchTag1 = io.branch(1).value(tagMsb, tagLsb)
-  val branchTag2 = io.branch(2).value(tagMsb, tagLsb)
-  val branchTag3 = io.branch(3).value(tagMsb, tagLsb)
-  val branchIndex0 = io.branch(0).value(indexMsb, indexLsb)
-  val branchIndex1 = io.branch(1).value(indexMsb, indexLsb)
-  val branchIndex2 = io.branch(2).value(indexMsb, indexLsb)
-  val branchIndex3 = io.branch(3).value(indexMsb, indexLsb)
+  val branchTags = io.branch.map(x => x.value(tagMsb, tagLsb))
+  val branchIndices = io.branch.map(x => x.value(indexMsb, indexLsb))
 
-  val l0validB0 = l0valid(branchIndex0)
-  val l0validB1 = l0valid(branchIndex1)
-  val l0validB2 = l0valid(branchIndex2)
-  val l0validB3 = l0valid(branchIndex3)
+  val l0valids = (0 until p.instructionLanes).map(x => l0valid(branchIndices(x)))
   val l0validP  = l0valid(preBranchIndex)
 
-  val l0tagB0 = VecAt(l0tag, branchIndex0)
-  val l0tagB1 = VecAt(l0tag, branchIndex1)
-  val l0tagB2 = VecAt(l0tag, branchIndex2)
-  val l0tagB3 = VecAt(l0tag, branchIndex3)
+  val l0tags = (0 until p.instructionLanes).map(x => VecAt(l0tag, branchIndices(x)))
   val l0tagP  = VecAt(l0tag, preBranchIndex)
 
-  val reqB0 = io.branch(0).valid && !l0req(branchIndex0) &&
-      (branchTag0 =/= l0tagB0 || !l0validB0)
-  val reqB1 = io.branch(1).valid && !l0req(branchIndex1) &&
-      (branchTag1 =/= l0tagB1 || !l0validB1) &&
-      !io.branch(0).valid
-  val reqB2 = io.branch(2).valid && !l0req(branchIndex2) &&
-      (branchTag2 =/= l0tagB2 || !l0validB2) &&
-      !io.branch(0).valid && !io.branch(1).valid
-  val reqB3 = io.branch(3).valid && !l0req(branchIndex3) &&
-      (branchTag3 =/= l0tagB3 || !l0validB3) &&
-      !io.branch(0).valid && !io.branch(1).valid && !io.branch(2).valid
+  val reqBValid = (0 until p.instructionLanes).map(x =>
+      io.branch(x).valid && !l0req(branchIndices(x)) &&
+      (branchTags(x) =/= l0tags(x) || !l0valids(x)))
+  val prevValid = io.branch.map(_.valid).scan(false.B)(_||_)
+  val reqs = (0 until p.instructionLanes).map(x => reqBValid(x) && !prevValid(x))
+
   val reqP = preBranchTaken && !l0req(preBranchIndex) && (preBranchTag =/= l0tagP || !l0validP)
   val req0 = !match0 && !l0req(instIndex0)
   val req1 = !match1 && !l0req(instIndex1)
 
-  aslice.io.in.valid := (reqB0 || reqB1 || reqB2 || reqB3 || reqP || req0 || req1) && !io.iflush.valid
-  aslice.io.in.bits  := Mux(reqB0, Cat(io.branch(0).value(31,indexLsb), 0.U(indexLsb.W)),
-                        Mux(reqB1, Cat(io.branch(1).value(31,indexLsb), 0.U(indexLsb.W)),
-                        Mux(reqB2, Cat(io.branch(2).value(31,indexLsb), 0.U(indexLsb.W)),
-                        Mux(reqB3, Cat(io.branch(3).value(31,indexLsb), 0.U(indexLsb.W)),
-                        Mux(reqP,  Cat(preBranchTarget(31,indexLsb), 0.U(indexLsb.W)),
-                        Mux(req0, instAligned0, instAligned1))))))
+  aslice.io.in.valid := (reqs ++ Seq(reqP, req0, req1)).reduce(_ || _) && !io.iflush.valid
+  aslice.io.in.bits := MuxCase(instAligned1,
+    (0 until p.instructionLanes).map(x => reqs(x) -> Cat(io.branch(x).value(31,indexLsb), 0.U(indexLsb.W))) ++
+    Array(
+      reqP -> Cat(preBranchTarget(31,indexLsb), 0.U(indexLsb.W)),
+      req0 -> instAligned0,
+    )
+  )
 
   when (readAddrEn) {
     readAddr := io.ibus.addr
@@ -253,25 +230,27 @@
   // creates excessive timing pressure. We know that the match is either on
   // the old line or the next line, so can late mux on lookups of prior.
   // Widen the arithmetic paths and select from results.
-  val fetchEn = Wire(Vec(4, Bool()))
+  val fetchEn = Wire(Vec(p.instructionLanes, Bool()))
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     fetchEn(i) := io.inst.lanes(i).valid && io.inst.lanes(i).ready
   }
 
-  val fsel = Cat(fetchEn(3),
-                 fetchEn(2) && !fetchEn(3),
-                 fetchEn(1) && !fetchEn(2) && !fetchEn(3),
-                 fetchEn(0) && !fetchEn(1) && !fetchEn(2) && !fetchEn(3),
-                 !fetchEn(0) && !fetchEn(1) && !fetchEn(2) && !fetchEn(3))
+  val fsela = Cat((0 until p.instructionLanes).reverse.map(x =>
+    (x until p.instructionLanes).map(y =>
+      (if (y == x) { fetchEn(y) } else { !fetchEn(y) })
+    ).reduce(_ && _)
+  ))
+  val fselb = (0 until p.instructionLanes).map(x => !fetchEn(x)).reduce(_ && _)
+  val fsel = Cat(fsela, fselb)
 
-  val nxtInstAddrOffset = instAddr.map(x => x) ++ instAddr.map(x => x + 16.U)
-  val nxtInstAddr = (0 until 4).map(i =>
-      (0 until 5).map(
+  val nxtInstAddrOffset = instAddr.map(x => x) ++ instAddr.map(x => x + (p.instructionLanes * 4).U)
+  val nxtInstAddr = (0 until p.instructionLanes).map(i =>
+      (0 until (p.instructionLanes + 1)).map(
           j => MuxOR(fsel(j), nxtInstAddrOffset(j + i))).reduce(_|_))
 
   val nxtInstIndex0 = nxtInstAddr(0)(indexMsb, indexLsb)
-  val nxtInstIndex1 = nxtInstAddr(3)(indexMsb, indexLsb)
+  val nxtInstIndex1 = nxtInstAddr(p.instructionLanes - 1)(indexMsb, indexLsb)
 
   val readFwd0 =
       readDataEn && readAddr(31,indexLsb) === instAligned0(31,indexLsb)
@@ -286,7 +265,7 @@
   val nxtMatch1 =
       Mux(instIndex0(0) === nxtInstIndex1(0), nxtMatch0Fwd, nxtMatch1Fwd)
 
-  val nxtInstValid = Wire(Vec(4, Bool()))
+  val nxtInstValid = Wire(Vec(p.instructionLanes, Bool()))
 
   val nxtInstBits0 = Mux(readFwd0, readData, VecAt(l0data, instIndex0))
   val nxtInstBits1 = Mux(readFwd1, readData, VecAt(l0data, instIndex1))
@@ -301,23 +280,18 @@
   def BranchMatchDe(valid: Bool, value: UInt):
       (Bool, UInt, Vec[UInt], Vec[UInt]) = {
 
-    val addr = VecInit(value,
-                       value + 4.U,
-                       value + 8.U,
-                       value + 12.U)
+    val addr = VecInit((0 until p.instructionLanes).map(x => value + (x * 4).U))
 
     val match0 = l0valid(addr(0)(indexMsb,indexLsb)) &&
         addr(0)(tagMsb,tagLsb) === VecAt(l0tag, addr(0)(indexMsb,indexLsb))
-    val match1 = l0valid(addr(3)(indexMsb,indexLsb)) &&
-        addr(3)(tagMsb,tagLsb) === VecAt(l0tag, addr(3)(indexMsb,indexLsb))
+    val match1 = l0valid(addr(p.instructionLanes - 1)(indexMsb,indexLsb)) &&
+        addr(p.instructionLanes - 1)(tagMsb,tagLsb) === VecAt(l0tag, addr(p.instructionLanes - 1)(indexMsb,indexLsb))
 
-    val vvalid = VecInit(Mux(addr(0)(4,2) <= 7.U, match0, match1),
-                         Mux(addr(0)(4,2) <= 6.U, match0, match1),
-                         Mux(addr(0)(4,2) <= 5.U, match0, match1),
-                         Mux(addr(0)(4,2) <= 4.U, match0, match1))
+    val vvalid = VecInit((0 until p.instructionLanes).reverse.map(x =>
+      Mux(addr(0)(2 + log2Ceil(p.instructionLanes),2) <= (4+x).U, match0, match1)))
 
     val muxbits0 = VecAt(l0data, addr(0)(indexMsb,indexLsb))
-    val muxbits1 = VecAt(l0data, addr(3)(indexMsb,indexLsb))
+    val muxbits1 = VecAt(l0data, addr(p.instructionLanes - 1)(indexMsb,indexLsb))
     val muxbits = Wire(Vec(16, UInt(p.instructionBits.W)))
 
     for (i <- 0 until 8) {
@@ -326,8 +300,8 @@
       muxbits(i + 8) := muxbits1(31 + offset, offset)
     }
 
-    val bits = Wire(Vec(4, UInt(p.instructionBits.W)))
-    for (i <- 0 until 4) {
+    val bits = Wire(Vec(p.instructionLanes, UInt(p.instructionBits.W)))
+    for (i <- 0 until p.instructionLanes) {
       val idx = Cat(addr(0)(5) =/= addr(i)(5), addr(i)(4,2))
       bits(i) := VecAt(muxbits, idx)
     }
@@ -337,38 +311,26 @@
 
   def BranchMatchEx(branch: Vec[BranchTakenIO]):
       (Bool, UInt, Vec[UInt], Vec[UInt]) = {
-    val valid = branch(0).valid || branch(1).valid ||
-                branch(2).valid || branch(3).valid
+    val valid = branch.map(x => x.valid).reduce(_ || _)
 
-    val addr = VecInit(Mux(branch(0).valid, branch(0).value,
-                       Mux(branch(1).valid, branch(1).value,
-                       Mux(branch(2).valid, branch(2).value,
-                                            branch(3).value))),
-                       Mux(branch(0).valid, branch(0).value + 4.U,
-                       Mux(branch(1).valid, branch(1).value + 4.U,
-                       Mux(branch(2).valid, branch(2).value + 4.U,
-                                            branch(3).value + 4.U))),
-                       Mux(branch(0).valid, branch(0).value + 8.U,
-                       Mux(branch(1).valid, branch(1).value + 8.U,
-                       Mux(branch(2).valid, branch(2).value + 8.U,
-                                            branch(3).value + 8.U))),
-                       Mux(branch(0).valid, branch(0).value + 12.U,
-                       Mux(branch(1).valid, branch(1).value + 12.U,
-                       Mux(branch(2).valid, branch(2).value + 12.U,
-                                            branch(3).value + 12.U))))
+
+    val addr = VecInit((0 until branch.length).map(x =>
+      MuxCase(branch(branch.length - 1).value + (x * 4).U, (
+        (0 until branch.length - 1).map(y =>
+          branch(y).valid -> (branch(y).value + (x * 4).U)
+        )
+      ))))
 
     val match0 = l0valid(addr(0)(indexMsb,indexLsb)) &&
         addr(0)(tagMsb,tagLsb) === VecAt(l0tag, addr(0)(indexMsb,indexLsb))
-    val match1 = l0valid(addr(3)(indexMsb,indexLsb)) &&
-        addr(3)(tagMsb,tagLsb) === VecAt(l0tag, addr(3)(indexMsb,indexLsb))
+    val match1 = l0valid(addr(branch.length - 1)(indexMsb,indexLsb)) &&
+        addr(branch.length - 1)(tagMsb,tagLsb) === VecAt(l0tag, addr(branch.length - 1)(indexMsb,indexLsb))
 
-    val vvalid = VecInit(Mux(addr(0)(4,2) <= 7.U, match0, match1),
-                         Mux(addr(0)(4,2) <= 6.U, match0, match1),
-                         Mux(addr(0)(4,2) <= 5.U, match0, match1),
-                         Mux(addr(0)(4,2) <= 4.U, match0, match1))
+    val vvalid = VecInit((0 until branch.length).reverse.map(x =>
+      Mux(addr(0)(2 + log2Ceil(branch.length),2) <= (4 + x).U, match0, match1)))
 
     val muxbits0 = VecAt(l0data, addr(0)(indexMsb,indexLsb))
-    val muxbits1 = VecAt(l0data, addr(3)(indexMsb,indexLsb))
+    val muxbits1 = VecAt(l0data, addr(branch.length - 1)(indexMsb,indexLsb))
     val muxbits = Wire(Vec(16, UInt(p.instructionBits.W)))
 
     for (i <- 0 until 8) {
@@ -377,8 +339,8 @@
       muxbits(i + 8) := muxbits1(31 + offset, offset)
     }
 
-    val bits = Wire(Vec(4, UInt(p.instructionBits.W)))
-    for (i <- 0 until 4) {
+    val bits = Wire(Vec(branch.length, UInt(p.instructionBits.W)))
+    for (i <- 0 until branch.length) {
       val idx = Cat(addr(0)(5) =/= addr(i)(5), addr(i)(4,2))
       bits(i) := VecAt(muxbits, idx)
     }
@@ -399,21 +361,17 @@
     (jal || ret || bxx, target)
   }
 
-  val (brchTakenDe0, brchTargetDe0) = PredecodeDe(instAddr(0), instBits(0))
-  val (brchTakenDe1, brchTargetDe1) = PredecodeDe(instAddr(1), instBits(1))
-  val (brchTakenDe2, brchTargetDe2) = PredecodeDe(instAddr(2), instBits(2))
-  val (brchTakenDe3, brchTargetDe3) = PredecodeDe(instAddr(3), instBits(3))
+  val brchDe = (0 until p.instructionLanes).map(x => PredecodeDe(instAddr(x), instBits(x)))
+  val brchTakensDe = brchDe.map { case (taken, target) => taken }
+  val brchTargetsDe = brchDe.map { case (taken, target) => target }
 
-  val brchTakenDeOr =
-      io.inst.lanes(0).valid && io.inst.lanes(0).ready && brchTakenDe0 ||
-      io.inst.lanes(1).valid && io.inst.lanes(1).ready && brchTakenDe1 ||
-      io.inst.lanes(2).valid && io.inst.lanes(2).ready && brchTakenDe2 ||
-      io.inst.lanes(3).valid && io.inst.lanes(3).ready && brchTakenDe3
+  val brchTakenDeOr = (0 until p.instructionLanes).map(x =>
+    io.inst.lanes(x).ready && io.inst.lanes(x).valid && brchTakensDe(x)
+  ).reduce(_ || _)
 
-  val brchTargetDe = Mux(brchTakenDe0, brchTargetDe0,
-                     Mux(brchTakenDe1, brchTargetDe1,
-                     Mux(brchTakenDe2, brchTargetDe2,
-                         brchTargetDe3)))
+  val brchTargetDe = MuxCase(brchTargetsDe(p.instructionLanes - 1),
+    (0 until p.instructionLanes - 1).map(x => brchTakensDe(x) -> brchTargetsDe(x))
+  )
 
   val (brchTakenDe, brchValidDe, brchAddrDe, brchBitsDe) =
       BranchMatchDe(brchTakenDeOr, brchTargetDe)
@@ -421,21 +379,27 @@
   val (brchTakenEx, brchValidEx, brchAddrEx, brchBitsEx) =
       BranchMatchEx(io.branch)
 
+
   val brchValidDeMask =
-      Cat(!brchTakenDe0 && !brchTakenDe1 && !brchTakenDe2,
-          !brchTakenDe0 && !brchTakenDe1,
-          !brchTakenDe0,
-          true.B)
+      Cat((0 until p.instructionLanes).reverse.map(x =>
+        if (x == 0) { true.B } else {
+          (0 until x).map(y =>
+            !brchTakensDe(y)
+          ).reduce(_ && _)
+        }
+      ))
 
-  val brchFwd = Cat(
-      brchTakenDe3 && !brchTakenDe0 && !brchTakenDe1 && !brchTakenDe2,
-      brchTakenDe2 && !brchTakenDe0 && !brchTakenDe1,
-      brchTakenDe1 && !brchTakenDe0,
-      brchTakenDe0)
+  val brchFwd =
+    Cat((0 until p.instructionLanes).reverse.map(x =>
+      brchTakensDe(x) && (if (x == 0) { true.B } else { (0 until x).map(y => !brchTakensDe(y)).reduce(_ && _) })
+    ))
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     // 1, 11, 111, ...
-    nxtInstValid(i) := Mux(nxtInstAddr(0)(4,2) <= (7 - i).U, nxtMatch0, nxtMatch1)
+    nxtInstValid(i) := Mux(
+      nxtInstAddr(0)(4,2) <= (7 - i).U,
+      nxtMatch0,
+      nxtMatch1)
 
     val nxtInstValidUInt = nxtInstValid.asUInt
     instValid(i) := Mux(brchTakenEx, brchValidEx(i,0) === ~0.U((i+1).W),
@@ -457,14 +421,11 @@
   // This pattern of separate when() blocks requires resets after the data.
   when (reset.asBool) {
     val addr = Cat(io.csr.value(0)(31,2), 0.U(2.W))
-    instAddr(0) := addr
-    instAddr(1) := addr + 4.U
-    instAddr(2) := addr + 8.U
-    instAddr(3) := addr + 12.U
+    instAddr := (0 until p.instructionLanes).map(i => addr + (4 * i).U)
   }
 
   // Outputs
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     io.inst.lanes(i).valid := instValid(i) & brchValidDeMask(i)
     io.inst.lanes(i).addr  := instAddr(i)
     io.inst.lanes(i).inst  := instBits(i)
@@ -472,23 +433,19 @@
   }
 
   // Assertions.
-  assert(instAddr(0) + 4.U === instAddr(1))
-  assert(instAddr(0) + 8.U === instAddr(2))
-  assert(instAddr(0) + 12.U === instAddr(3))
+  for (i <- 1 until p.instructionLanes) {
+    assert(instAddr(0) + (4 * i).U === instAddr(i))
+  }
 
-  assert(fsel.getWidth == 5)
+  assert(fsel.getWidth == (p.instructionLanes + 1))
   assert(PopCount(fsel) <= 1.U)
 
   val instValidUInt = instValid.asUInt
-  assert(!(!instValidUInt(0) && (instValidUInt(3,1) =/= 0.U)))
-  assert(!(!instValidUInt(1) && (instValidUInt(3,2) =/= 0.U)))
-  assert(!(!instValidUInt(2) && (instValidUInt(3,3) =/= 0.U)))
-
-  val instLanesReady = Cat(io.inst.lanes(3).ready, io.inst.lanes(2).ready,
-                           io.inst.lanes(1).ready, io.inst.lanes(0).ready)
-  assert(!(!instLanesReady(0) && (instLanesReady(3,1) =/= 0.U)))
-  assert(!(!instLanesReady(1) && (instLanesReady(3,2) =/= 0.U)))
-  assert(!(!instLanesReady(2) && (instLanesReady(3,3) =/= 0.U)))
+  val instLanesReady = Cat((0 until p.instructionLanes).reverse.map(x => io.inst.lanes(x).ready))
+  for (i <- 0 until p.instructionLanes - 1) {
+    assert(!(!instValidUInt(i) && (instValidUInt(p.instructionLanes - 1, i + 1) =/= 0.U)))
+    assert(!(!instLanesReady(i) && (instLanesReady(p.instructionLanes - 1, i + 1) =/= 0.U)))
+  }
 }
 
 object EmitFetch extends App {
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala
index 60aa158..b13364d 100644
--- a/hdl/chisel/src/kelvin/scalar/Lsu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -92,8 +92,8 @@
 class Lsu(p: Parameters) extends Module {
   val io = IO(new Bundle {
     // Decode cycle.
-    val req = Vec(4, new LsuIO(p))
-    val busPort = Flipped(new RegfileBusPortIO)
+    val req = Vec(p.instructionLanes, new LsuIO(p))
+    val busPort = Flipped(new RegfileBusPortIO(p))
 
     // Execute cycle(s).
     val rd = Flipped(new RegfileWriteDataIO)
@@ -115,16 +115,18 @@
 
   // AXI Queues.
   val n = 8
-  val ctrl = Fifo4(new LsuCtrl(p), n)
+  val ctrl = FifoX(new LsuCtrl(p), p.instructionLanes, n)
   val data = Slice(new LsuReadData(p), true, true)
 
   // Match and mask.
-  val ctrlready = Cat(ctrl.io.count <= (n - 4).U,
-                      ctrl.io.count <= (n - 3).U,
-                      ctrl.io.count <= (n - 2).U,
-                      ctrl.io.count <= (n - 1).U)
+  val ctrlready = (1 to p.instructionLanes).reverse.map(x => ctrl.io.count <= (n - x).U)
+  // val ctrlready = Cat(
+  //   (1 to p.instructionLanes).reverse.map(
+  //     x => ctrl.io.count <= (n - x).U
+  //   )
+  // )
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     io.req(i).ready := ctrlready(i) && data.io.in.ready
   }
 
@@ -137,7 +139,7 @@
   ctrl.io.in.valid := io.req.map(_.valid).reduce(_||_)
 
   val uncacheable = p.m.filter(x => !x.cacheable)
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     val uncached = io.busPort.addr(i)(31) ||
       (if (uncacheable.length > 0) uncacheable.map(x => (io.busPort.addr(i) >= x.memStart.U) && (io.busPort.addr(i) < (x.memStart + x.memSize).U)).reduce(_||_) else false.B)
 
diff --git a/hdl/chisel/src/kelvin/scalar/Mlu.scala b/hdl/chisel/src/kelvin/scalar/Mlu.scala
index 349104d..b7ad953 100644
--- a/hdl/chisel/src/kelvin/scalar/Mlu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Mlu.scala
@@ -47,11 +47,11 @@
 class Mlu(p: Parameters) extends Module {
   val io = IO(new Bundle {
     // Decode cycle.
-    val req = Vec(4, new MluIO(p))
+    val req = Vec(p.instructionLanes, new MluIO(p))
 
     // Execute cycle.
-    val rs1 = Vec(4, Flipped(new RegfileReadDataIO))
-    val rs2 = Vec(4, Flipped(new RegfileReadDataIO))
+    val rs1 = Vec(p.instructionLanes, Flipped(new RegfileReadDataIO))
+    val rs2 = Vec(p.instructionLanes, Flipped(new RegfileReadDataIO))
     val rd  = Flipped(new RegfileWriteDataIO)
   })
 
@@ -62,41 +62,25 @@
   val valid2 = RegInit(false.B)
   val addr1 = Reg(UInt(5.W))
   val addr2 = Reg(UInt(5.W))
-  val sel = Reg(UInt(4.W))
+  val sel = Reg(UInt(p.instructionLanes.W))
 
+  val valids = io.req.map(_.valid)
+  assert(valids.length == p.instructionLanes)
   valid1 := io.req.map(_.valid).reduce(_||_)
   valid2 := valid1
 
-  when (io.req(0).valid) {
-    op := io.req(0).op
-    addr1 := io.req(0).addr
-    sel := 1.U
-  } .elsewhen (io.req(1).valid) {
-    op := io.req(1).op
-    addr1 := io.req(1).addr
-    sel := 2.U
-  } .elsewhen (io.req(2).valid) {
-    op := io.req(2).op
-    addr1 := io.req(2).addr
-    sel := 4.U
-  } .elsewhen (io.req(3).valid) {
-    op := io.req(3).op
-    addr1 := io.req(3).addr
-    sel := 8.U
+  when (valids.reduce(_||_)) {
+    val idx = PriorityEncoder(valids)
+    op := io.req(idx).op
+    addr1 := io.req(idx).addr
+    sel := (1.U << idx)
   } .otherwise {
     op := 0.U
     sel := 0.U
   }
 
-  val rs1 = MuxOR(valid1 & sel(0), io.rs1(0).data) |
-            MuxOR(valid1 & sel(1), io.rs1(1).data) |
-            MuxOR(valid1 & sel(2), io.rs1(2).data) |
-            MuxOR(valid1 & sel(3), io.rs1(3).data)
-
-  val rs2 = MuxOR(valid1 & sel(0), io.rs2(0).data) |
-            MuxOR(valid1 & sel(1), io.rs2(1).data) |
-            MuxOR(valid1 & sel(2), io.rs2(2).data) |
-            MuxOR(valid1 & sel(3), io.rs2(3).data)
+  val rs1 = (0 until p.instructionLanes).map(x => MuxOR(valid1 & sel(x), io.rs1(x).data)).reduce(_ | _)
+  val rs2 = (0 until p.instructionLanes).map(x => MuxOR(valid1 & sel(x), io.rs2(x).data)).reduce(_ | _)
 
   // Multiplier has a registered output.
   val mul2 = Reg(UInt(32.W))
@@ -142,7 +126,7 @@
   io.rd.data  := mul2 + round2
 
   // Assertions.
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     assert(!(valid1 && sel(i) && !io.rs1(i).valid))
     assert(!(valid1 && sel(i) && !io.rs2(i).valid))
   }
diff --git a/hdl/chisel/src/kelvin/scalar/Regfile.scala b/hdl/chisel/src/kelvin/scalar/Regfile.scala
index 6dfdd00..7397bb7 100644
--- a/hdl/chisel/src/kelvin/scalar/Regfile.scala
+++ b/hdl/chisel/src/kelvin/scalar/Regfile.scala
@@ -62,9 +62,9 @@
   val immed = Input(UInt(32.W))
 }
 
-class RegfileBusPortIO extends Bundle {
-  val addr = Output(Vec(4, UInt(32.W)))
-  val data = Output(Vec(4, UInt(32.W)))
+class RegfileBusPortIO(p: Parameters) extends Bundle {
+  val addr = Output(Vec(p.instructionLanes, UInt(32.W)))
+  val data = Output(Vec(p.instructionLanes, UInt(32.W)))
 }
 
 class RegfileLinkPortIO extends Bundle {
@@ -79,18 +79,18 @@
 class Regfile(p: Parameters) extends Module {
   val io = IO(new Bundle {
     // Decode cycle.
-    val readAddr = Vec(8, new RegfileReadAddrIO)
-    val readSet  = Vec(8, new RegfileReadSetIO)
-    val writeAddr = Vec(4, new RegfileWriteAddrIO)
-    val busAddr = Vec(4, new RegfileBusAddrIO)
-    val target = Vec(4, new RegfileBranchTargetIO)
+    val readAddr = Vec(p.instructionLanes * 2, new RegfileReadAddrIO)
+    val readSet  = Vec(p.instructionLanes * 2, new RegfileReadSetIO)
+    val writeAddr = Vec(p.instructionLanes, new RegfileWriteAddrIO)
+    val busAddr = Vec(p.instructionLanes, new RegfileBusAddrIO)
+    val target = Vec(p.instructionLanes, new RegfileBranchTargetIO)
     val linkPort = new RegfileLinkPortIO
-    val busPort = new RegfileBusPortIO
+    val busPort = new RegfileBusPortIO(p)
 
     // Execute cycle.
-    val readData = Vec(8, new RegfileReadDataIO)
-    val writeData = Vec(6, new RegfileWriteDataIO)
-    val writeMask = Vec(4, new Bundle {val valid = Input(Bool())})
+    val readData = Vec(p.instructionLanes * 2, new RegfileReadDataIO)
+    val writeData = Vec(p.instructionLanes + 2, new RegfileWriteDataIO)
+    val writeMask = Vec(p.instructionLanes, new Bundle {val valid = Input(Bool())})
     val scoreboard = new Bundle {
       val regd = Output(UInt(32.W))
       val comb = Output(UInt(32.W))
@@ -130,11 +130,11 @@
   // ***************************************************************************
   // The read port response.
   // ***************************************************************************
-  val readDataReady = RegInit(VecInit(Seq.fill(8){false.B}))
-  val readDataBits  = Reg(Vec(8, UInt(32.W)))
-  val nxtReadDataBits = Wire(Vec(8, UInt(32.W)))
+  val readDataReady = RegInit(VecInit(Seq.fill(p.instructionLanes * 2){false.B}))
+  val readDataBits  = Reg(Vec(p.instructionLanes * 2, UInt(32.W)))
+  val nxtReadDataBits = Wire(Vec(p.instructionLanes * 2, UInt(32.W)))
 
-  for (i <- 0 until 8) {
+  for (i <- 0 until (p.instructionLanes * 2)) {
     io.readData(i).valid := readDataReady(i)
     io.readData(i).data  := readDataBits(i)
   }
@@ -149,18 +149,13 @@
   writeData(0)  := 0.U     // regfile(0) is optimized away
 
   for (i <- 1 until 32) {
-    val valid = Cat(io.writeData(5).valid && io.writeData(5).addr === i.U,
-                    io.writeData(4).valid && io.writeData(4).addr === i.U,
-                    io.writeData(3).valid && io.writeData(3).addr === i.U &&
-                      !io.writeMask(3).valid,
-                    io.writeData(2).valid && io.writeData(2).addr === i.U &&
-                      !io.writeMask(2).valid,
-                    io.writeData(1).valid && io.writeData(1).addr === i.U &&
-                      !io.writeMask(1).valid,
-                    io.writeData(0).valid && io.writeData(0).addr === i.U &&
-                      !io.writeMask(0).valid)
+    val valid = Cat(
+      Array(io.writeData(p.instructionLanes + 1).valid && io.writeData(p.instructionLanes + 1).addr === i.U,
+            io.writeData(p.instructionLanes).valid && io.writeData(p.instructionLanes).addr === i.U) ++
+            (0 until p.instructionLanes).reverse.map(x => io.writeData(x).valid && io.writeData(x).addr === i.U && !io.writeMask(x).valid)
+    )
 
-    val data  = (0 until 6).map(x => MuxOR(valid(x), io.writeData(x).data)).reduce(_|_)
+    val data  = (0 until p.instructionLanes + 2).map(x => MuxOR(valid(x), io.writeData(x).data)).reduce(_|_)
 
     writeValid(i) := valid =/= 0.U
     writeData(i)  := data
@@ -177,21 +172,21 @@
   // We care if someone tried to write x0 (e.g. nop is encoded this way), but want
   // it separate for above mentioned optimization.
   val x0 =
-    (0 until 4).map(x =>
+    (0 until p.instructionLanes).map(x =>
       io.writeData(x).valid &&
       io.writeData(x).addr === 0.U &&
       !io.writeMask(x).valid) ++
-    (4 until 6).map(x => io.writeData(x).valid && io.writeData(x).addr === 0.U)
+    (p.instructionLanes until p.instructionLanes + 2).map(x => io.writeData(x).valid && io.writeData(x).addr === 0.U)
 
   io.rfwriteCount := PopCount(writeValid) - writeValid(0) + PopCount(x0)
 
   // ***************************************************************************
   // Read ports with write forwarding.
   // ***************************************************************************
-  val rdata = Wire(Vec(8, UInt(32.W)))
-  val wdata = Wire(Vec(8, UInt(32.W)))
-  val rwdata = Wire(Vec(8, UInt(32.W)))
-  for (i <- 0 until 8) {
+  val rdata = Wire(Vec((p.instructionLanes * 2), UInt(32.W)))
+  val wdata = Wire(Vec((p.instructionLanes * 2), UInt(32.W)))
+  val rwdata = Wire(Vec((p.instructionLanes * 2), UInt(32.W)))
+  for (i <- 0 until (p.instructionLanes * 2)) {
     val idx = io.readAddr(i).addr
     val write = VecAt(writeValid, idx)
     rdata(i) := VecAt(regfile, idx)
@@ -199,7 +194,7 @@
     rwdata(i) := Mux(write, wdata(i), rdata(i))
   }
 
-  for (i <- 0 until 8) {
+  for (i <- 0 until (p.instructionLanes * 2)) {
     val setValid = io.readSet(i).valid
     val setValue = io.readSet(i).value
 
@@ -215,23 +210,22 @@
   }
 
   // Bus port priority encoded address.
-  val busAddr = Wire(Vec(4, UInt(32.W)))
-  val busValid = Cat(io.busAddr(3).valid, io.busAddr(2).valid,
-                     io.busAddr(1).valid, io.busAddr(0).valid)
+  val busAddr = Wire(Vec(p.instructionLanes, UInt(32.W)))
+  val busValid = Cat((0 until p.instructionLanes).reverse.map(x => io.busAddr(x).valid))
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     busAddr(i) := Mux(io.busAddr(i).bypass, rwdata(2 * i),
                   Mux(io.busAddr(i).immen, rdata(2 * i) + io.busAddr(i).immed,
                       rdata(2 * i)))
   }
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     io.busPort.addr(i) := busAddr(i)
     io.busPort.data(i) := nxtReadDataBits(2 * i + 1)
   }
 
   // Branch target address combinatorial.
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     io.target(i).data := busAddr(i)
   }
 
@@ -244,12 +238,12 @@
   // ***************************************************************************
   // Assertions.
   // ***************************************************************************
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     assert(busAddr(i).getWidth == p.lsuAddrBits)
   }
 
-  for (i <- 0 until 6) {
-    for (j <- (i+1) until 6) {
+  for (i <- 0 until p.instructionLanes + 2) {
+    for (j <- (i + 1) until p.instructionLanes + 2) {
       // Delay the failure a cycle for debugging purposes.
       val write_fail = RegInit(false.B)
       write_fail := io.writeData(i).valid && io.writeData(j).valid &&
diff --git a/hdl/chisel/src/kelvin/scalar/SCore.scala b/hdl/chisel/src/kelvin/scalar/SCore.scala
index 3f0f678..d9e2c32 100644
--- a/hdl/chisel/src/kelvin/scalar/SCore.scala
+++ b/hdl/chisel/src/kelvin/scalar/SCore.scala
@@ -50,9 +50,9 @@
   // The functional units that make up the core.
   val regfile = Regfile(p)
   val fetch = Fetch(p)
-  val decode = Seq(Decode(p, 0), Decode(p, 1), Decode(p, 2), Decode(p, 3))
-  val alu = Seq.fill(4)(Alu(p))
-  val bru = Seq.fill(4)(Bru(p))
+  val decode = (0 until p.instructionLanes).map(x => Seq(Decode(p, x))).reduce(_ ++ _)
+  val alu = Seq.fill(p.instructionLanes)(Alu(p))
+  val bru = Seq.fill(p.instructionLanes)(Bru(p))
   val csr = Csr(p)
   val lsu = Lsu(p)
   val mlu = Mlu(p)
@@ -77,15 +77,15 @@
   io.dflush.clean := lsu.io.flush.clean
   lsu.io.flush.ready := io.dflush.ready
 
-  assert(!bru(1).io.iflush)
-  assert(!bru(2).io.iflush)
-  assert(!bru(3).io.iflush)
+  for (i <- 1 until p.instructionLanes) {
+    assert(!bru(i).io.iflush)
+  }
 
   // ---------------------------------------------------------------------------
   // Fetch
   fetch.io.csr := io.csr.in
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     fetch.io.branch(i) := bru(i).io.taken
   }
 
@@ -97,7 +97,7 @@
   // Decode
   val mask = VecInit(decode.map(_.io.inst.ready).scan(true.B)(_ && _))
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     decode(i).io.inst.valid := fetch.io.inst.lanes(i).valid && mask(i)
     fetch.io.inst.lanes(i).ready := decode(i).io.inst.ready && mask(i)
     decode(i).io.inst.addr := fetch.io.inst.lanes(i).addr
@@ -110,31 +110,31 @@
 
   // Interlock based on regfile write port dependencies.
   decode(0).io.interlock := bru(0).io.interlock
-  decode(1).io.interlock := decode(0).io.interlock
-  decode(2).io.interlock := decode(1).io.interlock
-  decode(3).io.interlock := decode(2).io.interlock
+  for (i <- 1 until p.instructionLanes) {
+    decode(i).io.interlock := decode(i - 1).io.interlock
+  }
 
   // Serialize opcodes with only one pipeline.
   decode(0).io.serializeIn.defaults()
-  decode(1).io.serializeIn := decode(0).io.serializeOut
-  decode(2).io.serializeIn := decode(1).io.serializeOut
-  decode(3).io.serializeIn := decode(2).io.serializeOut
+  for (i <- 1 until p.instructionLanes) {
+    decode(i).io.serializeIn := decode(i - 1).io.serializeOut
+  }
 
   // In decode update multi-issue scoreboard state.
   val scoreboard_spec = decode.map(_.io.scoreboard.spec).scan(0.U)(_|_)
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     decode(i).io.scoreboard.comb := regfile.io.scoreboard.comb | scoreboard_spec(i)
     decode(i).io.scoreboard.regd := regfile.io.scoreboard.regd | scoreboard_spec(i)
   }
 
   decode(0).io.mactive := io.vcore.mactive
-  decode(1).io.mactive := false.B
-  decode(2).io.mactive := false.B
-  decode(3).io.mactive := false.B
+  for (i <- 1 until p.instructionLanes) {
+    decode(i).io.mactive := false.B
+  }
 
   // ---------------------------------------------------------------------------
   // ALU
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     alu(i).io.req := decode(i).io.alu
     alu(i).io.rs1 := regfile.io.readData(2 * i + 0)
     alu(i).io.rs2 := regfile.io.readData(2 * i + 1)
@@ -142,7 +142,7 @@
 
   // ---------------------------------------------------------------------------
   // Branch Unit
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     bru(i).io.req := decode(i).io.bru
     bru(i).io.rs1 := regfile.io.readData(2 * i + 0)
     bru(i).io.rs2 := regfile.io.readData(2 * i + 1)
@@ -150,9 +150,9 @@
   }
 
   bru(0).io.csr <> csr.io.bru
-  bru(1).io.csr.defaults()
-  bru(2).io.csr.defaults()
-  bru(3).io.csr.defaults()
+  for (i <- 1 until p.instructionLanes) {
+    bru(i).io.csr.defaults()
+  }
 
   io.iflush.valid := iflush
 
@@ -181,7 +181,7 @@
   // Load/Store Unit
   lsu.io.busPort := regfile.io.busPort
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     lsu.io.req(i).valid := decode(i).io.lsu.valid
     lsu.io.req(i).store := decode(i).io.lsu.store
     lsu.io.req(i).addr  := decode(i).io.lsu.addr
@@ -191,7 +191,7 @@
 
   // ---------------------------------------------------------------------------
   // Multiplier Unit
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     mlu.io.req(i) := decode(i).io.mlu
     mlu.io.rs1(i) := regfile.io.readData(2 * i)
     mlu.io.rs2(i) := regfile.io.readData((2 * i) + 1)
@@ -205,13 +205,13 @@
   dvu.io.rd.ready := !mlu.io.rd.valid
 
   // TODO: make port conditional on pipeline index.
-  for (i <- 1 until 4) {
+  for (i <- 1 until p.instructionLanes) {
     decode(i).io.dvu.ready := false.B
   }
 
   // ---------------------------------------------------------------------------
   // Register File
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     regfile.io.readAddr(2 * i + 0) := decode(i).io.rs1Read
     regfile.io.readAddr(2 * i + 1) := decode(i).io.rs2Read
     regfile.io.readSet(2 * i + 0) := decode(i).io.rs1Set
@@ -245,27 +245,29 @@
             io.vcore.rd(i).valid) <= 1.U)
   }
 
-  regfile.io.writeData(4).valid := mlu.io.rd.valid || dvu.io.rd.valid
-  regfile.io.writeData(4).addr := Mux(mlu.io.rd.valid, mlu.io.rd.addr, dvu.io.rd.addr)
-  regfile.io.writeData(4).data := Mux(mlu.io.rd.valid, mlu.io.rd.data, dvu.io.rd.data)
+  val mluDvuOffset = p.instructionLanes
+  regfile.io.writeData(mluDvuOffset).valid := mlu.io.rd.valid || dvu.io.rd.valid
+  regfile.io.writeData(mluDvuOffset).addr := Mux(mlu.io.rd.valid, mlu.io.rd.addr, dvu.io.rd.addr)
+  regfile.io.writeData(mluDvuOffset).data := Mux(mlu.io.rd.valid, mlu.io.rd.data, dvu.io.rd.data)
   assert(!(mlu.io.rd.valid && (dvu.io.rd.valid && dvu.io.rd.ready)))  // TODO: stall dvu on mlu write
 
-  regfile.io.writeData(5).valid := lsu.io.rd.valid
-  regfile.io.writeData(5).addr  := lsu.io.rd.addr
-  regfile.io.writeData(5).data  := lsu.io.rd.data
+  val lsuOffset = p.instructionLanes + 1
+  regfile.io.writeData(lsuOffset).valid := lsu.io.rd.valid
+  regfile.io.writeData(lsuOffset).addr  := lsu.io.rd.addr
+  regfile.io.writeData(lsuOffset).data  := lsu.io.rd.data
 
   val writeMask = bru.map(_.io.taken.valid).scan(false.B)(_||_)
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     regfile.io.writeMask(i).valid := writeMask(i)
   }
 
   // ---------------------------------------------------------------------------
   // Vector Extension
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     io.vcore.vinst(i) <> decode(i).io.vinst
   }
 
-  for (i <- 0 until 8) {
+  for (i <- 0 until p.instructionLanes * 2) {
     io.vcore.rs(i) := regfile.io.readData(i)
   }
 
@@ -301,36 +303,23 @@
   cycles := cycles + 1.U
   io.debug.cycles := cycles
 
-  val debugEn = RegInit(0.U(4.W))
-  val debugAddr = Reg(Vec(4, UInt(32.W)))
-  val debugInst = Reg(Vec(4, UInt(32.W)))
+  val debugEn = RegInit(0.U(p.instructionLanes.W))
+  val debugAddr = Reg(Vec(p.instructionLanes, UInt(32.W)))
+  val debugInst = Reg(Vec(p.instructionLanes, UInt(32.W)))
 
-  val debugBrch =
-    Cat(bru(0).io.taken.valid || bru(1).io.taken.valid || bru(2).io.taken.valid,
-        bru(0).io.taken.valid || bru(1).io.taken.valid,
-        bru(0).io.taken.valid,
-        false.B)
+  val debugBrch = Cat(bru.map(_.io.taken.valid).scanRight(false.B)(_ || _))
 
-  debugEn := Cat(fetch.io.inst.lanes(3).valid && fetch.io.inst.lanes(3).ready && !branchTaken,
-                 fetch.io.inst.lanes(2).valid && fetch.io.inst.lanes(2).ready && !branchTaken,
-                 fetch.io.inst.lanes(1).valid && fetch.io.inst.lanes(1).ready && !branchTaken,
-                 fetch.io.inst.lanes(0).valid && fetch.io.inst.lanes(0).ready && !branchTaken)
+  debugEn := Cat(fetch.io.inst.lanes.map(x => x.valid && x.ready && !branchTaken))
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     debugAddr(i) := fetch.io.inst.lanes(i).addr
     debugInst(i) := fetch.io.inst.lanes(i).inst
   }
 
   io.debug.en := debugEn & ~debugBrch
 
-  io.debug.addr0 := debugAddr(0)
-  io.debug.addr1 := debugAddr(1)
-  io.debug.addr2 := debugAddr(2)
-  io.debug.addr3 := debugAddr(3)
-  io.debug.inst0 := debugInst(0)
-  io.debug.inst1 := debugInst(1)
-  io.debug.inst2 := debugInst(2)
-  io.debug.inst3 := debugInst(3)
+  io.debug.addr <> debugAddr
+  io.debug.inst <> debugInst
 }
 
 object EmitSCore extends App {
diff --git a/hdl/chisel/src/kelvin/vector/VAlu.scala b/hdl/chisel/src/kelvin/vector/VAlu.scala
index 03eae95..03f6f36 100644
--- a/hdl/chisel/src/kelvin/vector/VAlu.scala
+++ b/hdl/chisel/src/kelvin/vector/VAlu.scala
@@ -30,15 +30,15 @@
 class VAlu(p: Parameters) extends Module {
   val io = IO(new Bundle {
     // Instructions.
-    val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+    val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VDecodeBits))))
     val active = Output(UInt(64.W))
 
     // VRegfile.
     val vrfsb = Input(UInt(128.W))
-    val read  = Vec(7, new VRegfileReadIO(p))
-    val write = Vec(4, new VRegfileWriteIO(p))
-    val whint = Vec(4, new VRegfileWhintIO(p))
-    val scalar = Vec(2, new VRegfileScalarIO(p))
+    val read  = Vec(p.vectorReadPorts, new VRegfileReadIO(p))
+    val write = Vec(p.vectorWritePorts - 2, new VRegfileWriteIO(p))
+    val whint = Vec(p.vectorWhintPorts, new VRegfileWhintIO(p))
+    val scalar = Vec(p.vectorScalarPorts, new VRegfileScalarIO(p))
 
     // Testbench signals.
     val read_0_ready = Output(Bool())
@@ -56,26 +56,26 @@
 
   // ---------------------------------------------------------------------------
   // Tie-offs.
-  for (i <- 0 until 7) {
+  for (i <- 0 until io.read.length) {
     io.read(i).valid := false.B
     io.read(i).addr := 0.U
     io.read(i).tag  := 0.U
   }
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until io.write.length) {
     io.write(i).valid := false.B
     io.write(i).addr := 0.U
     io.write(i).data := 0.U
   }
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until io.whint.length) {
     io.whint(i).valid := false.B
     io.whint(i).addr := 0.U
   }
 
   // ---------------------------------------------------------------------------
   // Opcode checks.
-  for (i <- 0 until 4) {
+  for (i <- 0 until io.in.bits.length) {
     when (io.in.valid && io.in.ready) {
       when (io.in.bits(i).valid) {
         val op = io.in.bits(i).bits.op
@@ -254,8 +254,8 @@
     active
   }
 
-  val q0 = VCmdq(cmdqDepth, new VAluCmdq, Fin0, Fout, Factive)
-  val q1 = VCmdq(cmdqDepth, new VAluCmdq, Fin1, Fout, Factive)
+  val q0 = VCmdq(p, cmdqDepth, new VAluCmdq, Fin0, Fout, Factive)
+  val q1 = VCmdq(p, cmdqDepth, new VAluCmdq, Fin1, Fout, Factive)
 
   q0.io.in.valid := io.in.valid && q1.io.in.ready
   q1.io.in.valid := io.in.valid && q0.io.in.ready
@@ -278,20 +278,19 @@
   // ---------------------------------------------------------------------------
   // ALU Selection interleaving.
   val alureg = RegInit(false.B)
-  val alusel = Wire(Vec(5, Bool()))
+  val alusel = Wire(Vec(p.instructionLanes + 1, Bool()))
 
   // Toggle if previous was valid and was not a synchronized dual command.
   alusel(0) := alureg
-  alusel(1) := Mux(io.in.bits(0).valid && !io.in.bits(0).bits.cmdsync, !alusel(0), alusel(0))
-  alusel(2) := Mux(io.in.bits(1).valid && !io.in.bits(1).bits.cmdsync, !alusel(1), alusel(1))
-  alusel(3) := Mux(io.in.bits(2).valid && !io.in.bits(2).bits.cmdsync, !alusel(2), alusel(2))
-  alusel(4) := Mux(io.in.bits(3).valid && !io.in.bits(3).bits.cmdsync, !alusel(3), alusel(3))
-
-  when (io.in.valid && io.in.ready) {
-    alureg := alusel(4)
+  for (i <- 0 until p.instructionLanes) {
+    alusel(i + 1) := Mux(io.in.bits(i).valid && !io.in.bits(i).bits.cmdsync, !alusel(i), alusel(i))
   }
 
-  for (i <- 0 until 4) {
+  when (io.in.valid && io.in.ready) {
+    alureg := alusel(alusel.length - 1)
+  }
+
+  for (i <- 0 until p.instructionLanes) {
     q0.io.in.bits(i).valid := io.in.bits(i).valid && (alusel(i) === 0.U || io.in.bits(i).bits.cmdsync)
     q1.io.in.bits(i).valid := io.in.bits(i).valid && (alusel(i) === 1.U || io.in.bits(i).bits.cmdsync)
   }
diff --git a/hdl/chisel/src/kelvin/vector/VCmdq.scala b/hdl/chisel/src/kelvin/vector/VCmdq.scala
index 20e29b3..261ba63 100644
--- a/hdl/chisel/src/kelvin/vector/VCmdq.scala
+++ b/hdl/chisel/src/kelvin/vector/VCmdq.scala
@@ -27,14 +27,14 @@
 // <factive> returns the activation status for decode dependencies.
 
 object VCmdq {
-  def apply[T <: Data](n: Int, t: T, fin: (VDecodeBits) => T, fout: (T, Bool, UInt, Bool) => (T, Bool), factive: (T, Bool, UInt) => UInt) = {
-    Module(new VCmdq(n, t, fin, fout, factive))
+  def apply[T <: Data](p: Parameters, n: Int, t: T, fin: (VDecodeBits) => T, fout: (T, Bool, UInt, Bool) => (T, Bool), factive: (T, Bool, UInt) => UInt) = {
+    Module(new VCmdq(p, n, t, fin, fout, factive))
   }
 }
 
-class VCmdq[T <: Data](n: Int, t: T, fin: (VDecodeBits) => T, fout: (T, Bool, UInt, Bool) => (T, Bool), factive: (T, Bool, UInt) => UInt) extends Module {
+class VCmdq[T <: Data](p: Parameters, n: Int, t: T, fin: (VDecodeBits) => T, fout: (T, Bool, UInt, Bool) => (T, Bool), factive: (T, Bool, UInt) => UInt) extends Module {
   val io = IO(new Bundle {
-    val in  = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+    val in  = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VDecodeBits))))
     val out = Decoupled(t)
     val active = Output(UInt(64.W))
     val nempty = Output(Bool())
@@ -45,7 +45,7 @@
     val m = Output(Bool())  // stripmine
   }
 
-  val f = Fifo4e(new VCmdqWrapper, n)
+  val f = FifoXe(new VCmdqWrapper, p.instructionLanes, n)
 
   val active = RegInit(0.U(64.W))
 
@@ -65,7 +65,7 @@
   f.io.in.valid := io.in.valid
   io.in.ready := f.io.in.ready
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     f.io.in.bits(i).valid := io.in.bits(i).valid
     f.io.in.bits(i).bits.tin := fin(io.in.bits(i).bits)
     f.io.in.bits(i).bits.m := io.in.bits(i).bits.m
@@ -118,14 +118,10 @@
 
   when (io.in.valid && io.in.ready || io.out.valid && io.out.ready) {
     val fvalid = MuxOR(f.io.in.valid && f.io.in.ready,
-                 Cat(f.io.in.bits(3).valid, f.io.in.bits(2).valid,
-                     f.io.in.bits(1).valid, f.io.in.bits(0).valid))
+                 Cat((0 until p.instructionLanes).reverse.map(x => f.io.in.bits(x).valid)))
 
-    active :=
-      MuxOR(fvalid(0), factive(f.io.in.bits(0).bits.tin, f.io.in.bits(0).bits.m, step0)) |
-      MuxOR(fvalid(1), factive(f.io.in.bits(1).bits.tin, f.io.in.bits(1).bits.m, step0)) |
-      MuxOR(fvalid(2), factive(f.io.in.bits(2).bits.tin, f.io.in.bits(2).bits.m, step0)) |
-      MuxOR(fvalid(3), factive(f.io.in.bits(3).bits.tin, f.io.in.bits(3).bits.m, step0)) |
+    active := (0 until p.instructionLanes).map(x =>
+      MuxOR(fvalid(x), factive(f.io.in.bits(x).bits.tin, f.io.in.bits(x).bits.m, step0))).reduce(_|_) |
       ValueActive()
   }
 
@@ -180,5 +176,6 @@
     active
   }
 
-  ChiselStage.emitSystemVerilogFile(new VCmdq(8, new VCmdqTestBundle, VCmdqTestFin, VCmdqTestFout, VCmdqTestFactive), args)
+  val p = kelvin.Parameters()
+  ChiselStage.emitSystemVerilogFile(new VCmdq(p, 8, new VCmdqTestBundle, VCmdqTestFin, VCmdqTestFout, VCmdqTestFactive), args)
 }
diff --git a/hdl/chisel/src/kelvin/vector/VConvCtrl.scala b/hdl/chisel/src/kelvin/vector/VConvCtrl.scala
index 1e017a4..ebea853 100644
--- a/hdl/chisel/src/kelvin/vector/VConvCtrl.scala
+++ b/hdl/chisel/src/kelvin/vector/VConvCtrl.scala
@@ -30,7 +30,7 @@
 class VConvCtrl(p: Parameters) extends Module {
   val io = IO(new Bundle {
     // Instructions.
-    val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+    val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VDecodeBits))))
     val active = Output(UInt(64.W))
 
     // RegisterFile.
@@ -160,7 +160,7 @@
     active
   }
 
-  val q = VCmdq(cmdqDepth, new VConvCtrlCmdq, Fin, Fout, Factive)
+  val q = VCmdq(p, cmdqDepth, new VConvCtrlCmdq, Fin, Fout, Factive)
 
   q.io.in <> io.in
 
diff --git a/hdl/chisel/src/kelvin/vector/VCore.scala b/hdl/chisel/src/kelvin/vector/VCore.scala
index 58bbab6..919cb27 100644
--- a/hdl/chisel/src/kelvin/vector/VCore.scala
+++ b/hdl/chisel/src/kelvin/vector/VCore.scala
@@ -28,11 +28,11 @@
 
 class VCoreIO(p: Parameters) extends Bundle {
   // Decode cycle.
-  val vinst = Vec(4, new VInstIO)
+  val vinst = Vec(p.instructionLanes, new VInstIO)
 
   // Execute cycle.
-  val rs = Vec(8, Flipped(new RegfileReadDataIO))
-  val rd = Vec(4, Flipped(new RegfileWriteDataIO))
+  val rs = Vec(p.instructionLanes * 2, Flipped(new RegfileReadDataIO))
+  val rd = Vec(p.instructionLanes, Flipped(new RegfileWriteDataIO))
 
   // Status.
   val mactive = Output(Bool())
@@ -97,7 +97,7 @@
 
   vinst.io.out.stall := vdec.io.stall  // decode backpressure
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     vdec.io.in.bits(i) := vinst.io.out.lane(i)
   }
 
@@ -105,24 +105,24 @@
 
   // ---------------------------------------------------------------------------
   // VRegfile.
-  for (i <- 0 until 7) {
+  for (i <- 0 until vrf.readPorts) {
     vrf.io.read(i).valid := false.B
     vrf.io.read(i).addr := 0.U
     vrf.io.read(i).tag := 0.U
   }
 
-  for (i <- 0 until 6) {
+  for (i <- 0 until vrf.writePorts) {
     vrf.io.write(i).valid := false.B
     vrf.io.write(i).addr := 0.U
     vrf.io.write(i).data := 0.U
   }
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until vrf.whintPorts) {
     vrf.io.whint(i).valid := false.B
     vrf.io.whint(i).addr := 0.U
   }
 
-  for (i <- 0 until 2) {
+  for (i <- 0 until vrf.scalarPorts) {
     vrf.io.scalar(i).valid := false.B
     vrf.io.scalar(i).data := 0.U
   }
@@ -133,43 +133,38 @@
 
   // ---------------------------------------------------------------------------
   // VALU.
-  val aluvalid = Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).alu,
-                     vdec.io.out(2).valid && vdec.io.cmdq(2).alu,
-                     vdec.io.out(1).valid && vdec.io.cmdq(1).alu,
-                     vdec.io.out(0).valid && vdec.io.cmdq(0).alu)
+  val aluvalid = (0 until p.instructionLanes).map(x => vdec.io.out(x).valid && vdec.io.cmdq(x).alu)
+  val aluready = (0 until p.instructionLanes).map(x => valu.io.in.ready && vdec.io.cmdq(x).alu)
 
-  val aluready = Cat(valu.io.in.ready && vdec.io.cmdq(3).alu,
-                     valu.io.in.ready && vdec.io.cmdq(2).alu,
-                     valu.io.in.ready && vdec.io.cmdq(1).alu,
-                     valu.io.in.ready && vdec.io.cmdq(0).alu)
+  valu.io.in.valid := aluvalid.reduce(_ || _)
 
-  valu.io.in.valid := aluvalid =/= 0.U
-
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     valu.io.in.bits(i).valid := aluvalid(i)
     valu.io.in.bits(i).bits := vdec.io.out(i).bits
   }
 
-  for (i <- 0 until 7) {
+  for (i <- 0 until vrf.readPorts) {
     vrf.io.read(i).valid := valu.io.read(i).valid
     vrf.io.read(i).addr := valu.io.read(i).addr
     vrf.io.read(i).tag  := valu.io.read(i).tag
   }
 
-  for (i <- 0 until 7) {
+  for (i <- 0 until vrf.readPorts) {
     valu.io.read(i).data := vrf.io.read(i).data
   }
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until vrf.writePorts - 2) {
     vrf.io.write(i).valid := valu.io.write(i).valid
     vrf.io.write(i).addr := valu.io.write(i).addr
     vrf.io.write(i).data := valu.io.write(i).data
+  }
 
+  for (i <- 0 until vrf.whintPorts) {
     vrf.io.whint(i).valid := valu.io.whint(i).valid
     vrf.io.whint(i).addr := valu.io.whint(i).addr
   }
 
-  for (i <- 0 until 2) {
+  for (i <- 0 until vrf.scalarPorts) {
     vrf.io.scalar(i).valid := valu.io.scalar(i).valid
     vrf.io.scalar(i).data := valu.io.scalar(i).data
   }
@@ -178,19 +173,12 @@
 
   // ---------------------------------------------------------------------------
   // VCONV.
-  val convvalid = Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).conv,
-                      vdec.io.out(2).valid && vdec.io.cmdq(2).conv,
-                      vdec.io.out(1).valid && vdec.io.cmdq(1).conv,
-                      vdec.io.out(0).valid && vdec.io.cmdq(0).conv)
+  val convvalid = (0 until p.instructionLanes).map(x => vdec.io.out(x).valid && vdec.io.cmdq(x).conv)
+  val convready = (0 until p.instructionLanes).map(x => vconv.io.in.ready && vdec.io.cmdq(x).conv)
 
-  val convready = Cat(vconv.io.in.ready && vdec.io.cmdq(3).conv,
-                      vconv.io.in.ready && vdec.io.cmdq(2).conv,
-                      vconv.io.in.ready && vdec.io.cmdq(1).conv,
-                      vconv.io.in.ready && vdec.io.cmdq(0).conv)
+  vconv.io.in.valid := convvalid.reduce(_ || _)
 
-  vconv.io.in.valid := convvalid =/= 0.U
-
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     vconv.io.in.bits(i).valid := convvalid(i)
     vconv.io.in.bits(i).bits := vdec.io.out(i).bits
   }
@@ -201,25 +189,18 @@
 
   // ---------------------------------------------------------------------------
   // VLdSt.
-  val ldstvalid = Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).ldst,
-                      vdec.io.out(2).valid && vdec.io.cmdq(2).ldst,
-                      vdec.io.out(1).valid && vdec.io.cmdq(1).ldst,
-                      vdec.io.out(0).valid && vdec.io.cmdq(0).ldst)
+  val ldstvalid = (0 until p.instructionLanes).map(x => vdec.io.out(x).valid && vdec.io.cmdq(x).ldst)
+  val ldstready = (0 until p.instructionLanes).map(x => vldst.io.in.ready && vdec.io.cmdq(x).ldst)
 
-  val ldstready = Cat(vldst.io.in.ready && vdec.io.cmdq(3).ldst,
-                      vldst.io.in.ready && vdec.io.cmdq(2).ldst,
-                      vldst.io.in.ready && vdec.io.cmdq(1).ldst,
-                      vldst.io.in.ready && vdec.io.cmdq(0).ldst)
+  vldst.io.in.valid := ldstvalid.reduce(_ || _)
 
-  vldst.io.in.valid := ldstvalid =/= 0.U
-
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     vldst.io.in.bits(i).valid := ldstvalid(i)
     vldst.io.in.bits(i).bits := vdec.io.out(i).bits
   }
 
   vldst.io.read.ready := !vst.io.read.valid
-  vldst.io.read.data := vrf.io.read(6).data
+  vldst.io.read.data := vrf.io.read(vrf.readPorts - 1).data
 
   vldst.io.vrfsb := vrf.io.vrfsb.data
 
@@ -228,22 +209,12 @@
 
   // ---------------------------------------------------------------------------
   // VLd.
-  val ldvalid = Wire(UInt(4.W))
-  val ldready = Wire(UInt(4.W))
+  val ldvalid = (0 until p.instructionLanes).map(x => vdec.io.cmdq(x).ld && vdec.io.out(x).valid)
+  val ldready = (0 until p.instructionLanes).map(x => vdec.io.cmdq(x).ld && vld.io.in.ready)
 
-  ldvalid := Cat(vdec.io.cmdq(3).ld && vdec.io.out(3).valid,
-                 vdec.io.cmdq(2).ld && vdec.io.out(2).valid,
-                 vdec.io.cmdq(1).ld && vdec.io.out(1).valid,
-                 vdec.io.cmdq(0).ld && vdec.io.out(0).valid)
+  vld.io.in.valid := ldvalid.reduce(_ || _)
 
-  ldready := Cat(vdec.io.cmdq(3).ld && vld.io.in.ready,
-                 vdec.io.cmdq(2).ld && vld.io.in.ready,
-                 vdec.io.cmdq(1).ld && vld.io.in.ready,
-                 vdec.io.cmdq(0).ld && vld.io.in.ready)
-
-  vld.io.in.valid := ldvalid =/= 0.U
-
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     vld.io.in.bits(i).valid := ldvalid(i)
     vld.io.in.bits(i).bits := vdec.io.out(i).bits
   }
@@ -252,22 +223,12 @@
 
   // ---------------------------------------------------------------------------
   // VSt.
-  val stvalid = Wire(UInt(4.W))
-  val stready = Wire(UInt(4.W))
+  val stvalid = (0 until p.instructionLanes).map(x => vdec.io.out(x).valid && vdec.io.cmdq(x).st)
+  val stready = (0 until p.instructionLanes).map(x => vst.io.in.ready && vdec.io.cmdq(x).st)
 
-  stvalid := Cat(vdec.io.out(3).valid && vdec.io.cmdq(3).st,
-                 vdec.io.out(2).valid && vdec.io.cmdq(2).st,
-                 vdec.io.out(1).valid && vdec.io.cmdq(1).st,
-                 vdec.io.out(0).valid && vdec.io.cmdq(0).st)
+  vst.io.in.valid := stvalid.reduce(_ || _)
 
-  stready := Cat(vst.io.in.ready && vdec.io.cmdq(3).st,
-                 vst.io.in.ready && vdec.io.cmdq(2).st,
-                 vst.io.in.ready && vdec.io.cmdq(1).st,
-                 vst.io.in.ready && vdec.io.cmdq(0).st)
-
-  vst.io.in.valid := stvalid =/= 0.U
-
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     vst.io.in.bits(i).valid := stvalid(i)
     vst.io.in.bits(i).bits := vdec.io.out(i).bits
   }
@@ -277,29 +238,29 @@
   vst.io.vrfsb := vrf.io.vrfsb.data
 
   vst.io.read.ready := true.B
-  vst.io.read.data := vrf.io.read(6).data
+  vst.io.read.data := vrf.io.read(vrf.readPorts - 1).data
 
   // ---------------------------------------------------------------------------
   // Load write.
-  vrf.io.write(4).valid := vldst.io.write.valid
-  vrf.io.write(4).addr := vldst.io.write.addr
-  vrf.io.write(4).data := vldst.io.write.data
+  vrf.io.write(vrf.readPorts - 3).valid := vldst.io.write.valid
+  vrf.io.write(vrf.readPorts - 3).addr := vldst.io.write.addr
+  vrf.io.write(vrf.readPorts - 3).data := vldst.io.write.data
 
-  vrf.io.write(5).valid := vld.io.write.valid
-  vrf.io.write(5).addr := vld.io.write.addr
-  vrf.io.write(5).data := vld.io.write.data
+  vrf.io.write(vrf.readPorts - 2).valid := vld.io.write.valid
+  vrf.io.write(vrf.readPorts - 2).addr := vld.io.write.addr
+  vrf.io.write(vrf.readPorts - 2).data := vld.io.write.data
 
   // ---------------------------------------------------------------------------
   // Store read.
-  vrf.io.read(6).valid := vst.io.read.valid || vldst.io.read.valid
-  vrf.io.read(6).addr := Mux(vst.io.read.valid, vst.io.read.addr,
+  vrf.io.read(vrf.readPorts - 1).valid := vst.io.read.valid || vldst.io.read.valid
+  vrf.io.read(vrf.readPorts - 1).addr := Mux(vst.io.read.valid, vst.io.read.addr,
                              vldst.io.read.addr)
-  vrf.io.read(6).tag := Mux(vst.io.read.valid, vst.io.read.tag,
+  vrf.io.read(vrf.readPorts - 1).tag := Mux(vst.io.read.valid, vst.io.read.tag,
                             vldst.io.read.tag)
 
   // ---------------------------------------------------------------------------
   // VDecode.
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     vdec.io.out(i).ready := aluready(i) || convready(i) || ldstready(i) ||
                             ldready(i) || stready(i)
   }
diff --git a/hdl/chisel/src/kelvin/vector/VDecode.scala b/hdl/chisel/src/kelvin/vector/VDecode.scala
index fa48723..44d6afc 100644
--- a/hdl/chisel/src/kelvin/vector/VDecode.scala
+++ b/hdl/chisel/src/kelvin/vector/VDecode.scala
@@ -18,7 +18,7 @@
 
 import chisel3._
 import chisel3.util._
-import common.Fifo4x4
+import common.FifoIxO
 import _root_.circt.stage.ChiselStage
 
 object VDecode {
@@ -29,10 +29,10 @@
 
 class VDecode(p: Parameters) extends Module {
   val io = IO(new Bundle {
-    val in = Flipped(Decoupled(Vec(4, Valid(new VectorInstructionLane))))
-    val out = Vec(4, Decoupled(new VDecodeBits))
-    val cmdq = Vec(4, Output(new VDecodeCmdq))
-    val actv = Vec(4, Output(new VDecodeActive))  // used in testbench
+    val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VectorInstructionLane))))
+    val out = Vec(p.instructionLanes, Decoupled(new VDecodeBits))
+    val cmdq = Vec(p.instructionLanes, Output(new VDecodeCmdq))
+    val actv = Vec(p.instructionLanes, Output(new VDecodeActive))  // used in testbench
     val stall = Output(Bool())
     val active = Input(UInt(64.W))
     val vrfsb = new VRegfileScoreboardIO
@@ -45,27 +45,24 @@
 
   val enc = new VEncodeOp()
 
-  val f = Fifo4x4(new VectorInstructionLane, depth)
+  val f = FifoIxO(new VectorInstructionLane, p.instructionLanes, p.instructionLanes, depth)
 
-  val d = Seq(Module(new VDecodeInstruction(p)),
-              Module(new VDecodeInstruction(p)),
-              Module(new VDecodeInstruction(p)),
-              Module(new VDecodeInstruction(p)))
+  val d = Seq.fill(p.instructionLanes)(Module(new VDecodeInstruction(p)))
 
-  val e = Wire(Vec(4, new VDecodeBits))
+  val e = Wire(Vec(p.instructionLanes, new VDecodeBits))
 
-  val valid = RegInit(VecInit(Seq.fill(4)(false.B)))
-  val data = Reg(Vec(4, new VDecodeBits))
-  val cmdq = Reg(Vec(4, new VDecodeCmdq))
-  val actv = Wire(Vec(4, new VDecodeActive))
-  val actv2 = Reg(Vec(4, new VDecodeActive2))
-  val dataNxt = Wire(Vec(4, new VDecodeBits))
-  val cmdqNxt = Wire(Vec(4, new VDecodeCmdq))
-  val actvNxt = Wire(Vec(4, new VDecodeActive2))
+  val valid = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
+  val data = Reg(Vec(p.instructionLanes, new VDecodeBits))
+  val cmdq = Reg(Vec(p.instructionLanes, new VDecodeCmdq))
+  val actv = Wire(Vec(p.instructionLanes, new VDecodeActive))
+  val actv2 = Reg(Vec(p.instructionLanes, new VDecodeActive2))
+  val dataNxt = Wire(Vec(p.instructionLanes, new VDecodeBits))
+  val cmdqNxt = Wire(Vec(p.instructionLanes, new VDecodeCmdq))
+  val actvNxt = Wire(Vec(p.instructionLanes, new VDecodeActive2))
 
   // ---------------------------------------------------------------------------
   // Decode.
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     d(i).io.in := f.io.out(i).bits
   }
 
@@ -75,24 +72,11 @@
   // write the read usage is occurring on.
   val tagReg = RegInit(0.U(64.W))
 
-  val tag0 = tagReg
-  val tag1 = tag0 ^ d(0).io.actv.wactive
-  val tag2 = tag1 ^ d(1).io.actv.wactive
-  val tag3 = tag2 ^ d(2).io.actv.wactive
-  val tag4 = tag3 ^ d(3).io.actv.wactive
-
-  val tags = Seq(tag0, tag1, tag2, tag3, tag4)
+  val tags = (0 until p.instructionLanes).map(x => d(x).io.actv.wactive).scan(tagReg)(_ ^ _)
+  assert(tags.length == p.instructionLanes + 1)
 
   // f.io.out is ordered, so can use a priority tree.
-  when(f.io.out(3).valid && f.io.out(3).ready) {
-    tagReg := tag4
-  } .elsewhen(f.io.out(2).valid && f.io.out(2).ready) {
-    tagReg := tag3
-  } .elsewhen(f.io.out(1).valid && f.io.out(1).ready) {
-    tagReg := tag2
-  } .elsewhen(f.io.out(0).valid && f.io.out(0).ready) {
-    tagReg := tag1
-  }
+  tagReg := MuxCase(tags(0), (0 until p.instructionLanes).reverse.map(x => (f.io.out(x).valid && f.io.out(x).ready) -> tags(x + 1)))
 
   def TagAddr(tag: UInt, v: VAddrTag): VAddrTag = {
     assert(tag.getWidth == 64)
@@ -111,7 +95,7 @@
     r
   }
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     e(i) := d(i).io.out
     e(i).vs := TagAddr(tags(i), d(i).io.out.vs)
     e(i).vt := TagAddr(tags(i), d(i).io.out.vt)
@@ -123,34 +107,27 @@
 
   // ---------------------------------------------------------------------------
   // Undef.  (io.in.ready ignored to signal as early as possible)
-  io.undef := io.in.valid && (d(0).io.undef || d(1).io.undef || d(2).io.undef || d(3).io.undef)
+  io.undef := io.in.valid && d.map(x => x.io.undef).reduce(_ || _)
 
   // ---------------------------------------------------------------------------
   // Fifo.
   f.io.in <> io.in
 
-  val icount = MuxOR(io.in.valid, PopCount(Cat(io.in.bits(0).valid, io.in.bits(1).valid, io.in.bits(2).valid, io.in.bits(3).valid)))
-  assert(icount.getWidth == 3)
+  val icount = MuxOR(io.in.valid,
+    PopCount(io.in.bits.map(_.valid))
+  )
 
-  val ocount = PopCount(Cat(valid(0) && !(io.out(0).valid && io.out(0).ready),
-                            valid(1) && !(io.out(1).valid && io.out(1).ready),
-                            valid(2) && !(io.out(2).valid && io.out(2).ready),
-                            valid(3) && !(io.out(3).valid && io.out(3).ready)))
-  assert(ocount.getWidth == 3)
+  val ocount = PopCount((0 until p.instructionLanes).map(x => valid(x) && !(io.out(x).valid && io.out(x).ready)))
 
-  for (i <- 0 until 4) {
-    f.io.out(i).ready := (i.U + ocount) < 4.U
+  for (i <- 0 until p.instructionLanes) {
+    f.io.out(i).ready := (i.U + ocount) < p.instructionLanes.U
   }
 
   // ---------------------------------------------------------------------------
   // Valid.
-  val fcount = PopCount(Cat(f.io.out(0).valid && f.io.out(0).ready,
-                            f.io.out(1).valid && f.io.out(1).ready,
-                            f.io.out(2).valid && f.io.out(2).ready,
-                            f.io.out(3).valid && f.io.out(3).ready))
-  assert(fcount.getWidth == 3)
+  val fcount = PopCount(f.io.out.map(x => x.valid && x.ready))
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     valid(i) := (ocount + fcount) > i.U
   }
 
@@ -159,41 +136,30 @@
   io.stall := (f.io.count + icount) > (depth - guard).U
 
   // ---------------------------------------------------------------------------
-  // Dependencies.
-  val depends = Wire(Vec(4, Bool()))
-
   // Writes must not proceed past any outstanding reads or writes,
   // or past any dispatching writes.
-  val wactive0 = io.vrfsb.data(63, 0) | io.vrfsb.data(127, 64) | io.active
-  val wactive1 = actv(0).ractive | actv(0).wactive | wactive0
-  val wactive2 = actv(1).ractive | actv(1).wactive | wactive1
-  val wactive3 = actv(2).ractive | actv(2).wactive | wactive2
-  val wactive = VecInit(wactive0, wactive1, wactive2, wactive3)
+  val wactive = VecInit((0 until p.instructionLanes).map(x => actv(x).ractive | actv(x).wactive).scan(io.vrfsb.data(63,0) | io.vrfsb.data(127,64) | io.active)(_ | _))
 
   // Reads must not proceed past any dispatching writes.
-  val ractive0 = 0.U(64.W)
-  val ractive1 = actv(0).wactive | ractive0
-  val ractive2 = actv(1).wactive | ractive1
-  val ractive3 = actv(2).wactive | ractive2
-  val ractive = VecInit(ractive0, ractive1, ractive2, ractive3)
+  val ractive = VecInit((0 until p.instructionLanes).map(x => actv(x).wactive).scan(0.U(64.W))(_ | _))
 
-  for (i <- 0 until 4) {
-    depends(i) := (wactive(i) & actv(i).wactive) =/= 0.U ||
-                  (ractive(i) & actv(i).ractive) =/= 0.U
-  }
+  // Dependencies.
+  val depends = VecInit((0 until p.instructionLanes).map(i =>
+    (wactive(i) & actv(i).wactive) =/= 0.U ||
+    (ractive(i) & actv(i).ractive) =/= 0.U
+  ))
 
   // ---------------------------------------------------------------------------
   // Data.
-  val fvalid = VecInit(f.io.out(0).valid, f.io.out(1).valid,
-                       f.io.out(2).valid, f.io.out(3).valid).asUInt
-  assert(!(fvalid(1) && fvalid(0,0) =/= 1.U))
-  assert(!(fvalid(2) && fvalid(1,0) =/= 3.U))
-  assert(!(fvalid(3) && fvalid(2,0) =/= 7.U))
+  val fvalid = VecInit(f.io.out.map(_.valid)).asUInt
+  for (i <- 0 until p.instructionLanes) {
+    assert(!(fvalid(i) && PopCount(fvalid(i,0)) =/= (i + 1).U))
+  }
 
   // Register is updated when fifo has state or contents are active.
   val dataEn = fvalid(0) || valid.asUInt =/= 0.U
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     when (dataEn) {
       data(i) := dataNxt(i)
       cmdq(i) := cmdqNxt(i)
@@ -201,14 +167,14 @@
     }
   }
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     actv(i).ractive := actv2(i).ractive
     actv(i).wactive := actv2(i).wactive(63, 0) | actv2(i).wactive(127, 64)
   }
 
   // Tag the decode wactive.
-  val dactv = Wire(Vec(4, new VDecodeActive2))
-  for (i <- 0 until 4) {
+  val dactv = Wire(Vec(p.instructionLanes, new VDecodeActive2))
+  for (i <- 0 until p.instructionLanes) {
     val w0 = d(i).io.actv.wactive & ~tags(i + 1)
     val w1 = d(i).io.actv.wactive &  tags(i + 1)
     dactv(i).ractive := d(i).io.actv.ractive
@@ -216,155 +182,51 @@
   }
 
   // Data multiplexor of current values and fifo+decode output.
-  val dataMux = VecInit(data(0), data(1), data(2), data(3),
-                        e(0), e(1), e(2), e(3))
+  val dataMux = VecInit(data ++ e)
+  val cmdqMux = VecInit(cmdq ++ d.map(x => x.io.cmdq))
+  val actvMux = VecInit(actv2 ++ dactv)
 
-  val cmdqMux = VecInit(cmdq(0), cmdq(1), cmdq(2), cmdq(3),
-                        d(0).io.cmdq, d(1).io.cmdq, d(2).io.cmdq, d(3).io.cmdq)
-
-  val actvMux = VecInit(actv2(0), actv2(1), actv2(2), actv2(3),
-                        dactv(0), dactv(1), dactv(2), dactv(3))
-
+  def GenerateMarked(start: Int, count: Int): Seq[UInt] = {
+    (0 until count).map(x => Wire(UInt((start + x).W)))
+  }
   // Mark the multiplexor entries that need to be kept.
-  val marked0 = Wire(UInt(5.W))
-  val marked1 = Wire(UInt(6.W))
-  val marked2 = Wire(UInt(7.W))
+  val marked = GenerateMarked((p.instructionLanes + 1), p.instructionLanes - 1)
+  val output = Cat((0 until p.instructionLanes).reverse.map(x => io.out(x).valid && io.out(x).ready))
+  val validNotOutput = (0 until (p.instructionLanes * 2) - 1).map(x =>
+    if (x < valid.length) { valid(x) && !output(x) } else { true.B })
+  val prevMarked = (0 until p.instructionLanes).map(x =>
+    if (x == 0) { None } else { Some(marked(x - 1)) }
+  )
 
-  assert((marked1 & marked0) === marked0)
-  assert((marked2 & marked0) === marked0)
-  assert((marked2 & marked1) === marked1)
-
-  val output = Cat(io.out(3).valid && io.out(3).ready,
-                   io.out(2).valid && io.out(2).ready,
-                   io.out(1).valid && io.out(1).ready,
-                   io.out(0).valid && io.out(0).ready)
-
-  when (valid(0) && !output(0)) {
-    dataNxt(0) := dataMux(0)
-    cmdqNxt(0) := cmdqMux(0)
-    actvNxt(0) := actvMux(0)
-    marked0 := 0x01.U
-  } .elsewhen (valid(1) && !output(1)) {
-    dataNxt(0) := dataMux(1)
-    cmdqNxt(0) := cmdqMux(1)
-    actvNxt(0) := actvMux(1)
-    marked0 := 0x03.U
-  } .elsewhen (valid(2) && !output(2)) {
-    dataNxt(0) := dataMux(2)
-    cmdqNxt(0) := cmdqMux(2)
-    actvNxt(0) := actvMux(2)
-    marked0 := 0x07.U
-  } .elsewhen (valid(3) && !output(3)) {
-    dataNxt(0) := dataMux(3)
-    cmdqNxt(0) := cmdqMux(3)
-    actvNxt(0) := actvMux(3)
-    marked0 := 0x0f.U
-  } .otherwise {
-    dataNxt(0) := dataMux(4)
-    cmdqNxt(0) := cmdqMux(4)
-    actvNxt(0) := actvMux(4)
-    marked0 := 0x1f.U
-  }
-
-  when (!marked0(1) && valid(1) && !output(1)) {
-    dataNxt(1) := dataMux(1)
-    cmdqNxt(1) := cmdqMux(1)
-    actvNxt(1) := actvMux(1)
-    marked1 := 0x03.U
-  } .elsewhen (!marked0(2) && valid(2) && !output(2)) {
-    dataNxt(1) := dataMux(2)
-    cmdqNxt(1) := cmdqMux(2)
-    actvNxt(1) := actvMux(2)
-    marked1 := 0x07.U
-  } .elsewhen (!marked0(3) && valid(3) && !output(3)) {
-    dataNxt(1) := dataMux(3)
-    cmdqNxt(1) := cmdqMux(3)
-    actvNxt(1) := actvMux(3)
-    marked1 := 0x0f.U
-  } .elsewhen (!marked0(4)) {
-    dataNxt(1) := dataMux(4)
-    cmdqNxt(1) := cmdqMux(4)
-    actvNxt(1) := actvMux(4)
-    marked1 := 0x1f.U
-  } .otherwise {
-    dataNxt(1) := dataMux(5)
-    cmdqNxt(1) := cmdqMux(5)
-    actvNxt(1) := actvMux(5)
-    marked1 := 0x3f.U
-  }
-
-  when (!marked1(2) && valid(2) && !output(2)) {
-    dataNxt(2) := dataMux(2)
-    cmdqNxt(2) := cmdqMux(2)
-    actvNxt(2) := actvMux(2)
-    marked2 := 0x07.U
-  } .elsewhen (!marked1(3) && valid(3) && !output(3)) {
-    dataNxt(2) := dataMux(3)
-    cmdqNxt(2) := cmdqMux(3)
-    actvNxt(2) := actvMux(3)
-    marked2 := 0x0f.U
-  } .elsewhen (!marked1(4)) {
-    dataNxt(2) := dataMux(4)
-    cmdqNxt(2) := cmdqMux(4)
-    actvNxt(2) := actvMux(4)
-    marked2 := 0x1f.U
-  } .elsewhen (!marked1(5)) {
-    dataNxt(2) := dataMux(5)
-    cmdqNxt(2) := cmdqMux(5)
-    actvNxt(2) := actvMux(5)
-    marked2 := 0x3f.U
-  } .otherwise {
-    dataNxt(2) := dataMux(6)
-    cmdqNxt(2) := cmdqMux(6)
-    actvNxt(2) := actvMux(6)
-    marked2 := 0x7f.U
-  }
-
-  when (!marked2(3) && valid(3) && !output(3)) {
-    dataNxt(3) := dataMux(3)
-    cmdqNxt(3) := cmdqMux(3)
-    actvNxt(3) := actvMux(3)
-  } .elsewhen (!marked2(4)) {
-    dataNxt(3) := dataMux(4)
-    cmdqNxt(3) := cmdqMux(4)
-    actvNxt(3) := actvMux(4)
-  } .elsewhen (!marked2(5)) {
-    dataNxt(3) := dataMux(5)
-    cmdqNxt(3) := cmdqMux(5)
-    actvNxt(3) := actvMux(5)
-  } .elsewhen (!marked2(6)) {
-    dataNxt(3) := dataMux(6)
-    cmdqNxt(3) := cmdqMux(6)
-    actvNxt(3) := actvMux(6)
-  } .otherwise {
-    dataNxt(3) := dataMux(7)
-    cmdqNxt(3) := cmdqMux(7)
-    actvNxt(3) := actvMux(7)
+  for (i <- 0 until p.instructionLanes) {
+    val idx = MuxCase((i + p.instructionLanes).U, (i until p.instructionLanes + i).map(x =>
+      (!prevMarked(i).getOrElse(false.B)(x) && validNotOutput(x)) -> (x).U
+    ))
+    dataNxt(i) := dataMux(idx)
+    cmdqNxt(i) := cmdqMux(idx)
+    actvNxt(i) := actvMux(idx)
+    if (i < marked.length) {
+      val width = marked(i).getWidth
+      marked(i) := ~0.U(width.W) >> ((width - 1).U - idx)
+    }
   }
 
   // ---------------------------------------------------------------------------
   // Scoreboard.
-  io.vrfsb.set.valid := output(0) || output(1) || output(2) || output(3)
+  // io.vrfsb.set.valid := output(0) || output(1) || output(2) || output(3)
+  io.vrfsb.set.valid := output =/= 0.U
 
-  io.vrfsb.set.bits := (MuxOR(output(0), actv2(0).wactive) |
-                        MuxOR(output(1), actv2(1).wactive) |
-                        MuxOR(output(2), actv2(2).wactive) |
-                        MuxOR(output(3), actv2(3).wactive))
+  io.vrfsb.set.bits := (0 until p.instructionLanes).map(x => MuxOR(output(x), actv2(x).wactive)).reduce(_ | _)
 
   assert((io.vrfsb.set.bits(63, 0) & io.vrfsb.set.bits(127, 64)) === 0.U)
   assert(((io.vrfsb.data(63, 0) | io.vrfsb.data(127, 64)) & (io.vrfsb.set.bits(63, 0) | io.vrfsb.set.bits(127, 64))) === 0.U)
 
   // ---------------------------------------------------------------------------
   // Outputs.
-  val outvalid = Wire(Vec(4, Bool()))
-  val cmdsync = Wire(Vec(4, Bool()))
+  val outvalid = VecInit((0 until p.instructionLanes).map(i => valid(i) && !depends(i)))
+  val cmdsync = VecInit((0 until p.instructionLanes).map(i => data(i).cmdsync))
 
-  for (i <- 0 until 4) {
-    outvalid(i) := valid(i) && !depends(i)
-    cmdsync(i) := data(i).cmdsync
-  }
-
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     // Synchronize commands at cmdsync instance or if found in history.
     // Note: {vdwinit, vdwconv, vdmulh}, vdmulh must not issue before vdwconv.
     val synchronize = cmdsync.asUInt(i,0) =/= 0.U
diff --git a/hdl/chisel/src/kelvin/vector/VInst.scala b/hdl/chisel/src/kelvin/vector/VInst.scala
index 8a1b42e..8757cea 100644
--- a/hdl/chisel/src/kelvin/vector/VInst.scala
+++ b/hdl/chisel/src/kelvin/vector/VInst.scala
@@ -44,11 +44,11 @@
   val op = Input(UInt(new VInstOp().Entries.W))
 }
 
-class VectorInstructionIO extends Bundle {
+class VectorInstructionIO(p: Parameters) extends Bundle {
   val valid = Output(Bool())
   val ready = Input(Bool())
   val stall = Input(Bool())
-  val lane = Vec(4, Valid(new VectorInstructionLane))
+  val lane = Vec(p.instructionLanes, Valid(new VectorInstructionLane))
 }
 
 class VectorInstructionLane extends Bundle {
@@ -68,14 +68,14 @@
 class VInst(p: Parameters) extends Module {
   val io = IO(new Bundle {
     // Decode cycle.
-    val in = Vec(4, new VInstIO)
+    val in = Vec(p.instructionLanes, new VInstIO)
 
     // Execute cycle.
-    val rs = Vec(8, Flipped(new RegfileReadDataIO))
-    val rd = Vec(4, Flipped(new RegfileWriteDataIO))
+    val rs = Vec(p.instructionLanes * 2, Flipped(new RegfileReadDataIO))
+    val rd = Vec(p.instructionLanes, Flipped(new RegfileWriteDataIO))
 
     // Vector interface.
-    val out = new VectorInstructionIO
+    val out = new VectorInstructionIO(p)
 
     // Status.
     val nempty = Output(Bool())
@@ -91,41 +91,34 @@
   val maxvlwm = (p.vectorBits * 4 / 32).U(p.vectorCountBits.W)
   assert(maxvlw >= 4.U)
 
-  val slice = Slice(Vec(4, new Bundle {
+  val slice = Slice(Vec(p.instructionLanes, new Bundle {
     val vld = Output(Bool())
     val vst = Output(Bool())
     val lane = Valid(new VectorInstructionLane)
   }), true)
 
-  val reqvalid = VecInit(io.in(0).valid && io.in(0).ready,
-                         io.in(1).valid && io.in(1).ready,
-                         io.in(2).valid && io.in(2).ready,
-                         io.in(3).valid && io.in(3).ready)
-
-  val reqaddr = VecInit(io.in(0).inst(19,15),
-                        io.in(1).inst(19,15),
-                        io.in(2).inst(19,15),
-                        io.in(3).inst(19,15))
+  val reqvalid = VecInit(io.in.map(x => x.valid && x.ready))
+  val reqaddr = VecInit(io.in.map(x => x.inst(19,15)))
 
   // ---------------------------------------------------------------------------
   // Response to Decode.
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     io.in(i).ready := !io.out.stall
   }
 
   // ---------------------------------------------------------------------------
   // Controls.
-  val vld_o = RegInit(VecInit(Seq.fill(4)(false.B)))
-  val vld_u = RegInit(VecInit(Seq.fill(4)(false.B)))
-  val vst_o = RegInit(VecInit(Seq.fill(4)(false.B)))
-  val vst_u = RegInit(VecInit(Seq.fill(4)(false.B)))
-  val vst_q = RegInit(VecInit(Seq.fill(4)(false.B)))
-  val getvl = RegInit(VecInit(Seq.fill(4)(false.B)))
-  val getmaxvl = RegInit(VecInit(Seq.fill(4)(false.B)))
+  val vld_o = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
+  val vld_u = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
+  val vst_o = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
+  val vst_u = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
+  val vst_q = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
+  val getvl = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
+  val getmaxvl = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
 
-  val rdAddr = Reg(Vec(4, UInt(5.W)))
+  val rdAddr = Reg(Vec(p.instructionLanes, UInt(5.W)))
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     when (reqvalid(i)) {
       rdAddr(i) := io.in(i).addr
     }
@@ -134,13 +127,13 @@
   // ---------------------------------------------------------------------------
   // Vector Interface.
   val vvalid = RegInit(false.B)
-  val vinstValid = RegInit(VecInit(Seq.fill(4)(false.B)))
-  val vinstInst = Reg(Vec(4, UInt(32.W)))
-  val nxtVinstValid = Wire(Vec(4, Bool()))
+  val vinstValid = RegInit(VecInit(Seq.fill(p.instructionLanes)(false.B)))
+  val vinstInst = Reg(Vec(p.instructionLanes, UInt(32.W)))
+  val nxtVinstValid = Wire(Vec(p.instructionLanes, Bool()))
 
   vvalid := nxtVinstValid.asUInt =/= 0.U
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     nxtVinstValid(i) := reqvalid(i) && (io.in(i).op(vinst.VLD) ||
                                         io.in(i).op(vinst.VST) ||
                                         io.in(i).op(vinst.VIOP))
@@ -148,7 +141,7 @@
     vinstInst(i) := io.in(i).inst
   }
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     val p = io.in(i).inst(28)  // func2
     val q = io.in(i).inst(30)  // func2
     vld_o(i) := reqvalid(i) && io.in(i).op(vinst.VLD) && !p
@@ -162,11 +155,11 @@
 
   // ---------------------------------------------------------------------------
   // Register write port.
-  val lsuAdder = Wire(Vec(4, UInt(32.W)))
-  val getvlValue = Wire(Vec(4, UInt(p.vectorCountBits.W)))  // bytes
-  val getmaxvlValue = Wire(Vec(4, UInt(p.vectorCountBits.W)))  // bytes
+  val lsuAdder = Wire(Vec(p.instructionLanes, UInt(32.W)))
+  val getvlValue = Wire(Vec(p.instructionLanes, UInt(p.vectorCountBits.W)))  // bytes
+  val getmaxvlValue = Wire(Vec(p.instructionLanes, UInt(p.vectorCountBits.W)))  // bytes
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     val rs1 = io.rs(2 * i + 0).data
     val rs2 = io.rs(2 * i + 1).data
     val m  = vinstInst(i)(5)
@@ -220,7 +213,7 @@
     lsuAdder(i) := rs1 + offset
   }
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     val len = Wire(UInt(p.vectorCountBits.W))  // bytes
     val rs1 = io.rs(2 * i + 0).data
     val rs2 = io.rs(2 * i + 1).data
@@ -247,7 +240,7 @@
     getmaxvlValue(i) := maxvl
   }
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     io.rd(i).valid := getvl(i) || getmaxvl(i) || vld_u(i) || vst_u(i) || vst_q(i)
     io.rd(i).addr := rdAddr(i)
 
@@ -267,7 +260,7 @@
   // Resolve back-pressure with stall to io.in in decode.
   assert(!(slice.io.in.valid && !slice.io.in.ready))
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     slice.io.in.bits(i).vld := vld_o(i) || vld_u(i)
     slice.io.in.bits(i).vst := vst_o(i) || vst_u(i) || vst_q(i)
     slice.io.in.bits(i).lane.valid := vinstValid(i)
@@ -276,7 +269,7 @@
     slice.io.in.bits(i).lane.bits.data := io.rs(2 * i + 1).data
   }
 
-  for (i <- 0 until 4) {
+  for (i <- 0 until p.instructionLanes) {
     io.out.lane(i) := slice.io.out.bits(i).lane
   }
 
@@ -290,8 +283,7 @@
   val nempty = RegInit(false.B)
 
   // Simple implementation, will overlap downstream units redundantly.
-  nempty := io.in(0).valid || io.in(1).valid || io.in(2).valid ||
-            io.in(3).valid || vvalid || io.out.valid
+  nempty := io.in.map(x => x.valid).reduce(_ || _) || vvalid || io.out.valid
 
   io.nempty := nempty
 }
diff --git a/hdl/chisel/src/kelvin/vector/VLd.scala b/hdl/chisel/src/kelvin/vector/VLd.scala
index 88b4d8d..bfbda33 100644
--- a/hdl/chisel/src/kelvin/vector/VLd.scala
+++ b/hdl/chisel/src/kelvin/vector/VLd.scala
@@ -30,7 +30,7 @@
 class VLd(p: Parameters) extends Module {
   val io = IO(new Bundle {
     // Instructions.
-    val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+    val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VDecodeBits))))
 
     // VRegfile.
     val write = new VRegfileWriteIO(p)
@@ -131,7 +131,7 @@
     0.U
   }
 
-  val q = VCmdq(cmdqDepth, new VLdCmdq, Fin, Fout, Factive)
+  val q = VCmdq(p, cmdqDepth, new VLdCmdq, Fin, Fout, Factive)
 
   q.io.in <> io.in
 
diff --git a/hdl/chisel/src/kelvin/vector/VLdSt.scala b/hdl/chisel/src/kelvin/vector/VLdSt.scala
index d2d9853..1aa3ee2 100644
--- a/hdl/chisel/src/kelvin/vector/VLdSt.scala
+++ b/hdl/chisel/src/kelvin/vector/VLdSt.scala
@@ -30,7 +30,7 @@
 class VLdSt(p: Parameters) extends Module {
   val io = IO(new Bundle {
     // Instructions.
-    val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+    val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VDecodeBits))))
     val active = Output(UInt(64.W))
 
     // VRegfile.
@@ -180,7 +180,7 @@
     active
   }
 
-  val q = VCmdq(cmdqDepth, new VLdStCmdq, Fin, Fout, Factive)
+  val q = VCmdq(p, cmdqDepth, new VLdStCmdq, Fin, Fout, Factive)
 
   q.io.in <> io.in
 
diff --git a/hdl/chisel/src/kelvin/vector/VRegfile.scala b/hdl/chisel/src/kelvin/vector/VRegfile.scala
index ac67ff0..fa75b05 100644
--- a/hdl/chisel/src/kelvin/vector/VRegfile.scala
+++ b/hdl/chisel/src/kelvin/vector/VRegfile.scala
@@ -113,13 +113,14 @@
 }
 
 class VRegfile(p: Parameters) extends Module {
-  val readPorts = 7
-  val writePorts = 6
-  val whintPorts = 4
+  val readPorts = p.vectorReadPorts
+  val scalarPorts = p.vectorScalarPorts
+  val writePorts = p.vectorWritePorts
+  val whintPorts = p.vectorWhintPorts
 
   val io = IO(new Bundle {
     val read = Vec(readPorts, Flipped(new VRegfileReadIO(p)))
-    val scalar = Vec(readPorts / 3, Flipped(new VRegfileScalarIO(p)))
+    val scalar = Vec(scalarPorts, Flipped(new VRegfileScalarIO(p)))
     val write = Vec(writePorts, Flipped(new VRegfileWrintIO(p)))
     val whint = Vec(whintPorts, Flipped(new VRegfileWhintIO(p)))
     val conv = Flipped(new VRegfileConvIO(p))
diff --git a/hdl/chisel/src/kelvin/vector/VRegfileSegment.scala b/hdl/chisel/src/kelvin/vector/VRegfileSegment.scala
index 90a4935..38451d7 100644
--- a/hdl/chisel/src/kelvin/vector/VRegfileSegment.scala
+++ b/hdl/chisel/src/kelvin/vector/VRegfileSegment.scala
@@ -21,8 +21,8 @@
 import _root_.circt.stage.ChiselStage
 
 class VRegfileSegment(p: Parameters) extends Module {
-  val readPorts = 7
-  val writePorts = 6
+  val readPorts = p.vectorReadPorts
+  val writePorts = p.vectorWritePorts
   val tcnt = 16.min(p.vectorBits / 32)
 
   val io = IO(new Bundle {
diff --git a/hdl/chisel/src/kelvin/vector/VSt.scala b/hdl/chisel/src/kelvin/vector/VSt.scala
index f730fec..638f709 100644
--- a/hdl/chisel/src/kelvin/vector/VSt.scala
+++ b/hdl/chisel/src/kelvin/vector/VSt.scala
@@ -30,7 +30,7 @@
 class VSt(p: Parameters) extends Module {
   val io = IO(new Bundle {
     // Instructions.
-    val in = Flipped(Decoupled(Vec(4, Valid(new VDecodeBits))))
+    val in = Flipped(Decoupled(Vec(p.instructionLanes, Valid(new VDecodeBits))))
     val active = Output(UInt(64.W))
 
     // VRegfile.
@@ -182,7 +182,7 @@
     val strb = UInt((p.lsuDataBits / 8).W)
   }
 
-  val q = VCmdq(cmdqDepth, new VStCmdq, Fin, Fout, Factive)
+  val q = VCmdq(p, cmdqDepth, new VStCmdq, Fin, Fout, Factive)
 
   val ctrl = Slice(new Ctrl, false, true)
   val data = Slice(new Data, false, true, true)
diff --git a/tests/verilator_sim/kelvin/core_tb.cc b/tests/verilator_sim/kelvin/core_tb.cc
index 969e361..73396ab 100644
--- a/tests/verilator_sim/kelvin/core_tb.cc
+++ b/tests/verilator_sim/kelvin/core_tb.cc
@@ -195,14 +195,14 @@
   core.io_slog_addr(io_slog_addr);
   core.io_slog_data(io_slog_data);
   core.io_debug_en(io_debug_en);
-  core.io_debug_addr0(io_debug_addr0);
-  core.io_debug_addr1(io_debug_addr1);
-  core.io_debug_addr2(io_debug_addr2);
-  core.io_debug_addr3(io_debug_addr3);
-  core.io_debug_inst0(io_debug_inst0);
-  core.io_debug_inst1(io_debug_inst1);
-  core.io_debug_inst2(io_debug_inst2);
-  core.io_debug_inst3(io_debug_inst3);
+  core.io_debug_addr_0(io_debug_addr0);
+  core.io_debug_addr_1(io_debug_addr1);
+  core.io_debug_addr_2(io_debug_addr2);
+  core.io_debug_addr_3(io_debug_addr3);
+  core.io_debug_inst_0(io_debug_inst0);
+  core.io_debug_inst_1(io_debug_inst1);
+  core.io_debug_inst_2(io_debug_inst2);
+  core.io_debug_inst_3(io_debug_inst3);
   core.io_debug_cycles(io_debug_cycles);
 
   mif.clock(tb.clock);