feat(bus): Add TileLink-UL primitives

This commit introduces a collection of primitive modules for building
TileLink-UL interconnects, including FIFOs, sockets, and a width
bridge.

The new modules are:
- TlulFifoSync: A synchronous TileLink FIFO with optional spare
  side-channels.
- TlulFifoAsync: An asynchronous TileLink FIFO for clock domain
  crossing, built on the rocket-chip AsyncQueue.
- TlulSocket1N: A 1-to-N socket for steering requests from a single
  host to one of N devices.
- TlulSocketM1: An M-to-1 socket that arbitrates requests from M
  hosts to a single device using a round-robin arbiter.
- TlulWidthBridge: A bridge for connecting TileLink-UL buses of
  different widths.

Each of these modules is accompanied by a comprehensive cocotb test
suite to ensure its correctness.

Change-Id: I2ca34caad9332b0621a68957c043a91deee45999
diff --git a/hdl/chisel/src/bus/BUILD b/hdl/chisel/src/bus/BUILD
index 3e2be71..d793ef6 100644
--- a/hdl/chisel/src/bus/BUILD
+++ b/hdl/chisel/src/bus/BUILD
@@ -37,12 +37,18 @@
         "KelvinToTlul.scala",
         "SecdedEncoderTestbench.scala",
         "TileLinkUL.scala",
+        "TlulFifoAsync.scala",
+        "TlulFifoSync.scala",
         "TlulIntegrity.scala",
         "TlulIntegrityTestbench.scala",
+        "TlulSocket1N.scala",
+        "TlulSocketM1.scala",
+        "TlulWidthBridge.scala",
     ],
     deps = [
-        "//hdl/chisel/src/kelvin:kelvin_params",
         "//hdl/chisel/src/common",
+        "//hdl/chisel/src/kelvin:kelvin_params",
+        "@chipsalliance_rocket_chip//:asyncqueue",
     ],
 )
 
@@ -60,6 +66,65 @@
     module_name = "TLUL2Axi",
 )
 
+chisel_cc_library(
+    name = "tlul_fifo_async_128_cc_library",
+    chisel_lib = ":bus",
+    emit_class = "bus.TlulFifoAsync128Emitter",
+    module_name = "TlulFifoAsync128",
+)
+
+chisel_cc_library(
+    name = "tlul_socket_1n_128_cc_library",
+    chisel_lib = ":bus",
+    emit_class = "bus.TlulSocket1N_128Emitter",
+    module_name = "TlulSocket1N_128",
+)
+
+chisel_cc_library(
+    name = "tlul_fifo_sync_cc_library",
+    chisel_lib = ":bus",
+    emit_class = "bus.TlulFifoSyncEmitter",
+    module_name = "TlulFifoSync",
+)
+
+chisel_cc_library(
+    name = "tlul_socket_m1_2_128_cc_library",
+    chisel_lib = ":bus",
+    emit_class = "bus.TlulSocketM1_2_128Emitter",
+    module_name = "TlulSocketM1_2_128",
+)
+
+chisel_cc_library(
+    name = "tlul_socket_m1_3_128_cc_library",
+    chisel_lib = ":bus",
+    emit_class = "bus.TlulSocketM1_3_128Emitter",
+    module_name = "TlulSocketM1_3_128",
+)
+
+verilator_cocotb_model(
+    name = "tlul_socket_m1_2_128_model",
+    cflags = VERILATOR_BUILD_ARGS,
+    hdl_toplevel = "TlulSocketM1_2_128",
+    trace = True,
+    verilog_source = "//hdl/chisel/src/bus:TlulSocketM1_2_128.sv",
+)
+
+verilator_cocotb_model(
+    name = "tlul_fifo_async_128_model",
+    cflags = VERILATOR_BUILD_ARGS,
+    hdl_toplevel = "TlulFifoAsync128",
+    trace = True,
+    verilog_source = "//hdl/chisel/src/bus:TlulFifoAsync128.sv",
+)
+
+verilator_cocotb_model(
+    name = "tlul_fifo_sync_model",
+    cflags = VERILATOR_BUILD_ARGS,
+    hdl_toplevel = "TlulFifoSync",
+    trace = True,
+    verilog_source = "//hdl/chisel/src/bus:TlulFifoSync.sv",
+)
+
 verilator_cocotb_model(
     name = "axi2tlul_model",
     cflags = VERILATOR_BUILD_ARGS,
@@ -91,6 +156,14 @@
     verilog_source = "//hdl/chisel/src/bus:TlulIntegrityTestbench.sv",
 )
 
+verilator_cocotb_model(
+    name = "tlul_socket_1n_128_model",
+    cflags = VERILATOR_BUILD_ARGS,
+    hdl_toplevel = "TlulSocket1N_128",
+    trace = True,
+    verilog_source = "//hdl/chisel/src/bus:TlulSocket1N_128.sv",
+)
+
 chisel_cc_library(
     name = "secded_encoder_testbench_cc_library",
     chisel_lib = ":bus",
diff --git a/hdl/chisel/src/bus/TlulFifoAsync.scala b/hdl/chisel/src/bus/TlulFifoAsync.scala
new file mode 100644
index 0000000..98ad167
--- /dev/null
+++ b/hdl/chisel/src/bus/TlulFifoAsync.scala
@@ -0,0 +1,62 @@
+package bus
+
+import chisel3._
+import freechips.rocketchip.util.{AsyncQueue, AsyncQueueParams}
+import kelvin.Parameters
+
+class TlulFifoAsync(
+    p: TLULParameters,
+    reqDepth: Int = 4,
+    rspDepth: Int = 4,
+    moduleName: String = "TlulFifoAsync"
+) extends RawModule {
+  override val desiredName = moduleName
+
+  val io = IO(new Bundle {
+    val clk_h_i = Input(Clock())
+    val rst_h_i = Input(Bool())
+    val clk_d_i = Input(Clock())
+    val rst_d_i = Input(Bool())
+    val tl_h = Flipped(new OpenTitanTileLink.Host2Device(p))
+    val tl_d = new OpenTitanTileLink.Host2Device(p)
+  })
+
+  val req_queue = Module(new AsyncQueue(new OpenTitanTileLink.A_Channel(p), AsyncQueueParams(depth = reqDepth)))
+  req_queue.io.enq_clock := io.clk_h_i
+  req_queue.io.enq_reset := io.rst_h_i
+  req_queue.io.deq_clock := io.clk_d_i
+  req_queue.io.deq_reset := io.rst_d_i
+  req_queue.io.enq <> io.tl_h.a
+  io.tl_d.a <> req_queue.io.deq
+
+  val rsp_queue = Module(new AsyncQueue(new OpenTitanTileLink.D_Channel(p), AsyncQueueParams(depth = rspDepth)))
+  rsp_queue.io.enq_clock := io.clk_d_i
+  rsp_queue.io.enq_reset := io.rst_d_i
+  rsp_queue.io.deq_clock := io.clk_h_i
+  rsp_queue.io.deq_reset := io.rst_h_i
+  rsp_queue.io.enq <> io.tl_d.d
+  io.tl_h.d <> rsp_queue.io.deq
+}
+
+import _root_.circt.stage.{ChiselStage, FirtoolOption}
+import chisel3.stage.ChiselGeneratorAnnotation
+import scala.annotation.nowarn
+
+@nowarn
+object TlulFifoAsync128Emitter extends App {
+  val p = new Parameters
+  p.lsuDataBits = 128
+  (new ChiselStage).execute(
+    Array("--target", "systemverilog") ++ args,
+    Seq(
+      ChiselGeneratorAnnotation(() =>
+        new TlulFifoAsync(
+          p = new bus.TLULParameters(p),
+          reqDepth = 1,
+          rspDepth = 1,
+          moduleName = "TlulFifoAsync128"
+        )
+      )
+    ) ++ Seq(FirtoolOption("-enable-layers=Verification"))
+  )
+}
diff --git a/hdl/chisel/src/bus/TlulFifoSync.scala b/hdl/chisel/src/bus/TlulFifoSync.scala
new file mode 100644
index 0000000..92b45ba
--- /dev/null
+++ b/hdl/chisel/src/bus/TlulFifoSync.scala
@@ -0,0 +1,121 @@
+package bus
+
+import chisel3._
+import chisel3.util._
+
+class TlulFifoSync(
+    p: TLULParameters,
+    reqDepth: Int = 2,
+    rspDepth: Int = 2,
+    reqPass: Boolean = true, // Equivalent to flow=true in Queue
+    rspPass: Boolean = true, // Equivalent to flow=true in Queue
+    spareReqW: Int = 1,
+    spareRspW: Int = 1,
+    moduleName: String = "TlulFifoSync"
+) extends Module {
+  require(reqDepth > 0 || reqPass, "reqDepth cannot be 0 if reqPass is false")
+  require(rspDepth > 0 || rspPass, "rspDepth cannot be 0 if rspPass is false")
+
+  override val desiredName = moduleName
+  val io = IO(new Bundle {
+    // Host-facing interface
+    val host = Flipped(new OpenTitanTileLink.Host2Device(p))
+
+    // Device-facing interface
+    val device = new OpenTitanTileLink.Host2Device(p)
+
+    // Spare side channels
+    val spare_req_i = Input(UInt(spareReqW.W))
+    val spare_req_o = Output(UInt(spareReqW.W))
+    val spare_rsp_i = Input(UInt(spareRspW.W))
+    val spare_rsp_o = Output(UInt(spareRspW.W))
+  })
+
+  // A bundle to hold the TileLink A channel data plus the spare bits
+  class AChannelWithSpare extends Bundle {
+    val a = new OpenTitanTileLink.A_Channel(p)
+    val spare = UInt(spareReqW.W)
+  }
+
+  // A bundle to hold the TileLink D channel data plus the spare bits
+  class DChannelWithSpare extends Bundle {
+    val d = new OpenTitanTileLink.D_Channel(p)
+    val spare = UInt(spareRspW.W)
+  }
+
+  // Request FIFO (Host to Device)
+  if (reqDepth > 0) {
+    val reqFifo = Module(new Queue(new AChannelWithSpare, reqDepth, flow = reqPass))
+    reqFifo.io.enq.valid := io.host.a.valid
+    io.host.a.ready := reqFifo.io.enq.ready
+    reqFifo.io.enq.bits.a := io.host.a.bits
+    reqFifo.io.enq.bits.spare := io.spare_req_i
+
+    io.device.a.valid := reqFifo.io.deq.valid
+    reqFifo.io.deq.ready := io.device.a.ready
+    io.device.a.bits := reqFifo.io.deq.bits.a
+    io.spare_req_o := reqFifo.io.deq.bits.spare
+  } else {
+    io.device.a.valid := io.host.a.valid
+    io.host.a.ready := io.device.a.ready
+    io.device.a.bits := io.host.a.bits
+    io.spare_req_o := io.spare_req_i
+  }
+
+  // Response FIFO (Device to Host)
+  val device_d_bits_sanitized = Wire(chiselTypeOf(io.device.d.bits))
+  device_d_bits_sanitized := io.device.d.bits
+  device_d_bits_sanitized.data := Mux(
+    io.device.d.bits.opcode === TLULOpcodesD.AccessAckData.asUInt,
+    io.device.d.bits.data,
+    0.U
+  )
+
+  if (rspDepth > 0) {
+    val rspFifo =
+      Module(new Queue(new DChannelWithSpare, rspDepth, flow = rspPass))
+    rspFifo.io.enq.valid := io.device.d.valid
+    io.device.d.ready := rspFifo.io.enq.ready
+    rspFifo.io.enq.bits.d := device_d_bits_sanitized
+    rspFifo.io.enq.bits.spare := io.spare_rsp_i
+
+    io.host.d.valid := rspFifo.io.deq.valid
+    rspFifo.io.deq.ready := io.host.d.ready
+    io.host.d.bits := rspFifo.io.deq.bits.d
+    io.spare_rsp_o := rspFifo.io.deq.bits.spare
+  } else {
+    io.host.d.valid := io.device.d.valid
+    io.device.d.ready := io.host.d.ready
+    io.host.d.bits := device_d_bits_sanitized
+    io.spare_rsp_o := io.spare_rsp_i
+  }
+}
+
+import _root_.circt.stage.{ChiselStage, FirtoolOption}
+import chisel3.stage.ChiselGeneratorAnnotation
+import scala.annotation.nowarn
+
+@nowarn
+object TlulFifoSyncEmitter extends App {
+  val p = new kelvin.Parameters
+  (new ChiselStage).execute(
+    Array("--target", "systemverilog") ++ args,
+    Seq(ChiselGeneratorAnnotation(() => new TlulFifoSync(new bus.TLULParameters(p)))) ++ Seq(FirtoolOption("-enable-layers=Verification"))
+  )
+}
+
+@nowarn
+object EmitTlulFifoSyncDepth0 extends App {
+  val p = new kelvin.Parameters
+  p.lsuDataBits = 128
+  (new ChiselStage).execute(
+    Array("--target", "systemverilog") ++ args,
+    Seq(ChiselGeneratorAnnotation(() => new TlulFifoSync(
+      p = new bus.TLULParameters(p),
+      reqDepth = 0,
+      rspDepth = 0,
+      spareReqW = 4,
+      moduleName = "TlulFifoSync_Depth0"
+    ))) ++ Seq(FirtoolOption("-enable-layers=Verification"))
+  )
+}
diff --git a/hdl/chisel/src/bus/TlulSocket1N.scala b/hdl/chisel/src/bus/TlulSocket1N.scala
new file mode 100644
index 0000000..046b3fd
--- /dev/null
+++ b/hdl/chisel/src/bus/TlulSocket1N.scala
@@ -0,0 +1,219 @@
+package bus
+
+import chisel3._
+import chisel3.util._
+import common.MakeInvalid
+import kelvin.Parameters
+
+// A simple error responder that immediately generates an error response
+// for any incoming request.
+class TlulErrorResponder(p: TLULParameters) extends Module {
+  val io = IO(new Bundle {
+    val tl_h = Flipped(new OpenTitanTileLink.Host2Device(p))
+  })
+
+  io.tl_h.a.ready := true.B
+
+  val d = RegInit(MakeInvalid(new OpenTitanTileLink.D_Channel(p)))
+
+  d.valid := io.tl_h.a.fire
+  d.bits.size := Mux(io.tl_h.a.fire, io.tl_h.a.bits.size, d.bits.size)
+  d.bits.source := Mux(io.tl_h.a.fire, io.tl_h.a.bits.source, d.bits.source)
+  d.bits.opcode := TLULOpcodesD.AccessAck.asUInt
+  d.bits.param := 0.U
+  d.bits.sink := 0.U
+  d.bits.data := 0.U
+  d.bits.error := true.B
+  d.bits.user.rsp_intg := 0.U
+  d.bits.user.data_intg := 0.U
+
+  io.tl_h.d.valid := d.valid
+  io.tl_h.d.bits := d.bits
+}
+
+class TlulSocket1N(
+    p: TLULParameters,
+    N: Int = 4,
+    HReqPass: Boolean = true,
+    HRspPass: Boolean = true,
+    DReqPass: Seq[Boolean] = Nil,
+    DRspPass: Seq[Boolean] = Nil,
+    HReqDepth: Int = 1,
+    HRspDepth: Int = 1,
+    DReqDepth: Seq[Int] = Nil,
+    DRspDepth: Seq[Int] = Nil,
+    ExplicitErrs: Boolean = true,
+    moduleName: String = "TlulSocket1N"
+) extends Module {
+  val DReqPass_ = if (DReqPass.isEmpty) Seq.fill(N)(true) else DReqPass
+  val DRspPass_ = if (DRspPass.isEmpty) Seq.fill(N)(true) else DRspPass
+  val DReqDepth_ = if (DReqDepth.isEmpty) Seq.fill(N)(1) else DReqDepth
+  val DRspDepth_ = if (DRspDepth.isEmpty) Seq.fill(N)(1) else DRspDepth
+  override val desiredName = moduleName
+  val NWD = if (ExplicitErrs) log2Ceil(N + 1) else log2Ceil(N)
+
+  val io = IO(new Bundle {
+    val tl_h = Flipped(new OpenTitanTileLink.Host2Device(p))
+    val tl_d = Vec(N, new OpenTitanTileLink.Host2Device(p))
+    val dev_select_i = Input(UInt(NWD.W))
+  })
+
+  // Host-side FIFO
+  val fifo_h = Module(
+    new TlulFifoSync(
+      p,
+      reqDepth = HReqDepth,
+      rspDepth = HRspDepth,
+      reqPass = HReqPass,
+      rspPass = HRspPass,
+      spareReqW = NWD
+    )
+  )
+
+  fifo_h.io.host <> io.tl_h
+  fifo_h.io.spare_req_i := io.dev_select_i
+  fifo_h.io.spare_rsp_i := 0.U // Tie off unused spare port
+  val dev_select_t = fifo_h.io.spare_req_o
+
+  // Outstanding request tracking
+  val maxOutstanding = 1 << p.o
+  val outstandingW = log2Ceil(maxOutstanding + 1)
+  val num_req_outstanding = RegInit(0.U(outstandingW.W))
+  val dev_select_outstanding = RegInit(0.U(NWD.W))
+  val accept_t_req = fifo_h.io.device.a.fire
+  val accept_t_rsp = fifo_h.io.device.d.fire
+
+  when(accept_t_req) {
+    dev_select_outstanding := dev_select_t
+    when(!accept_t_rsp) {
+      num_req_outstanding := num_req_outstanding + 1.U
+    }
+  }.elsewhen(accept_t_rsp) {
+    num_req_outstanding := num_req_outstanding - 1.U
+  }
+
+  val hold_all_requests =
+    (num_req_outstanding =/= 0.U) && (dev_select_t =/= dev_select_outstanding)
+
+  // Device-side FIFOs and steering logic
+  val tl_u_o = Wire(Vec(N + 1, new OpenTitanTileLink.Host2Device(p)))
+  val tl_u_i = Wire(Vec(N + 1, new OpenTitanTileLink.Host2Device(p)))
+
+  val blanked_auser = Wire(new OpenTitanTileLink_A_User)
+  blanked_auser.rsvd := fifo_h.io.device.a.bits.user.rsvd
+  blanked_auser.instr_type := fifo_h.io.device.a.bits.user.instr_type
+  blanked_auser.cmd_intg := 0.U // Simplified for now
+  blanked_auser.data_intg := 0.U // Simplified for now
+
+  for (i <- 0 until N) {
+    val dev_select = (dev_select_t === i.U) && !hold_all_requests
+
+    tl_u_o(i).a.valid := fifo_h.io.device.a.valid && dev_select
+    tl_u_o(i).a.bits := fifo_h.io.device.a.bits
+    tl_u_o(i).a.bits.user := Mux(
+      dev_select,
+      fifo_h.io.device.a.bits.user,
+      blanked_auser
+    )
+    tl_u_o(i).d.ready := fifo_h.io.device.d.ready
+
+    val fifo_d = Module(
+      new TlulFifoSync(
+        p,
+        reqDepth = DReqDepth_(i),
+        rspDepth = DRspDepth_(i),
+        reqPass = DReqPass_(i),
+        rspPass = DRspPass_(i)
+      )
+    )
+    fifo_d.io.host.a <> tl_u_o(i).a
+    io.tl_d(i).a <> fifo_d.io.device.a
+    tl_u_i(i).a := fifo_d.io.device.a
+
+    tl_u_o(i).d <> fifo_d.io.host.d
+    io.tl_d(i).d <> fifo_d.io.device.d
+    tl_u_i(i).d := fifo_d.io.device.d
+
+    fifo_d.io.spare_req_i := 0.U
+    fifo_d.io.spare_rsp_i := 0.U
+  }
+
+  // Error responder instantiation
+  if (ExplicitErrs && (1 << NWD) > N) {
+    val err_resp = Module(new TlulErrorResponder(p))
+    tl_u_o(N).a.valid := fifo_h.io.device.a.valid && (dev_select_t >= N.U) && !hold_all_requests
+    tl_u_o(N).a.bits := fifo_h.io.device.a.bits
+    tl_u_o(N).d.ready := fifo_h.io.device.d.ready
+    err_resp.io.tl_h.a <> tl_u_o(N).a
+    tl_u_o(N).d <> err_resp.io.tl_h.d
+
+    tl_u_i(N).a.ready := err_resp.io.tl_h.a.ready
+    tl_u_i(N).d <> err_resp.io.tl_h.d
+    tl_u_i(N).d.ready := true.B
+
+    // Tie off unused outputs of the wire to prevent "not fully initialized" errors
+    tl_u_i(N).a.valid := false.B
+    tl_u_i(N).a.bits := 0.U.asTypeOf(new OpenTitanTileLink.A_Channel(p))
+  } else {
+    tl_u_o(N).a.valid := false.B
+    tl_u_o(N).a.bits := DontCare
+    tl_u_o(N).d.ready := false.B
+    tl_u_i(N).a.ready := false.B
+    tl_u_i(N).d.valid := false.B
+    tl_u_i(N).d.bits := DontCare
+    tl_u_i(N).d.ready := false.B
+  }
+
+  // Response path selection
+  val hfifo_reqready = Mux(
+    hold_all_requests,
+    false.B,
+    MuxCase(
+      // Default to error responder ready if it exists
+      if (ExplicitErrs && (1 << NWD) > N) tl_u_o(N).a.ready else true.B,
+      (0 until N).map(i => (dev_select_t === i.U) -> tl_u_o(i).a.ready)
+    )
+  )
+  fifo_h.io.device.a.ready := fifo_h.io.device.a.valid && hfifo_reqready
+
+  val tl_t_p = MuxCase(
+    // Default to error responder if it exists
+    tl_u_i(N).d.bits,
+    (0 until N).map(i =>
+      (dev_select_outstanding === i.U) -> tl_u_i(i).d.bits
+    )
+  )
+  val d_valid = MuxCase(
+    tl_u_i(N).d.valid,
+    (0 until N).map(i => (dev_select_outstanding === i.U) -> tl_u_i(i).d.valid)
+  )
+
+  fifo_h.io.device.d.valid := d_valid
+  fifo_h.io.device.d.bits := tl_t_p
+}
+
+import _root_.circt.stage.{ChiselStage, FirtoolOption}
+import chisel3.stage.ChiselGeneratorAnnotation
+import scala.annotation.nowarn
+
+@nowarn
+object TlulSocket1N_128Emitter extends App {
+  val p = new Parameters
+  p.lsuDataBits = 128
+  (new ChiselStage).execute(
+    Array("--target", "systemverilog") ++ args,
+    Seq(
+      ChiselGeneratorAnnotation(() =>
+        new TlulSocket1N(
+          p = new bus.TLULParameters(p),
+          N = 4, // Default value, will be overridden at instantiation
+          DReqPass = Seq.fill(4)(true),
+          DRspPass = Seq.fill(4)(true),
+          DReqDepth = Seq.fill(4)(1),
+          DRspDepth = Seq.fill(4)(1),
+          moduleName = "TlulSocket1N_128"
+        )
+      )
+    ) ++ Seq(FirtoolOption("-enable-layers=Verification"))
+  )
+}
diff --git a/hdl/chisel/src/bus/TlulSocketM1.scala b/hdl/chisel/src/bus/TlulSocketM1.scala
new file mode 100644
index 0000000..a00789d
--- /dev/null
+++ b/hdl/chisel/src/bus/TlulSocketM1.scala
@@ -0,0 +1,155 @@
+package bus
+
+import chisel3._
+import chisel3.util._
+import common.KelvinRRArbiter
+import kelvin.Parameters
+
+
+class TlulFifoSync_(p: TLULParameters,
+                    reqDepth: Int,
+                    rspDepth: Int,
+                    reqPass: Boolean,
+                    rspPass: Boolean,
+                    socketName: String)
+  extends TlulFifoSync(p, reqDepth, rspDepth, reqPass, rspPass) {
+  override val desiredName = s"${socketName}_TlulFifoSync_d${reqDepth}r${rspDepth}"
+}
+
+
+
+class TlulSocketM1(
+    p: TLULParameters,
+    M: Int = 4,
+    HReqPass: Seq[Boolean] = Nil,
+    HRspPass: Seq[Boolean] = Nil,
+    HReqDepth: Seq[Int] = Nil,
+    HRspDepth: Seq[Int] = Nil,
+    DReqPass: Boolean = true,
+    DRspPass: Boolean = true,
+    DReqDepth: Int = 1,
+    DRspDepth: Int = 1,
+    moduleName: String = "TlulSocketM1"
+) extends Module {
+  val HReqPass_ = if (HReqPass.isEmpty) Seq.fill(M)(true) else HReqPass
+  val HRspPass_ = if (HRspPass.isEmpty) Seq.fill(M)(true) else HRspPass
+  val HReqDepth_ = if (HReqDepth.isEmpty) Seq.fill(M)(1) else HReqDepth
+  val HRspDepth_ = if (HRspDepth.isEmpty) Seq.fill(M)(1) else HRspDepth
+  override val desiredName = moduleName
+  val StIdW = log2Ceil(M)
+
+  val io = IO(new Bundle {
+    val tl_h = Flipped(Vec(M, new OpenTitanTileLink.Host2Device(p)))
+    val tl_d = new OpenTitanTileLink.Host2Device(p)
+  })
+
+  // Host-side FIFOs
+  val hreq_fifo_o = Wire(Vec(M, Decoupled(new OpenTitanTileLink.A_Channel(p))))
+  val hrsp_fifo_i = Wire(Vec(M, Flipped(Decoupled(new OpenTitanTileLink.D_Channel(p)))))
+
+  for (i <- 0 until M) {
+    val hreq_fifo_i = Wire(new OpenTitanTileLink.A_Channel(p))
+    hreq_fifo_i := io.tl_h(i).a.bits
+    hreq_fifo_i.source := Cat(io.tl_h(i).a.bits.source, i.U(StIdW.W))
+
+    val fifo = Module(new TlulFifoSync_(
+      p,
+      reqDepth = HReqDepth_(i),
+      rspDepth = HRspDepth_(i),
+      reqPass = HReqPass_(i),
+      rspPass = HRspPass_(i),
+      socketName = moduleName
+    ))
+    fifo.io.host.a.valid := io.tl_h(i).a.valid
+    fifo.io.host.a.bits := hreq_fifo_i
+    io.tl_h(i).a.ready := fifo.io.host.a.ready
+
+    hreq_fifo_o(i) <> fifo.io.device.a
+
+    io.tl_h(i).d <> fifo.io.host.d
+    fifo.io.device.d <> hrsp_fifo_i(i)
+
+    fifo.io.spare_req_i := 0.U
+    fifo.io.spare_rsp_i := 0.U
+  }
+
+  // Arbiter
+  val arb = Module(new KelvinRRArbiter(new OpenTitanTileLink.A_Channel(p), M, moduleName = Some(s"${moduleName}_KelvinRRArbiter_${M}")))
+  for (i <- 0 until M) {
+    arb.io.in(i) <> hreq_fifo_o(i)
+  }
+
+  // Device-side FIFO
+  val dfifo = Module(new TlulFifoSync_(
+    p,
+    reqDepth = DReqDepth,
+    rspDepth = DRspDepth,
+    reqPass = DReqPass,
+    rspPass = DRspPass,
+    socketName = moduleName
+  ))
+
+  dfifo.io.host.a <> arb.io.out
+  io.tl_d.a <> dfifo.io.device.a
+  dfifo.io.device.d <> io.tl_d.d
+  dfifo.io.spare_req_i := 0.U
+  dfifo.io.spare_rsp_i := 0.U
+
+  // Response steering
+  val rsp_arb_grant = Mux(io.tl_d.d.valid, UIntToOH(io.tl_d.d.bits.source(StIdW - 1, 0)), 0.U(M.W))
+  for (i <- 0 until M) {
+    hrsp_fifo_i(i).valid := io.tl_d.d.valid && rsp_arb_grant(i)
+    hrsp_fifo_i(i).bits := io.tl_d.d.bits
+    hrsp_fifo_i(i).bits.source := io.tl_d.d.bits.source >> StIdW
+  }
+  io.tl_d.d.ready := (VecInit(hrsp_fifo_i.map(_.ready)).asUInt & rsp_arb_grant).orR
+  dfifo.io.host.d.ready := (VecInit(hrsp_fifo_i.map(_.ready)).asUInt & rsp_arb_grant).orR
+}
+
+import _root_.circt.stage.{ChiselStage, FirtoolOption}
+import chisel3.stage.ChiselGeneratorAnnotation
+import scala.annotation.nowarn
+
+@nowarn
+object TlulSocketM1_2_128Emitter extends App {
+  val p = new Parameters
+  p.lsuDataBits = 128
+  (new ChiselStage).execute(
+    Array("--target", "systemverilog") ++ args,
+    Seq(
+      ChiselGeneratorAnnotation(() =>
+        new TlulSocketM1(
+          p = new bus.TLULParameters(p),
+          M = 2,
+          HReqDepth = Seq.fill(2)(0),
+          HRspDepth = Seq.fill(2)(0),
+          DReqDepth = 0,
+          DRspDepth = 0,
+          moduleName = "TlulSocketM1_2_128"
+        )
+      )
+    ) ++ Seq(FirtoolOption("-enable-layers=Verification"))
+  )
+}
+
+@nowarn
+object TlulSocketM1_3_128Emitter extends App {
+  val p = new Parameters
+  p.lsuDataBits = 128
+  (new ChiselStage).execute(
+    Array("--target", "systemverilog") ++ args,
+    Seq(
+      ChiselGeneratorAnnotation(() =>
+        new TlulSocketM1(
+          p = new bus.TLULParameters(p),
+          M = 3,
+          HReqDepth = Seq.fill(3)(0),
+          HRspDepth = Seq.fill(3)(0),
+          DReqDepth = 0,
+          DRspDepth = 0,
+          moduleName = "TlulSocketM1_3_128"
+        )
+      )
+    ) ++ Seq(FirtoolOption("-enable-layers=Verification"))
+  )
+}
diff --git a/hdl/chisel/src/bus/TlulWidthBridge.scala b/hdl/chisel/src/bus/TlulWidthBridge.scala
new file mode 100644
index 0000000..4feecee
--- /dev/null
+++ b/hdl/chisel/src/bus/TlulWidthBridge.scala
@@ -0,0 +1,218 @@
+package bus
+
+import chisel3._
+import chisel3.util._
+import common.FifoX
+
+class TlulWidthBridge(val host_p: TLULParameters, val device_p: TLULParameters) extends RawModule {
+  val io = IO(new Bundle {
+    val clk_i = Input(Clock())
+    val rst_ni = Input(Reset())
+
+    val tl_h = Flipped(new OpenTitanTileLink.Host2Device(host_p))
+    val tl_d = new OpenTitanTileLink.Host2Device(device_p)
+
+    val fault_a_o = Output(Bool())
+    val fault_d_o = Output(Bool())
+  })
+
+  withClockAndReset(io.clk_i, !io.rst_ni.asBool) {
+    // ==========================================================================
+    // Parameters and Constants
+    // ==========================================================================
+    val hostWidth = host_p.w * 8
+    val deviceWidth = device_p.w * 8
+
+    // Default fault outputs
+    io.fault_a_o := false.B
+    io.fault_d_o := false.B
+
+    // ==========================================================================
+    // Wide to Narrow Path (e.g., 128-bit host to 32-bit device)
+    // ==========================================================================
+    if (hostWidth > deviceWidth) {
+      val ratio = hostWidth / deviceWidth
+      val narrowBytes = deviceWidth / 8
+      val hostBytes = hostWidth / 8
+
+      // ------------------------------------------------------------------------
+      // Response Path (D Channel): Assemble narrow responses into a wide one
+      // ------------------------------------------------------------------------
+      val d_data_reg = RegInit(VecInit(Seq.fill(ratio)(0.U(deviceWidth.W))))
+      val d_resp_reg = RegInit(0.U.asTypeOf(new OpenTitanTileLink.D_Channel(host_p)))
+      val d_valid_reg = RegInit(false.B)
+      val beat_count = RegInit(0.U(log2Ceil(ratio+1).W))
+      val d_fault_reg = RegInit(false.B)
+
+      val d_check = Module(new ResponseIntegrityCheck(device_p))
+      d_check.io.d_i := io.tl_d.d.bits
+      io.fault_d_o := d_fault_reg
+
+      val d_gen = Module(new ResponseIntegrityGen(host_p))
+      val wide_resp = Wire(new OpenTitanTileLink.D_Channel(host_p))
+      wide_resp := d_resp_reg
+
+      val req_info_q = Module(new Queue(new Bundle {
+        val source = UInt(host_p.o.W)
+        val beats = UInt(log2Ceil(ratio+1).W)
+        val offset = UInt(log2Ceil(hostBytes).W)
+        val size = UInt(host_p.z.W)
+      }, 2))
+
+      wide_resp.source := req_info_q.io.deq.bits.source
+      wide_resp.size := req_info_q.io.deq.bits.size
+      d_gen.io.d_i := wide_resp
+
+      io.tl_d.d.ready := !d_valid_reg
+      io.tl_h.d.valid := d_valid_reg
+      io.tl_h.d.bits := d_gen.io.d_o
+      io.tl_h.d.bits.data := (d_data_reg.asUInt >> (req_info_q.io.deq.bits.offset << 3.U)).asUInt
+      io.tl_h.d.bits.error := d_resp_reg.error || d_fault_reg
+
+      when(io.tl_d.d.fire) {
+        // On the first beat, clear any fault and check for a new one.
+        // On subsequent beats, make the fault sticky.
+        when(beat_count === 0.U) {
+          d_fault_reg := d_check.io.fault
+        }.otherwise {
+          when(d_check.io.fault) {
+            d_fault_reg := true.B
+          }
+        }
+
+        val beat_index = (io.tl_d.d.bits.source - req_info_q.io.deq.bits.source)(log2Ceil(ratio)-1, 0)
+        d_data_reg(beat_index) := io.tl_d.d.bits.data
+        d_resp_reg := io.tl_d.d.bits
+        d_resp_reg.size := req_info_q.io.deq.bits.size
+        beat_count := beat_count + 1.U
+        when(beat_count === (req_info_q.io.deq.bits.beats - 1.U)) {
+          d_valid_reg := true.B
+        }
+      }
+
+      when(io.tl_h.d.fire) {
+        d_valid_reg := false.B
+        d_fault_reg := false.B
+        beat_count := 0.U
+        req_info_q.io.deq.ready := true.B
+      }.otherwise {
+        req_info_q.io.deq.ready := false.B
+      }
+
+      // ------------------------------------------------------------------------
+      // Request Path (A Channel): Split wide request into multiple narrow ones
+      // ------------------------------------------------------------------------
+      val a_check = Module(new RequestIntegrityCheck(host_p))
+      a_check.io.a_i := io.tl_h.a.bits
+      io.fault_a_o := a_check.io.fault
+
+      val req_fifo = Module(new FifoX(new OpenTitanTileLink.A_Channel(device_p), ratio, ratio + 1))
+
+      val beats = Wire(Vec(ratio, Valid(new OpenTitanTileLink.A_Channel(device_p))))
+      req_fifo.io.in.bits := beats
+
+      val is_write = io.tl_h.a.bits.opcode === TLULOpcodesA.PutFullData.asUInt ||
+                     io.tl_h.a.bits.opcode === TLULOpcodesA.PutPartialData.asUInt
+
+      val align_mask = (~(hostBytes - 1).U(host_p.a.W))
+      val aligned_address = io.tl_h.a.bits.address & align_mask
+      val address_offset = io.tl_h.a.bits.address(log2Ceil(hostBytes) - 1, 0)
+
+      val size_in_bytes = 1.U << io.tl_h.a.bits.size
+      val read_mask = (((1.U << size_in_bytes) - 1.U) << address_offset)(hostBytes - 1, 0)
+      val effective_mask = Mux(is_write, io.tl_h.a.bits.mask, read_mask)
+
+      for (i <- 0 until ratio) {
+        val req_gen = Module(new RequestIntegrityGen(device_p))
+
+        val narrow_req = Wire(new OpenTitanTileLink.A_Channel(device_p))
+        narrow_req.opcode := Mux(is_write, TLULOpcodesA.PutPartialData.asUInt, io.tl_h.a.bits.opcode)
+        narrow_req.param   := io.tl_h.a.bits.param
+        narrow_req.size    := log2Ceil(device_p.w).U
+        narrow_req.source  := io.tl_h.a.bits.source + i.U
+        narrow_req.address := aligned_address + (i * narrowBytes).U
+        val narrow_mask = (effective_mask >> (i * narrowBytes)).asUInt(narrowBytes-1, 0)
+        narrow_req.mask    := narrow_mask
+        narrow_req.data    := (io.tl_h.a.bits.data >> (i * deviceWidth)).asUInt
+        narrow_req.user    := io.tl_h.a.bits.user
+
+        req_gen.io.a_i := narrow_req
+        beats(i).bits := req_gen.io.a_o
+        beats(i).valid := narrow_mask =/= 0.U
+      }
+
+      io.tl_d.a <> req_fifo.io.out
+      req_fifo.io.in.valid := io.tl_h.a.valid && !a_check.io.fault && req_info_q.io.enq.ready
+      io.tl_h.a.ready := req_fifo.io.in.ready && !a_check.io.fault && req_info_q.io.enq.ready
+
+      val total_beats = PopCount(beats.map(_.valid))
+
+      req_info_q.io.enq.valid := io.tl_h.a.fire
+      req_info_q.io.enq.bits.source := io.tl_h.a.bits.source
+      req_info_q.io.enq.bits.beats := total_beats
+      req_info_q.io.enq.bits.offset := address_offset
+      req_info_q.io.enq.bits.size := io.tl_h.a.bits.size
+      assert(!req_info_q.io.enq.valid || req_info_q.io.enq.ready)
+
+    // ==========================================================================
+    // Narrow to Wide Path (e.g., 32-bit host to 128-bit device)
+    // ==========================================================================
+    } else if (hostWidth < deviceWidth) {
+      val wideBytes = deviceWidth / 8
+      val numSourceIds = 1 << host_p.i
+      val addr_lsb_width = log2Ceil(wideBytes)
+      val index_width = log2Ceil(numSourceIds)
+      val addr_lsb_regs = RegInit(VecInit(Seq.fill(numSourceIds)(0.U(addr_lsb_width.W))))
+
+      val req_addr_lsb = io.tl_h.a.bits.address(addr_lsb_width - 1, 0)
+
+      when (io.tl_h.a.fire) {
+        addr_lsb_regs(io.tl_h.a.bits.source(index_width-1, 0)) := req_addr_lsb
+      }
+
+      val a_check = Module(new RequestIntegrityCheck(host_p))
+      a_check.io.a_i := io.tl_h.a.bits
+      io.fault_a_o := a_check.io.fault
+
+      val a_gen = Module(new RequestIntegrityGen(device_p))
+      val wide_req = Wire(new OpenTitanTileLink.A_Channel(device_p))
+      wide_req.opcode  := io.tl_h.a.bits.opcode
+      wide_req.param   := io.tl_h.a.bits.param
+      wide_req.size    := io.tl_h.a.bits.size
+      wide_req.source  := io.tl_h.a.bits.source
+      wide_req.address := io.tl_h.a.bits.address
+      wide_req.user    := io.tl_h.a.bits.user
+      wide_req.mask    := (io.tl_h.a.bits.mask.asUInt << req_addr_lsb).asUInt
+      wide_req.data    := (io.tl_h.a.bits.data.asUInt << (req_addr_lsb << 3.U)).asUInt
+      a_gen.io.a_i := wide_req
+
+      io.tl_d.a.valid := io.tl_h.a.valid && !a_check.io.fault
+      io.tl_d.a.bits := a_gen.io.a_o
+      io.tl_h.a.ready := io.tl_d.a.ready && !a_check.io.fault
+
+      val d_check = Module(new ResponseIntegrityCheck(device_p))
+      d_check.io.d_i := io.tl_d.d.bits
+      io.fault_d_o := d_check.io.fault
+
+      val d_gen = Module(new ResponseIntegrityGen(host_p))
+      val narrow_resp = Wire(new OpenTitanTileLink.D_Channel(host_p))
+      val resp_addr_lsb = addr_lsb_regs(io.tl_d.d.bits.source(index_width-1, 0))
+      narrow_resp := io.tl_d.d.bits
+      narrow_resp.source := io.tl_d.d.bits.source
+      narrow_resp.data := (io.tl_d.d.bits.data >> (resp_addr_lsb << 3.U)).asUInt
+      narrow_resp.error := io.tl_d.d.bits.error || d_check.io.fault
+      d_gen.io.d_i := narrow_resp
+
+      io.tl_h.d.valid := io.tl_d.d.valid
+      io.tl_h.d.bits := d_gen.io.d_o
+      io.tl_d.d.ready := io.tl_h.d.ready
+
+    // ==========================================================================
+    // Equal Widths Path
+    // ==========================================================================
+    } else {
+      // Widths are equal, just pass through
+      io.tl_d <> io.tl_h
+    }
+  }
+}
diff --git a/hdl/chisel/src/common/KelvinArbiter.scala b/hdl/chisel/src/common/KelvinArbiter.scala
index f3271ae..bf878ba 100644
--- a/hdl/chisel/src/common/KelvinArbiter.scala
+++ b/hdl/chisel/src/common/KelvinArbiter.scala
@@ -49,4 +49,6 @@
     when(validMask(i)) { choice := i.asUInt }
 }
 
-class KelvinRRArbiter[T <: Data](val gen: T, val n: Int) extends InitedLockingRRArbiter[T](gen, n, 1)
+class KelvinRRArbiter[T <: Data](val gen: T, val n: Int, moduleName: Option[String] = None) extends InitedLockingRRArbiter[T](gen, n, 1) {
+  override val desiredName = moduleName.getOrElse(super.desiredName)
+}
diff --git a/tests/cocotb/tlul/BUILD b/tests/cocotb/tlul/BUILD
index 4740777..33c324b 100644
--- a/tests/cocotb/tlul/BUILD
+++ b/tests/cocotb/tlul/BUILD
@@ -91,6 +91,111 @@
     vcs_defines = VCS_DEFINES,
 )
 
+# BEGIN_TESTCASES_FOR_tlul_fifo_async_128_cocotb_test
+TLUL_FIFO_ASYNC_TESTCASES = [
+    "test_async_crossing",
+]
+# END_TESTCASES_FOR_tlul_fifo_async_128_cocotb_test
+
+cocotb_test_suite(
+    name = "tlul_fifo_async_128_cocotb_test",
+    simulators = ["verilator", "vcs"],
+    testcases = TLUL_FIFO_ASYNC_TESTCASES,
+    testcases_vname = "TLUL_FIFO_ASYNC_TESTCASES",
+    tests_kwargs = {
+        "hdl_toplevel": "TlulFifoAsync128",
+        "test_module": ["test_tlul_fifo_async.py"],
+        "deps": [
+            "//kelvin_test_utils:TileLinkULInterface",
+        ],
+        "waves": True,
+    },
+    verilator_model = "//hdl/chisel/src/bus:tlul_fifo_async_128_model",
+    vcs_verilog_sources = ["//hdl/chisel/src/bus:tlul_fifo_async_128_cc_library_verilog"],
+    vcs_build_args = VCS_BUILD_ARGS,
+    vcs_test_args = VCS_TEST_ARGS,
+    vcs_defines = VCS_DEFINES,
+)
+
+# BEGIN_TESTCASES_FOR_tlul_fifo_sync_cocotb_test
+TLUL_FIFO_SYNC_TESTCASES = [
+    "test_passthrough_with_spare",
+]
+# END_TESTCASES_FOR_tlul_fifo_sync_cocotb_test
+
+cocotb_test_suite(
+    name = "tlul_fifo_sync_cocotb_test",
+    simulators = ["verilator", "vcs"],
+    testcases = TLUL_FIFO_SYNC_TESTCASES,
+    testcases_vname = "TLUL_FIFO_SYNC_TESTCASES",
+    tests_kwargs = {
+        "hdl_toplevel": "TlulFifoSync",
+        "test_module": ["test_tlul_fifo_sync.py"],
+        "deps": [
+            "//kelvin_test_utils:TileLinkULInterface",
+        ],
+        "waves": True,
+    },
+    verilator_model = "//hdl/chisel/src/bus:tlul_fifo_sync_model",
+    vcs_verilog_sources = ["//hdl/chisel/src/bus:tlul_fifo_sync_cc_library_verilog"],
+    vcs_build_args = VCS_BUILD_ARGS,
+    vcs_test_args = VCS_TEST_ARGS,
+    vcs_defines = VCS_DEFINES,
+)
+
+# BEGIN_TESTCASES_FOR_tlul_socket_1n_128_cocotb_test
+TLUL_SOCKET_1N_TESTCASES = [
+    "test_steering",
+    "test_error_response",
+]
+# END_TESTCASES_FOR_tlul_socket_1n_128_cocotb_test
+
+cocotb_test_suite(
+    name = "tlul_socket_1n_128_cocotb_test",
+    simulators = ["verilator", "vcs"],
+    testcases = TLUL_SOCKET_1N_TESTCASES,
+    testcases_vname = "TLUL_SOCKET_1N_TESTCASES",
+    tests_kwargs = {
+        "hdl_toplevel": "TlulSocket1N_128",
+        "test_module": ["test_tlul_socket_1n.py"],
+        "deps": [
+            "//kelvin_test_utils:TileLinkULInterface",
+        ],
+        "waves": True,
+    },
+    verilator_model = "//hdl/chisel/src/bus:tlul_socket_1n_128_model",
+    vcs_verilog_sources = ["//hdl/chisel/src/bus:tlul_socket_1n_128_cc_library_verilog"],
+    vcs_build_args = VCS_BUILD_ARGS,
+    vcs_test_args = VCS_TEST_ARGS,
+    vcs_defines = VCS_DEFINES,
+)
+
+# BEGIN_TESTCASES_FOR_tlul_socket_m1_2_128_cocotb_test
+TLUL_SOCKET_M1_2_TESTCASES = [
+    "test_arbitration",
+]
+# END_TESTCASES_FOR_tlul_socket_m1_2_128_cocotb_test
+
+cocotb_test_suite(
+    name = "tlul_socket_m1_2_128_cocotb_test",
+    simulators = ["verilator", "vcs"],
+    testcases = TLUL_SOCKET_M1_2_TESTCASES,
+    testcases_vname = "TLUL_SOCKET_M1_2_TESTCASES",
+    tests_kwargs = {
+        "hdl_toplevel": "TlulSocketM1_2_128",
+        "test_module": ["test_tlul_socket_m1.py"],
+        "deps": [
+            "//kelvin_test_utils:TileLinkULInterface",
+        ],
+        "waves": True,
+    },
+    verilator_model = "//hdl/chisel/src/bus:tlul_socket_m1_2_128_model",
+    vcs_verilog_sources = ["//hdl/chisel/src/bus:tlul_socket_m1_2_128_cc_library_verilog"],
+    vcs_build_args = VCS_BUILD_ARGS,
+    vcs_test_args = VCS_TEST_ARGS,
+    vcs_defines = VCS_DEFINES,
+)
+
 # BEGIN_TESTCASES_FOR_tlul_integrity_cocotb_test
 TLUL_INTEGRITY_TESTCASES = [
     "test_request_integrity_gen",
@@ -121,8 +226,6 @@
     vcs_defines = VCS_DEFINES,
 )
 
-
-
 # BEGIN_TESTCASES_FOR_secded_encoder_cocotb_test
 SECDED_ENCODER_TESTCASES = [
     "test_secded_encoder",
diff --git a/tests/cocotb/tlul/test_tlul_fifo_async.py b/tests/cocotb/tlul/test_tlul_fifo_async.py
new file mode 100644
index 0000000..e4972e0
--- /dev/null
+++ b/tests/cocotb/tlul/test_tlul_fifo_async.py
@@ -0,0 +1,77 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cocotb
+from cocotb.clock import Clock
+from cocotb.triggers import RisingEdge, ClockCycles
+
+from kelvin_test_utils.TileLinkULInterface import TileLinkULInterface, create_a_channel_req
+
+
+async def setup_dut(dut):
+    """Common setup for all tests."""
+    h_clock = Clock(dut.io_clk_h_i, 10)
+    d_clock = Clock(dut.io_clk_d_i, 13)  # Asymmetric clocks
+    cocotb.start_soon(h_clock.start())
+    cocotb.start_soon(d_clock.start())
+
+    dut.io_rst_h_i.value = 1
+    dut.io_rst_d_i.value = 1
+    await ClockCycles(dut.io_clk_h_i, 2)
+    await ClockCycles(dut.io_clk_d_i, 2)
+    dut.io_rst_h_i.value = 0
+    dut.io_rst_d_i.value = 0
+    await RisingEdge(dut.io_clk_h_i)
+    await RisingEdge(dut.io_clk_d_i)
+
+
+@cocotb.test()
+async def test_async_crossing(dut):
+    """Verify requests are arbitrated and responses are routed correctly."""
+    await setup_dut(dut)
+
+    host_if = TileLinkULInterface(dut,
+                                  host_if_name="io_tl_h",
+                                  clock_name="io_clk_h_i",
+                                  reset_name="io_rst_h_i")
+    device_if = TileLinkULInterface(dut,
+                                    device_if_name="io_tl_d",
+                                    clock_name="io_clk_d_i",
+                                    reset_name="io_rst_d_i")
+
+    req = create_a_channel_req(address=0x1000,
+                               data=0x11223344,
+                               mask=0xF,
+                               source=1)
+
+    # Start a concurrent task to handle the device-side interaction
+    async def device_responder():
+        req_seen = await device_if.device_get_request()
+        assert req_seen["source"] == req["source"]
+        await device_if.device_respond(opcode=0,
+                                       param=0,
+                                       size=req_seen["size"],
+                                       source=req_seen["source"])
+
+    device_task = cocotb.start_soon(device_responder())
+
+    # Send the request from the host
+    await host_if.host_put(req)
+
+    # Wait for the response on the host side
+    response = await host_if.host_get_response()
+    assert response["source"] == req["source"]
+
+    # Wait for the device task to complete
+    await device_task
diff --git a/tests/cocotb/tlul/test_tlul_fifo_sync.py b/tests/cocotb/tlul/test_tlul_fifo_sync.py
new file mode 100644
index 0000000..a71844b
--- /dev/null
+++ b/tests/cocotb/tlul/test_tlul_fifo_sync.py
@@ -0,0 +1,104 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cocotb
+from cocotb.clock import Clock
+from cocotb.triggers import RisingEdge, ClockCycles, Event
+
+from kelvin_test_utils.TileLinkULInterface import TileLinkULInterface, create_a_channel_req
+
+
+async def setup_dut(dut):
+    """Common setup for all tests."""
+    cocotb.start_soon(Clock(dut.clock, 10, unit="us").start())
+
+    dut.reset.value = 1
+    await ClockCycles(dut.clock, 2)
+    dut.reset.value = 0
+    await RisingEdge(dut.clock)
+
+
+@cocotb.test()
+async def test_passthrough_with_spare(dut):
+    """Test basic data transfer and spare channels through the FIFO."""
+    await setup_dut(dut)
+    host_if = TileLinkULInterface(dut, host_if_name="io_host")
+    device_if = TileLinkULInterface(dut, device_if_name="io_device")
+
+    # Create a simple PutFullData request
+    a_data = create_a_channel_req(address=0x1000,
+                                  data=0x11223344,
+                                  mask=0xF,
+                                  width=32)
+    spare_req_val = 1
+    spare_rsp_val = 0
+
+    # Create a concurrent task that acts as the device model
+    async def device_model():
+        # Wait for the request from the DUT (coming from the host)
+        req = await device_if.device_get_request()
+
+        # Verify the request is what we expect
+        assert req["opcode"] == a_data["opcode"], f"Request opcode mismatch"
+        assert req["param"] == a_data["param"], f"Request param mismatch"
+        assert req["size"] == a_data["size"], f"Request size mismatch"
+        assert req["source"] == a_data["source"], f"Request source mismatch"
+        assert req["address"] == a_data["address"], f"Request address mismatch"
+        assert req["mask"] == a_data["mask"], f"Request mask mismatch"
+        assert req["data"] == a_data["data"], f"Request data mismatch"
+        for field, value in a_data["user"].items():
+            assert req["user"][
+                field] == value, f"Request user.{field} mismatch"
+
+        # Check spare request channel
+        assert dut.io_spare_req_o.value == spare_req_val, "Spare request data mismatch"
+
+        # Drive spare response channel before sending the main response
+        dut.io_spare_rsp_i.value = spare_rsp_val
+
+        # Send a simple AccessAck response
+        await device_if.device_respond(
+            opcode=0,  # AccessAck
+            param=0,
+            size=req["size"],
+            source=req["source"])
+
+    # Start the device model task
+    device_task = cocotb.start_soon(device_model())
+
+    # Drive spare request channel before sending the main request
+    dut.io_spare_req_i.value = spare_req_val
+
+    # Drive the transaction from the host side
+    await host_if.host_put(a_data)
+
+    # Wait for the response on the host side
+    response = await host_if.host_get_response()
+
+    # Verify the response
+    assert response["opcode"] == 0, "Response opcode mismatch"
+    assert response["param"] == 0, "Response param mismatch"
+    assert response["size"] == a_data["size"], "Response size mismatch"
+    assert response["source"] == a_data["source"], "Response source mismatch"
+    assert response["sink"] == 0, "Response sink mismatch"
+    assert response["data"] == 0, "Response data mismatch"
+    assert response["error"] == 0, "Response error mismatch"
+    assert response["user"]["rsp_intg"] != 0, "Response user.rsp_intg should not be zero"
+    assert response["user"]["data_intg"] != 0, "Response user.data_intg should not be zero"
+
+    # Check spare response channel
+    assert dut.io_spare_rsp_o.value == spare_rsp_val, "Spare response data mismatch"
+
+    # Ensure the device model task completed successfully
+    await device_task
diff --git a/tests/cocotb/tlul/test_tlul_socket_1n.py b/tests/cocotb/tlul/test_tlul_socket_1n.py
new file mode 100644
index 0000000..80ab150
--- /dev/null
+++ b/tests/cocotb/tlul/test_tlul_socket_1n.py
@@ -0,0 +1,92 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cocotb
+from cocotb.clock import Clock
+from cocotb.triggers import FallingEdge, RisingEdge, ClockCycles, with_timeout
+import random
+
+from kelvin_test_utils.TileLinkULInterface import TileLinkULInterface, create_a_channel_req
+
+
+async def setup_dut(dut):
+    """Common setup for all tests."""
+    clock = Clock(dut.clock, 10)
+    cocotb.start_soon(clock.start())
+    dut.reset.value = 1
+    await ClockCycles(dut.clock, 2)
+    dut.reset.value = 0
+    await RisingEdge(dut.clock)
+
+
+@cocotb.test()
+async def test_steering(dut):
+    """Verify requests are steered to the correct device port."""
+    await setup_dut(dut)
+
+    N = 4  # This is hardcoded in the Chisel emitter for now
+    host_if = TileLinkULInterface(dut, host_if_name="io_tl_h")
+    device_ifs = [
+        TileLinkULInterface(dut, device_if_name=f"io_tl_d_{i}")
+        for i in range(N)
+    ]
+
+    async def device_responder(device_if, i):
+        req_seen = await device_if.device_get_request()
+        await device_if.device_respond(opcode=0,
+                                       param=0,
+                                       size=req_seen["size"],
+                                       source=req_seen["source"])
+
+    # Start all device responders
+    for i in range(N):
+        cocotb.start_soon(device_responder(device_ifs[i], i))
+
+    for i in range(N):
+        dut.io_dev_select_i.value = i
+        req = create_a_channel_req(address=0x1000 + i * 0x100,
+                                   data=0x11223344 + i,
+                                   mask=0xF,
+                                   source=i)
+
+        await host_if.host_put(req)
+        response = await host_if.host_get_response()
+
+        assert response["source"] == i
+        # TODO(atv): Can we do this better?
+        # Allow some time for the device responder to process the request
+        await ClockCycles(dut.clock, 5)
+
+
+@cocotb.test()
+async def test_error_response(dut):
+    """Verify error response for out-of-bounds dev_select."""
+    await setup_dut(dut)
+
+    N = 4  # This is hardcoded in the Chisel emitter for now
+    host_if = TileLinkULInterface(dut, host_if_name="io_tl_h")
+
+    # dev_select_i is NWD bits wide, where NWD = ceil(log2(N+1))
+    # So, a value of N should be out of bounds and trigger an error
+    dut.io_dev_select_i.value = N
+    req = create_a_channel_req(address=0xBAD,
+                               data=0xBAD,
+                               mask=0xF,
+                               source=(1 << 6) - 1)
+
+    await host_if.host_put(req)
+    response = await host_if.host_get_response()
+
+    assert response["error"] == 1
+    assert response["source"] == req["source"]
diff --git a/tests/cocotb/tlul/test_tlul_socket_m1.py b/tests/cocotb/tlul/test_tlul_socket_m1.py
new file mode 100644
index 0000000..6061ef2
--- /dev/null
+++ b/tests/cocotb/tlul/test_tlul_socket_m1.py
@@ -0,0 +1,81 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cocotb
+from cocotb.clock import Clock
+from cocotb.triggers import RisingEdge, ClockCycles, with_timeout
+import math
+import random
+
+from kelvin_test_utils.TileLinkULInterface import TileLinkULInterface, create_a_channel_req
+
+
+async def setup_dut(dut):
+    """Common setup for all tests."""
+    clock = Clock(dut.clock, 10)
+    cocotb.start_soon(clock.start())
+    dut.reset.value = 1
+    await ClockCycles(dut.clock, 2)
+    dut.reset.value = 0
+    await RisingEdge(dut.clock)
+
+
+@cocotb.test()
+async def test_arbitration(dut):
+    """Verify requests are arbitrated and responses are routed correctly."""
+    await setup_dut(dut)
+
+    M = 0
+    while hasattr(dut, f"io_tl_h_{M}_a_valid"):
+        M += 1
+
+    StIdW = math.ceil(math.log2(M))
+
+    host_ifs = [
+        TileLinkULInterface(dut, host_if_name=f"io_tl_h_{i}") for i in range(M)
+    ]
+    device_if = TileLinkULInterface(dut, device_if_name="io_tl_d")
+
+    reqs = {
+        i:
+        create_a_channel_req(address=0x1000 + i * 0x100,
+                             data=0x11223344 + i,
+                             mask=0xF,
+                             source=i)
+        for i in range(M)
+    }
+    received_reqs = {}
+
+    async def device_responder():
+        while len(received_reqs) < M:
+            req_seen = await device_if.device_get_request()
+            host_index = req_seen["source"].to_unsigned() & ((1 << StIdW) - 1)
+            assert req_seen["source"].to_unsigned(
+            ) >> StIdW == reqs[host_index]["source"]
+            received_reqs[host_index] = req_seen
+            await device_if.device_respond(opcode=0,
+                                           param=0,
+                                           size=req_seen["size"],
+                                           source=req_seen["source"])
+
+    device_task = cocotb.start_soon(device_responder())
+
+    for i in range(M):
+        await host_ifs[i].host_put(reqs[i])
+
+    for i in range(M):
+        response = await host_ifs[i].host_get_response()
+        assert response["source"] == reqs[i]["source"]
+
+    await with_timeout(device_task, 1000)