feat(hdl): Add Chisel TL-UL <-> AXI bridges and CoreTlul

Change-Id: I2ffc39a7d559eb64074c214c18e5f46e30f84aa1
diff --git a/hdl/chisel/src/bus/Axi2TLUL.scala b/hdl/chisel/src/bus/Axi2TLUL.scala
new file mode 100644
index 0000000..f0a8e57
--- /dev/null
+++ b/hdl/chisel/src/bus/Axi2TLUL.scala
@@ -0,0 +1,99 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package bus
+
+import chisel3._
+import chisel3.util._
+
+import kelvin.Parameters
+
+/**
+  * Axi2TLUL: A Chisel module that serves as a bridge between an AXI4 master
+  * and a TileLink-UL slave.
+  *
+  * This module translates AXI read and write transactions into TileLink Get and Put
+  * operations, respectively. It uses a dataflow approach with queues to manage
+  * the protocol conversion.
+  *
+  * Note: This implementation handles single-beat AXI transactions (len=0). AXI
+  * bursting would require more complex logic to be added.
+  *
+  * @param p The Kelvin parameters.
+  */
+class Axi2TLUL[A_USER <: Data, D_USER <: Data](p: Parameters, userAGen: () => A_USER, userDGen: () => D_USER) extends Module {
+  val tlul_p = new TLULParameters(p)
+  val io = IO(new Bundle {
+    val axi = Flipped(new AxiMasterIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits))
+    val tl_a = Decoupled(new TileLink_A_ChannelBase(tlul_p, userAGen)) // TileLink Output
+    val tl_d = Flipped(Decoupled(new TileLink_D_ChannelBase(tlul_p, userDGen))) // TileLink Input
+  })
+
+  assert(io.axi.read.addr.bits.len === 0.U || !io.axi.read.addr.valid, "Axi2TLUL: AXI read bursts not supported")
+  assert(io.axi.write.addr.bits.len === 0.U || !io.axi.write.addr.valid, "Axi2TLUL: AXI write bursts not supported")
+
+  val read_addr_q = Queue(io.axi.read.addr, entries = 2)
+  val write_addr_q = Queue(io.axi.write.addr, entries = 2)
+  val write_data_q = Queue(io.axi.write.data, entries = 2)
+
+  // Prioritize reads over writes.
+  val is_write = write_addr_q.valid && write_data_q.valid
+  val is_read = read_addr_q.valid
+
+  io.tl_a.valid := is_read || is_write
+  read_addr_q.ready := false.B
+  write_addr_q.ready := false.B
+  write_data_q.ready := false.B
+
+  read_addr_q.ready := Mux(is_read, io.tl_a.ready, false.B)
+  write_addr_q.ready := !is_read && io.tl_a.ready
+  write_data_q.ready := !is_read && io.tl_a.ready
+
+  io.tl_a.bits.opcode := Mux(is_read, TLULOpcodesA.Get.asUInt, TLULOpcodesA.PutFullData.asUInt)
+  io.tl_a.bits.param := 0.U
+  io.tl_a.bits.address := Mux(is_read, read_addr_q.bits.addr, write_addr_q.bits.addr)
+  io.tl_a.bits.source := Mux(is_read, read_addr_q.bits.id, write_addr_q.bits.id)
+  io.tl_a.bits.size := Mux(is_read, read_addr_q.bits.size, write_addr_q.bits.size)
+  io.tl_a.bits.mask := Mux(is_read, 0.U, write_data_q.bits.strb)
+  io.tl_a.bits.data := Mux(is_read, 0.U, write_data_q.bits.data)
+  io.tl_a.bits.user      := 0.U.asTypeOf(io.tl_a.bits.user)
+
+  val d_is_write = io.tl_d.bits.opcode === TLULOpcodesD.AccessAck.asUInt
+  val d_is_read = io.tl_d.bits.opcode === TLULOpcodesD.AccessAckData.asUInt
+
+  io.axi.write.resp.valid := io.tl_d.valid && d_is_write
+  io.axi.write.resp.bits.id := io.tl_d.bits.source
+  io.axi.write.resp.bits.resp := 0.U
+
+  io.axi.read.data.valid := io.tl_d.valid && d_is_read
+  io.axi.read.data.bits.id := io.tl_d.bits.source
+  io.axi.read.data.bits.data := io.tl_d.bits.data
+  io.axi.read.data.bits.resp := Mux(io.tl_d.bits.error, "b10".U, "b00".U)
+  io.axi.read.data.bits.last := true.B
+
+  io.tl_d.ready := Mux(d_is_read, io.axi.read.data.ready, io.axi.write.resp.ready)
+}
+
+import _root_.circt.stage.{ChiselStage,FirtoolOption}
+import chisel3.stage.ChiselGeneratorAnnotation
+import scala.annotation.nowarn
+
+@nowarn
+object EmitAxi2TLUL extends App {
+  val p = Parameters()
+  (new ChiselStage).execute(
+    Array("--target", "systemverilog") ++ args,
+    Seq(ChiselGeneratorAnnotation(() => new Axi2TLUL(p, () => new NoUser, () => new NoUser))) ++ Seq(FirtoolOption("-enable-layers=Verification"))
+  )
+}
diff --git a/hdl/chisel/src/bus/BUILD b/hdl/chisel/src/bus/BUILD
index cbc3eff..9a6f1e3 100644
--- a/hdl/chisel/src/bus/BUILD
+++ b/hdl/chisel/src/bus/BUILD
@@ -15,7 +15,10 @@
 load(
     "@kelvin_hw//rules:chisel.bzl",
     "chisel_library",
+    "chisel_cc_library",
 )
+load("@kelvin_hw//rules:coco_tb.bzl", "cocotb_test_suite", "verilator_cocotb_model")
+load("@kelvin_hw//third_party/python:requirements.bzl", "requirement")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -23,11 +26,98 @@
     name = "bus",
     srcs = [
         "Axi.scala",
+        "TLUL2Axi.scala",
+        "Axi2TLUL.scala",
         "KelvinMemIO.scala",
         "KelvinToTlul.scala",
         "TileLinkUL.scala",
     ],
     deps = [
         "//hdl/chisel/src/kelvin:kelvin_params",
+        "//hdl/chisel/src/common",
     ],
 )
+
+chisel_cc_library(
+    name = "tlul2axi_cc_library",
+    chisel_lib = ":bus",
+    emit_class = "bus.EmitTLUL2Axi",
+    module_name = "TLUL2Axi",
+)
+
+verilator_cocotb_model(
+    name = "tlul2axi_model",
+    hdl_toplevel = "TLUL2Axi",
+    verilog_source = "//hdl/chisel/src/bus:TLUL2Axi.sv",
+    cflags = [],
+    trace = True,
+)
+
+# BEGIN_TESTCASES_FOR_tlul2axi_cocotb_test
+TLUL2AXI_TESTCASES = [
+    "test_put_request",
+    "test_get_request",
+    "test_backpressure",
+    "test_put_then_get",
+]
+# END_TESTCASES_FOR_tlul2axi_cocotb_test
+
+cocotb_test_suite(
+    name = "tlul2axi_cocotb_test",
+    simulators = ["verilator", "vcs"],
+    testcases = TLUL2AXI_TESTCASES,
+    testcases_vname = "TLUL2AXI_TESTCASES",
+    tests_kwargs = {
+        "hdl_toplevel": "TLUL2Axi",
+        "test_module": ["tlul2axi_cocotb_test.py"],
+        "size": "large",
+        "deps": [
+            "@bazel_tools//tools/python/runfiles",
+            requirement("tqdm"),
+        ],
+        "waves": True,
+    },
+    vcs_verilog_sources = ["//hdl/chisel/src/bus:tlul2axi_cc_library_verilog"],
+    verilator_model = ":tlul2axi_model",
+)
+
+chisel_cc_library(
+    name = "axi2tlul_cc_library",
+    chisel_lib = ":bus",
+    emit_class = "bus.EmitAxi2TLUL",
+    module_name = "Axi2TLUL",
+)
+
+verilator_cocotb_model(
+    name = "axi2tlul_model",
+    hdl_toplevel = "Axi2TLUL",
+    verilog_source = "//hdl/chisel/src/bus:Axi2TLUL.sv",
+    cflags = [],
+    trace = True,
+)
+
+# BEGIN_TESTCASES_FOR_axi2tlul_cocotb_test
+AXI2TLUL_TESTCASES = [
+    "test_write_request",
+    "test_read_request",
+]
+# END_TESTCASES_FOR_axi2tlul_cocotb_test
+
+cocotb_test_suite(
+    name = "axi2tlul_cocotb_test",
+    simulators = ["verilator", "vcs"],
+    testcases = AXI2TLUL_TESTCASES,
+    testcases_vname = "AXI2TLUL_TESTCASES",
+    tests_kwargs = {
+        "hdl_toplevel": "Axi2TLUL",
+        "test_module": ["axi2tlul_cocotb_test.py"],
+        "size": "large",
+        "deps": [
+            "@bazel_tools//tools/python/runfiles",
+            requirement("tqdm"),
+        ],
+        "waves": True,
+    },
+    vcs_verilog_sources = ["//hdl/chisel/src/bus:axi2tlul_cc_library_verilog"],
+    verilator_model = ":axi2tlul_model",
+)
\ No newline at end of file
diff --git a/hdl/chisel/src/bus/TLUL2Axi.scala b/hdl/chisel/src/bus/TLUL2Axi.scala
new file mode 100644
index 0000000..4bfb5b7
--- /dev/null
+++ b/hdl/chisel/src/bus/TLUL2Axi.scala
@@ -0,0 +1,160 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package bus
+
+import chisel3._
+import chisel3.util._
+import kelvin.Parameters
+import common.KelvinRRArbiter
+import _root_.circt.stage.{ChiselStage,FirtoolOption}
+import chisel3.stage.ChiselGeneratorAnnotation
+import scala.annotation.nowarn
+
+/**
+  * TLUL2Axi: A Chisel module that serves as a bridge between a TileLink-UL master
+  * and an AXI4 slave.
+  *
+  * This module translates TileLink Get and Put operations into AXI read and write
+  * transactions, respectively. It uses a dataflow approach with queues and an
+  * arbiter to manage the protocol conversion.
+  *
+  * @param p The Kelvin parameters.
+  */
+class TLUL2Axi[A_USER <: Data, D_USER <: Data](p: Parameters, userAGen: () => A_USER, userDGen: () => D_USER) extends Module {
+  val tlul_p = new TLULParameters(p)
+  val io = IO(new Bundle {
+    val tl_a = Flipped(Decoupled(new TileLink_A_ChannelBase(tlul_p, userAGen))) // TileLink Input
+    val tl_d = Decoupled(new TileLink_D_ChannelBase(tlul_p, userDGen))          // TileLink Output
+    val axi = new AxiMasterIO(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits)
+  })
+  // --- Queue for incoming TileLink A-Channel requests ---
+  val tl_a_q = Queue(io.tl_a, 2)
+
+  val is_get = tl_a_q.bits.opcode === TLULOpcodesA.Get.asUInt
+  val is_put = tl_a_q.bits.opcode === TLULOpcodesA.PutFullData.asUInt ||
+               tl_a_q.bits.opcode === TLULOpcodesA.PutPartialData.asUInt
+
+  // --- AXI Channel Generation ---
+  // TODO: Consider gating these signals (on get/put)? Especially address.
+  // Drive AXI write channels for Put requests
+  val aw_q = Module(new Queue(new AxiAddress(p.axi2AddrBits, p.axi2DataBits, p.axi2IdBits), 1))
+  aw_q.io.enq.valid := tl_a_q.valid && is_put
+  aw_q.io.enq.bits.addr := tl_a_q.bits.address
+  aw_q.io.enq.bits.id := tl_a_q.bits.source
+  aw_q.io.enq.bits.len := 0.U
+  aw_q.io.enq.bits.size := tl_a_q.bits.size
+  aw_q.io.enq.bits.burst := AxiBurstType.INCR.asUInt
+  aw_q.io.enq.bits.prot := 0.U
+  aw_q.io.enq.bits.lock := 0.U
+  aw_q.io.enq.bits.cache := 0.U
+  aw_q.io.enq.bits.qos := 0.U
+  aw_q.io.enq.bits.region := 0.U
+
+  val w_q = Module(new Queue(new AxiWriteData(p.axi2DataBits, p.axi2IdBits), 1))
+  w_q.io.enq.valid := tl_a_q.valid && is_put
+  w_q.io.enq.bits.data := tl_a_q.bits.data
+  w_q.io.enq.bits.strb := tl_a_q.bits.mask
+  w_q.io.enq.bits.last := true.B
+
+  io.axi.write.addr <> aw_q.io.deq
+  io.axi.write.data <> w_q.io.deq
+
+  // Drive AXI read channel for Get requests
+  io.axi.read.addr.valid := tl_a_q.valid && is_get
+  io.axi.read.addr.bits.addr := tl_a_q.bits.address
+  io.axi.read.addr.bits.id   := tl_a_q.bits.source
+  io.axi.read.addr.bits.len  := 0.U // No bursting
+  io.axi.read.addr.bits.size := tl_a_q.bits.size
+  io.axi.read.addr.bits.burst := AxiBurstType.INCR.asUInt // Doesn't matter
+  io.axi.read.addr.bits.prot := 0.U // Default protection
+
+  // Dequeue from TileLink queue when AXI transaction is accepted
+  tl_a_q.ready := (is_get && io.axi.read.addr.ready) || (is_put && aw_q.io.enq.ready && w_q.io.enq.ready)
+
+  io.axi.write.addr.bits.lock := 0.U
+  io.axi.write.addr.bits.cache := 0.U
+  io.axi.write.addr.bits.qos := 0.U
+  io.axi.write.addr.bits.region := 0.U
+  io.axi.read.addr.bits.lock := 0.U
+  io.axi.read.addr.bits.cache := 0.U
+  io.axi.read.addr.bits.qos := 0.U
+  io.axi.read.addr.bits.region := 0.U
+
+  // --- Response Path ---
+  class TxInfo extends Bundle {
+    val source = UInt(tlul_p.o.W)
+    val size = UInt(tlul_p.z.W)
+  }
+
+  val read_tx_info_q = Module(new Queue(new TxInfo, entries = 2))
+  val write_tx_info_q = Module(new Queue(new TxInfo, entries = 2))
+
+  read_tx_info_q.io.enq.valid := tl_a_q.valid && is_get && io.axi.read.addr.ready
+  read_tx_info_q.io.enq.bits.source := tl_a_q.bits.source
+  read_tx_info_q.io.enq.bits.size := tl_a_q.bits.size
+
+  write_tx_info_q.io.enq.valid := tl_a_q.valid && is_put && aw_q.io.enq.ready && w_q.io.enq.ready
+  write_tx_info_q.io.enq.bits.source := tl_a_q.bits.source
+  write_tx_info_q.io.enq.bits.size := tl_a_q.bits.size
+
+  // --- TileLink D-Channel (Response) Generation ---
+  val read_response = Wire(Decoupled(new TileLink_D_ChannelBase(tlul_p, userDGen)))
+  val write_response = Wire(Decoupled(new TileLink_D_ChannelBase(tlul_p, userDGen)))
+
+  // AXI Read Response -> TileLink AccessAckData
+  read_response.valid := io.axi.read.data.valid && read_tx_info_q.io.deq.valid
+  read_response.bits.opcode := TLULOpcodesD.AccessAckData.asUInt
+  read_response.bits.param := 0.U
+  read_response.bits.size := read_tx_info_q.io.deq.bits.size
+  read_response.bits.source := read_tx_info_q.io.deq.bits.source
+  read_response.bits.sink := 0.U
+  read_response.bits.data := io.axi.read.data.bits.data
+  read_response.bits.error := io.axi.read.data.bits.resp =/= 0.U
+  read_response.bits.user := 0.U.asTypeOf(read_response.bits.user)
+
+  // AXI Write Response -> TileLink AccessAck
+  write_response.valid := io.axi.write.resp.valid && write_tx_info_q.io.deq.valid
+  write_response.bits.opcode := TLULOpcodesD.AccessAck.asUInt
+  write_response.bits.param := 0.U
+  write_response.bits.size := write_tx_info_q.io.deq.bits.size
+  write_response.bits.source := write_tx_info_q.io.deq.bits.source
+  write_response.bits.sink := 0.U
+  write_response.bits.data := 0.U
+  write_response.bits.error := io.axi.write.resp.bits.resp =/= 0.U
+  write_response.bits.user := 0.U.asTypeOf(write_response.bits.user)
+
+  // Arbitrate between read and write responses for the D-channel
+  val d_channel_arb = Module(new KelvinRRArbiter(new TileLink_D_ChannelBase(tlul_p, userDGen), 2))
+  d_channel_arb.io.in(0) <> read_response
+  d_channel_arb.io.in(1) <> write_response
+  io.tl_d <> Queue(d_channel_arb.io.out, 2)
+
+  // Drive ready signals
+  io.axi.read.data.ready := d_channel_arb.io.in(0).ready
+  read_tx_info_q.io.deq.ready := d_channel_arb.io.in(0).ready && io.axi.read.data.valid
+
+  io.axi.write.resp.ready := d_channel_arb.io.in(1).ready
+  write_tx_info_q.io.deq.ready := d_channel_arb.io.in(1).ready && io.axi.write.resp.valid
+
+}
+
+@nowarn
+object EmitTLUL2Axi extends App {
+  val p = Parameters()
+  (new ChiselStage).execute(
+    Array("--target", "systemverilog") ++ args,
+    Seq(ChiselGeneratorAnnotation(() => new TLUL2Axi(p, () => new OpenTitanTileLink_A_User, () => new OpenTitanTileLink_D_User))) ++ Seq(FirtoolOption("-enable-layers=Verification"))
+  )
+}
\ No newline at end of file
diff --git a/hdl/chisel/src/bus/TileLinkUL.scala b/hdl/chisel/src/bus/TileLinkUL.scala
index 8570ab1..aaaab69 100644
--- a/hdl/chisel/src/bus/TileLinkUL.scala
+++ b/hdl/chisel/src/bus/TileLinkUL.scala
@@ -17,13 +17,14 @@
 import chisel3._
 import chisel3.util._
 
+import kelvin.Parameters
 import kelvin.MemoryRegion
 
-case class TLULParameters() {
-  val w = 32
-  val a = 32
-  val z = 6
-  val o = 10
+class TLULParameters(p: Parameters) {
+  val w = p.axi2DataBits / 8
+  val a = p.axi2AddrBits
+  val z = log2Ceil(w)
+  val o = p.axi2IdBits
   val i = 1
 }
 
@@ -40,6 +41,64 @@
   val End = Value(7.U(3.W))
 }
 
+class OpenTitanTileLink_A_User extends Bundle {
+  val rsvd = UInt(5.W)
+  val instr_type = UInt(4.W) // mubi4_t
+  val cmd_intg = UInt(7.W)
+  val data_intg = UInt(7.W)
+}
+
+class OpenTitanTileLink_D_User extends Bundle {
+  val rsp_intg = UInt(7.W)
+  val data_intg = UInt(7.W)
+}
+
+class NoUser extends Bundle {}
+
+class TileLink_A_ChannelBase[T <: Data](p: TLULParameters, val userGen: () => T) extends Bundle {
+  val opcode = UInt(3.W)
+  val param = UInt(3.W)
+  val size = UInt(p.z.W)
+  val source = UInt(p.o.W)
+  val address = UInt(p.a.W)
+  val mask = UInt(p.w.W)
+  val data = UInt((8 * p.w).W)
+  val user = userGen()
+}
+
+class TileLink_D_ChannelBase[T <: Data](p: TLULParameters, val userGen: () => T) extends Bundle {
+  val opcode = UInt(3.W)
+  val param = UInt(3.W)
+  val size = UInt(p.z.W)
+  val source = UInt(p.o.W)
+  val sink = UInt(p.i.W)
+  val data = UInt((8 * p.w).W)
+  val user = userGen()
+  val error = Bool()
+}
+
+class TileLink_A_Channel(p: TLULParameters) extends TileLink_A_ChannelBase(p, () => new NoUser) {}
+class TileLink_D_Channel(p: TLULParameters) extends TileLink_D_ChannelBase(p, () => new NoUser) {}
+
+class TLULHost2Device[A_USER <: Data, D_USER <: Data](p: TLULParameters, userAGen: () => A_USER, userDGen: () => D_USER) extends Bundle {
+  val a = Decoupled(new TileLink_A_ChannelBase(p, userAGen))
+  val d = Flipped(Decoupled(new TileLink_D_ChannelBase(p, userDGen)))
+}
+
+class TLULDevice2Host[A_USER <: Data, D_USER <: Data](p: TLULParameters, userAGen: () => A_USER, userDGen: () => D_USER) extends Bundle {
+  val a = Flipped(Decoupled(new TileLink_A_ChannelBase(p, userAGen)))
+  val d = Decoupled(new TileLink_D_ChannelBase(p, userDGen))
+}
+
+object OpenTitanTileLink {
+  class A_Channel(p: TLULParameters) extends TileLink_A_ChannelBase(p, () => new OpenTitanTileLink_A_User) {}
+  class D_Channel(p: TLULParameters) extends TileLink_D_ChannelBase(p, () => new OpenTitanTileLink_D_User) {}
+  class Host2Device(p: TLULParameters) extends TLULHost2Device(p, () => new OpenTitanTileLink_A_User, () => new OpenTitanTileLink_D_User) {}
+  class Device2Host(p: TLULParameters) extends TLULDevice2Host(p, () => new OpenTitanTileLink_A_User, () => new OpenTitanTileLink_D_User) {}
+}
+
+// NB: Stuff below here is for ChAI -- it's not likely that you want
+// to use these for new development.
 class TileLinkULIO_H2D(p: TLULParameters) extends Bundle {
   val a_valid = (Bool())
   val a_opcode = (UInt(3.W))
@@ -124,4 +183,4 @@
     host_d :<>= device_d
     device_a :<>= host_a
   }
-}
+}
\ No newline at end of file
diff --git a/hdl/chisel/src/bus/axi2tlul_cocotb_test.py b/hdl/chisel/src/bus/axi2tlul_cocotb_test.py
new file mode 100644
index 0000000..e6b721c
--- /dev/null
+++ b/hdl/chisel/src/bus/axi2tlul_cocotb_test.py
@@ -0,0 +1,263 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cocotb
+import enum
+import random
+
+from cocotb.clock import Clock
+from cocotb.triggers import ClockCycles, RisingEdge
+
+class TLUL_OpcodeA(enum.IntEnum):
+    PutFullData    = 0
+    PutPartialData = 1
+    Get            = 4
+
+class TLUL_OpcodeD(enum.IntEnum):
+    AccessAck     = 0
+    AccessAckData = 1
+
+async def reset_dut(dut):
+    """Applies reset to the DUT."""
+    dut.reset.value = 1
+    dut.io_axi_read_addr_valid.value = 0
+    dut.io_axi_write_addr_valid.value = 0
+    dut.io_axi_write_data_valid.value = 0
+    dut.io_axi_write_resp_ready.value = 0
+    dut.io_axi_read_data_ready.value = 0
+    await ClockCycles(dut.clock, 2)
+    dut.reset.value = 0
+    await ClockCycles(dut.clock, 2)
+
+async def axi_send_write(dut, address, source, size, data, strb, timeout_cycles=1000):
+    """Sends an AXI write transaction."""
+    dut.io_axi_write_addr_valid.value = 1
+    dut.io_axi_write_addr_bits_addr.value = address
+    dut.io_axi_write_addr_bits_id.value = source
+    dut.io_axi_write_addr_bits_size.value = size
+
+    dut.io_axi_write_data_valid.value = 1
+    dut.io_axi_write_data_bits_data.value = data
+    dut.io_axi_write_data_bits_strb.value = strb
+
+    for _ in range(timeout_cycles):
+        await RisingEdge(dut.clock)
+        if dut.io_axi_write_addr_ready.value == 1 and dut.io_axi_write_data_ready.value == 1:
+            break
+    else:
+        raise RuntimeError(f"Timeout waiting for AXI write ready")
+
+    dut.io_axi_write_addr_valid.value = 0
+    dut.io_axi_write_data_valid.value = 0
+
+async def axi_send_read(dut, address, source, size, timeout_cycles=1000):
+    """Sends an AXI read transaction."""
+    dut.io_axi_read_addr_valid.value = 1
+    dut.io_axi_read_addr_bits_addr.value = address
+    dut.io_axi_read_addr_bits_id.value = source
+    dut.io_axi_read_addr_bits_size.value = size
+
+    for _ in range(timeout_cycles):
+        await RisingEdge(dut.clock)
+        if dut.io_axi_read_addr_ready.value == 1:
+            break
+    else:
+        raise RuntimeError(f"Timeout waiting for AXI read ready")
+
+    dut.io_axi_read_addr_valid.value = 0
+
+@cocotb.test()
+async def test_write_request(dut):
+    """Tests a simple AXI write request."""
+    clock = Clock(dut.clock, 10, unit="us")
+    cocotb.start_soon(clock.start())
+
+    await reset_dut(dut)
+
+    dut.io_tl_a_ready.value = 0
+    dut.io_tl_d_valid.value = 0
+
+    addr_width = 32
+    source_width = 6
+    data_width_bytes = 32
+    timeout_cycles = 1000
+
+    size_power = random.randint(0, 5)
+    test_size = size_power
+    num_bytes = 2**size_power
+
+    test_addr = random.randint(0, (2**addr_width) - 1)
+    test_source = random.randint(0, (2**source_width) - 1)
+    test_data = random.randint(0, (2**(data_width_bytes*8)) - 1)
+    test_strb = (1 << num_bytes) - 1
+
+    await axi_send_write(dut, address=test_addr, source=test_source, size=test_size, data=test_data, strb=test_strb, timeout_cycles=timeout_cycles)
+
+    await RisingEdge(dut.clock)
+    assert dut.io_tl_a_valid.value, "TL A_VALID should be high"
+    assert dut.io_tl_a_bits_opcode.value == TLUL_OpcodeA.PutFullData, "TL A_OPCODE should be PutFullData"
+    assert dut.io_tl_a_bits_address.value == test_addr, "TL A_ADDRESS is incorrect"
+    assert dut.io_tl_a_bits_source.value == test_source, "TL A_SOURCE is incorrect"
+    assert dut.io_tl_a_bits_size.value == test_size, "TL A_SIZE is incorrect"
+    assert dut.io_tl_a_bits_data.value == test_data, "TL A_DATA is incorrect"
+    assert dut.io_tl_a_bits_mask.value == test_strb, "TL A_MASK is incorrect"
+
+    dut.io_tl_a_ready.value = 1
+    await RisingEdge(dut.clock)
+    dut.io_tl_a_ready.value = 0
+
+    dut.io_axi_write_resp_ready.value = 1
+    dut.io_tl_d_valid.value = 1
+    dut.io_tl_d_bits_opcode.value = TLUL_OpcodeD.AccessAck
+    dut.io_tl_d_bits_source.value = test_source
+
+    for _ in range(timeout_cycles):
+        await RisingEdge(dut.clock)
+        if dut.io_tl_d_ready.value:
+            assert dut.io_axi_write_resp_valid.value, "AXI BVALID should be high"
+            assert dut.io_axi_write_resp_bits_id.value == test_source, "AXI BID is incorrect"
+            assert dut.io_axi_write_resp_bits_resp.value == 0, "AXI BRESP is incorrect"
+            dut.io_axi_write_resp_ready.value = 0
+            break
+    else:
+        raise RuntimeError("Timeout waiting for io_tl_d_ready")
+
+    await RisingEdge(dut.clock)
+    dut.io_tl_d_valid.value = 0
+
+    await ClockCycles(dut.clock, 5)
+
+@cocotb.test()
+async def test_read_request(dut):
+    """Tests a simple AXI read request."""
+    clock = Clock(dut.clock, 10, unit="us")
+    cocotb.start_soon(clock.start())
+
+    await reset_dut(dut)
+
+    dut.io_tl_a_ready.value = 0
+    dut.io_tl_d_valid.value = 0
+
+    addr_width = 32
+    source_width = 6
+    data_width_bytes = 32
+    timeout_cycles = 1000
+
+    size_power = random.randint(0, 5)
+    test_size = size_power
+
+    test_addr = random.randint(0, (2**addr_width) - 1)
+    test_source = random.randint(0, (2**source_width) - 1)
+    test_data = random.randint(0, (2**(data_width_bytes*8)) - 1)
+
+    await axi_send_read(dut, address=test_addr, source=test_source, size=test_size, timeout_cycles=timeout_cycles)
+
+    await RisingEdge(dut.clock)
+    assert dut.io_tl_a_valid.value, "TL A_VALID should be high"
+    assert dut.io_tl_a_bits_opcode.value == TLUL_OpcodeA.Get, "TL A_OPCODE should be Get"
+    assert dut.io_tl_a_bits_address.value == test_addr, "TL A_ADDRESS is incorrect"
+    assert dut.io_tl_a_bits_source.value == test_source, "TL A_SOURCE is incorrect"
+    assert dut.io_tl_a_bits_size.value == test_size, "TL A_SIZE is incorrect"
+
+    dut.io_tl_a_ready.value = 1
+    await RisingEdge(dut.clock)
+    dut.io_tl_a_ready.value = 0
+
+    await RisingEdge(dut.clock)
+    dut.io_tl_d_valid.value = 1
+    dut.io_tl_d_bits_opcode.value = TLUL_OpcodeD.AccessAckData
+    dut.io_tl_d_bits_source.value = test_source
+    dut.io_tl_d_bits_data.value = test_data
+
+    dut.io_axi_read_data_ready.value = 1
+
+    for _ in range(timeout_cycles):
+        await RisingEdge(dut.clock)
+        if dut.io_tl_d_ready.value:
+            assert dut.io_axi_read_data_valid.value, "AXI RVALID should be high"
+            assert dut.io_axi_read_data_bits_id.value == test_source, "AXI RID is incorrect"
+            assert dut.io_axi_read_data_bits_data.value == test_data, "AXI RDATA is incorrect"
+            assert dut.io_axi_read_data_bits_resp.value == 0, "AXI RRESP is incorrect"
+            dut.io_axi_read_data_ready.value = 0
+            break
+    else:
+        raise RuntimeError("Timeout waiting for io_tl_d_ready")
+
+    await RisingEdge(dut.clock)
+    dut.io_tl_d_valid.value = 0
+
+    await ClockCycles(dut.clock, 5)
+
+
+@cocotb.test()
+async def test_read_error(dut):
+    """Tests a simple AXI read request that results in a TL error."""
+    clock = Clock(dut.clock, 10, unit="us")
+    cocotb.start_soon(clock.start())
+
+    await reset_dut(dut)
+
+    dut.io_tl_a_ready.value = 0
+    dut.io_tl_d_valid.value = 0
+
+    addr_width = 32
+    source_width = 6
+    data_width_bytes = 32
+    timeout_cycles = 1000
+
+    size_power = random.randint(0, 5)
+    test_size = size_power
+
+    test_addr = random.randint(0, (2**addr_width) - 1)
+    test_source = random.randint(0, (2**source_width) - 1)
+    test_data = random.randint(0, (2**(data_width_bytes*8)) - 1)
+
+    await axi_send_read(dut, address=test_addr, source=test_source, size=test_size, timeout_cycles=timeout_cycles)
+
+    await RisingEdge(dut.clock)
+    assert dut.io_tl_a_valid.value, "TL A_VALID should be high"
+    assert dut.io_tl_a_bits_opcode.value == TLUL_OpcodeA.Get, "TL A_OPCODE should be Get"
+    assert dut.io_tl_a_bits_address.value == test_addr, "TL A_ADDRESS is incorrect"
+    assert dut.io_tl_a_bits_source.value == test_source, "TL A_SOURCE is incorrect"
+    assert dut.io_tl_a_bits_size.value == test_size, "TL A_SIZE is incorrect"
+
+    dut.io_tl_a_ready.value = 1
+    await RisingEdge(dut.clock)
+    dut.io_tl_a_ready.value = 0
+
+    await RisingEdge(dut.clock)
+    dut.io_tl_d_valid.value = 1
+    dut.io_tl_d_bits_opcode.value = TLUL_OpcodeD.AccessAckData
+    dut.io_tl_d_bits_source.value = test_source
+    dut.io_tl_d_bits_data.value = test_data
+    dut.io_tl_d_bits_error.value = 1
+
+    dut.io_axi_read_data_ready.value = 1
+
+    for _ in range(timeout_cycles):
+        await RisingEdge(dut.clock)
+        if dut.io_tl_d_ready.value:
+            assert dut.io_axi_read_data_valid.value, "AXI RVALID should be high"
+            assert dut.io_axi_read_data_bits_id.value == test_source, "AXI RID is incorrect"
+            assert dut.io_axi_read_data_bits_data.value == test_data, "AXI RDATA is incorrect"
+            assert dut.io_axi_read_data_bits_resp.value == 2, "AXI RRESP is incorrect"
+            dut.io_axi_read_data_ready.value = 0
+            break
+    else:
+        raise RuntimeError("Timeout waiting for io_tl_d_ready")
+
+    await RisingEdge(dut.clock)
+    dut.io_tl_d_valid.value = 0
+
+    await ClockCycles(dut.clock, 5)
diff --git a/hdl/chisel/src/bus/tlul2axi_cocotb_test.py b/hdl/chisel/src/bus/tlul2axi_cocotb_test.py
new file mode 100644
index 0000000..e50b25b
--- /dev/null
+++ b/hdl/chisel/src/bus/tlul2axi_cocotb_test.py
@@ -0,0 +1,420 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cocotb
+import enum
+import random
+
+from cocotb.clock import Clock
+from cocotb.triggers import ClockCycles, RisingEdge
+
+class TLUL_OpcodeA(enum.IntEnum):
+    PutFullData    = 0
+    PutPartialData = 1
+    Get            = 4
+
+class TLUL_OpcodeD(enum.IntEnum):
+    AccessAck     = 0
+    AccessAckData = 1
+
+async def reset_dut(dut):
+    """Applies reset to the DUT."""
+    dut.reset.value = 1
+    dut.io_tl_a_valid.value = 0
+    dut.io_tl_d_ready.value = 0
+    dut.io_tl_a_bits_opcode.value = 0
+    dut.io_tl_a_bits_param.value = 0
+    dut.io_tl_a_bits_size.value = 0
+    dut.io_tl_a_bits_source.value = 0
+    dut.io_tl_a_bits_address.value = 0
+    dut.io_tl_a_bits_mask.value = 0
+    dut.io_tl_a_bits_data.value = 0
+    await ClockCycles(dut.clock, 2)
+    dut.reset.value = 0
+    await ClockCycles(dut.clock, 2)
+
+async def wait_for_signal(clock, signal, timeout_cycles=1000, message=None):
+    """Waits for a signal to be asserted."""
+    if message is None:
+        message = f"Timeout waiting for {signal._name}"
+
+    for _ in range(timeout_cycles):
+        await RisingEdge(clock)
+        if signal.value:
+            return
+    else:
+        raise RuntimeError(message)
+
+async def tl_send_get(dut, address, source, size, timeout_cycles=1000):
+    """Sends a TileLink Get request."""
+    dut.io_tl_a_valid.value = 1
+    dut.io_tl_a_bits_opcode.value = TLUL_OpcodeA.Get
+    dut.io_tl_a_bits_address.value = address
+    dut.io_tl_a_bits_source.value = source
+    dut.io_tl_a_bits_size.value = size
+    dut.io_tl_a_bits_mask.value = 0  # Mask is ignored for Get
+    dut.io_tl_a_bits_data.value = 0  # Data is ignored for Get
+
+    await wait_for_signal(dut.clock, dut.io_tl_a_ready, timeout_cycles)
+
+    dut.io_tl_a_valid.value = 0
+
+
+async def tl_send_put(dut, address, source, size, data, mask, timeout_cycles=1000):
+    """Sends a TileLink PutFullData request."""
+    dut.io_tl_a_valid.value = 1
+    dut.io_tl_a_bits_opcode.value = TLUL_OpcodeA.PutFullData
+    dut.io_tl_a_bits_address.value = address
+    dut.io_tl_a_bits_source.value = source
+    dut.io_tl_a_bits_size.value = size
+    dut.io_tl_a_bits_data.value = data
+    dut.io_tl_a_bits_mask.value = mask
+
+    await wait_for_signal(dut.clock, dut.io_tl_a_ready, timeout_cycles)
+
+    dut.io_tl_a_valid.value = 0
+
+
+@cocotb.test()
+async def test_put_request(dut):
+    """Tests a simple Put request."""
+    clock = Clock(dut.clock, 10, unit="us")
+    cocotb.start_soon(clock.start())
+
+    # Reset
+    await reset_dut(dut)
+
+    # AXI slave initial state
+    dut.io_axi_read_addr_ready.value = 0
+    dut.io_axi_write_addr_ready.value = 0
+    dut.io_axi_write_data_ready.value = 0
+    dut.io_axi_write_resp_valid.value = 0
+    dut.io_axi_read_data_valid.value = 0
+
+    # Test parameters
+    addr_width = 32
+    source_width = 6
+    data_width_bytes = 32 # Corresponds to 256 bits
+    timeout_cycles = 1000
+
+    size_power = random.randint(0, 5) # 2**5 = 32 bytes
+    test_size = size_power
+    num_bytes = 2**size_power
+
+    test_addr = random.randint(0, (2**addr_width) - 1)
+    test_source = random.randint(0, (2**source_width) - 1)
+    test_data = random.randint(0, (2**(data_width_bytes*8)) - 1)
+    test_mask = (1 << num_bytes) - 1
+
+    # Drive TL Put request
+    await tl_send_put(dut, address=test_addr, source=test_source, size=test_size, data=test_data, mask=test_mask, timeout_cycles=timeout_cycles)
+
+    #
+    # Check AXI Write Address and Data Channels
+    #
+    await wait_for_signal(dut.clock, dut.io_axi_write_addr_valid, timeout_cycles, "Timeout waiting for AXI AWVALID for Put")
+
+    assert dut.io_axi_write_addr_valid.value, "AXI AWVALID should be high"
+    assert dut.io_axi_write_data_valid.value, "AXI WVALID should be high"
+    assert dut.io_axi_write_addr_bits_addr.value == test_addr, "AXI AWADDR is incorrect"
+    assert dut.io_axi_write_addr_bits_id.value == test_source, "AXI AWID is incorrect"
+    assert dut.io_axi_write_addr_bits_size.value == test_size, "AXI AWSIZE is incorrect"
+    assert dut.io_axi_write_data_bits_data.value == test_data, "AXI WDATA is incorrect"
+    assert dut.io_axi_write_data_bits_strb.value == test_mask, "AXI WSTRB is incorrect"
+
+    # AXI slave accepts the request
+    dut.io_axi_write_addr_ready.value = 1
+    dut.io_axi_write_data_ready.value = 1
+    await RisingEdge(dut.clock)
+    dut.io_axi_write_addr_ready.value = 0
+    dut.io_axi_write_data_ready.value = 0
+
+    #
+    # AXI slave provides write response
+    #
+    await RisingEdge(dut.clock)
+    dut.io_axi_write_resp_valid.value = 1
+    dut.io_axi_write_resp_bits_id.value = test_source
+    dut.io_axi_write_resp_bits_resp.value = 0  # OKAY
+
+    await wait_for_signal(dut.clock, dut.io_axi_write_resp_ready, timeout_cycles)
+
+    await RisingEdge(dut.clock)
+    dut.io_axi_write_resp_valid.value = 0
+
+    #
+    # Check TileLink D Channel
+    #
+    dut.io_tl_d_ready.value = 1
+    await wait_for_signal(dut.clock, dut.io_tl_d_valid, timeout_cycles, "Timeout waiting for TL D_VALID for Put")
+
+    assert dut.io_tl_d_valid.value, "TL D_VALID should be high"
+    assert dut.io_tl_d_bits_opcode.value == TLUL_OpcodeD.AccessAck, "TL D_OPCODE should be AccessAck"
+    assert dut.io_tl_d_bits_source.value == test_source, "TL D_SOURCE is incorrect"
+    assert not dut.io_tl_d_bits_error.value, "TL D_ERROR should be low"
+    dut.io_tl_d_ready.value = 0
+
+    await ClockCycles(dut.clock, 5)
+
+@cocotb.test()
+async def test_get_request(dut):
+    """Tests a simple Get request."""
+    clock = Clock(dut.clock, 10, unit="us")
+    cocotb.start_soon(clock.start())
+
+    # Reset
+    await reset_dut(dut)
+
+    # AXI slave initial state
+    dut.io_axi_read_addr_ready.value = 0
+    dut.io_axi_write_addr_ready.value = 0
+    dut.io_axi_write_data_ready.value = 0
+    dut.io_axi_write_resp_valid.value = 0
+    dut.io_axi_read_data_valid.value = 0
+
+    # Test parameters
+    addr_width = 32
+    source_width = 6
+    data_width_bytes = 32
+    timeout_cycles = 1000
+
+    size_power = random.randint(0, 5)
+    test_size = size_power
+
+    test_addr = random.randint(0, (2**addr_width) - 1)
+    test_source = random.randint(0, (2**source_width) - 1)
+    test_data = random.randint(0, (2**(data_width_bytes*8)) - 1)
+
+    # Drive TL Get request
+    await tl_send_get(dut, address=test_addr, source=test_source, size=test_size, timeout_cycles=timeout_cycles)
+
+    #
+    # Check AXI Read Address Channel
+    #
+    await RisingEdge(dut.clock)
+    assert dut.io_axi_read_addr_valid.value, "AXI ARVALID should be high"
+    assert dut.io_axi_read_addr_bits_addr.value == test_addr, "AXI ARADDR is incorrect"
+    assert dut.io_axi_read_addr_bits_id.value == test_source, "AXI ARID is incorrect"
+    assert dut.io_axi_read_addr_bits_size.value == test_size, "AXI ARSIZE is incorrect"
+
+    # AXI slave accepts the request
+    dut.io_axi_read_addr_ready.value = 1
+    await RisingEdge(dut.clock)
+    dut.io_axi_read_addr_ready.value = 0
+
+    #
+    # AXI slave provides read data
+    #
+    await RisingEdge(dut.clock)
+    dut.io_axi_read_data_valid.value = 1
+    dut.io_axi_read_data_bits_data.value = test_data
+    dut.io_axi_read_data_bits_id.value = test_source
+    dut.io_axi_read_data_bits_resp.value = 0  # OKAY
+
+    await wait_for_signal(dut.clock, dut.io_axi_read_data_ready, timeout_cycles)
+
+    await RisingEdge(dut.clock)
+    dut.io_axi_read_data_valid.value = 0
+
+    #
+    # Check TileLink D Channel
+    #
+    dut.io_tl_d_ready.value = 1
+    await RisingEdge(dut.clock)
+
+    assert dut.io_tl_d_valid.value, "TL D_VALID should be high"
+    assert dut.io_tl_d_bits_opcode.value == TLUL_OpcodeD.AccessAckData, "TL D_OPCODE should be AccessAckData"
+    assert dut.io_tl_d_bits_source.value == test_source, "TL D_SOURCE is incorrect"
+    assert dut.io_tl_d_bits_data.value == test_data, "TL D_DATA is incorrect"
+    assert not dut.io_tl_d_bits_error.value, "TL D_ERROR should be low"
+    dut.io_tl_d_ready.value = 0
+
+    await ClockCycles(dut.clock, 5)
+
+
+@cocotb.test()
+async def test_backpressure(dut):
+    """Tests backpressure from the AXI slave."""
+    clock = Clock(dut.clock, 10, unit="us")
+    cocotb.start_soon(clock.start())
+
+    # Reset
+    await reset_dut(dut)
+
+    # AXI slave initial state
+    dut.io_axi_read_addr_ready.value = 0
+    dut.io_axi_write_addr_ready.value = 0
+    dut.io_axi_write_data_ready.value = 0
+    dut.io_axi_write_resp_valid.value = 0
+    dut.io_axi_read_data_valid.value = 0
+
+    # Test parameters
+    addr_width = 32
+    source_width = 6
+    data_width_bytes = 32 # Corresponds to 256 bits
+    timeout_cycles = 1000
+
+    size_power = random.randint(0, 5) # 2**5 = 32 bytes
+    test_size = size_power
+    num_bytes = 2**size_power
+
+    test_addr = random.randint(0, (2**addr_width) - 1)
+    test_source = random.randint(0, (2**source_width) - 1)
+    test_data = random.randint(0, (2**(data_width_bytes*8)) - 1)
+    test_mask = (1 << num_bytes) - 1
+
+    # Drive TL Put request
+    await tl_send_put(dut, address=test_addr, source=test_source, size=test_size, data=test_data, mask=test_mask, timeout_cycles=timeout_cycles)
+
+    #
+    # Check AXI Write Address and Data Channels
+    #
+    await wait_for_signal(dut.clock, dut.io_axi_write_addr_valid, timeout_cycles, "Timeout waiting for AXI AWVALID for Put")
+
+    assert dut.io_axi_write_addr_valid.value, "AXI AWVALID should be high"
+    assert dut.io_axi_write_data_valid.value, "AXI WVALID should be high"
+
+    # Apply backpressure to address channel
+    dut.io_axi_write_addr_ready.value = 0
+    dut.io_axi_write_data_ready.value = 1
+
+    await ClockCycles(dut.clock, 10)
+
+    # Address channel should be stalled, data channel should have cleared
+    assert dut.io_axi_write_addr_valid.value, "AXI AWVALID should remain high"
+    assert not dut.io_axi_write_data_valid.value, "AXI WVALID should be low"
+
+    # Release backpressure
+    dut.io_axi_write_addr_ready.value = 1
+    await RisingEdge(dut.clock)
+    dut.io_axi_write_addr_ready.value = 0
+    dut.io_axi_write_data_ready.value = 0
+
+    #
+    # AXI slave provides write response
+    #
+    await RisingEdge(dut.clock)
+    dut.io_axi_write_resp_valid.value = 1
+    dut.io_axi_write_resp_bits_id.value = test_source
+    dut.io_axi_write_resp_bits_resp.value = 0  # OKAY
+
+    await wait_for_signal(dut.clock, dut.io_axi_write_resp_ready, timeout_cycles)
+
+    await RisingEdge(dut.clock)
+    dut.io_axi_write_resp_valid.value = 0
+
+    #
+    # Check TileLink D Channel
+    #
+    dut.io_tl_d_ready.value = 1
+    await wait_for_signal(dut.clock, dut.io_tl_d_valid, timeout_cycles, "Timeout waiting for TL D_VALID for Put")
+
+    assert dut.io_tl_d_valid.value, "TL D_VALID should be high"
+    assert dut.io_tl_d_bits_opcode.value == TLUL_OpcodeD.AccessAck, "TL D_OPCODE should be AccessAck"
+    assert dut.io_tl_d_bits_source.value == test_source, "TL D_SOURCE is incorrect"
+    assert not dut.io_tl_d_bits_error.value, "TL D_ERROR should be low"
+    dut.io_tl_d_ready.value = 0
+
+    await ClockCycles(dut.clock, 5)
+
+
+@cocotb.test()
+async def test_put_then_get(dut):
+    """Tests a Put request followed by a Get request."""
+    clock = Clock(dut.clock, 10, unit="us")
+    cocotb.start_soon(clock.start())
+
+    # Reset
+    await reset_dut(dut)
+
+    # AXI slave initial state
+    dut.io_axi_read_addr_ready.value = 0
+    dut.io_axi_write_addr_ready.value = 0
+    dut.io_axi_write_data_ready.value = 0
+    dut.io_axi_write_resp_valid.value = 0
+    dut.io_axi_read_data_valid.value = 0
+
+    # Test parameters
+    addr_width = 32
+    source_width = 6
+    data_width_bytes = 32
+    timeout_cycles = 1000
+
+    # Put parameters
+    put_size_power = random.randint(0, 5)
+    put_size = put_size_power
+    put_num_bytes = 2**put_size_power
+    put_addr = random.randint(0, (2**addr_width) - 1)
+    put_source = random.randint(0, (2**source_width) - 1)
+    put_data = random.randint(0, (2**(data_width_bytes*8)) - 1)
+    put_mask = (1 << put_num_bytes) - 1
+
+    # Get parameters
+    get_size_power = random.randint(0, 5)
+    get_size = get_size_power
+    get_addr = random.randint(0, (2**addr_width) - 1)
+    get_source = random.randint(0, (2**source_width) - 1)
+    get_data = random.randint(0, (2**(data_width_bytes*8)) - 1)
+    
+    #
+    # Complete Put Transaction
+    #
+    await tl_send_put(dut, address=put_addr, source=put_source, size=put_size, data=put_data, mask=put_mask, timeout_cycles=timeout_cycles)
+    
+    await wait_for_signal(dut.clock, dut.io_axi_write_addr_valid, timeout_cycles, "Timeout waiting for AXI AWVALID for Put")
+    assert dut.io_axi_write_addr_valid.value, "AXI AWVALID should be high for Put"
+    assert dut.io_axi_write_data_valid.value, "AXI WVALID should be high for Put"
+    
+    dut.io_axi_write_addr_ready.value = 1
+    dut.io_axi_write_data_ready.value = 1
+    await RisingEdge(dut.clock)
+    dut.io_axi_write_addr_ready.value = 0
+    dut.io_axi_write_data_ready.value = 0
+
+    await RisingEdge(dut.clock)
+    dut.io_axi_write_resp_valid.value = 1
+    await wait_for_signal(dut.clock, dut.io_axi_write_resp_ready, timeout_cycles)
+    dut.io_axi_write_resp_valid.value = 0
+
+    dut.io_tl_d_ready.value = 1
+    await RisingEdge(dut.clock)
+    assert dut.io_tl_d_valid.value, "TL D_VALID should be high for Put"
+    assert dut.io_tl_d_bits_opcode.value == TLUL_OpcodeD.AccessAck, "TL D_OPCODE should be AccessAck for Put"
+    dut.io_tl_d_ready.value = 0
+
+    #
+    # Complete Get Transaction
+    #
+    await tl_send_get(dut, address=get_addr, source=get_source, size=get_size, timeout_cycles=timeout_cycles)
+    
+    await RisingEdge(dut.clock)
+    assert dut.io_axi_read_addr_valid.value, "AXI ARVALID should be high for Get"
+    
+    dut.io_axi_read_addr_ready.value = 1
+    await RisingEdge(dut.clock)
+    dut.io_axi_read_addr_ready.value = 0
+
+    await RisingEdge(dut.clock)
+    dut.io_axi_read_data_valid.value = 1
+    dut.io_axi_read_data_bits_data.value = get_data
+    await wait_for_signal(dut.clock, dut.io_axi_read_data_ready, timeout_cycles)
+    dut.io_axi_read_data_valid.value = 0
+
+    dut.io_tl_d_ready.value = 1
+    await RisingEdge(dut.clock)
+    assert dut.io_tl_d_valid.value, "TL D_VALID should be high for Get"
+    assert dut.io_tl_d_bits_opcode.value == TLUL_OpcodeD.AccessAckData, "TL D_OPCODE should be AccessAckData for Get"
+    dut.io_tl_d_ready.value = 0
+
+    await ClockCycles(dut.clock, 5)
\ No newline at end of file
diff --git a/hdl/chisel/src/chai/ChAI.scala b/hdl/chisel/src/chai/ChAI.scala
index 27fec09..aed720a 100644
--- a/hdl/chisel/src/chai/ChAI.scala
+++ b/hdl/chisel/src/chai/ChAI.scala
@@ -80,13 +80,13 @@
   io.fault := u_kelvin.fault
 
   withClockAndReset(io.clk_i, rst_i) {
-    val tlul_p = new TLULParameters()
+    val tlul_p = new TLULParameters(kelvin_p)
     val kelvin_to_tlul = KelvinToTlul(tlul_p, kelvin_p)
     kelvin_to_tlul.io.kelvin <> u_kelvin.mem
 
     val tlul_sram =
       SRAM(p.sramDataEntries(), UInt(p.sramDataBits.W), p.sramReadPorts, p.sramWritePorts, p.sramReadWritePorts)
-    val tlul_adapter_sram = Module(new chai.TlulAdapterSram())
+    val tlul_adapter_sram = Module(new chai.TlulAdapterSram(tlul_p))
     tlul_adapter_sram.io.clk_i := io.clk_i
     tlul_adapter_sram.io.rst_ni := io.rst_ni
     tlul_adapter_sram.io.en_ifetch_i := 9.U // MuBi4False
diff --git a/hdl/chisel/src/chai/TlulAdapterSram.scala b/hdl/chisel/src/chai/TlulAdapterSram.scala
index db2dcdf..ac204c4 100644
--- a/hdl/chisel/src/chai/TlulAdapterSram.scala
+++ b/hdl/chisel/src/chai/TlulAdapterSram.scala
@@ -18,6 +18,8 @@
 
 import bus._
 
+
+
 package object sram_params {
   val SramAw = 17
   val SramDw = 256
@@ -27,15 +29,14 @@
   val EnableDataIntgPt = 0
 }
 
-class TlulAdapterSram extends BlackBox {
-  val tlul_p = new TLULParameters()
+class TlulAdapterSram(p: TLULParameters) extends BlackBox {
   val io = IO(new Bundle {
     val clk_i = Input(Clock())
     val rst_ni = Input(AsyncReset())
 
     // TL-UL
-    val tl_i = Input(new TileLinkULIO_H2D(tlul_p))
-    val tl_o = Output(new TileLinkULIO_D2H(tlul_p))
+    val tl_i = Input(new TileLinkULIO_H2D(p))
+    val tl_o = Output(new TileLinkULIO_D2H(p))
 
     // control
     val en_ifetch_i = Input(UInt(4.W)) // mubi4_t
diff --git a/hdl/chisel/src/chai/Uart.scala b/hdl/chisel/src/chai/Uart.scala
index 2e65da9..e81cd94 100644
--- a/hdl/chisel/src/chai/Uart.scala
+++ b/hdl/chisel/src/chai/Uart.scala
@@ -18,13 +18,13 @@
 
 import bus._
 
-class Uart(tlul_p: TLULParameters) extends BlackBox {
+class Uart(p: TLULParameters) extends BlackBox {
   val io = IO(new Bundle {
     val clk_i = Input(Clock())
     val rst_ni = Input(AsyncReset())
 
-    val tl_i = Input(new TileLinkULIO_H2D(tlul_p))
-    val tl_o = Output(new TileLinkULIO_D2H(tlul_p))
+    val tl_i = Input(new TileLinkULIO_H2D(p))
+    val tl_o = Output(new TileLinkULIO_D2H(p))
 
     // These have some alert_{rx|tx}_t types.
     val alert_rx_i = Input(UInt(4.W))
diff --git a/hdl/chisel/src/kelvin/BUILD b/hdl/chisel/src/kelvin/BUILD
index e8ad752..51ee1e7 100644
--- a/hdl/chisel/src/kelvin/BUILD
+++ b/hdl/chisel/src/kelvin/BUILD
@@ -359,6 +359,7 @@
         "Core.scala",
         "CoreAxi.scala",
         "CoreAxiCSR.scala",
+        "CoreTlul.scala",
         "Fabric.scala",
         "TCM.scala",
         "SRAM.scala",
@@ -571,6 +572,24 @@
 template_rule(
     chisel_cc_library,
     {
+        "rvv_core_mini_tlul_cc_library": {
+            "verilog_file_path": "RvvCoreMiniTlul.sv",
+            "extra_outs": [
+                "VRvvCoreMiniTlul_parameters.h",
+                "RvvCoreMiniTlul.zip",
+            ],
+            "gen_flags": [
+                "--moduleName=RvvCoreMini",
+                "--enableFetchL0=False",
+                "--fetchDataBits=128",
+                "--lsuDataBits=128",
+                "--enableVector=False",
+                "--enableRvv=True",
+                "--enableFloat=True",
+                "--useTlul=True",
+            ],
+            "module_name": "RvvCoreMiniTlul",
+        },
         "rvv_core_mini_axi_cc_library": {
             "verilog_file_path": "RvvCoreMiniAxi.sv",
             "extra_outs": [
@@ -627,6 +646,11 @@
     ],
 )
 
+filegroup(
+    name = "rvv_core_mini_tlul_verilog",
+    srcs = [":rvv_core_mini_tlul_cc_library_verilog"],
+)
+
 verilog_zip_bundle(
     name = "core_mini_axi_bundle",
     lib = ":core_mini_axi_cc_library_verilog",
diff --git a/hdl/chisel/src/kelvin/Core.scala b/hdl/chisel/src/kelvin/Core.scala
index 45b5766..9410810 100644
--- a/hdl/chisel/src/kelvin/Core.scala
+++ b/hdl/chisel/src/kelvin/Core.scala
@@ -110,6 +110,7 @@
   var chiselArgs = List[String]()
   var targetDir: Option[String] = None
   var useAxi = false
+  var useTlul = false
   for (arg <- args) {
     if (arg.startsWith("--enableFetchL0")) {
       p.enableFetchL0 = arg.split("=")(1).toBoolean
@@ -135,12 +136,15 @@
       p.tcmHighmem = true
     } else if (arg.startsWith("--useAxi")) {
       useAxi = true
+    } else if (arg.startsWith("--useTlul")) {
+      useTlul = true
     } else if (arg.startsWith("--target-dir")) {
       targetDir = Some(arg.split("=")(1))
     } else {
       chiselArgs = chiselArgs :+ arg
     }
   }
+  assert(!(useAxi && useTlul))
 
   // The core module must be created in the ChiselStage context. Use lazy here
   // so it's created in ChiselStage, but referencable afterwards.
@@ -161,7 +165,15 @@
       )
       p.m = memoryRegions
     }
-      new CoreAxi(p, moduleName)
+    new CoreAxi(p, moduleName)
+  } else if (useTlul) {
+      val memoryRegions = Seq(
+        new MemoryRegion(0x0000, 0x2000, MemoryRegionType.IMEM), // ITCM
+        new MemoryRegion(0x10000, 0x8000, MemoryRegionType.DMEM), // DTCM
+        new MemoryRegion(0x30000, 0x2000, MemoryRegionType.Peripheral), // CSR
+      )
+      p.m = memoryRegions
+      new CoreTlul(p, moduleName)
   } else {
     // "Matcha" memory layout
     p.m = Seq(
diff --git a/hdl/chisel/src/kelvin/CoreTlul.scala b/hdl/chisel/src/kelvin/CoreTlul.scala
new file mode 100644
index 0000000..7c41c55
--- /dev/null
+++ b/hdl/chisel/src/kelvin/CoreTlul.scala
@@ -0,0 +1,59 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kelvin
+
+import chisel3._
+import bus._
+
+class CoreTlul(p: Parameters, coreModuleName: String) extends RawModule {
+    override val desiredName = coreModuleName + "Tlul"
+    val memoryRegions = p.m
+    val io = IO(new Bundle {
+        val clk = Input(Clock())
+        val rst_ni = Input(AsyncReset())
+
+        val tl_host = new OpenTitanTileLink.Host2Device(new TLULParameters(p))
+        val tl_device = new OpenTitanTileLink.Device2Host(new TLULParameters(p))
+
+        // Core status interrupts
+        val halted = Output(Bool())
+        val fault = Output(Bool())
+        val wfi = Output(Bool())
+        val irq = Input(Bool())
+        val te = Input(Bool())
+    })
+    dontTouch(io)
+
+    val coreAxi = withClockAndReset(io.clk, io.rst_ni) { Module(new CoreAxi(p, coreModuleName)) }
+    val hostBridge = withClockAndReset(io.clk, (!io.rst_ni.asBool).asAsyncReset) { Module(new Axi2TLUL(p, () => new OpenTitanTileLink_A_User, () => new OpenTitanTileLink_D_User)) }
+    val deviceBridge = withClockAndReset(io.clk, (!io.rst_ni.asBool).asAsyncReset) { Module(new TLUL2Axi(p, () => new OpenTitanTileLink_A_User, () => new OpenTitanTileLink_D_User)) }
+
+    coreAxi.io.aclk := io.clk
+    coreAxi.io.aresetn := io.rst_ni
+    coreAxi.io.te := io.te
+    coreAxi.io.irq := io.irq
+    io.wfi := coreAxi.io.wfi
+    io.fault := coreAxi.io.fault
+    io.halted := coreAxi.io.halted
+
+    hostBridge.io.axi <> coreAxi.io.axi_master
+    deviceBridge.io.axi <> coreAxi.io.axi_slave
+
+    io.tl_host.a <> hostBridge.io.tl_a
+    hostBridge.io.tl_d <> io.tl_host.d
+
+    deviceBridge.io.tl_a <> io.tl_device.a
+    io.tl_device.d <> deviceBridge.io.tl_d
+}
\ No newline at end of file
diff --git a/hdl/verilog/TlulAdapterSram.sv b/hdl/verilog/TlulAdapterSram.sv
index e1a295c..7d07623 100644
--- a/hdl/verilog/TlulAdapterSram.sv
+++ b/hdl/verilog/TlulAdapterSram.sv
@@ -20,8 +20,8 @@
   input tl_i_a_valid,
   input [2:0] tl_i_a_opcode,
   input [2:0] tl_i_a_param,
-  input [5:0] tl_i_a_size,
-  input [9:0] tl_i_a_source,
+  input [4:0] tl_i_a_size,
+  input [5:0] tl_i_a_source,
   input [31:0] tl_i_a_address,
   input [31:0] tl_i_a_mask,
   input [255:0] tl_i_a_data,
@@ -38,8 +38,8 @@
   output tl_o_d_valid,
   output [2:0] tl_o_d_opcode,
   output [2:0] tl_o_d_param,
-  output [5:0] tl_o_d_size,
-  output [9:0] tl_o_d_source,
+  output [4:0] tl_o_d_size,
+  output [5:0] tl_o_d_source,
   output tl_o_d_sink,
   output [255:0] tl_o_d_data,
   output [6:0] tl_o_d_user_rsp_intg,
@@ -55,6 +55,16 @@
   output intg_error_o
 );
 
+logic [5:0] tl_i_a_size_padded;
+logic [9:0] tl_i_a_source_padded;
+logic [5:0] tl_o_d_size_unpadded;
+logic [9:0] tl_o_d_source_unpadded;
+
+assign tl_i_a_size_padded = {1'b0, tl_i_a_size};
+assign tl_i_a_source_padded = {4'b0, tl_i_a_source};
+assign tl_o_d_size = tl_o_d_size_unpadded[4:0];
+assign tl_o_d_source = tl_o_d_source_unpadded[5:0];
+
 mubi4_t en_ifetch_i_ = mubi4_t'(en_ifetch_i);
 
 tlul_adapter_sram #(
@@ -72,8 +82,8 @@
         tl_i_a_valid,
         tl_i_a_opcode,
         tl_i_a_param,
-        tl_i_a_size,
-        tl_i_a_source,
+        tl_i_a_size_padded,
+        tl_i_a_source_padded,
         tl_i_a_address,
         tl_i_a_mask,
         tl_i_a_data,
@@ -87,8 +97,8 @@
     tl_o_d_valid,
     tl_o_d_opcode,
     tl_o_d_param,
-    tl_o_d_size,
-    tl_o_d_source,
+    tl_o_d_size_unpadded,
+    tl_o_d_source_unpadded,
     tl_o_d_sink,
     tl_o_d_data,
     tl_o_d_user_rsp_intg,
@@ -110,4 +120,4 @@
   .rerror_i(rerror_i)
 );
 
-endmodule
\ No newline at end of file
+endmodule
diff --git a/hdl/verilog/Uart.sv b/hdl/verilog/Uart.sv
index b00a4b4..56f1784 100644
--- a/hdl/verilog/Uart.sv
+++ b/hdl/verilog/Uart.sv
@@ -18,8 +18,8 @@
   input tl_i_a_valid,
   input [2:0] tl_i_a_opcode,
   input [2:0] tl_i_a_param,
-  input [5:0] tl_i_a_size,
-  input [9:0] tl_i_a_source,
+  input [4:0] tl_i_a_size,
+  input [5:0] tl_i_a_source,
   input [31:0] tl_i_a_address,
   input [31:0] tl_i_a_mask,
   input [255:0] tl_i_a_data,
@@ -31,8 +31,8 @@
   output tl_o_d_valid,
   output [2:0] tl_o_d_opcode,
   output [2:0] tl_o_d_param,
-  output [5:0] tl_o_d_size,
-  output [9:0] tl_o_d_source,
+  output [4:0] tl_o_d_size,
+  output [5:0] tl_o_d_source,
   output tl_o_d_sink,
   output [255:0] tl_o_d_data,
   output [6:0] tl_o_d_user_rsp_intg,
@@ -57,6 +57,16 @@
   output intr_rx_parity_err_o
 );
 
+logic [5:0] tl_i_a_size_padded;
+logic [9:0] tl_i_a_source_padded;
+logic [5:0] tl_o_d_size_unpadded;
+logic [9:0] tl_o_d_source_unpadded;
+
+assign tl_i_a_size_padded = {1'b0, tl_i_a_size};
+assign tl_i_a_source_padded = {4'b0, tl_i_a_source};
+assign tl_o_d_size = tl_o_d_size_unpadded[4:0];
+assign tl_o_d_source = tl_o_d_source_unpadded[5:0];
+
 uart #() u_uart (
   .clk_i(clk_i),
   .rst_ni(rst_ni),
@@ -65,8 +75,8 @@
         tl_i_a_valid,
         tl_i_a_opcode,
         tl_i_a_param,
-        tl_i_a_size,
-        tl_i_a_source,
+        tl_i_a_size_padded,
+        tl_i_a_source_padded,
         tl_i_a_address,
         tl_i_a_mask,
         tl_i_a_data,
@@ -80,8 +90,8 @@
     tl_o_d_valid,
     tl_o_d_opcode,
     tl_o_d_param,
-    tl_o_d_size,
-    tl_o_d_source,
+    tl_o_d_size_unpadded,
+    tl_o_d_source_unpadded,
     tl_o_d_sink,
     tl_o_d_data,
     tl_o_d_user_rsp_intg,