feat(spi): Add packed write transaction support

This change introduces a "packed" write transaction to the SPI master,
allowing for a full TileLink write transaction to be sent in a single,
uninterrupted SPI burst. This significantly improves performance for
loading data to the device.

The following changes are included:

- A `packed_write_transaction` method has been added to the `SPIMaster`
  in `kelvin_test_utils/spi_master.py`.
- A new test case, `test_packed_write_transaction`, has been added to
  `tests/cocotb/tlul/test_spi_to_tlul.py` to validate the new
  functionality.
- The `poll_reg_for_value` method in `SPIMaster` has been refactored to
  correctly handle the pipelined nature of SPI reads, which was
  discovered during the development of the packed write feature.

Change-Id: I1703af4d083dc75781550a38c43ac726a40cfb43
diff --git a/kelvin_test_utils/spi_master.py b/kelvin_test_utils/spi_master.py
index c5182fc..32fe33e 100644
--- a/kelvin_test_utils/spi_master.py
+++ b/kelvin_test_utils/spi_master.py
@@ -94,13 +94,22 @@
 
     async def poll_reg_for_value(self, reg_addr, expected_value, max_polls=20):
         """Polls a register until it reads an expected value."""
-        status = -1
-        for _ in range(max_polls):
-            status = await self.read_reg(reg_addr)
-            if status == expected_value:
+        read_cmd = reg_addr # MSB is 0 for read
+        read_data = -1
+
+        # The first transaction just kicks off the read pipeline. The data is junk.
+        await self.spi_transaction(read_cmd)
+
+        for i in range(max_polls):
+            # Each subsequent transaction sends a new read command and receives the
+            # result of the PREVIOUS command.
+            read_data = await self.spi_transaction(read_cmd)
+            if read_data == expected_value:
+                self.log.info(f"Successfully polled 0x{reg_addr:x} and got 0x{expected_value:x} after {i+1} attempts.")
                 return True
             await ClockCycles(self.main_clk, 5) # Wait before next poll
-        self.log.error(f"Timed out after {max_polls} polls waiting for register 0x{reg_addr:x} to be 0x{expected_value:x}, got 0x{status:x}")
+
+        self.log.error(f"Timed out after {max_polls} polls waiting for register 0x{reg_addr:x} to be 0x{expected_value:x}, got 0x{read_data:x}")
         return False
 
     async def bulk_read_data(self, reg_addr, num_bytes):
@@ -134,3 +143,38 @@
         for i in range(num_bytes):
             byte = (data >> (i * 8)) & 0xFF
             await self.write_reg(reg_addr, byte, wait_cycles=5)
+
+    async def packed_write_transaction(self, target_addr, num_beats, data_generator):
+        await self._set_cs(True)
+        await ClockCycles(self.main_clk, 1)
+
+        await self.start_clock()
+
+        # Write addr
+        await self._clock_byte(0x80)
+        await self._clock_byte((target_addr >> 0) & 0xFF)
+        await self._clock_byte(0x81)
+        await self._clock_byte((target_addr >> 8) & 0xFF)
+        await self._clock_byte(0x82)
+        await self._clock_byte((target_addr >> 16) & 0xFF)
+        await self._clock_byte(0x83)
+        await self._clock_byte((target_addr >> 24) & 0xFF)
+
+        # Write beats
+        await self._clock_byte(0x84)
+        await self._clock_byte(num_beats - 1)
+
+        # Write data
+        for j in range(num_beats):
+            data = data_generator(j)
+            for i in range(16):
+                byte = (data >> (i * 8)) & 0xFF
+                await self._clock_byte(0x87)
+                await self._clock_byte(byte)
+
+        await self._clock_byte(0x85)
+        await self._clock_byte(0x02)
+
+        await self.stop_clock()
+        await ClockCycles(self.main_clk, 1)
+        await self._set_cs(False)
diff --git a/tests/cocotb/tlul/BUILD b/tests/cocotb/tlul/BUILD
index 71676eb..e659183 100644
--- a/tests/cocotb/tlul/BUILD
+++ b/tests/cocotb/tlul/BUILD
@@ -354,6 +354,7 @@
     "test_tlul_multi_beat_read",
     "test_tlul_write",
     "test_tlul_multi_beat_write",
+    "test_packed_write_transaction",
 ]
 # END_TESTCASES_FOR_spi2tlul_cocotb
 
diff --git a/tests/cocotb/tlul/test_spi_to_tlul.py b/tests/cocotb/tlul/test_spi_to_tlul.py
index 2cc6599..0b4059b 100644
--- a/tests/cocotb/tlul/test_spi_to_tlul.py
+++ b/tests/cocotb/tlul/test_spi_to_tlul.py
@@ -362,4 +362,54 @@
     assert received_data_list == expected_data_list, f"Received data {received_data_list} does not match expected data {expected_data_list}"
 
     # 4. Clear the status to return FSM to Idle
-    await spi_master.write_reg(0x05, 0x00)
\ No newline at end of file
+    await spi_master.write_reg(0x05, 0x00)
+
+@cocotb.test()
+async def test_packed_write_transaction(dut):
+    clock = Clock(dut.clock, 10)
+    cocotb.start_soon(clock.start())
+
+    await setup_dut(dut)
+    spi_master = SPIMaster(
+        clk=dut.io_spi_clk,
+        csb=dut.io_spi_csb,
+        mosi=dut.io_spi_mosi,
+        miso=dut.io_spi_miso,
+        main_clk=dut.clock,
+        log=dut._log
+    )
+    tl_device = TileLinkULInterface(dut, device_if_name="io_tl", width=128)
+    await tl_device.init()
+
+    num_beats = 16
+    async def device_responder():
+        for i in range(num_beats):
+            req = await tl_device.device_get_request()
+            assert int(req['opcode']) in [0, 1], f"Expected PutFullData or PutPartialData, got opcode {req['opcode']}"
+            assert req['data'] == 0xDEADBEEF_CAFEF00D_ABAD1DEA_C0DED00D + i
+
+            # Send an AccessAck after each beat
+            await tl_device.device_respond(
+                opcode=0,  # AccessAck
+                param=0,
+                size=req['size'],
+                source=req['source'],
+                error=0,
+                width=128
+            )
+
+    responder_task = cocotb.start_soon(device_responder())
+
+    def data_generator(beat):
+        return 0xDEADBEEF_CAFEF00D_ABAD1DEA_C0DED00D + beat
+
+    await spi_master.packed_write_transaction(
+        target_addr=0x40001000,
+        num_beats=num_beats,
+        data_generator=data_generator
+    )
+
+    assert await spi_master.poll_reg_for_value(0x08, 0x02), "Timed out waiting for write status to be Done"
+    await spi_master.write_reg(0x05, 0x00)
+    await responder_task
+