tests/cocotb/rvv_load_store_test.py - hw/kelvin - Git at Google

 # Copyright 2025 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import cocotb
 import numpy as np

 from kelvin_test_utils.sim_test_fixture import Fixture
 from bazel_tools.tools.python.runfiles import runfiles


 async def vector_load_store(
         dut,
         elf_name: str,
         dtype,
         in_size: int,
         out_size: int,
         pattern: list[int],
 ):
     """RVV load-store test template.

     Each test performs some kind of patterned copy from `in_buf` to `out_buf`.
     """
     fixture = await Fixture.Create(dut)
     r = runfiles.Create()
     await fixture.load_elf_and_lookup_symbols(
         r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/' + elf_name),
         ['in_buf', 'out_buf'],
     )

     min_value = np.iinfo(dtype).min
     max_value = np.iinfo(dtype).max + 1  # One above.
     rng = np.random.default_rng()
     input_data = rng.integers(min_value, max_value, in_size, dtype=dtype)
     expected_outputs = input_data[pattern]
     sbz = np.zeros(out_size - len(pattern), dtype=dtype)
     expected_outputs = np.concat((expected_outputs, sbz))

     await fixture.write('in_buf', input_data)
     await fixture.write('out_buf', np.zeros([out_size], dtype=dtype))

     await fixture.run_to_halt()

     actual_outputs = (await fixture.read(
         'out_buf', out_size * np.dtype(dtype).itemsize)).view(dtype)
     debug_msg = str({
         'input': input_data,
         'expected': expected_outputs,
         'actual': actual_outputs,
     })

     assert (actual_outputs == expected_outputs).all(), debug_msg

 async def vector_load_store_v2(
         dut,
         elf_name: str,
         cases: list[dict],  # keys: impl, vl, in_size, out_size, pattern.
         dtype,
 ):
     """RVV load-store test template.

     Each test performs some kind of patterned copy from `in_buf` to `out_buf`.
     """
     fixture = await Fixture.Create(dut)
     r = runfiles.Create()
     await fixture.load_elf_and_lookup_symbols(
         r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/' + elf_name),
         ['impl', 'vl', 'in_buf', 'out_buf'] + [c['impl'] for c in cases],
     )

     min_value = np.iinfo(dtype).min
     max_value = np.iinfo(dtype).max + 1  # One above.
     rng = np.random.default_rng()
     for c in cases:
         impl = c['impl']
         vl = c['vl']
         in_size = c['in_size']
         out_size = c['out_size']
         pattern = c['pattern']

         input_data = rng.integers(min_value, max_value, in_size, dtype=dtype)
         expected_outputs = input_data[pattern]
         sbz = np.zeros(out_size - len(pattern), dtype=dtype)
         expected_outputs = np.concat((expected_outputs, sbz))

         await fixture.write_ptr('impl', impl)
         await fixture.write_word('vl', vl)
         await fixture.write('in_buf', input_data)
         await fixture.write('out_buf', np.zeros([out_size], dtype=dtype))

         await fixture.run_to_halt()

         actual_outputs = (await fixture.read(
             'out_buf', out_size * np.dtype(dtype).itemsize)).view(dtype)

         debug_msg = str({
             'impl': impl,
             'input': input_data,
             'expected': expected_outputs,
             'actual': actual_outputs,
         })
         assert (actual_outputs == expected_outputs).all(), debug_msg

 async def vector_load_indexed(
         dut,
         elf_name: str,
         dtype,
 ):
     """RVV load-store test template for indexed loads.

     Each test performs a gather operation and writes the result to an output.
     """
     fixture = await Fixture.Create(dut)
     r = runfiles.Create()
     await fixture.load_elf_and_lookup_symbols(
         r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/' + elf_name),
         ['input_indices', 'input_data', 'output_data'],
     )

     indices_count = 16 // np.dtype(dtype).itemsize
     in_data_count = 4096 // np.dtype(dtype).itemsize
     out_data_count = 16 // np.dtype(dtype).itemsize

     min_value = np.iinfo(dtype).min
     max_value = np.iinfo(dtype).max + 1  # One above.
     rng = np.random.default_rng()
     input_data = rng.integers(min_value, max_value, in_data_count, dtype=dtype)
     input_indices = rng.integers(
         0, min(max_value, in_data_count-1), indices_count, dtype=dtype)

     expected_outputs = np.take(input_data, input_indices)

     await fixture.write('input_data', input_data)
     await fixture.write('input_indices', input_indices)
     await fixture.write('output_data', np.zeros([out_data_count], dtype=dtype))

     await fixture.run_to_halt()

     actual_outputs = (await fixture.read(
         'output_data', out_data_count * np.dtype(dtype).itemsize)).view(dtype)

     assert (actual_outputs == expected_outputs).all()

 async def vector_store_indexed(
         dut,
         elf_name: str,
         dtype,
 ):
     """RVV load-store test template for indexed stores.

     Each test loads indices and data and performs a scatter operation.
     """
     fixture = await Fixture.Create(dut)
     r = runfiles.Create()
     await fixture.load_elf_and_lookup_symbols(
         r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/' + elf_name),
         ['input_indices', 'input_data', 'output_data'],
     )

     indices_count = 16 // np.dtype(dtype).itemsize
     in_data_count = 16 // np.dtype(dtype).itemsize
     out_data_count = 4096 // np.dtype(dtype).itemsize

     min_value = np.iinfo(dtype).min
     max_value = np.iinfo(dtype).max + 1  # One above.
     rng = np.random.default_rng()
     input_data = rng.integers(min_value, max_value, in_data_count, dtype=dtype)
     input_indices = rng.integers(
         0, min(max_value, out_data_count-1), indices_count, dtype=dtype)
     original_outputs = rng.integers(
         min_value, max_value, out_data_count, dtype=dtype)

     await fixture.write('input_data', input_data)
     await fixture.write('input_indices', input_indices)
     await fixture.write('output_data', original_outputs)

     expected_outputs = np.copy(original_outputs)
     for idx, data in zip(input_indices, input_data):
       expected_outputs[idx] = data

     await fixture.run_to_halt()

     actual_outputs = (await fixture.read(
         'output_data', out_data_count * np.dtype(dtype).itemsize)).view(dtype)

     assert (actual_outputs == expected_outputs).all()


 @cocotb.test()
 async def load_store_bits(dut):
     """Test vlm/vsm usage accessible from intrinsics."""
     # mask is not accessible from here.
     fixture = await Fixture.Create(dut)
     r = runfiles.Create()
     cases = [
         {'impl': 'vlm_vsm_v_b1', 'vl': 128},
         {'impl': 'vlm_vsm_v_b1', 'vl': 121},
         {'impl': 'vlm_vsm_v_b1', 'vl': 120},
         {'impl': 'vlm_vsm_v_b2', 'vl': 64},
         {'impl': 'vlm_vsm_v_b2', 'vl': 57},
         {'impl': 'vlm_vsm_v_b2', 'vl': 56},
         {'impl': 'vlm_vsm_v_b4', 'vl': 32},
         {'impl': 'vlm_vsm_v_b4', 'vl': 25},
         {'impl': 'vlm_vsm_v_b4', 'vl': 24},
         {'impl': 'vlm_vsm_v_b8', 'vl': 16},
         {'impl': 'vlm_vsm_v_b8', 'vl': 9},
         {'impl': 'vlm_vsm_v_b8', 'vl': 8},
         {'impl': 'vlm_vsm_v_b16', 'vl': 8},
         {'impl': 'vlm_vsm_v_b16', 'vl': 1},
         {'impl': 'vlm_vsm_v_b32', 'vl': 4},
         {'impl': 'vlm_vsm_v_b32', 'vl': 1},
     ]
     await fixture.load_elf_and_lookup_symbols(
         r.Rlocation(
             'kelvin_hw/tests/cocotb/rvv/load_store/load_store_bits.elf'),
         ['vl', 'in_buf', 'out_buf', 'impl'] + [c['impl'] for c in cases],
     )
     rng = np.random.default_rng()
     for c in cases:
         impl = c['impl']
         vl = c['vl']
         in_bytes = (vl + 7) // 8
         last_byte_mask = (1 << (vl % 8) - 1) if vl % 8 else 0xFF

         input_data = rng.integers(
             low=0, high=256, size=in_bytes, dtype=np.uint8)
         expected_output = input_data
         expected_output[-1] = expected_output[-1] & last_byte_mask

         await fixture.write_ptr('impl', impl)
         await fixture.write_word('vl', vl)
         await fixture.write('in_buf', input_data)
         await fixture.write('out_buf', np.zeros([in_bytes], dtype=np.uint8))

         await fixture.run_to_halt()

         actual_output = (await fixture.read('out_buf', in_bytes)).view(np.uint8)

         debug_msg = str({
             'impl': impl,
             'input': input_data,
             'expected': expected_output,
             'actual': actual_output,
         })
         assert (actual_output == expected_output).all(), debug_msg


 @cocotb.test()
 async def load8_stride2_m1(dut):
     await vector_load_store(
         dut = dut,
         elf_name = 'load8_stride2_m1.elf',
         dtype = np.uint8,
         in_size = 32,
         out_size = 16,
         pattern = list(range(0, 31, 2)),
     )

 @cocotb.test()
 async def load8_stride2_m1_partial(dut):
     await vector_load_store(
         dut = dut,
         elf_name = 'load8_stride2_m1_partial.elf',
         dtype = np.uint8,
         in_size = 32,
         out_size = 16,
         pattern = list(range(0, 29, 2)),
     )

 @cocotb.test()
 async def load8_stride2_mf4(dut):
     await vector_load_store(
         dut = dut,
         elf_name = 'load8_stride2_mf4.elf',
         dtype = np.uint8,
         in_size = 32,
         out_size = 16,
         pattern = [0, 2, 4, 6],
     )

 @cocotb.test()
 async def load16_stride4_m1(dut):
     await vector_load_store(
         dut = dut,
         elf_name = 'load16_stride4_m1.elf',
         dtype = np.uint16,
         in_size = 16,
         out_size = 8,
         pattern = list(range(0, 15, 2)),
     )

 @cocotb.test()
 async def load16_stride4_m1_partial(dut):
     await vector_load_store(
         dut = dut,
         elf_name = 'load16_stride4_m1_partial.elf',
         dtype = np.uint16,
         in_size = 16,
         out_size = 8,
         pattern = list(range(0, 13, 2)),
     )

 @cocotb.test()
 async def load16_stride4_mf2(dut):
     await vector_load_store(
         dut = dut,
         elf_name = 'load16_stride4_mf2.elf',
         dtype = np.uint16,
         in_size = 16,
         out_size = 8,
         pattern = [0, 2, 4, 6],
     )

 @cocotb.test()
 async def load32_stride8_m1(dut):
     await vector_load_store(
         dut = dut,
         elf_name = 'load32_stride8_m1.elf',
         dtype = np.uint32,
         in_size = 8,
         out_size = 4,
         pattern = [0, 2, 4, 6],
     )

 @cocotb.test()
 async def load32_stride8_m1_partial(dut):
     await vector_load_store(
         dut = dut,
         elf_name = 'load32_stride8_m1_partial.elf',
         dtype = np.uint32,
         in_size = 8,
         out_size = 4,
         pattern = [0, 2, 4],
     )

 @cocotb.test()
 async def load_store8_unit_m2(dut):
     await vector_load_store(
         dut = dut,
         elf_name = 'load_store8_unit_m2.elf',
         dtype = np.uint8,
         in_size = 64,
         out_size = 64,
         pattern = list(range(0, 32)),
     )


 @cocotb.test()
 async def load_store16_unit_m2(dut):
     await vector_load_store(
         dut=dut,
         elf_name='load_store16_unit_m2.elf',
         dtype=np.uint16,
         in_size=32,
         out_size=32,
         pattern=list(range(0, 16)),
     )


 @cocotb.test()
 async def load_store32_unit_m2(dut):
     await vector_load_store(
         dut=dut,
         elf_name='load_store32_unit_m2.elf',
         dtype=np.uint32,
         in_size=16,
         out_size=16,
         pattern=list(range(0, 8)),
     )

 @cocotb.test()
 async def load8_segment2_unit_m1(dut):
     await vector_load_store(
         dut=dut,
         elf_name='load8_segment2_unit_m1.elf',
         dtype=np.uint8,
         in_size=64,
         out_size=64,
         pattern=(list(range(0, 32, 2)) + list(range(1, 32, 2))),
     )


 @cocotb.test()
 async def load16_segment2_unit_m1(dut):
     await vector_load_store(
         dut=dut,
         elf_name='load16_segment2_unit_m1.elf',
         dtype=np.uint16,
         in_size=32,
         out_size=32,
         pattern=(list(range(0, 16, 2)) + list(range(1, 16, 2))),
     )


 @cocotb.test()
 async def load32_segment2_unit_m1(dut):
     await vector_load_store(
         dut=dut,
         elf_name='load32_segment2_unit_m1.elf',
         dtype=np.uint32,
         in_size=16,
         out_size=16,
         pattern=(list(range(0, 8, 2)) + list(range(1, 8, 2))),
     )


 # TODO(davidgao): re-enable once fixes are in.
 @cocotb.test(skip=True)
 async def load8_segment2_unit_m2(dut):
     await vector_load_store(
         dut=dut,
         elf_name='load8_segment2_unit_m2.elf',
         dtype=np.uint8,
         in_size=128,
         out_size=128,
         pattern=(list(range(0, 63, 2)) + list(range(1, 64, 2))),
     )


 # TODO(davidgao): re-enable once fixes are in.
 @cocotb.test(skip=True)
 async def load16_segment2_unit_m2(dut):
     await vector_load_store(
         dut=dut,
         elf_name='load16_segment2_unit_m2.elf',
         dtype=np.uint16,
         in_size=64,
         out_size=64,
         pattern=(list(range(0, 31, 2)) + list(range(1, 32, 2))),
     )


 # TODO(davidgao): re-enable once fixes are in.
 @cocotb.test(skip=True)
 async def load32_segment2_unit_m2(dut):
     await vector_load_store(
         dut=dut,
         elf_name='load32_segment2_unit_m2.elf',
         dtype=np.uint32,
         in_size=32,
         out_size=32,
         pattern=(list(range(0, 15, 2)) + list(range(1, 16, 2))),
     )


 @cocotb.test()
 async def load8_segment2_stride6_m1(dut):
     await vector_load_store(
         dut=dut,
         elf_name='load8_segment2_stride6_m1.elf',
         dtype=np.uint8,
         in_size=256,
         out_size=64,
         pattern=([i * 6 for i in range(16)] + [i * 6 + 1 for i in range(16)]),
     )


 @cocotb.test()
 async def load16_segment2_stride6_m1(dut):
     await vector_load_store(
         dut=dut,
         elf_name='load16_segment2_stride6_m1.elf',
         dtype=np.uint16,
         in_size=128,
         out_size=32,
         pattern=([i * 3 for i in range(8)] + [i * 3 + 1 for i in range(8)]),
     )


 @cocotb.test()
 async def load8_indexed_m1(dut):
     await vector_load_indexed(
         dut = dut,
         elf_name = 'load8_indexed_m1.elf',
         dtype = np.uint8,
     )

 @cocotb.test()
 async def store8_indexed_m1(dut):
     await vector_store_indexed(
         dut = dut,
         elf_name = 'store8_indexed_m1.elf',
         dtype = np.uint8,
     )


 @cocotb.test()
 async def store8_seg_unit(dut):
     """Test vsseg*e8  usage accessible from intrinsics."""
     def make_test_case(impl: str, vl: int, n_segs: int):
         return {
             'impl': impl,
             'vl': vl,
             'in_size': vl * n_segs * 2,
             'out_size': vl * n_segs * 2,
             'pattern': [i * vl + j for j in range(vl) for i in range(n_segs)]
         }

     await vector_load_store_v2(
         dut = dut,
         elf_name = 'store8_seg_unit.elf',
         cases = [
             # Seg 2
             make_test_case('vsseg2e8_v_u8mf4x2', vl=4, n_segs=2),
             make_test_case('vsseg2e8_v_u8mf4x2', vl=3, n_segs=2),
             make_test_case('vsseg2e8_v_u8mf2x2', vl=8, n_segs=2),
             make_test_case('vsseg2e8_v_u8mf2x2', vl=7, n_segs=2),
             make_test_case('vsseg2e8_v_u8m1x2', vl=16, n_segs=2),
             make_test_case('vsseg2e8_v_u8m1x2', vl=15, n_segs=2),
             # make_test_case('vsseg2e8_v_u8m2x2', vl=32, n_segs=2),
             # make_test_case('vsseg2e8_v_u8m2x2', vl=31, n_segs=2),
             # make_test_case('vsseg2e8_v_u8m4x2', vl=64, n_segs=2),
             # make_test_case('vsseg2e8_v_u8m4x2', vl=63, n_segs=2),
             # Seg 3
             make_test_case('vsseg3e8_v_u8mf4x3', vl=4, n_segs=3),
             make_test_case('vsseg3e8_v_u8mf4x3', vl=3, n_segs=3),
             make_test_case('vsseg3e8_v_u8mf2x3', vl=8, n_segs=3),
             make_test_case('vsseg3e8_v_u8mf2x3', vl=7, n_segs=3),
             make_test_case('vsseg3e8_v_u8m1x3', vl=16, n_segs=3),
             make_test_case('vsseg3e8_v_u8m1x3', vl=15, n_segs=3),
             # make_test_case('vsseg3e8_v_u8m2x3', vl=32, n_segs=3),
             # make_test_case('vsseg3e8_v_u8m2x3', vl=31, n_segs=3),
             # Seg 4
             make_test_case('vsseg4e8_v_u8mf4x4', vl=4, n_segs=4),
             make_test_case('vsseg4e8_v_u8mf4x4', vl=3, n_segs=4),
             make_test_case('vsseg4e8_v_u8mf2x4', vl=8, n_segs=4),
             make_test_case('vsseg4e8_v_u8mf2x4', vl=7, n_segs=4),
             make_test_case('vsseg4e8_v_u8m1x4', vl=16, n_segs=4),
             make_test_case('vsseg4e8_v_u8m1x4', vl=15, n_segs=4),
             # make_test_case('vsseg4e8_v_u8m2x4', vl=32, n_segs=4),
             # make_test_case('vsseg4e8_v_u8m2x4', vl=31, n_segs=4),
             # Seg 5
             make_test_case('vsseg5e8_v_u8mf4x5', vl=4, n_segs=5),
             make_test_case('vsseg5e8_v_u8mf4x5', vl=3, n_segs=5),
             make_test_case('vsseg5e8_v_u8mf2x5', vl=8, n_segs=5),
             make_test_case('vsseg5e8_v_u8mf2x5', vl=7, n_segs=5),
             make_test_case('vsseg5e8_v_u8m1x5', vl=16, n_segs=5),
             make_test_case('vsseg5e8_v_u8m1x5', vl=15, n_segs=5),
             # Seg 6
             make_test_case('vsseg6e8_v_u8mf4x6', vl=4, n_segs=6),
             make_test_case('vsseg6e8_v_u8mf4x6', vl=3, n_segs=6),
             make_test_case('vsseg6e8_v_u8mf2x6', vl=8, n_segs=6),
             make_test_case('vsseg6e8_v_u8mf2x6', vl=7, n_segs=6),
             make_test_case('vsseg6e8_v_u8m1x6', vl=16, n_segs=6),
             make_test_case('vsseg6e8_v_u8m1x6', vl=15, n_segs=6),
             # Seg 7
             make_test_case('vsseg7e8_v_u8mf4x7', vl=4, n_segs=7),
             make_test_case('vsseg7e8_v_u8mf4x7', vl=3, n_segs=7),
             make_test_case('vsseg7e8_v_u8mf2x7', vl=8, n_segs=7),
             make_test_case('vsseg7e8_v_u8mf2x7', vl=7, n_segs=7),
             make_test_case('vsseg7e8_v_u8m1x7', vl=16, n_segs=7),
             make_test_case('vsseg7e8_v_u8m1x7', vl=15, n_segs=7),
             # Seg 8
             make_test_case('vsseg8e8_v_u8mf4x8', vl=4, n_segs=8),
             make_test_case('vsseg8e8_v_u8mf4x8', vl=3, n_segs=8),
             make_test_case('vsseg8e8_v_u8mf2x8', vl=8, n_segs=8),
             make_test_case('vsseg8e8_v_u8mf2x8', vl=7, n_segs=8),
             make_test_case('vsseg8e8_v_u8m1x8', vl=16, n_segs=8),
             make_test_case('vsseg8e8_v_u8m1x8', vl=15, n_segs=8),
         ],
         dtype = np.uint8,
     )


 @cocotb.test()
 async def load_store8_test(dut):
     """Testbench to test RVV load."""
     fixture = await Fixture.Create(dut)
     r = runfiles.Create()
     await fixture.load_elf_and_lookup_symbols(
         r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/load_store8_test.elf'),
         ['buffer', 'in_ptr', 'out_ptr', 'vl'],
     )

     vl = 16
     input_data = np.random.randint(0, 255, vl, dtype=np.uint8)
     target_in_addr = fixture.symbols['buffer'] + 16
     target_out_addr = fixture.symbols['buffer'] + 64

     await fixture.core_mini_axi.write(target_in_addr, input_data)
     await fixture.write('in_ptr', np.array([target_in_addr], dtype=np.uint32))
     await fixture.write('out_ptr', np.array([target_out_addr], dtype=np.uint32))
     await fixture.write('vl', np.array([vl], dtype=np.uint32))

     await fixture.run_to_halt()

     routputs = (await fixture.core_mini_axi.read(target_out_addr, vl)).view(
         np.uint8)
     assert (input_data == routputs).all()
	# Copyright 2025 Google LLC
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# https://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import cocotb
	import numpy as np

	from kelvin_test_utils.sim_test_fixture import Fixture
	from bazel_tools.tools.python.runfiles import runfiles


	async def vector_load_store(
	dut,
	elf_name: str,
	dtype,
	in_size: int,
	out_size: int,
	pattern: list[int],
	):
	"""RVV load-store test template.

	Each test performs some kind of patterned copy from `in_buf` to `out_buf`.
	"""
	fixture = await Fixture.Create(dut)
	r = runfiles.Create()
	await fixture.load_elf_and_lookup_symbols(
	r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/' + elf_name),
	['in_buf', 'out_buf'],
	)

	min_value = np.iinfo(dtype).min
	max_value = np.iinfo(dtype).max + 1 # One above.
	rng = np.random.default_rng()
	input_data = rng.integers(min_value, max_value, in_size, dtype=dtype)
	expected_outputs = input_data[pattern]
	sbz = np.zeros(out_size - len(pattern), dtype=dtype)
	expected_outputs = np.concat((expected_outputs, sbz))

	await fixture.write('in_buf', input_data)
	await fixture.write('out_buf', np.zeros([out_size], dtype=dtype))

	await fixture.run_to_halt()

	actual_outputs = (await fixture.read(
	'out_buf', out_size * np.dtype(dtype).itemsize)).view(dtype)
	debug_msg = str({
	'input': input_data,
	'expected': expected_outputs,
	'actual': actual_outputs,
	})

	assert (actual_outputs == expected_outputs).all(), debug_msg

	async def vector_load_store_v2(
	dut,
	elf_name: str,
	cases: list[dict], # keys: impl, vl, in_size, out_size, pattern.
	dtype,
	):
	"""RVV load-store test template.

	Each test performs some kind of patterned copy from `in_buf` to `out_buf`.
	"""
	fixture = await Fixture.Create(dut)
	r = runfiles.Create()
	await fixture.load_elf_and_lookup_symbols(
	r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/' + elf_name),
	['impl', 'vl', 'in_buf', 'out_buf'] + [c['impl'] for c in cases],
	)

	min_value = np.iinfo(dtype).min
	max_value = np.iinfo(dtype).max + 1 # One above.
	rng = np.random.default_rng()
	for c in cases:
	impl = c['impl']
	vl = c['vl']
	in_size = c['in_size']
	out_size = c['out_size']
	pattern = c['pattern']

	input_data = rng.integers(min_value, max_value, in_size, dtype=dtype)
	expected_outputs = input_data[pattern]
	sbz = np.zeros(out_size - len(pattern), dtype=dtype)
	expected_outputs = np.concat((expected_outputs, sbz))

	await fixture.write_ptr('impl', impl)
	await fixture.write_word('vl', vl)
	await fixture.write('in_buf', input_data)
	await fixture.write('out_buf', np.zeros([out_size], dtype=dtype))

	await fixture.run_to_halt()

	actual_outputs = (await fixture.read(
	'out_buf', out_size * np.dtype(dtype).itemsize)).view(dtype)

	debug_msg = str({
	'impl': impl,
	'input': input_data,
	'expected': expected_outputs,
	'actual': actual_outputs,
	})
	assert (actual_outputs == expected_outputs).all(), debug_msg

	async def vector_load_indexed(
	dut,
	elf_name: str,
	dtype,
	):
	"""RVV load-store test template for indexed loads.

	Each test performs a gather operation and writes the result to an output.
	"""
	fixture = await Fixture.Create(dut)
	r = runfiles.Create()
	await fixture.load_elf_and_lookup_symbols(
	r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/' + elf_name),
	['input_indices', 'input_data', 'output_data'],
	)

	indices_count = 16 // np.dtype(dtype).itemsize
	in_data_count = 4096 // np.dtype(dtype).itemsize
	out_data_count = 16 // np.dtype(dtype).itemsize

	min_value = np.iinfo(dtype).min
	max_value = np.iinfo(dtype).max + 1 # One above.
	rng = np.random.default_rng()
	input_data = rng.integers(min_value, max_value, in_data_count, dtype=dtype)
	input_indices = rng.integers(
	0, min(max_value, in_data_count-1), indices_count, dtype=dtype)

	expected_outputs = np.take(input_data, input_indices)

	await fixture.write('input_data', input_data)
	await fixture.write('input_indices', input_indices)
	await fixture.write('output_data', np.zeros([out_data_count], dtype=dtype))

	await fixture.run_to_halt()

	actual_outputs = (await fixture.read(
	'output_data', out_data_count * np.dtype(dtype).itemsize)).view(dtype)

	assert (actual_outputs == expected_outputs).all()

	async def vector_store_indexed(
	dut,
	elf_name: str,
	dtype,
	):
	"""RVV load-store test template for indexed stores.

	Each test loads indices and data and performs a scatter operation.
	"""
	fixture = await Fixture.Create(dut)
	r = runfiles.Create()
	await fixture.load_elf_and_lookup_symbols(
	r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/' + elf_name),
	['input_indices', 'input_data', 'output_data'],
	)

	indices_count = 16 // np.dtype(dtype).itemsize
	in_data_count = 16 // np.dtype(dtype).itemsize
	out_data_count = 4096 // np.dtype(dtype).itemsize

	min_value = np.iinfo(dtype).min
	max_value = np.iinfo(dtype).max + 1 # One above.
	rng = np.random.default_rng()
	input_data = rng.integers(min_value, max_value, in_data_count, dtype=dtype)
	input_indices = rng.integers(
	0, min(max_value, out_data_count-1), indices_count, dtype=dtype)
	original_outputs = rng.integers(
	min_value, max_value, out_data_count, dtype=dtype)

	await fixture.write('input_data', input_data)
	await fixture.write('input_indices', input_indices)
	await fixture.write('output_data', original_outputs)

	expected_outputs = np.copy(original_outputs)
	for idx, data in zip(input_indices, input_data):
	expected_outputs[idx] = data

	await fixture.run_to_halt()

	actual_outputs = (await fixture.read(
	'output_data', out_data_count * np.dtype(dtype).itemsize)).view(dtype)

	assert (actual_outputs == expected_outputs).all()


	@cocotb.test()
	async def load_store_bits(dut):
	"""Test vlm/vsm usage accessible from intrinsics."""
	# mask is not accessible from here.
	fixture = await Fixture.Create(dut)
	r = runfiles.Create()
	cases = [
	{'impl': 'vlm_vsm_v_b1', 'vl': 128},
	{'impl': 'vlm_vsm_v_b1', 'vl': 121},
	{'impl': 'vlm_vsm_v_b1', 'vl': 120},
	{'impl': 'vlm_vsm_v_b2', 'vl': 64},
	{'impl': 'vlm_vsm_v_b2', 'vl': 57},
	{'impl': 'vlm_vsm_v_b2', 'vl': 56},
	{'impl': 'vlm_vsm_v_b4', 'vl': 32},
	{'impl': 'vlm_vsm_v_b4', 'vl': 25},
	{'impl': 'vlm_vsm_v_b4', 'vl': 24},
	{'impl': 'vlm_vsm_v_b8', 'vl': 16},
	{'impl': 'vlm_vsm_v_b8', 'vl': 9},
	{'impl': 'vlm_vsm_v_b8', 'vl': 8},
	{'impl': 'vlm_vsm_v_b16', 'vl': 8},
	{'impl': 'vlm_vsm_v_b16', 'vl': 1},
	{'impl': 'vlm_vsm_v_b32', 'vl': 4},
	{'impl': 'vlm_vsm_v_b32', 'vl': 1},
	]
	await fixture.load_elf_and_lookup_symbols(
	r.Rlocation(
	'kelvin_hw/tests/cocotb/rvv/load_store/load_store_bits.elf'),
	['vl', 'in_buf', 'out_buf', 'impl'] + [c['impl'] for c in cases],
	)
	rng = np.random.default_rng()
	for c in cases:
	impl = c['impl']
	vl = c['vl']
	in_bytes = (vl + 7) // 8
	last_byte_mask = (1 << (vl % 8) - 1) if vl % 8 else 0xFF

	input_data = rng.integers(
	low=0, high=256, size=in_bytes, dtype=np.uint8)
	expected_output = input_data
	expected_output[-1] = expected_output[-1] & last_byte_mask

	await fixture.write_ptr('impl', impl)
	await fixture.write_word('vl', vl)
	await fixture.write('in_buf', input_data)
	await fixture.write('out_buf', np.zeros([in_bytes], dtype=np.uint8))

	await fixture.run_to_halt()

	actual_output = (await fixture.read('out_buf', in_bytes)).view(np.uint8)

	debug_msg = str({
	'impl': impl,
	'input': input_data,
	'expected': expected_output,
	'actual': actual_output,
	})
	assert (actual_output == expected_output).all(), debug_msg


	@cocotb.test()
	async def load8_stride2_m1(dut):
	await vector_load_store(
	dut = dut,
	elf_name = 'load8_stride2_m1.elf',
	dtype = np.uint8,
	in_size = 32,
	out_size = 16,
	pattern = list(range(0, 31, 2)),
	)

	@cocotb.test()
	async def load8_stride2_m1_partial(dut):
	await vector_load_store(
	dut = dut,
	elf_name = 'load8_stride2_m1_partial.elf',
	dtype = np.uint8,
	in_size = 32,
	out_size = 16,
	pattern = list(range(0, 29, 2)),
	)

	@cocotb.test()
	async def load8_stride2_mf4(dut):
	await vector_load_store(
	dut = dut,
	elf_name = 'load8_stride2_mf4.elf',
	dtype = np.uint8,
	in_size = 32,
	out_size = 16,
	pattern = [0, 2, 4, 6],
	)

	@cocotb.test()
	async def load16_stride4_m1(dut):
	await vector_load_store(
	dut = dut,
	elf_name = 'load16_stride4_m1.elf',
	dtype = np.uint16,
	in_size = 16,
	out_size = 8,
	pattern = list(range(0, 15, 2)),
	)

	@cocotb.test()
	async def load16_stride4_m1_partial(dut):
	await vector_load_store(
	dut = dut,
	elf_name = 'load16_stride4_m1_partial.elf',
	dtype = np.uint16,
	in_size = 16,
	out_size = 8,
	pattern = list(range(0, 13, 2)),
	)

	@cocotb.test()
	async def load16_stride4_mf2(dut):
	await vector_load_store(
	dut = dut,
	elf_name = 'load16_stride4_mf2.elf',
	dtype = np.uint16,
	in_size = 16,
	out_size = 8,
	pattern = [0, 2, 4, 6],
	)

	@cocotb.test()
	async def load32_stride8_m1(dut):
	await vector_load_store(
	dut = dut,
	elf_name = 'load32_stride8_m1.elf',
	dtype = np.uint32,
	in_size = 8,
	out_size = 4,
	pattern = [0, 2, 4, 6],
	)

	@cocotb.test()
	async def load32_stride8_m1_partial(dut):
	await vector_load_store(
	dut = dut,
	elf_name = 'load32_stride8_m1_partial.elf',
	dtype = np.uint32,
	in_size = 8,
	out_size = 4,
	pattern = [0, 2, 4],
	)

	@cocotb.test()
	async def load_store8_unit_m2(dut):
	await vector_load_store(
	dut = dut,
	elf_name = 'load_store8_unit_m2.elf',
	dtype = np.uint8,
	in_size = 64,
	out_size = 64,
	pattern = list(range(0, 32)),
	)


	@cocotb.test()
	async def load_store16_unit_m2(dut):
	await vector_load_store(
	dut=dut,
	elf_name='load_store16_unit_m2.elf',
	dtype=np.uint16,
	in_size=32,
	out_size=32,
	pattern=list(range(0, 16)),
	)


	@cocotb.test()
	async def load_store32_unit_m2(dut):
	await vector_load_store(
	dut=dut,
	elf_name='load_store32_unit_m2.elf',
	dtype=np.uint32,
	in_size=16,
	out_size=16,
	pattern=list(range(0, 8)),
	)

	@cocotb.test()
	async def load8_segment2_unit_m1(dut):
	await vector_load_store(
	dut=dut,
	elf_name='load8_segment2_unit_m1.elf',
	dtype=np.uint8,
	in_size=64,
	out_size=64,
	pattern=(list(range(0, 32, 2)) + list(range(1, 32, 2))),
	)


	@cocotb.test()
	async def load16_segment2_unit_m1(dut):
	await vector_load_store(
	dut=dut,
	elf_name='load16_segment2_unit_m1.elf',
	dtype=np.uint16,
	in_size=32,
	out_size=32,
	pattern=(list(range(0, 16, 2)) + list(range(1, 16, 2))),
	)


	@cocotb.test()
	async def load32_segment2_unit_m1(dut):
	await vector_load_store(
	dut=dut,
	elf_name='load32_segment2_unit_m1.elf',
	dtype=np.uint32,
	in_size=16,
	out_size=16,
	pattern=(list(range(0, 8, 2)) + list(range(1, 8, 2))),
	)


	# TODO(davidgao): re-enable once fixes are in.
	@cocotb.test(skip=True)
	async def load8_segment2_unit_m2(dut):
	await vector_load_store(
	dut=dut,
	elf_name='load8_segment2_unit_m2.elf',
	dtype=np.uint8,
	in_size=128,
	out_size=128,
	pattern=(list(range(0, 63, 2)) + list(range(1, 64, 2))),
	)


	# TODO(davidgao): re-enable once fixes are in.
	@cocotb.test(skip=True)
	async def load16_segment2_unit_m2(dut):
	await vector_load_store(
	dut=dut,
	elf_name='load16_segment2_unit_m2.elf',
	dtype=np.uint16,
	in_size=64,
	out_size=64,
	pattern=(list(range(0, 31, 2)) + list(range(1, 32, 2))),
	)


	# TODO(davidgao): re-enable once fixes are in.
	@cocotb.test(skip=True)
	async def load32_segment2_unit_m2(dut):
	await vector_load_store(
	dut=dut,
	elf_name='load32_segment2_unit_m2.elf',
	dtype=np.uint32,
	in_size=32,
	out_size=32,
	pattern=(list(range(0, 15, 2)) + list(range(1, 16, 2))),
	)


	@cocotb.test()
	async def load8_segment2_stride6_m1(dut):
	await vector_load_store(
	dut=dut,
	elf_name='load8_segment2_stride6_m1.elf',
	dtype=np.uint8,
	in_size=256,
	out_size=64,
	pattern=([i * 6 for i in range(16)] + [i * 6 + 1 for i in range(16)]),
	)


	@cocotb.test()
	async def load16_segment2_stride6_m1(dut):
	await vector_load_store(
	dut=dut,
	elf_name='load16_segment2_stride6_m1.elf',
	dtype=np.uint16,
	in_size=128,
	out_size=32,
	pattern=([i * 3 for i in range(8)] + [i * 3 + 1 for i in range(8)]),
	)


	@cocotb.test()
	async def load8_indexed_m1(dut):
	await vector_load_indexed(
	dut = dut,
	elf_name = 'load8_indexed_m1.elf',
	dtype = np.uint8,
	)

	@cocotb.test()
	async def store8_indexed_m1(dut):
	await vector_store_indexed(
	dut = dut,
	elf_name = 'store8_indexed_m1.elf',
	dtype = np.uint8,
	)


	@cocotb.test()
	async def store8_seg_unit(dut):
	"""Test vsseg*e8 usage accessible from intrinsics."""
	def make_test_case(impl: str, vl: int, n_segs: int):
	return {
	'impl': impl,
	'vl': vl,
	'in_size': vl * n_segs * 2,
	'out_size': vl * n_segs * 2,
	'pattern': [i * vl + j for j in range(vl) for i in range(n_segs)]
	}

	await vector_load_store_v2(
	dut = dut,
	elf_name = 'store8_seg_unit.elf',
	cases = [
	# Seg 2
	make_test_case('vsseg2e8_v_u8mf4x2', vl=4, n_segs=2),
	make_test_case('vsseg2e8_v_u8mf4x2', vl=3, n_segs=2),
	make_test_case('vsseg2e8_v_u8mf2x2', vl=8, n_segs=2),
	make_test_case('vsseg2e8_v_u8mf2x2', vl=7, n_segs=2),
	make_test_case('vsseg2e8_v_u8m1x2', vl=16, n_segs=2),
	make_test_case('vsseg2e8_v_u8m1x2', vl=15, n_segs=2),
	# make_test_case('vsseg2e8_v_u8m2x2', vl=32, n_segs=2),
	# make_test_case('vsseg2e8_v_u8m2x2', vl=31, n_segs=2),
	# make_test_case('vsseg2e8_v_u8m4x2', vl=64, n_segs=2),
	# make_test_case('vsseg2e8_v_u8m4x2', vl=63, n_segs=2),
	# Seg 3
	make_test_case('vsseg3e8_v_u8mf4x3', vl=4, n_segs=3),
	make_test_case('vsseg3e8_v_u8mf4x3', vl=3, n_segs=3),
	make_test_case('vsseg3e8_v_u8mf2x3', vl=8, n_segs=3),
	make_test_case('vsseg3e8_v_u8mf2x3', vl=7, n_segs=3),
	make_test_case('vsseg3e8_v_u8m1x3', vl=16, n_segs=3),
	make_test_case('vsseg3e8_v_u8m1x3', vl=15, n_segs=3),
	# make_test_case('vsseg3e8_v_u8m2x3', vl=32, n_segs=3),
	# make_test_case('vsseg3e8_v_u8m2x3', vl=31, n_segs=3),
	# Seg 4
	make_test_case('vsseg4e8_v_u8mf4x4', vl=4, n_segs=4),
	make_test_case('vsseg4e8_v_u8mf4x4', vl=3, n_segs=4),
	make_test_case('vsseg4e8_v_u8mf2x4', vl=8, n_segs=4),
	make_test_case('vsseg4e8_v_u8mf2x4', vl=7, n_segs=4),
	make_test_case('vsseg4e8_v_u8m1x4', vl=16, n_segs=4),
	make_test_case('vsseg4e8_v_u8m1x4', vl=15, n_segs=4),
	# make_test_case('vsseg4e8_v_u8m2x4', vl=32, n_segs=4),
	# make_test_case('vsseg4e8_v_u8m2x4', vl=31, n_segs=4),
	# Seg 5
	make_test_case('vsseg5e8_v_u8mf4x5', vl=4, n_segs=5),
	make_test_case('vsseg5e8_v_u8mf4x5', vl=3, n_segs=5),
	make_test_case('vsseg5e8_v_u8mf2x5', vl=8, n_segs=5),
	make_test_case('vsseg5e8_v_u8mf2x5', vl=7, n_segs=5),
	make_test_case('vsseg5e8_v_u8m1x5', vl=16, n_segs=5),
	make_test_case('vsseg5e8_v_u8m1x5', vl=15, n_segs=5),
	# Seg 6
	make_test_case('vsseg6e8_v_u8mf4x6', vl=4, n_segs=6),
	make_test_case('vsseg6e8_v_u8mf4x6', vl=3, n_segs=6),
	make_test_case('vsseg6e8_v_u8mf2x6', vl=8, n_segs=6),
	make_test_case('vsseg6e8_v_u8mf2x6', vl=7, n_segs=6),
	make_test_case('vsseg6e8_v_u8m1x6', vl=16, n_segs=6),
	make_test_case('vsseg6e8_v_u8m1x6', vl=15, n_segs=6),
	# Seg 7
	make_test_case('vsseg7e8_v_u8mf4x7', vl=4, n_segs=7),
	make_test_case('vsseg7e8_v_u8mf4x7', vl=3, n_segs=7),
	make_test_case('vsseg7e8_v_u8mf2x7', vl=8, n_segs=7),
	make_test_case('vsseg7e8_v_u8mf2x7', vl=7, n_segs=7),
	make_test_case('vsseg7e8_v_u8m1x7', vl=16, n_segs=7),
	make_test_case('vsseg7e8_v_u8m1x7', vl=15, n_segs=7),
	# Seg 8
	make_test_case('vsseg8e8_v_u8mf4x8', vl=4, n_segs=8),
	make_test_case('vsseg8e8_v_u8mf4x8', vl=3, n_segs=8),
	make_test_case('vsseg8e8_v_u8mf2x8', vl=8, n_segs=8),
	make_test_case('vsseg8e8_v_u8mf2x8', vl=7, n_segs=8),
	make_test_case('vsseg8e8_v_u8m1x8', vl=16, n_segs=8),
	make_test_case('vsseg8e8_v_u8m1x8', vl=15, n_segs=8),
	],
	dtype = np.uint8,
	)


	@cocotb.test()
	async def load_store8_test(dut):
	"""Testbench to test RVV load."""
	fixture = await Fixture.Create(dut)
	r = runfiles.Create()
	await fixture.load_elf_and_lookup_symbols(
	r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/load_store8_test.elf'),
	['buffer', 'in_ptr', 'out_ptr', 'vl'],
	)

	vl = 16
	input_data = np.random.randint(0, 255, vl, dtype=np.uint8)
	target_in_addr = fixture.symbols['buffer'] + 16
	target_out_addr = fixture.symbols['buffer'] + 64

	await fixture.core_mini_axi.write(target_in_addr, input_data)
	await fixture.write('in_ptr', np.array([target_in_addr], dtype=np.uint32))
	await fixture.write('out_ptr', np.array([target_out_addr], dtype=np.uint32))
	await fixture.write('vl', np.array([vl], dtype=np.uint32))

	await fixture.run_to_halt()

	routputs = (await fixture.core_mini_axi.read(target_out_addr, vl)).view(
	np.uint8)
	assert (input_data == routputs).all()