Add load16/32_index8 tests Change-Id: Iaf0ce97e1c4f580d9541aa7c976b8f3e566a21f9
diff --git a/tests/cocotb/BUILD b/tests/cocotb/BUILD index e0d7820..7657acc 100644 --- a/tests/cocotb/BUILD +++ b/tests/cocotb/BUILD
@@ -180,10 +180,12 @@ "load8_stride2_m1", "load8_stride2_m1_partial", "load8_stride2_mf4", + "load16_index8", "load16_seg_unit", "load16_stride4_m1", "load16_stride4_m1_partial", "load16_stride4_mf2", + "load32_index8", "load32_seg_unit", "load32_stride8_m1", "load32_stride8_m1_partial",
diff --git a/tests/cocotb/rvv/load_store/BUILD b/tests/cocotb/rvv/load_store/BUILD index 4e07588..e0e3b4a 100644 --- a/tests/cocotb/rvv/load_store/BUILD +++ b/tests/cocotb/rvv/load_store/BUILD
@@ -62,6 +62,9 @@ "load_store32_unit_m2": { "srcs": ["load_store32_unit_m2.cc"], }, + "load16_index8": { + "srcs": ["load16_index8.cc"], + }, "load16_seg_unit": { "srcs": ["load16_seg_unit.cc"], }, @@ -74,6 +77,9 @@ "load16_stride4_mf2": { "srcs": ["load16_stride4_mf2.cc"], }, + "load32_index8": { + "srcs": ["load32_index8.cc"], + }, "load32_seg_unit": { "srcs": ["load32_seg_unit.cc"], }, @@ -115,10 +121,12 @@ ":load_store8_unit_m2.elf", ":load_store16_unit_m2.elf", ":load_store32_unit_m2.elf", + ":load16_index8.elf", ":load16_seg_unit.elf", ":load16_stride4_m1.elf", ":load16_stride4_m1_partial.elf", ":load16_stride4_mf2.elf", + ":load32_index8.elf", ":load32_seg_unit.elf", ":load32_stride8_m1.elf", ":load32_stride8_m1_partial.elf",
diff --git a/tests/cocotb/rvv/load_store/load16_index8.cc b/tests/cocotb/rvv/load_store/load16_index8.cc new file mode 100644 index 0000000..b8366a2 --- /dev/null +++ b/tests/cocotb/rvv/load_store/load16_index8.cc
@@ -0,0 +1,101 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <riscv_vector.h> +#include <stdint.h> + +namespace { +constexpr size_t lut_size = 256; +// Double sized so we can check trailing regions are not read/written. +constexpr size_t buf_size = 128; +} // namespace + +size_t vl __attribute__((section(".data"))) = 8; +// Indices are always unsigned. +uint8_t index_buf[buf_size] __attribute__((section(".data"))); +// These instructions don't differentiate signed/unsigned so we only need to +// test one. The types come from intrinsic level. +uint16_t in_buf[lut_size] __attribute__((section(".data"))); +uint16_t out_buf[buf_size] __attribute__((section(".data"))); + +extern "C" { +// Unordered +__attribute__((used, retain)) void vluxei8_v_u16mf2() { + auto indices = __riscv_vle8_v_u8mf4(index_buf, vl); + auto data = __riscv_vluxei8_v_u16mf2(in_buf, indices, vl); + __riscv_vse16_v_u16mf2(out_buf, data, vl); +} + +__attribute__((used, retain)) void vluxei8_v_u16m1() { + auto indices = __riscv_vle8_v_u8mf2(index_buf, vl); + auto data = __riscv_vluxei8_v_u16m1(in_buf, indices, vl); + __riscv_vse16_v_u16m1(out_buf, data, vl); +} + +__attribute__((used, retain)) void vluxei8_v_u16m2() { + auto indices = __riscv_vle8_v_u8m1(index_buf, vl); + auto data = __riscv_vluxei8_v_u16m2(in_buf, indices, vl); + __riscv_vse16_v_u16m2(out_buf, data, vl); +} + +__attribute__((used, retain)) void vluxei8_v_u16m4() { + auto indices = __riscv_vle8_v_u8m2(index_buf, vl); + auto data = __riscv_vluxei8_v_u16m4(in_buf, indices, vl); + __riscv_vse16_v_u16m4(out_buf, data, vl); +} + +__attribute__((used, retain)) void vluxei8_v_u16m8() { + auto indices = __riscv_vle8_v_u8m4(index_buf, vl); + auto data = __riscv_vluxei8_v_u16m8(in_buf, indices, vl); + __riscv_vse16_v_u16m8(out_buf, data, vl); +} + +// Ordered +__attribute__((used, retain)) void vloxei8_v_u16mf2() { + auto indices = __riscv_vle8_v_u8mf4(index_buf, vl); + auto data = __riscv_vloxei8_v_u16mf2(in_buf, indices, vl); + __riscv_vse16_v_u16mf2(out_buf, data, vl); +} + +__attribute__((used, retain)) void vloxei8_v_u16m1() { + auto indices = __riscv_vle8_v_u8mf2(index_buf, vl); + auto data = __riscv_vloxei8_v_u16m1(in_buf, indices, vl); + __riscv_vse16_v_u16m1(out_buf, data, vl); +} + +__attribute__((used, retain)) void vloxei8_v_u16m2() { + auto indices = __riscv_vle8_v_u8m1(index_buf, vl); + auto data = __riscv_vloxei8_v_u16m2(in_buf, indices, vl); + __riscv_vse16_v_u16m2(out_buf, data, vl); +} + +__attribute__((used, retain)) void vloxei8_v_u16m4() { + auto indices = __riscv_vle8_v_u8m2(index_buf, vl); + auto data = __riscv_vloxei8_v_u16m4(in_buf, indices, vl); + __riscv_vse16_v_u16m4(out_buf, data, vl); +} + +__attribute__((used, retain)) void vloxei8_v_u16m8() { + auto indices = __riscv_vle8_v_u8m4(index_buf, vl); + auto data = __riscv_vloxei8_v_u16m8(in_buf, indices, vl); + __riscv_vse16_v_u16m8(out_buf, data, vl); +} +} + +void (*impl)() __attribute__((section(".data"))) = &vluxei8_v_u16m1; + +int main(int argc, char** argv) { + impl(); + return 0; +}
diff --git a/tests/cocotb/rvv/load_store/load32_index8.cc b/tests/cocotb/rvv/load_store/load32_index8.cc new file mode 100644 index 0000000..a3a7a87 --- /dev/null +++ b/tests/cocotb/rvv/load_store/load32_index8.cc
@@ -0,0 +1,89 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <riscv_vector.h> +#include <stdint.h> + +namespace { +constexpr size_t lut_size = 256; +// Double sized so we can check trailing regions are not read/written. +constexpr size_t buf_size = 64; +} // namespace + +size_t vl __attribute__((section(".data"))) = 4; +// Indices are always unsigned. +uint8_t index_buf[buf_size] __attribute__((section(".data"))); +// These instructions don't differentiate signed/unsigned so we only need to +// test one. The types come from intrinsic level. +uint32_t in_buf[lut_size] __attribute__((section(".data"))); +uint32_t out_buf[buf_size] __attribute__((section(".data"))); + +extern "C" { +// Unordered +__attribute__((used, retain)) void vluxei8_v_u32m1() { + auto indices = __riscv_vle8_v_u8mf4(index_buf, vl); + auto data = __riscv_vluxei8_v_u32m1(in_buf, indices, vl); + __riscv_vse32_v_u32m1(out_buf, data, vl); +} + +__attribute__((used, retain)) void vluxei8_v_u32m2() { + auto indices = __riscv_vle8_v_u8mf2(index_buf, vl); + auto data = __riscv_vluxei8_v_u32m2(in_buf, indices, vl); + __riscv_vse32_v_u32m2(out_buf, data, vl); +} + +__attribute__((used, retain)) void vluxei8_v_u32m4() { + auto indices = __riscv_vle8_v_u8m1(index_buf, vl); + auto data = __riscv_vluxei8_v_u32m4(in_buf, indices, vl); + __riscv_vse32_v_u32m4(out_buf, data, vl); +} + +__attribute__((used, retain)) void vluxei8_v_u32m8() { + auto indices = __riscv_vle8_v_u8m2(index_buf, vl); + auto data = __riscv_vluxei8_v_u32m8(in_buf, indices, vl); + __riscv_vse32_v_u32m8(out_buf, data, vl); +} + +// Ordered +__attribute__((used, retain)) void vloxei8_v_u32m1() { + auto indices = __riscv_vle8_v_u8mf4(index_buf, vl); + auto data = __riscv_vloxei8_v_u32m1(in_buf, indices, vl); + __riscv_vse32_v_u32m1(out_buf, data, vl); +} + +__attribute__((used, retain)) void vloxei8_v_u32m2() { + auto indices = __riscv_vle8_v_u8mf2(index_buf, vl); + auto data = __riscv_vloxei8_v_u32m2(in_buf, indices, vl); + __riscv_vse32_v_u32m2(out_buf, data, vl); +} + +__attribute__((used, retain)) void vloxei8_v_u32m4() { + auto indices = __riscv_vle8_v_u8m1(index_buf, vl); + auto data = __riscv_vloxei8_v_u32m4(in_buf, indices, vl); + __riscv_vse32_v_u32m4(out_buf, data, vl); +} + +__attribute__((used, retain)) void vloxei8_v_u32m8() { + auto indices = __riscv_vle8_v_u8m2(index_buf, vl); + auto data = __riscv_vloxei8_v_u32m8(in_buf, indices, vl); + __riscv_vse32_v_u32m8(out_buf, data, vl); +} +} + +void (*impl)() __attribute__((section(".data"))) = &vluxei8_v_u32m1; + +int main(int argc, char** argv) { + impl(); + return 0; +}
diff --git a/tests/cocotb/rvv_load_store_test.py b/tests/cocotb/rvv_load_store_test.py index 843e6e2..ea99784 100644 --- a/tests/cocotb/rvv_load_store_test.py +++ b/tests/cocotb/rvv_load_store_test.py
@@ -77,7 +77,7 @@ await fixture.load_elf_and_lookup_symbols( r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/' + elf_name), ['impl', 'vl', 'in_buf', 'out_buf'] + - list(set([c['impl'] for c in cases])), + list({c['impl'] for c in cases}), ) min_value = np.iinfo(dtype).min @@ -117,7 +117,7 @@ async def vector_load_indexed( dut, elf_name: str, - cases: list[dict], # keys: impl, vl, in_size, out_size. + cases: list[dict], # keys: impl, vl, in_bytes, out_size. dtype, index_dtype, ): @@ -130,23 +130,28 @@ await fixture.load_elf_and_lookup_symbols( r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/' + elf_name), ['impl', 'vl', 'in_buf', 'out_buf', 'index_buf'] + - list(set([c['impl'] for c in cases])), + list({c['impl'] for c in cases}), ) - min_value = np.iinfo(dtype).min - max_value = np.iinfo(dtype).max + 1 # One above. rng = np.random.default_rng() for c in tqdm.tqdm(cases): impl = c['impl'] vl = c['vl'] - in_size = c['in_size'] + in_bytes = c['in_bytes'] out_size = c['out_size'] + # Don't go beyond the buffer. + index_max = in_bytes - np.dtype(dtype).itemsize + 1 # TODO(davidgao): currently assuming the vl is supported. # We'll eventually want to test unsupported vl. - indices = rng.integers(0, in_size, vl, dtype=index_dtype) - input_data = rng.integers(min_value, max_value, in_size, dtype=dtype) - expected_outputs = input_data[indices[:vl]] + indices = rng.integers(0, index_max, out_size, dtype=index_dtype) + # Index is in bytes so input needs to be in bytes. + input_data = rng.integers(0, 256, in_bytes, dtype=np.uint8) + # Input needs to be reinterpreted. + indices_in_use = np.array([ + np.arange(x, x + np.dtype(dtype).itemsize) + for x in indices[:vl].astype(np.uint32)]).astype(index_dtype) + expected_outputs = input_data[indices_in_use].view(dtype)[..., 0] sbz = np.zeros(out_size - vl, dtype=dtype) expected_outputs = np.concat((expected_outputs, sbz)) @@ -244,7 +249,7 @@ r.Rlocation( 'kelvin_hw/tests/cocotb/rvv/load_store/load_store_bits.elf'), ['vl', 'in_buf', 'out_buf', 'impl'] + - list(set([c['impl'] for c in cases])), + list({c['impl'] for c in cases}), ) rng = np.random.default_rng() for c in cases: @@ -487,7 +492,7 @@ return { 'impl': impl, 'vl': vl, - 'in_size': 256, + 'in_bytes': 256, 'out_size': vl * 2, } @@ -534,7 +539,7 @@ return { 'impl': impl, 'vl': vl, - 'in_size': 32000, # DTCM is 32KB + 'in_bytes': 32000, # DTCM is 32KB 'out_size': vl * 2, } @@ -577,7 +582,7 @@ return { 'impl': impl, 'vl': vl, - 'in_size': 32000, # DTCM is 32KB + 'in_bytes': 32000, # DTCM is 32KB 'out_size': vl * 2, } @@ -614,6 +619,49 @@ @cocotb.test() +async def load16_index8(dut): + """Test vl*xei8_v_u16 usage accessible from intrinsics.""" + def make_test_case(impl: str, vl: int): + return { + 'impl': impl, + 'vl': vl, + 'in_bytes': 256, + 'out_size': vl * 2, + } + + await vector_load_indexed( + dut = dut, + elf_name = 'load16_index8.elf', + cases = [ + # Unordered + make_test_case('vluxei8_v_u16mf2', vl = 4), + make_test_case('vluxei8_v_u16mf2', vl = 3), + make_test_case('vluxei8_v_u16m1', vl = 8), + make_test_case('vluxei8_v_u16m1', vl = 7), + # make_test_case('vluxei8_v_u16m2', vl = 16), + # make_test_case('vluxei8_v_u16m2', vl = 15), + # make_test_case('vluxei8_v_u16m4', vl = 32), + # make_test_case('vluxei8_v_u16m4', vl = 31), + # make_test_case('vluxei8_v_u16m8', vl = 64), + # make_test_case('vluxei8_v_u16m8', vl = 63), + # Ordered + make_test_case('vloxei8_v_u16mf2', vl = 4), + make_test_case('vloxei8_v_u16mf2', vl = 3), + make_test_case('vloxei8_v_u16m1', vl = 8), + make_test_case('vloxei8_v_u16m1', vl = 7), + # make_test_case('vloxei8_v_u16m1', vl = 16), + # make_test_case('vloxei8_v_u16m1', vl = 15), + # make_test_case('vloxei8_v_u16m4', vl = 32), + # make_test_case('vloxei8_v_u16m4', vl = 31), + # make_test_case('vloxei8_v_u16m8', vl = 64), + # make_test_case('vloxei8_v_u16m8', vl = 63), + ], + dtype = np.uint16, + index_dtype = np.uint8, + ) + + +@cocotb.test() async def load16_seg_unit(dut): """Test vlseg*e16 usage accessible from intrinsics.""" def make_test_case(impl: str, vl: int, n_segs: int): @@ -680,6 +728,45 @@ @cocotb.test() +async def load32_index8(dut): + """Test vl*xei8_v_u32 usage accessible from intrinsics.""" + def make_test_case(impl: str, vl: int): + return { + 'impl': impl, + 'vl': vl, + 'in_bytes': 256, + 'out_size': vl * 2, + } + + await vector_load_indexed( + dut = dut, + elf_name = 'load32_index8.elf', + cases = [ + # Unordered + make_test_case('vluxei8_v_u32m1', vl = 4), + make_test_case('vluxei8_v_u32m1', vl = 3), + # make_test_case('vluxei8_v_u32m2', vl = 8), + # make_test_case('vluxei8_v_u32m2', vl = 7), + # make_test_case('vluxei8_v_u32m4', vl = 16), + # make_test_case('vluxei8_v_u32m4', vl = 15), + # make_test_case('vluxei8_v_u32m8', vl = 32), + # make_test_case('vluxei8_v_u32m8', vl = 31), + # Ordered + make_test_case('vloxei8_v_u32m1', vl = 4), + make_test_case('vloxei8_v_u32m1', vl = 3), + # make_test_case('vloxei8_v_u32m1', vl = 16), + # make_test_case('vloxei8_v_u32m1', vl = 15), + # make_test_case('vloxei8_v_u32m4', vl = 32), + # make_test_case('vloxei8_v_u32m4', vl = 31), + # make_test_case('vloxei8_v_u32m8', vl = 64), + # make_test_case('vloxei8_v_u32m8', vl = 63), + ], + dtype = np.uint32, + index_dtype = np.uint8, + ) + + +@cocotb.test() async def load32_seg_unit(dut): """Test vlseg*e32 usage accessible from intrinsics.""" def make_test_case(impl: str, vl: int, n_segs: int):