Add load16/32_index8 tests

Change-Id: Iaf0ce97e1c4f580d9541aa7c976b8f3e566a21f9
diff --git a/tests/cocotb/BUILD b/tests/cocotb/BUILD
index e0d7820..7657acc 100644
--- a/tests/cocotb/BUILD
+++ b/tests/cocotb/BUILD
@@ -180,10 +180,12 @@
     "load8_stride2_m1",
     "load8_stride2_m1_partial",
     "load8_stride2_mf4",
+    "load16_index8",
     "load16_seg_unit",
     "load16_stride4_m1",
     "load16_stride4_m1_partial",
     "load16_stride4_mf2",
+    "load32_index8",
     "load32_seg_unit",
     "load32_stride8_m1",
     "load32_stride8_m1_partial",
diff --git a/tests/cocotb/rvv/load_store/BUILD b/tests/cocotb/rvv/load_store/BUILD
index 4e07588..e0e3b4a 100644
--- a/tests/cocotb/rvv/load_store/BUILD
+++ b/tests/cocotb/rvv/load_store/BUILD
@@ -62,6 +62,9 @@
         "load_store32_unit_m2": {
             "srcs": ["load_store32_unit_m2.cc"],
         },
+        "load16_index8": {
+            "srcs": ["load16_index8.cc"],
+        },
         "load16_seg_unit": {
             "srcs": ["load16_seg_unit.cc"],
         },
@@ -74,6 +77,9 @@
         "load16_stride4_mf2": {
             "srcs": ["load16_stride4_mf2.cc"],
         },
+        "load32_index8": {
+            "srcs": ["load32_index8.cc"],
+        },
         "load32_seg_unit": {
             "srcs": ["load32_seg_unit.cc"],
         },
@@ -115,10 +121,12 @@
         ":load_store8_unit_m2.elf",
         ":load_store16_unit_m2.elf",
         ":load_store32_unit_m2.elf",
+        ":load16_index8.elf",
         ":load16_seg_unit.elf",
         ":load16_stride4_m1.elf",
         ":load16_stride4_m1_partial.elf",
         ":load16_stride4_mf2.elf",
+        ":load32_index8.elf",
         ":load32_seg_unit.elf",
         ":load32_stride8_m1.elf",
         ":load32_stride8_m1_partial.elf",
diff --git a/tests/cocotb/rvv/load_store/load16_index8.cc b/tests/cocotb/rvv/load_store/load16_index8.cc
new file mode 100644
index 0000000..b8366a2
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load16_index8.cc
@@ -0,0 +1,101 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+namespace {
+constexpr size_t lut_size = 256;
+// Double sized so we can check trailing regions are not read/written.
+constexpr size_t buf_size = 128;
+}  // namespace
+
+size_t vl __attribute__((section(".data"))) = 8;
+// Indices are always unsigned.
+uint8_t index_buf[buf_size] __attribute__((section(".data")));
+// These instructions don't differentiate signed/unsigned so we only need to
+// test one. The types come from intrinsic level.
+uint16_t in_buf[lut_size] __attribute__((section(".data")));
+uint16_t out_buf[buf_size] __attribute__((section(".data")));
+
+extern "C" {
+// Unordered
+__attribute__((used, retain)) void vluxei8_v_u16mf2() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vluxei8_v_u16mf2(in_buf, indices, vl);
+  __riscv_vse16_v_u16mf2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei8_v_u16m1() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vluxei8_v_u16m1(in_buf, indices, vl);
+  __riscv_vse16_v_u16m1(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei8_v_u16m2() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vluxei8_v_u16m2(in_buf, indices, vl);
+  __riscv_vse16_v_u16m2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei8_v_u16m4() {
+  auto indices = __riscv_vle8_v_u8m2(index_buf, vl);
+  auto data = __riscv_vluxei8_v_u16m4(in_buf, indices, vl);
+  __riscv_vse16_v_u16m4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei8_v_u16m8() {
+  auto indices = __riscv_vle8_v_u8m4(index_buf, vl);
+  auto data = __riscv_vluxei8_v_u16m8(in_buf, indices, vl);
+  __riscv_vse16_v_u16m8(out_buf, data, vl);
+}
+
+// Ordered
+__attribute__((used, retain)) void vloxei8_v_u16mf2() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vloxei8_v_u16mf2(in_buf, indices, vl);
+  __riscv_vse16_v_u16mf2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei8_v_u16m1() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vloxei8_v_u16m1(in_buf, indices, vl);
+  __riscv_vse16_v_u16m1(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei8_v_u16m2() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vloxei8_v_u16m2(in_buf, indices, vl);
+  __riscv_vse16_v_u16m2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei8_v_u16m4() {
+  auto indices = __riscv_vle8_v_u8m2(index_buf, vl);
+  auto data = __riscv_vloxei8_v_u16m4(in_buf, indices, vl);
+  __riscv_vse16_v_u16m4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei8_v_u16m8() {
+  auto indices = __riscv_vle8_v_u8m4(index_buf, vl);
+  auto data = __riscv_vloxei8_v_u16m8(in_buf, indices, vl);
+  __riscv_vse16_v_u16m8(out_buf, data, vl);
+}
+}
+
+void (*impl)() __attribute__((section(".data"))) = &vluxei8_v_u16m1;
+
+int main(int argc, char** argv) {
+  impl();
+  return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load32_index8.cc b/tests/cocotb/rvv/load_store/load32_index8.cc
new file mode 100644
index 0000000..a3a7a87
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load32_index8.cc
@@ -0,0 +1,89 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+namespace {
+constexpr size_t lut_size = 256;
+// Double sized so we can check trailing regions are not read/written.
+constexpr size_t buf_size = 64;
+}  // namespace
+
+size_t vl __attribute__((section(".data"))) = 4;
+// Indices are always unsigned.
+uint8_t index_buf[buf_size] __attribute__((section(".data")));
+// These instructions don't differentiate signed/unsigned so we only need to
+// test one. The types come from intrinsic level.
+uint32_t in_buf[lut_size] __attribute__((section(".data")));
+uint32_t out_buf[buf_size] __attribute__((section(".data")));
+
+extern "C" {
+// Unordered
+__attribute__((used, retain)) void vluxei8_v_u32m1() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vluxei8_v_u32m1(in_buf, indices, vl);
+  __riscv_vse32_v_u32m1(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei8_v_u32m2() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vluxei8_v_u32m2(in_buf, indices, vl);
+  __riscv_vse32_v_u32m2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei8_v_u32m4() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vluxei8_v_u32m4(in_buf, indices, vl);
+  __riscv_vse32_v_u32m4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei8_v_u32m8() {
+  auto indices = __riscv_vle8_v_u8m2(index_buf, vl);
+  auto data = __riscv_vluxei8_v_u32m8(in_buf, indices, vl);
+  __riscv_vse32_v_u32m8(out_buf, data, vl);
+}
+
+// Ordered
+__attribute__((used, retain)) void vloxei8_v_u32m1() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vloxei8_v_u32m1(in_buf, indices, vl);
+  __riscv_vse32_v_u32m1(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei8_v_u32m2() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vloxei8_v_u32m2(in_buf, indices, vl);
+  __riscv_vse32_v_u32m2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei8_v_u32m4() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vloxei8_v_u32m4(in_buf, indices, vl);
+  __riscv_vse32_v_u32m4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei8_v_u32m8() {
+  auto indices = __riscv_vle8_v_u8m2(index_buf, vl);
+  auto data = __riscv_vloxei8_v_u32m8(in_buf, indices, vl);
+  __riscv_vse32_v_u32m8(out_buf, data, vl);
+}
+}
+
+void (*impl)() __attribute__((section(".data"))) = &vluxei8_v_u32m1;
+
+int main(int argc, char** argv) {
+  impl();
+  return 0;
+}
diff --git a/tests/cocotb/rvv_load_store_test.py b/tests/cocotb/rvv_load_store_test.py
index 843e6e2..ea99784 100644
--- a/tests/cocotb/rvv_load_store_test.py
+++ b/tests/cocotb/rvv_load_store_test.py
@@ -77,7 +77,7 @@
     await fixture.load_elf_and_lookup_symbols(
         r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/' + elf_name),
         ['impl', 'vl', 'in_buf', 'out_buf'] +
-            list(set([c['impl'] for c in cases])),
+            list({c['impl'] for c in cases}),
     )
 
     min_value = np.iinfo(dtype).min
@@ -117,7 +117,7 @@
 async def vector_load_indexed(
         dut,
         elf_name: str,
-        cases: list[dict],  # keys: impl, vl, in_size, out_size.
+        cases: list[dict],  # keys: impl, vl, in_bytes, out_size.
         dtype,
         index_dtype,
 ):
@@ -130,23 +130,28 @@
     await fixture.load_elf_and_lookup_symbols(
         r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/' + elf_name),
         ['impl', 'vl', 'in_buf', 'out_buf', 'index_buf'] +
-            list(set([c['impl'] for c in cases])),
+            list({c['impl'] for c in cases}),
     )
 
-    min_value = np.iinfo(dtype).min
-    max_value = np.iinfo(dtype).max + 1  # One above.
     rng = np.random.default_rng()
     for c in tqdm.tqdm(cases):
         impl = c['impl']
         vl = c['vl']
-        in_size = c['in_size']
+        in_bytes = c['in_bytes']
         out_size = c['out_size']
 
+        # Don't go beyond the buffer.
+        index_max = in_bytes - np.dtype(dtype).itemsize + 1
         # TODO(davidgao): currently assuming the vl is supported.
         # We'll eventually want to test unsupported vl.
-        indices = rng.integers(0, in_size, vl, dtype=index_dtype)
-        input_data = rng.integers(min_value, max_value, in_size, dtype=dtype)
-        expected_outputs = input_data[indices[:vl]]
+        indices = rng.integers(0, index_max, out_size, dtype=index_dtype)
+        # Index is in bytes so input needs to be in bytes.
+        input_data = rng.integers(0, 256, in_bytes, dtype=np.uint8)
+        # Input needs to be reinterpreted.
+        indices_in_use = np.array([
+            np.arange(x, x + np.dtype(dtype).itemsize)
+            for x in indices[:vl].astype(np.uint32)]).astype(index_dtype)
+        expected_outputs = input_data[indices_in_use].view(dtype)[..., 0]
         sbz = np.zeros(out_size - vl, dtype=dtype)
         expected_outputs = np.concat((expected_outputs, sbz))
 
@@ -244,7 +249,7 @@
         r.Rlocation(
             'kelvin_hw/tests/cocotb/rvv/load_store/load_store_bits.elf'),
         ['vl', 'in_buf', 'out_buf', 'impl'] +
-            list(set([c['impl'] for c in cases])),
+            list({c['impl'] for c in cases}),
     )
     rng = np.random.default_rng()
     for c in cases:
@@ -487,7 +492,7 @@
         return {
             'impl': impl,
             'vl': vl,
-            'in_size': 256,
+            'in_bytes': 256,
             'out_size': vl * 2,
         }
 
@@ -534,7 +539,7 @@
         return {
             'impl': impl,
             'vl': vl,
-            'in_size': 32000,  # DTCM is 32KB
+            'in_bytes': 32000,  # DTCM is 32KB
             'out_size': vl * 2,
         }
 
@@ -577,7 +582,7 @@
         return {
             'impl': impl,
             'vl': vl,
-            'in_size': 32000,  # DTCM is 32KB
+            'in_bytes': 32000,  # DTCM is 32KB
             'out_size': vl * 2,
         }
 
@@ -614,6 +619,49 @@
 
 
 @cocotb.test()
+async def load16_index8(dut):
+    """Test vl*xei8_v_u16 usage accessible from intrinsics."""
+    def make_test_case(impl: str, vl: int):
+        return {
+            'impl': impl,
+            'vl': vl,
+            'in_bytes': 256,
+            'out_size': vl * 2,
+        }
+
+    await vector_load_indexed(
+        dut = dut,
+        elf_name = 'load16_index8.elf',
+        cases = [
+            # Unordered
+            make_test_case('vluxei8_v_u16mf2', vl = 4),
+            make_test_case('vluxei8_v_u16mf2', vl = 3),
+            make_test_case('vluxei8_v_u16m1', vl = 8),
+            make_test_case('vluxei8_v_u16m1', vl = 7),
+            # make_test_case('vluxei8_v_u16m2', vl = 16),
+            # make_test_case('vluxei8_v_u16m2', vl = 15),
+            # make_test_case('vluxei8_v_u16m4', vl = 32),
+            # make_test_case('vluxei8_v_u16m4', vl = 31),
+            # make_test_case('vluxei8_v_u16m8', vl = 64),
+            # make_test_case('vluxei8_v_u16m8', vl = 63),
+            # Ordered
+            make_test_case('vloxei8_v_u16mf2', vl = 4),
+            make_test_case('vloxei8_v_u16mf2', vl = 3),
+            make_test_case('vloxei8_v_u16m1', vl = 8),
+            make_test_case('vloxei8_v_u16m1', vl = 7),
+            # make_test_case('vloxei8_v_u16m1', vl = 16),
+            # make_test_case('vloxei8_v_u16m1', vl = 15),
+            # make_test_case('vloxei8_v_u16m4', vl = 32),
+            # make_test_case('vloxei8_v_u16m4', vl = 31),
+            # make_test_case('vloxei8_v_u16m8', vl = 64),
+            # make_test_case('vloxei8_v_u16m8', vl = 63),
+        ],
+        dtype = np.uint16,
+        index_dtype = np.uint8,
+    )
+
+
+@cocotb.test()
 async def load16_seg_unit(dut):
     """Test vlseg*e16 usage accessible from intrinsics."""
     def make_test_case(impl: str, vl: int, n_segs: int):
@@ -680,6 +728,45 @@
 
 
 @cocotb.test()
+async def load32_index8(dut):
+    """Test vl*xei8_v_u32 usage accessible from intrinsics."""
+    def make_test_case(impl: str, vl: int):
+        return {
+            'impl': impl,
+            'vl': vl,
+            'in_bytes': 256,
+            'out_size': vl * 2,
+        }
+
+    await vector_load_indexed(
+        dut = dut,
+        elf_name = 'load32_index8.elf',
+        cases = [
+            # Unordered
+            make_test_case('vluxei8_v_u32m1', vl = 4),
+            make_test_case('vluxei8_v_u32m1', vl = 3),
+            # make_test_case('vluxei8_v_u32m2', vl = 8),
+            # make_test_case('vluxei8_v_u32m2', vl = 7),
+            # make_test_case('vluxei8_v_u32m4', vl = 16),
+            # make_test_case('vluxei8_v_u32m4', vl = 15),
+            # make_test_case('vluxei8_v_u32m8', vl = 32),
+            # make_test_case('vluxei8_v_u32m8', vl = 31),
+            # Ordered
+            make_test_case('vloxei8_v_u32m1', vl = 4),
+            make_test_case('vloxei8_v_u32m1', vl = 3),
+            # make_test_case('vloxei8_v_u32m1', vl = 16),
+            # make_test_case('vloxei8_v_u32m1', vl = 15),
+            # make_test_case('vloxei8_v_u32m4', vl = 32),
+            # make_test_case('vloxei8_v_u32m4', vl = 31),
+            # make_test_case('vloxei8_v_u32m8', vl = 64),
+            # make_test_case('vloxei8_v_u32m8', vl = 63),
+        ],
+        dtype = np.uint32,
+        index_dtype = np.uint8,
+    )
+
+
+@cocotb.test()
 async def load32_seg_unit(dut):
     """Test vlseg*e32 usage accessible from intrinsics."""
     def make_test_case(impl: str, vl: int, n_segs: int):