Fix wrong results for mixed prec indexed loads

Adding a bunch of indexed load tests too.

The fix is tested on data8 index16 loads below m1 and index32 below mf2.
Pending another fix for larger lmul.

TODO: test store
TODO: test load data16/32

Change-Id: I83b986339ff0900449fa4b57fa2eb895ca4e26f8
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala
index 832db87..e3ef588 100644
--- a/hdl/chisel/src/kelvin/scalar/Lsu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -166,7 +166,12 @@
   val pc = UInt(32.W)
   val addr = UInt(32.W)
   val data = UInt(32.W)  // Doubles as rs2
+  // This aligns with "width" in the spec. It controls index width in
+  // indexed loads/stores and data width otherwise.
   val elemWidth = Option.when(p.enableRvv) { UInt(3.W) }
+  // This is the sew from vtype. It controls data width in indexed
+  // loads/stores and is unused in other ops.
+  val sew = Option.when(p.enableRvv) { UInt(3.W) }
   val lmul = Option.when(p.enableRvv) { UInt(4.W) }
   val nfields = Option.when(p.enableRvv) { UInt(3.W) }
 
@@ -207,6 +212,7 @@
       result.lmul.get := 1.U(1.W) << effectiveLmul
       // If mask operation, force fields to zero
       result.nfields.get := Mux(cmd.isMaskOperation(), 0.U, cmd.nfields.get)
+      result.sew.get := rvvState.get.bits.sew
     }
 
     result
@@ -235,22 +241,35 @@
   def apply(bytesPerSlot: Int,
             baseAddr: UInt,
             indices: UInt,
-            elemWidth: UInt): Vec[UInt] = {
+            indexWidth: UInt,
+            sew: UInt): Vec[UInt] = {
     val indices8 = UIntToVec(indices, 8)
-    val indices16 = UIntToVec(indices, 16)
-    val indices32 = UIntToVec(indices, 32)
+    val indices16 = UIntToVec(indices, 16).padTo(bytesPerSlot, 0.U)
+    val indices32 = UIntToVec(indices, 32).padTo(bytesPerSlot, 0.U)
+
+    val indices_v = MuxCase(VecInit.fill(bytesPerSlot)(0.U(32.W)), Seq(
+      // 8-bit indices.
+      (indexWidth === "b000".U) -> VecInit((0 until bytesPerSlot).map(
+        i => Cat(0.U(24.W), indices8(i)))),
+      // 16-bit indices.
+      (indexWidth === "b101".U) -> VecInit((0 until bytesPerSlot).map(
+        i => Cat(0.U(16.W), indices16(i)))),
+      // 32-bit indices.
+      (indexWidth === "b110".U) -> VecInit((0 until bytesPerSlot).map(
+        i => indices32(i))),
+    ))
 
     MuxCase(VecInit.fill(bytesPerSlot)(0.U(32.W)), Seq(
       // elemWidth validation is done at decode time.
       // 8-bit indices. Each byte has its own offset.
-      (elemWidth === "b000".U) -> VecInit((0 until bytesPerSlot).map(
-          i => (baseAddr + indices8(i))(31, 0))),
+      (sew === "b000".U) -> VecInit((0 until bytesPerSlot).map(
+          i => (baseAddr + indices_v(i)))),
       // 16-bit indices. Each 2-byte element has an offset.
-      (elemWidth === "b101".U) -> VecInit((0 until bytesPerSlot).map(
-          i => (baseAddr + indices16(i >> 1))(31, 0) + (i & 1).U)),
+      (sew === "b001".U) -> VecInit((0 until bytesPerSlot).map(
+          i => (baseAddr + indices_v(i >> 1) + (i & 1).U))),
       // 32-bit indices. Each 4-byte element has an offset.
-      (elemWidth === "b110".U) -> VecInit((0 until bytesPerSlot).map(
-          i => (baseAddr + indices32(i >> 2))(31, 0) + (i & 3).U))
+      (sew === "b010".U) -> VecInit((0 until bytesPerSlot).map(
+          i => (baseAddr + indices_v(i >> 2) + (i & 3).U)))
     ))
   }
 }
@@ -273,7 +292,12 @@
   val lmul = UInt(4.W)
   val elemStride = UInt(32.W)     // Stride between lanes in a vector
   val segmentStride = UInt(32.W)  // Stride between base addr between segments
+  // This aligns with "width" in the spec. It controls index width in
+  // indexed loads/stores and data width otherwise.
   val elemWidth = UInt(3.W)
+  // This controls data width in indexed loads/stores and is unused in
+  // other ops.
+  val sew = UInt(3.W)
   val nfields = UInt(3.W)
   val segment = UInt(3.W)
   // Add this to find the next segment.
@@ -339,9 +363,10 @@
         op.isOneOf(LsuOp.VLOAD_OINDEXED, LsuOp.VLOAD_UINDEXED,
                    LsuOp.VSTORE_OINDEXED, LsuOp.VSTORE_UINDEXED) ->
             ComputeIndexedAddrs(bytesPerSlot, baseAddr, rvv2lsu.idx.bits.data,
-                                elemWidth)
+                                elemWidth, sew),
     ))
     result.elemWidth := elemWidth
+    result.sew := sew
 
     result.data := Mux(updated && LsuOp.isVector(op) && rvv2lsu.vregfile.valid,
         UIntToVec(rvv2lsu.vregfile.bits.data, 8), data)
@@ -386,6 +411,7 @@
     result.elemStride := elemStride
     result.segmentStride := segmentStride
     result.elemWidth := elemWidth
+    result.sew := sew
     result.nfields := nfields
     result.segment := segment
     result.nextSegmentVectorOffset := nextSegmentVectorOffset
@@ -412,6 +438,7 @@
     result.elemStride := elemStride
     result.segmentStride := segmentStride
     result.elemWidth := elemWidth
+    result.sew := sew
     result.nfields := nfields
     result.nextSegmentVectorOffset := nextSegmentVectorOffset
     result.nextLmulVectorRewind := nextLmulVectorRewind
@@ -492,6 +519,7 @@
     result.elemStride := elemStride
     result.segmentStride := segmentStride
     result.elemWidth := elemWidth
+    result.sew := sew
     result.nfields := nfields
     result.segment := segment
     result.nextSegmentVectorOffset := nextSegmentVectorOffset
@@ -571,6 +599,7 @@
     // Compute addrs
     result.baseAddr := uop.addr
     result.elemWidth := uop.elemWidth.getOrElse(0.U(3.W))
+    result.sew := uop.sew.getOrElse(0.U(3.W))
     result.addrs := Mux(
         uop.op.isOneOf(LsuOp.VLOAD_STRIDED, LsuOp.VSTORE_STRIDED),
         ComputeStridedAddrs(bytesPerSlot, uop.addr, uop.data, uop.elemWidth.getOrElse(0.U(3.W))),
diff --git a/tests/cocotb/BUILD b/tests/cocotb/BUILD
index ee7bec8..e0d7820 100644
--- a/tests/cocotb/BUILD
+++ b/tests/cocotb/BUILD
@@ -173,6 +173,9 @@
 # BEGIN_TESTCASES_FOR_rvv_load_store_test
 RVV_LOAD_STORE_TESTCASES = [
     "load_store_bits",
+    "load8_index8",
+    "load8_index16",
+    "load8_index32",
     "load8_seg_unit",
     "load8_stride2_m1",
     "load8_stride2_m1_partial",
@@ -189,7 +192,6 @@
     "load_store32_unit_m2",
     "load8_segment2_stride6_m1",
     "load16_segment2_stride6_m1",
-    "load8_indexed_m1",
     "store8_indexed_m1",
     "store8_seg_unit",
     "store16_seg_unit",
@@ -377,6 +379,7 @@
         "deps": [
             "//kelvin_test_utils:sim_test_fixture",
             "@bazel_tools//tools/python/runfiles",
+            requirement("tqdm"),
         ],
         "data": ["//tests/cocotb/rvv/load_store:rvv_load_store_tests"],
         "size": "large",
diff --git a/tests/cocotb/rvv/load_store/BUILD b/tests/cocotb/rvv/load_store/BUILD
index c5a79d3..4e07588 100644
--- a/tests/cocotb/rvv/load_store/BUILD
+++ b/tests/cocotb/rvv/load_store/BUILD
@@ -23,8 +23,14 @@
         "load_store_bits": {
             "srcs": ["load_store_bits.cc"],
         },
-        "load8_indexed_m1": {
-            "srcs": ["load8_indexed_m1.cc"],
+        "load8_index8": {
+            "srcs": ["load8_index8.cc"],
+        },
+        "load8_index16": {
+            "srcs": ["load8_index16.cc"],
+        },
+        "load8_index32": {
+            "srcs": ["load8_index32.cc"],
         },
         "load8_seg_unit": {
             "srcs": ["load8_seg_unit.cc"],
@@ -96,7 +102,9 @@
     name = "rvv_load_store_tests",
     srcs = [
         ":load_store_bits.elf",
-        ":load8_indexed_m1.elf",
+        ":load8_index8.elf",
+        ":load8_index16.elf",
+        ":load8_index32.elf",
         ":load8_seg_unit.elf",
         ":load8_segment2_stride6_m1.elf",
         ":load16_segment2_stride6_m1.elf",
diff --git a/tests/cocotb/rvv/load_store/load8_index16.cc b/tests/cocotb/rvv/load_store/load8_index16.cc
new file mode 100644
index 0000000..f6decff
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load8_index16.cc
@@ -0,0 +1,101 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+namespace {
+constexpr size_t lut_size = 32000;  // DTCM is 32KB.
+// Double sized so we can check trailing regions are not read/written.
+constexpr size_t buf_size = 128;
+}  // namespace
+
+size_t vl __attribute__((section(".data"))) = 8;
+// Indices are always unsigned.
+uint16_t index_buf[buf_size] __attribute__((section(".data")));
+// These instructions don't differentiate signed/unsigned so we only need to
+// test one. The types come from intrinsic level.
+uint8_t in_buf[lut_size] __attribute__((section(".data")));
+uint8_t out_buf[buf_size] __attribute__((section(".data")));
+
+extern "C" {
+// Unordered
+__attribute__((used, retain)) void vluxei16_v_u8mf4() {
+  auto indices = __riscv_vle16_v_u16mf2(index_buf, vl);
+  auto data = __riscv_vluxei16_v_u8mf4(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei16_v_u8mf2() {
+  auto indices = __riscv_vle16_v_u16m1(index_buf, vl);
+  auto data = __riscv_vluxei16_v_u8mf2(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei16_v_u8m1() {
+  auto indices = __riscv_vle16_v_u16m2(index_buf, vl);
+  auto data = __riscv_vluxei16_v_u8m1(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei16_v_u8m2() {
+  auto indices = __riscv_vle16_v_u16m4(index_buf, vl);
+  auto data = __riscv_vluxei16_v_u8m2(in_buf, indices, vl);
+  __riscv_vse8_v_u8m2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei16_v_u8m4() {
+  auto indices = __riscv_vle16_v_u16m8(index_buf, vl);
+  auto data = __riscv_vluxei16_v_u8m4(in_buf, indices, vl);
+  __riscv_vse8_v_u8m4(out_buf, data, vl);
+}
+
+// Ordered
+__attribute__((used, retain)) void vloxei16_v_u8mf4() {
+  auto indices = __riscv_vle16_v_u16mf2(index_buf, vl);
+  auto data = __riscv_vloxei16_v_u8mf4(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei16_v_u8mf2() {
+  auto indices = __riscv_vle16_v_u16m1(index_buf, vl);
+  auto data = __riscv_vloxei16_v_u8mf2(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei16_v_u8m1() {
+  auto indices = __riscv_vle16_v_u16m2(index_buf, vl);
+  auto data = __riscv_vloxei16_v_u8m1(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei16_v_u8m2() {
+  auto indices = __riscv_vle16_v_u16m4(index_buf, vl);
+  auto data = __riscv_vloxei16_v_u8m2(in_buf, indices, vl);
+  __riscv_vse8_v_u8m2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei16_v_u8m4() {
+  auto indices = __riscv_vle16_v_u16m8(index_buf, vl);
+  auto data = __riscv_vloxei16_v_u8m4(in_buf, indices, vl);
+  __riscv_vse8_v_u8m4(out_buf, data, vl);
+}
+}
+
+void (*impl)() __attribute__((section(".data"))) = &vluxei16_v_u8m1;
+
+int main(int argc, char** argv) {
+  impl();
+  return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load8_index32.cc b/tests/cocotb/rvv/load_store/load8_index32.cc
new file mode 100644
index 0000000..524e86d
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load8_index32.cc
@@ -0,0 +1,89 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+namespace {
+constexpr size_t lut_size = 32000;  // DTCM is 32KB.
+// Double sized so we can check trailing regions are not read/written.
+constexpr size_t buf_size = 64;
+}  // namespace
+
+size_t vl __attribute__((section(".data"))) = 8;
+// Indices are always unsigned.
+uint32_t index_buf[buf_size] __attribute__((section(".data")));
+// These instructions don't differentiate signed/unsigned so we only need to
+// test one. The types come from intrinsic level.
+uint8_t in_buf[lut_size] __attribute__((section(".data")));
+uint8_t out_buf[buf_size] __attribute__((section(".data")));
+
+extern "C" {
+// Unordered
+__attribute__((used, retain)) void vluxei32_v_u8mf4() {
+  auto indices = __riscv_vle32_v_u32m1(index_buf, vl);
+  auto data = __riscv_vluxei32_v_u8mf4(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei32_v_u8mf2() {
+  auto indices = __riscv_vle32_v_u32m2(index_buf, vl);
+  auto data = __riscv_vluxei32_v_u8mf2(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei32_v_u8m1() {
+  auto indices = __riscv_vle32_v_u32m4(index_buf, vl);
+  auto data = __riscv_vluxei32_v_u8m1(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei32_v_u8m2() {
+  auto indices = __riscv_vle32_v_u32m8(index_buf, vl);
+  auto data = __riscv_vluxei32_v_u8m2(in_buf, indices, vl);
+  __riscv_vse8_v_u8m2(out_buf, data, vl);
+}
+
+// Ordered
+__attribute__((used, retain)) void vloxei32_v_u8mf4() {
+  auto indices = __riscv_vle32_v_u32m1(index_buf, vl);
+  auto data = __riscv_vloxei32_v_u8mf4(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei32_v_u8mf2() {
+  auto indices = __riscv_vle32_v_u32m2(index_buf, vl);
+  auto data = __riscv_vloxei32_v_u8mf2(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei32_v_u8m1() {
+  auto indices = __riscv_vle32_v_u32m4(index_buf, vl);
+  auto data = __riscv_vloxei32_v_u8m1(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei32_v_u8m2() {
+  auto indices = __riscv_vle32_v_u32m8(index_buf, vl);
+  auto data = __riscv_vloxei32_v_u8m2(in_buf, indices, vl);
+  __riscv_vse8_v_u8m2(out_buf, data, vl);
+}
+}
+
+void (*impl)() __attribute__((section(".data"))) = &vluxei32_v_u8m1;
+
+int main(int argc, char** argv) {
+  impl();
+  return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load8_index8.cc b/tests/cocotb/rvv/load_store/load8_index8.cc
new file mode 100644
index 0000000..6a6d75f
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load8_index8.cc
@@ -0,0 +1,113 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+namespace {
+constexpr size_t lut_size = 256;
+// Double sized so we can check trailing regions are not read/written.
+constexpr size_t buf_size = 256;
+}  // namespace
+
+size_t vl __attribute__((section(".data"))) = 16;
+// Indices are always unsigned.
+uint8_t index_buf[buf_size] __attribute__((section(".data")));
+// These instructions don't differentiate signed/unsigned so we only need to
+// test one. The types come from intrinsic level.
+uint8_t in_buf[lut_size] __attribute__((section(".data")));
+uint8_t out_buf[buf_size] __attribute__((section(".data")));
+
+extern "C" {
+// Unordered
+__attribute__((used, retain)) void vluxei8_v_u8mf4() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vluxei8_v_u8mf4(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei8_v_u8mf2() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vluxei8_v_u8mf2(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei8_v_u8m1() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vluxei8_v_u8m1(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei8_v_u8m2() {
+  auto indices = __riscv_vle8_v_u8m2(index_buf, vl);
+  auto data = __riscv_vluxei8_v_u8m2(in_buf, indices, vl);
+  __riscv_vse8_v_u8m2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei8_v_u8m4() {
+  auto indices = __riscv_vle8_v_u8m4(index_buf, vl);
+  auto data = __riscv_vluxei8_v_u8m4(in_buf, indices, vl);
+  __riscv_vse8_v_u8m4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei8_v_u8m8() {
+  auto indices = __riscv_vle8_v_u8m8(index_buf, vl);
+  auto data = __riscv_vluxei8_v_u8m8(in_buf, indices, vl);
+  __riscv_vse8_v_u8m8(out_buf, data, vl);
+}
+
+// Ordered
+__attribute__((used, retain)) void vloxei8_v_u8mf4() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vloxei8_v_u8mf4(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei8_v_u8mf2() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vloxei8_v_u8mf2(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei8_v_u8m1() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vloxei8_v_u8m1(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei8_v_u8m2() {
+  auto indices = __riscv_vle8_v_u8m2(index_buf, vl);
+  auto data = __riscv_vloxei8_v_u8m2(in_buf, indices, vl);
+  __riscv_vse8_v_u8m2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei8_v_u8m4() {
+  auto indices = __riscv_vle8_v_u8m4(index_buf, vl);
+  auto data = __riscv_vloxei8_v_u8m4(in_buf, indices, vl);
+  __riscv_vse8_v_u8m4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei8_v_u8m8() {
+  auto indices = __riscv_vle8_v_u8m8(index_buf, vl);
+  auto data = __riscv_vloxei8_v_u8m8(in_buf, indices, vl);
+  __riscv_vse8_v_u8m8(out_buf, data, vl);
+}
+}
+
+void (*impl)() __attribute__((section(".data"))) = &vluxei8_v_u8m1;
+
+int main(int argc, char** argv) {
+  impl();
+  return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load8_indexed_m1.cc b/tests/cocotb/rvv/load_store/load8_indexed_m1.cc
deleted file mode 100644
index 8c6c9a7..0000000
--- a/tests/cocotb/rvv/load_store/load8_indexed_m1.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2025 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <riscv_vector.h>
-#include <stdint.h>
-
-uint8_t input_indices[128] __attribute__((section(".data")));
-uint8_t input_data[4096] __attribute__((section(".data")));
-uint8_t output_data[128] __attribute__((section(".data")));
-
-int main(int argc, char **argv) {
-  vuint8m1_t indices = __riscv_vle8_v_u8m1(input_indices, /*vl=*/16);
-  vuint8m1_t data = __riscv_vloxei8_v_u8m1(input_data, indices, /*vl=*/16);
-  __riscv_vse8_v_u8m1(output_data, data, /*vl=*/16);
-
-  return 0;
-}
diff --git a/tests/cocotb/rvv_load_store_test.py b/tests/cocotb/rvv_load_store_test.py
index 2f8d9d3..843e6e2 100644
--- a/tests/cocotb/rvv_load_store_test.py
+++ b/tests/cocotb/rvv_load_store_test.py
@@ -14,6 +14,7 @@
 
 import cocotb
 import numpy as np
+import tqdm
 
 from bazel_tools.tools.python.runfiles import runfiles
 from kelvin_test_utils.sim_test_fixture import Fixture
@@ -75,13 +76,14 @@
     r = runfiles.Create()
     await fixture.load_elf_and_lookup_symbols(
         r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/' + elf_name),
-        ['impl', 'vl', 'in_buf', 'out_buf'] + [c['impl'] for c in cases],
+        ['impl', 'vl', 'in_buf', 'out_buf'] +
+            list(set([c['impl'] for c in cases])),
     )
 
     min_value = np.iinfo(dtype).min
     max_value = np.iinfo(dtype).max + 1  # One above.
     rng = np.random.default_rng()
-    for c in cases:
+    for c in tqdm.tqdm(cases):
         impl = c['impl']
         vl = c['vl']
         in_size = c['in_size']
@@ -111,10 +113,13 @@
         })
         assert (actual_outputs == expected_outputs).all(), debug_msg
 
+
 async def vector_load_indexed(
         dut,
         elf_name: str,
+        cases: list[dict],  # keys: impl, vl, in_size, out_size.
         dtype,
+        index_dtype,
 ):
     """RVV load-store test template for indexed loads.
 
@@ -124,32 +129,47 @@
     r = runfiles.Create()
     await fixture.load_elf_and_lookup_symbols(
         r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/' + elf_name),
-        ['input_indices', 'input_data', 'output_data'],
+        ['impl', 'vl', 'in_buf', 'out_buf', 'index_buf'] +
+            list(set([c['impl'] for c in cases])),
     )
 
-    indices_count = 16 // np.dtype(dtype).itemsize
-    in_data_count = 4096 // np.dtype(dtype).itemsize
-    out_data_count = 16 // np.dtype(dtype).itemsize
-
     min_value = np.iinfo(dtype).min
     max_value = np.iinfo(dtype).max + 1  # One above.
     rng = np.random.default_rng()
-    input_data = rng.integers(min_value, max_value, in_data_count, dtype=dtype)
-    input_indices = rng.integers(
-        0, min(max_value, in_data_count-1), indices_count, dtype=dtype)
+    for c in tqdm.tqdm(cases):
+        impl = c['impl']
+        vl = c['vl']
+        in_size = c['in_size']
+        out_size = c['out_size']
 
-    expected_outputs = np.take(input_data, input_indices)
+        # TODO(davidgao): currently assuming the vl is supported.
+        # We'll eventually want to test unsupported vl.
+        indices = rng.integers(0, in_size, vl, dtype=index_dtype)
+        input_data = rng.integers(min_value, max_value, in_size, dtype=dtype)
+        expected_outputs = input_data[indices[:vl]]
+        sbz = np.zeros(out_size - vl, dtype=dtype)
+        expected_outputs = np.concat((expected_outputs, sbz))
 
-    await fixture.write('input_data', input_data)
-    await fixture.write('input_indices', input_indices)
-    await fixture.write('output_data', np.zeros([out_data_count], dtype=dtype))
+        await fixture.write_ptr('impl', impl)
+        await fixture.write_word('vl', vl)
+        await fixture.write('index_buf', indices)
+        await fixture.write('in_buf', input_data)
+        await fixture.write('out_buf', np.zeros([out_size], dtype=dtype))
 
-    await fixture.run_to_halt()
+        await fixture.run_to_halt()
 
-    actual_outputs = (await fixture.read(
-        'output_data', out_data_count * np.dtype(dtype).itemsize)).view(dtype)
+        actual_outputs = (await fixture.read(
+            'out_buf', out_size * np.dtype(dtype).itemsize)).view(dtype)
 
-    assert (actual_outputs == expected_outputs).all()
+        debug_msg = str({
+            'impl': impl,
+            'input': input_data,
+            'indices': indices,
+            'expected': expected_outputs,
+            'actual': actual_outputs,
+        })
+        assert (actual_outputs == expected_outputs).all(), debug_msg
+
 
 async def vector_store_indexed(
         dut,
@@ -223,7 +243,8 @@
     await fixture.load_elf_and_lookup_symbols(
         r.Rlocation(
             'kelvin_hw/tests/cocotb/rvv/load_store/load_store_bits.elf'),
-        ['vl', 'in_buf', 'out_buf', 'impl'] + [c['impl'] for c in cases],
+        ['vl', 'in_buf', 'out_buf', 'impl'] +
+            list(set([c['impl'] for c in cases])),
     )
     rng = np.random.default_rng()
     for c in cases:
@@ -460,6 +481,139 @@
 
 
 @cocotb.test()
+async def load8_index8(dut):
+    """Test vl*xei8_v_u8 usage accessible from intrinsics."""
+    def make_test_case(impl: str, vl: int):
+        return {
+            'impl': impl,
+            'vl': vl,
+            'in_size': 256,
+            'out_size': vl * 2,
+        }
+
+    await vector_load_indexed(
+        dut = dut,
+        elf_name = 'load8_index8.elf',
+        cases = [
+            # Unordered
+            make_test_case('vluxei8_v_u8mf4', vl = 4),
+            make_test_case('vluxei8_v_u8mf4', vl = 3),
+            make_test_case('vluxei8_v_u8mf2', vl = 8),
+            make_test_case('vluxei8_v_u8mf2', vl = 7),
+            make_test_case('vluxei8_v_u8m1', vl = 16),
+            make_test_case('vluxei8_v_u8m1', vl = 15),
+            make_test_case('vluxei8_v_u8m2', vl = 32),
+            make_test_case('vluxei8_v_u8m2', vl = 31),
+            make_test_case('vluxei8_v_u8m4', vl = 64),
+            make_test_case('vluxei8_v_u8m4', vl = 63),
+            make_test_case('vluxei8_v_u8m8', vl = 128),
+            make_test_case('vluxei8_v_u8m8', vl = 127),
+            # Ordered
+            make_test_case('vloxei8_v_u8mf4', vl = 4),
+            make_test_case('vloxei8_v_u8mf4', vl = 3),
+            make_test_case('vloxei8_v_u8mf2', vl = 8),
+            make_test_case('vloxei8_v_u8mf2', vl = 7),
+            make_test_case('vloxei8_v_u8m1', vl = 16),
+            make_test_case('vloxei8_v_u8m1', vl = 15),
+            make_test_case('vloxei8_v_u8m2', vl = 32),
+            make_test_case('vloxei8_v_u8m2', vl = 31),
+            make_test_case('vloxei8_v_u8m4', vl = 64),
+            make_test_case('vloxei8_v_u8m4', vl = 63),
+            make_test_case('vloxei8_v_u8m8', vl = 128),
+            make_test_case('vloxei8_v_u8m8', vl = 127),
+        ],
+        dtype = np.uint8,
+        index_dtype = np.uint8,
+    )
+
+
+@cocotb.test()
+async def load8_index16(dut):
+    """Test vl*xei16_v_u8 usage accessible from intrinsics."""
+    def make_test_case(impl: str, vl: int):
+        return {
+            'impl': impl,
+            'vl': vl,
+            'in_size': 32000,  # DTCM is 32KB
+            'out_size': vl * 2,
+        }
+
+    await vector_load_indexed(
+        dut = dut,
+        elf_name = 'load8_index16.elf',
+        cases = [
+            # Unordered
+            make_test_case('vluxei16_v_u8mf4', vl = 4),
+            make_test_case('vluxei16_v_u8mf4', vl = 3),
+            make_test_case('vluxei16_v_u8mf2', vl = 8),
+            make_test_case('vluxei16_v_u8mf2', vl = 7),
+            # make_test_case('vluxei16_v_u8m1', vl = 16),
+            # make_test_case('vluxei16_v_u8m1', vl = 15),
+            # make_test_case('vluxei16_v_u8m2', vl = 32),
+            # make_test_case('vluxei16_v_u8m2', vl = 31),
+            # make_test_case('vluxei16_v_u8m4', vl = 64),
+            # make_test_case('vluxei16_v_u8m4', vl = 63),
+            # Ordered
+            make_test_case('vloxei16_v_u8mf4', vl = 4),
+            make_test_case('vloxei16_v_u8mf4', vl = 3),
+            make_test_case('vloxei16_v_u8mf2', vl = 8),
+            make_test_case('vloxei16_v_u8mf2', vl = 7),
+            # make_test_case('vloxei16_v_u8m1', vl = 16),
+            # make_test_case('vloxei16_v_u8m1', vl = 15),
+            # make_test_case('vloxei16_v_u8m2', vl = 32),
+            # make_test_case('vloxei16_v_u8m2', vl = 31),
+            # make_test_case('vloxei16_v_u8m4', vl = 64),
+            # make_test_case('vloxei16_v_u8m4', vl = 63),
+        ],
+        dtype = np.uint8,
+        index_dtype = np.uint16,
+    )
+
+
+@cocotb.test()
+async def load8_index32(dut):
+    """Test vl*xei32_v_u8 usage accessible from intrinsics."""
+    def make_test_case(impl: str, vl: int):
+        return {
+            'impl': impl,
+            'vl': vl,
+            'in_size': 32000,  # DTCM is 32KB
+            'out_size': vl * 2,
+        }
+
+    await vector_load_indexed(
+        dut = dut,
+        elf_name = 'load8_index32.elf',
+        cases = [
+            # Unordered
+            make_test_case('vluxei32_v_u8mf4', vl = 4),
+            make_test_case('vluxei32_v_u8mf4', vl = 3),
+            # make_test_case('vluxei32_v_u8mf2', vl = 8),
+            # make_test_case('vluxei32_v_u8mf2', vl = 7),
+            # make_test_case('vluxei32_v_u8m1', vl = 16),
+            # make_test_case('vluxei32_v_u8m1', vl = 15),
+            # make_test_case('vluxei32_v_u8m2', vl = 32),
+            # make_test_case('vluxei32_v_u8m2', vl = 31),
+            # make_test_case('vluxei32_v_u8m4', vl = 64),
+            # make_test_case('vluxei32_v_u8m4', vl = 63),
+            # Ordered
+            make_test_case('vloxei32_v_u8mf4', vl = 4),
+            make_test_case('vloxei32_v_u8mf4', vl = 3),
+            # make_test_case('vloxei32_v_u8mf2', vl = 8),
+            # make_test_case('vloxei32_v_u8mf2', vl = 7),
+            # make_test_case('vloxei32_v_u8m1', vl = 16),
+            # make_test_case('vloxei32_v_u8m1', vl = 15),
+            # make_test_case('vloxei32_v_u8m2', vl = 32),
+            # make_test_case('vloxei32_v_u8m2', vl = 31),
+            # make_test_case('vloxei32_v_u8m4', vl = 64),
+            # make_test_case('vloxei32_v_u8m4', vl = 63),
+        ],
+        dtype = np.uint8,
+        index_dtype = np.uint32,
+    )
+
+
+@cocotb.test()
 async def load16_seg_unit(dut):
     """Test vlseg*e16 usage accessible from intrinsics."""
     def make_test_case(impl: str, vl: int, n_segs: int):
@@ -602,14 +756,6 @@
 
 
 @cocotb.test()
-async def load8_indexed_m1(dut):
-    await vector_load_indexed(
-        dut = dut,
-        elf_name = 'load8_indexed_m1.elf',
-        dtype = np.uint8,
-    )
-
-@cocotb.test()
 async def store8_indexed_m1(dut):
     await vector_store_indexed(
         dut = dut,