Add indexed load/store support.

Change-Id: I684024c6f41cab613c4cf8003d8ad548c8a26bc7
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala
index 60ae397..e034011 100644
--- a/hdl/chisel/src/kelvin/scalar/Lsu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -187,6 +187,30 @@
   }
 }
 
+object ComputeIndexedAddrs {
+  def apply(bytesPerSlot: Int,
+            baseAddr: UInt,
+            indices: UInt,
+            elemWidth: UInt): Vec[UInt] = {
+    val indices8 = UIntToVec(indices, 8)
+    val indices16 = UIntToVec(indices, 16)
+    val indices32 = UIntToVec(indices, 32)
+
+    MuxCase(VecInit.fill(bytesPerSlot)(0.U(32.W)), Seq(
+      // elemWidth validation is done at decode time.
+      // 8-bit indices. Each byte has its own offset.
+      (elemWidth === "b000".U) -> VecInit((0 until bytesPerSlot).map(
+          i => (baseAddr + indices8(i))(31, 0))),
+      // 16-bit indices. Each 2-byte element has an offset.
+      (elemWidth === "b101".U) -> VecInit((0 until bytesPerSlot).map(
+          i => (baseAddr + indices16(i >> 1))(31, 0) + (i & 1).U)),
+      // 32-bit indices. Each 4-byte element has an offset.
+      (elemWidth === "b110".U) -> VecInit((0 until bytesPerSlot).map(
+          i => (baseAddr + indices32(i >> 2))(31, 0) + (i & 3).U))
+    ))
+  }
+}
+
 // bytesPerSlot is the number of bytes in a vector register
 // bytesPerLine is the number of bytes in the AXI bus
 class LsuSlot(bytesPerSlot: Int, bytesPerLine: Int) extends Bundle {
@@ -253,8 +277,11 @@
         op.isOneOf(LsuOp.VLOAD_UNIT, LsuOp.VSTORE_UNIT) ->
             VecInit((0 until bytesPerSlot).map(i => baseAddr + i.U)),
         op.isOneOf(LsuOp.VLOAD_STRIDED, LsuOp.VSTORE_STRIDED) ->
-            ComputeStridedAddrs(bytesPerSlot, baseAddr, stride, elemWidth)
-        // TODO(derekjchow): Support indexed
+            ComputeStridedAddrs(bytesPerSlot, baseAddr, stride, elemWidth),
+        op.isOneOf(LsuOp.VLOAD_OINDEXED, LsuOp.VLOAD_UINDEXED,
+                   LsuOp.VSTORE_OINDEXED, LsuOp.VSTORE_UINDEXED) ->
+            ComputeIndexedAddrs(bytesPerSlot, baseAddr, rvv2lsu.idx.bits.data,
+                                elemWidth)
         // TODO(derekjchow): Support segmented
     ))
     result.elemWidth := elemWidth
diff --git a/tests/cocotb/BUILD b/tests/cocotb/BUILD
index 989df39..d274166 100644
--- a/tests/cocotb/BUILD
+++ b/tests/cocotb/BUILD
@@ -315,6 +315,7 @@
         "//kelvin_test_utils:sim_test_fixture",
     ],
     data = [
+        '//tests/cocotb/rvv/load_store:load8_indexed_m1.elf',
         '//tests/cocotb/rvv/load_store:load8_stride2_m1.elf',
         '//tests/cocotb/rvv/load_store:load8_stride2_m1_partial.elf',
         '//tests/cocotb/rvv/load_store:load8_stride2_mf4.elf',
@@ -324,6 +325,7 @@
         '//tests/cocotb/rvv/load_store:load16_stride4_mf2.elf',
         '//tests/cocotb/rvv/load_store:load32_stride8_m1.elf',
         '//tests/cocotb/rvv/load_store:load32_stride8_m1_partial.elf',
+        '//tests/cocotb/rvv/load_store:store8_indexed_m1.elf',
     ],
     size = "large",
 )
diff --git a/tests/cocotb/rvv/load_store/BUILD b/tests/cocotb/rvv/load_store/BUILD
index f6c3aad..d19281c 100644
--- a/tests/cocotb/rvv/load_store/BUILD
+++ b/tests/cocotb/rvv/load_store/BUILD
@@ -20,6 +20,9 @@
 template_rule(
     kelvin_v2_binary,
     {
+        "load8_indexed_m1": {
+            "srcs": ["load8_indexed_m1.cc"],
+        },
         "load8_stride2_m1": {
             "srcs": ["load8_stride2_m1.cc"],
         },
@@ -47,5 +50,8 @@
         "load32_stride8_m1_partial": {
             "srcs": ["load32_stride8_m1_partial.cc"],
         },
+        "store8_indexed_m1": {
+            "srcs": ["store8_indexed_m1.cc"],
+        },
     },
 )
diff --git a/tests/cocotb/rvv/load_store/load8_indexed_m1.cc b/tests/cocotb/rvv/load_store/load8_indexed_m1.cc
new file mode 100644
index 0000000..8c6c9a7
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load8_indexed_m1.cc
@@ -0,0 +1,28 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+uint8_t input_indices[128] __attribute__((section(".data")));
+uint8_t input_data[4096] __attribute__((section(".data")));
+uint8_t output_data[128] __attribute__((section(".data")));
+
+int main(int argc, char **argv) {
+  vuint8m1_t indices = __riscv_vle8_v_u8m1(input_indices, /*vl=*/16);
+  vuint8m1_t data = __riscv_vloxei8_v_u8m1(input_data, indices, /*vl=*/16);
+  __riscv_vse8_v_u8m1(output_data, data, /*vl=*/16);
+
+  return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/store8_indexed_m1.cc b/tests/cocotb/rvv/load_store/store8_indexed_m1.cc
new file mode 100644
index 0000000..221ff76
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/store8_indexed_m1.cc
@@ -0,0 +1,28 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+uint8_t input_indices[128] __attribute__((section(".data")));
+uint8_t input_data[128] __attribute__((section(".data")));
+uint8_t output_data[4096] __attribute__((section(".data")));
+
+int main(int argc, char **argv) {
+  vuint8m1_t indices = __riscv_vle8_v_u8m1(input_indices, /*vl=*/16);
+  vuint8m1_t data = __riscv_vle8_v_u8m1(input_data, /*vl=*/16);
+  __riscv_vsoxei8_v_u8m1(output_data, indices, data, /*vl=*/16);
+
+  return 0;
+}
diff --git a/tests/cocotb/rvv_load_store_test.py b/tests/cocotb/rvv_load_store_test.py
index 6e4b481..640d1f2 100644
--- a/tests/cocotb/rvv_load_store_test.py
+++ b/tests/cocotb/rvv_load_store_test.py
@@ -59,6 +59,88 @@
 
     assert (actual_outputs == expected_outputs).all(), debug_msg
 
+async def vector_load_indexed(
+        dut,
+        elf_name: str,
+        dtype,
+):
+    """RVV load-store test template for indexed loads.
+
+    Each test performs a gather operation and writes the result to an output.
+    """
+    fixture = await Fixture.Create(dut)
+    await fixture.load_elf_and_lookup_symbols(
+        '../tests/cocotb/rvv/load_store/' + elf_name,
+        ['input_indices', 'input_data', 'output_data'],
+    )
+
+    indices_count = 16 // np.dtype(dtype).itemsize
+    in_data_count = 4096 // np.dtype(dtype).itemsize
+    out_data_count = 16 // np.dtype(dtype).itemsize
+
+    min_value = np.iinfo(dtype).min
+    max_value = np.iinfo(dtype).max + 1  # One above.
+    rng = np.random.default_rng()
+    input_data = rng.integers(min_value, max_value, in_data_count, dtype=dtype)
+    input_indices = rng.integers(
+        0, min(max_value, in_data_count-1), indices_count, dtype=dtype)
+
+    expected_outputs = np.take(input_data, input_indices)
+
+    await fixture.write('input_data', input_data)
+    await fixture.write('input_indices', input_indices)
+    await fixture.write('output_data', np.zeros([out_data_count], dtype=dtype))
+
+    await fixture.run_to_halt()
+
+    actual_outputs = (await fixture.read(
+        'output_data', out_data_count * np.dtype(dtype).itemsize)).view(dtype)
+
+    assert (actual_outputs == expected_outputs).all()
+
+async def vector_store_indexed(
+        dut,
+        elf_name: str,
+        dtype,
+):
+    """RVV load-store test template for indexed stores.
+
+    Each test loads indices and data and performs a scatter operation.
+    """
+    fixture = await Fixture.Create(dut)
+    await fixture.load_elf_and_lookup_symbols(
+        '../tests/cocotb/rvv/load_store/' + elf_name,
+        ['input_indices', 'input_data', 'output_data'],
+    )
+
+    indices_count = 16 // np.dtype(dtype).itemsize
+    in_data_count = 16 // np.dtype(dtype).itemsize
+    out_data_count = 4096 // np.dtype(dtype).itemsize
+
+    min_value = np.iinfo(dtype).min
+    max_value = np.iinfo(dtype).max + 1  # One above.
+    rng = np.random.default_rng()
+    input_data = rng.integers(min_value, max_value, in_data_count, dtype=dtype)
+    input_indices = rng.integers(
+        0, min(max_value, out_data_count-1), indices_count, dtype=dtype)
+    original_outputs = rng.integers(
+        min_value, max_value, out_data_count, dtype=dtype)
+
+    await fixture.write('input_data', input_data)
+    await fixture.write('input_indices', input_indices)
+    await fixture.write('output_data', original_outputs)
+
+    expected_outputs = np.copy(original_outputs)
+    for idx, data in zip(input_indices, input_data):
+      expected_outputs[idx] = data
+
+    await fixture.run_to_halt()
+
+    actual_outputs = (await fixture.read(
+        'output_data', out_data_count * np.dtype(dtype).itemsize)).view(dtype)
+
+    assert (actual_outputs == expected_outputs).all()
+
 @cocotb.test()
 async def load8_stride2_m1(dut):
     await vector_load_store(
@@ -157,3 +239,19 @@
         out_size = 64,
         pattern = list(range(0, 32)),
     )
+
+@cocotb.test()
+async def load8_indexed_m1(dut):
+    await vector_load_indexed(
+        dut = dut,
+        elf_name = 'load8_indexed_m1.elf',
+        dtype = np.uint8,
+    )
+
+@cocotb.test()
+async def store8_indexed_m1(dut):
+    await vector_store_indexed(
+        dut = dut,
+        elf_name = 'store8_indexed_m1.elf',
+        dtype = np.uint8,
+    )