Fix segmented indexed loads

Currently tested on data8 index8.

Adding the first set of tests. Will extend coverage.

Change-Id: I2283b4120f35778e2d7e7134932a3dcd211b1294
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala
index 49e656d..cbe4ea3 100644
--- a/hdl/chisel/src/kelvin/scalar/Lsu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -391,7 +391,7 @@
     result.segmentStride := segmentStride
     result.vectorLoop := vectorLoop
 
-    val segmentBaseAddr = baseAddr + (segmentStride * vectorLoop.segment.curr)
+    val segmentBaseAddr = baseAddr + (segmentStride * vectorLoop.segment.curr)(31, 0)
     val bitsPerSlot = bytesPerSlot * 8
     val indices = MuxCase(rvv2lsu.idx.bits.data, Seq(
         // 2 of 2
@@ -410,7 +410,7 @@
             ComputeStridedAddrs(bytesPerSlot, segmentBaseAddr, elemStride, elemWidth),
         op.isOneOf(LsuOp.VLOAD_OINDEXED, LsuOp.VLOAD_UINDEXED,
                    LsuOp.VSTORE_OINDEXED, LsuOp.VSTORE_UINDEXED) ->
-            ComputeIndexedAddrs(bytesPerSlot, baseAddr, indices,
+            ComputeIndexedAddrs(bytesPerSlot, segmentBaseAddr, indices,
                                 elemWidth, sew),
     ))
     result.elemWidth := elemWidth
diff --git a/tests/cocotb/BUILD b/tests/cocotb/BUILD
index b184773..b00766a 100644
--- a/tests/cocotb/BUILD
+++ b/tests/cocotb/BUILD
@@ -174,6 +174,7 @@
 RVV_LOAD_STORE_TESTCASES = [
     "load_store_bits",
     "load8_index8",
+    "load8_index8_seg",
     "load8_index16",
     "load8_index32",
     "load8_seg_unit",
diff --git a/tests/cocotb/rvv/load_store/BUILD b/tests/cocotb/rvv/load_store/BUILD
index 30014f6..2efad52 100644
--- a/tests/cocotb/rvv/load_store/BUILD
+++ b/tests/cocotb/rvv/load_store/BUILD
@@ -26,6 +26,9 @@
         "load8_index8": {
             "srcs": ["load8_index8.cc"],
         },
+        "load8_index8_seg": {
+            "srcs": ["load8_index8_seg.cc"],
+        },
         "load8_index16": {
             "srcs": ["load8_index16.cc"],
         },
@@ -124,6 +127,7 @@
     srcs = [
         ":load_store_bits.elf",
         ":load8_index8.elf",
+        ":load8_index8_seg.elf",
         ":load8_index16.elf",
         ":load8_index32.elf",
         ":load8_seg_unit.elf",
diff --git a/tests/cocotb/rvv/load_store/load8_index8_seg.cc b/tests/cocotb/rvv/load_store/load8_index8_seg.cc
new file mode 100644
index 0000000..a9a0807
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load8_index8_seg.cc
@@ -0,0 +1,389 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+namespace {
+constexpr size_t lut_size = 263;
+// Double sized so we can check trailing regions are not read/written.
+constexpr size_t buf_size = 256;
+}  // namespace
+
+size_t vl __attribute__((section(".data"))) = 16;
+// Indices are always unsigned.
+uint8_t index_buf[buf_size] __attribute__((section(".data")));
+// These instructions don't differentiate signed/unsigned so we only need to
+// test one. The types come from intrinsic level.
+uint8_t in_buf[lut_size] __attribute__((section(".data")));
+uint8_t out_buf[buf_size] __attribute__((section(".data")));
+
+extern "C" {
+// Unordered, segment 2
+__attribute__((used, retain)) void vluxseg2ei8_v_u8mf4x2() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vluxseg2ei8_v_u8mf4x2(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x2_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x2_u8mf4(data, 1), vl);
+}
+
+__attribute__((used, retain)) void vluxseg2ei8_v_u8mf2x2() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vluxseg2ei8_v_u8mf2x2(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x2_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x2_u8mf2(data, 1), vl);
+}
+
+__attribute__((used, retain)) void vluxseg2ei8_v_u8m1x2() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vluxseg2ei8_v_u8m1x2(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x2_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x2_u8m1(data, 1), vl);
+}
+
+__attribute__((used, retain)) void vluxseg2ei8_v_u8m2x2() {
+  auto indices = __riscv_vle8_v_u8m2(index_buf, vl);
+  auto data = __riscv_vluxseg2ei8_v_u8m2x2(in_buf, indices, vl);
+  __riscv_vse8_v_u8m2(out_buf, __riscv_vget_v_u8m2x2_u8m2(data, 0), vl);
+  __riscv_vse8_v_u8m2(out_buf + vl, __riscv_vget_v_u8m2x2_u8m2(data, 1), vl);
+}
+
+__attribute__((used, retain)) void vluxseg2ei8_v_u8m4x2() {
+  auto indices = __riscv_vle8_v_u8m4(index_buf, vl);
+  auto data = __riscv_vluxseg2ei8_v_u8m4x2(in_buf, indices, vl);
+  __riscv_vse8_v_u8m4(out_buf, __riscv_vget_v_u8m4x2_u8m4(data, 0), vl);
+  __riscv_vse8_v_u8m4(out_buf + vl, __riscv_vget_v_u8m4x2_u8m4(data, 1), vl);
+}
+
+// Unordered, segment 3
+__attribute__((used, retain)) void vluxseg3ei8_v_u8mf4x3() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vluxseg3ei8_v_u8mf4x3(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x3_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x3_u8mf4(data, 1), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x3_u8mf4(data, 2),
+                       vl);
+}
+
+__attribute__((used, retain)) void vluxseg3ei8_v_u8mf2x3() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vluxseg3ei8_v_u8mf2x3(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x3_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x3_u8mf2(data, 1), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x3_u8mf2(data, 2),
+                       vl);
+}
+
+__attribute__((used, retain)) void vluxseg3ei8_v_u8m1x3() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vluxseg3ei8_v_u8m1x3(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x3_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x3_u8m1(data, 1), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x3_u8m1(data, 2),
+                      vl);
+}
+
+__attribute__((used, retain)) void vluxseg3ei8_v_u8m2x3() {
+  auto indices = __riscv_vle8_v_u8m2(index_buf, vl);
+  auto data = __riscv_vluxseg3ei8_v_u8m2x3(in_buf, indices, vl);
+  __riscv_vse8_v_u8m2(out_buf, __riscv_vget_v_u8m2x3_u8m2(data, 0), vl);
+  __riscv_vse8_v_u8m2(out_buf + vl, __riscv_vget_v_u8m2x3_u8m2(data, 1), vl);
+  __riscv_vse8_v_u8m2(out_buf + vl * 2, __riscv_vget_v_u8m2x3_u8m2(data, 2),
+                      vl);
+}
+
+// Unordered, segment 4
+__attribute__((used, retain)) void vluxseg4ei8_v_u8mf4x4() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vluxseg4ei8_v_u8mf4x4(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x4_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x4_u8mf4(data, 1), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x4_u8mf4(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x4_u8mf4(data, 3),
+                       vl);
+}
+
+__attribute__((used, retain)) void vluxseg4ei8_v_u8mf2x4() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vluxseg4ei8_v_u8mf2x4(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x4_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x4_u8mf2(data, 1), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x4_u8mf2(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x4_u8mf2(data, 3),
+                       vl);
+}
+
+__attribute__((used, retain)) void vluxseg4ei8_v_u8m1x4() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vluxseg4ei8_v_u8m1x4(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x4_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x4_u8m1(data, 1), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x4_u8m1(data, 2),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x4_u8m1(data, 3),
+                      vl);
+}
+
+__attribute__((used, retain)) void vluxseg4ei8_v_u8m2x4() {
+  auto indices = __riscv_vle8_v_u8m2(index_buf, vl);
+  auto data = __riscv_vluxseg4ei8_v_u8m2x4(in_buf, indices, vl);
+  __riscv_vse8_v_u8m2(out_buf, __riscv_vget_v_u8m2x4_u8m2(data, 0), vl);
+  __riscv_vse8_v_u8m2(out_buf + vl, __riscv_vget_v_u8m2x4_u8m2(data, 1), vl);
+  __riscv_vse8_v_u8m2(out_buf + vl * 2, __riscv_vget_v_u8m2x4_u8m2(data, 2),
+                      vl);
+  __riscv_vse8_v_u8m2(out_buf + vl * 3, __riscv_vget_v_u8m2x4_u8m2(data, 3),
+                      vl);
+}
+
+// Unordered, segment 5
+__attribute__((used, retain)) void vluxseg5ei8_v_u8mf4x5() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vluxseg5ei8_v_u8mf4x5(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x5_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x5_u8mf4(data, 1), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x5_u8mf4(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x5_u8mf4(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x5_u8mf4(data, 4),
+                       vl);
+}
+
+__attribute__((used, retain)) void vluxseg5ei8_v_u8mf2x5() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vluxseg5ei8_v_u8mf2x5(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x5_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x5_u8mf2(data, 1), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x5_u8mf2(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x5_u8mf2(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x5_u8mf2(data, 4),
+                       vl);
+}
+
+__attribute__((used, retain)) void vluxseg5ei8_v_u8m1x5() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vluxseg5ei8_v_u8m1x5(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x5_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x5_u8m1(data, 1), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x5_u8m1(data, 2),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x5_u8m1(data, 3),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x5_u8m1(data, 4),
+                      vl);
+}
+
+// Unordered, segment 6
+__attribute__((used, retain)) void vluxseg6ei8_v_u8mf4x6() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vluxseg6ei8_v_u8mf4x6(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x6_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x6_u8mf4(data, 1), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x6_u8mf4(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x6_u8mf4(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x6_u8mf4(data, 4),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 5, __riscv_vget_v_u8mf4x6_u8mf4(data, 5),
+                       vl);
+}
+
+__attribute__((used, retain)) void vluxseg6ei8_v_u8mf2x6() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vluxseg6ei8_v_u8mf2x6(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x6_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x6_u8mf2(data, 1), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x6_u8mf2(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x6_u8mf2(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x6_u8mf2(data, 4),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 5, __riscv_vget_v_u8mf2x6_u8mf2(data, 5),
+                       vl);
+}
+
+__attribute__((used, retain)) void vluxseg6ei8_v_u8m1x6() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vluxseg6ei8_v_u8m1x6(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x6_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x6_u8m1(data, 1), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x6_u8m1(data, 2),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x6_u8m1(data, 3),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x6_u8m1(data, 4),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 5, __riscv_vget_v_u8m1x6_u8m1(data, 5),
+                      vl);
+}
+
+// Unordered, segment 7
+__attribute__((used, retain)) void vluxseg7ei8_v_u8mf4x7() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vluxseg7ei8_v_u8mf4x7(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x7_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x7_u8mf4(data, 1), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x7_u8mf4(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x7_u8mf4(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x7_u8mf4(data, 4),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 5, __riscv_vget_v_u8mf4x7_u8mf4(data, 5),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 6, __riscv_vget_v_u8mf4x7_u8mf4(data, 6),
+                       vl);
+}
+
+__attribute__((used, retain)) void vluxseg7ei8_v_u8mf2x7() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vluxseg7ei8_v_u8mf2x7(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x7_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x7_u8mf2(data, 1), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x7_u8mf2(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x7_u8mf2(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x7_u8mf2(data, 4),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 5, __riscv_vget_v_u8mf2x7_u8mf2(data, 5),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 6, __riscv_vget_v_u8mf2x7_u8mf2(data, 6),
+                       vl);
+}
+
+__attribute__((used, retain)) void vluxseg7ei8_v_u8m1x7() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vluxseg7ei8_v_u8m1x7(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x7_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x7_u8m1(data, 1), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x7_u8m1(data, 2),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x7_u8m1(data, 3),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x7_u8m1(data, 4),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 5, __riscv_vget_v_u8m1x7_u8m1(data, 5),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 6, __riscv_vget_v_u8m1x7_u8m1(data, 6),
+                      vl);
+}
+
+// Unordered, segment 8
+__attribute__((used, retain)) void vluxseg8ei8_v_u8mf4x8() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vluxseg8ei8_v_u8mf4x8(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x8_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x8_u8mf4(data, 1), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x8_u8mf4(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x8_u8mf4(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x8_u8mf4(data, 4),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 5, __riscv_vget_v_u8mf4x8_u8mf4(data, 5),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 6, __riscv_vget_v_u8mf4x8_u8mf4(data, 6),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 7, __riscv_vget_v_u8mf4x8_u8mf4(data, 7),
+                       vl);
+}
+
+__attribute__((used, retain)) void vluxseg8ei8_v_u8mf2x8() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vluxseg8ei8_v_u8mf2x8(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x8_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x8_u8mf2(data, 1), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x8_u8mf2(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x8_u8mf2(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x8_u8mf2(data, 4),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 5, __riscv_vget_v_u8mf2x8_u8mf2(data, 5),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 6, __riscv_vget_v_u8mf2x8_u8mf2(data, 6),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 7, __riscv_vget_v_u8mf2x8_u8mf2(data, 7),
+                       vl);
+}
+
+__attribute__((used, retain)) void vluxseg8ei8_v_u8m1x8() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vluxseg8ei8_v_u8m1x8(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x8_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x8_u8m1(data, 1), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x8_u8m1(data, 2),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x8_u8m1(data, 3),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x8_u8m1(data, 4),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 5, __riscv_vget_v_u8m1x8_u8m1(data, 5),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 6, __riscv_vget_v_u8m1x8_u8m1(data, 6),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 7, __riscv_vget_v_u8m1x8_u8m1(data, 7),
+                      vl);
+}
+
+// Ordered, segment 2
+__attribute__((used, retain)) void vloxseg2ei8_v_u8mf4x2() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vloxseg2ei8_v_u8mf4x2(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x2_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x2_u8mf4(data, 1), vl);
+}
+
+__attribute__((used, retain)) void vloxseg2ei8_v_u8mf2x2() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vloxseg2ei8_v_u8mf2x2(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x2_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x2_u8mf2(data, 1), vl);
+}
+
+__attribute__((used, retain)) void vloxseg2ei8_v_u8m1x2() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vloxseg2ei8_v_u8m1x2(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x2_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x2_u8m1(data, 1), vl);
+}
+
+__attribute__((used, retain)) void vloxseg2ei8_v_u8m2x2() {
+  auto indices = __riscv_vle8_v_u8m2(index_buf, vl);
+  auto data = __riscv_vloxseg2ei8_v_u8m2x2(in_buf, indices, vl);
+  __riscv_vse8_v_u8m2(out_buf, __riscv_vget_v_u8m2x2_u8m2(data, 0), vl);
+  __riscv_vse8_v_u8m2(out_buf + vl, __riscv_vget_v_u8m2x2_u8m2(data, 1), vl);
+}
+
+__attribute__((used, retain)) void vloxseg2ei8_v_u8m4x2() {
+  auto indices = __riscv_vle8_v_u8m4(index_buf, vl);
+  auto data = __riscv_vloxseg2ei8_v_u8m4x2(in_buf, indices, vl);
+  __riscv_vse8_v_u8m4(out_buf, __riscv_vget_v_u8m4x2_u8m4(data, 0), vl);
+  __riscv_vse8_v_u8m4(out_buf + vl, __riscv_vget_v_u8m4x2_u8m4(data, 1), vl);
+}
+}
+
+void (*impl)() __attribute__((section(".data"))) = &vluxseg2ei8_v_u8m1x2;
+
+int main(int argc, char** argv) {
+  impl();
+  return 0;
+}
diff --git a/tests/cocotb/rvv_load_store_test.py b/tests/cocotb/rvv_load_store_test.py
index dcfdb5a..cda8cf6 100644
--- a/tests/cocotb/rvv_load_store_test.py
+++ b/tests/cocotb/rvv_load_store_test.py
@@ -177,6 +177,75 @@
         assert (actual_outputs == expected_outputs).all(), debug_msg
 
 
+async def vector_load_segmented_indexed(
+        dut,
+        elf_name: str,
+        cases: list[dict],  # keys: impl, vl, segments, in_bytes, out_size.
+        dtype,
+        index_dtype,
+):
+    """RVV load-store test template for segmented indexed loads.
+
+    Each test performs a gather-unzip operation and writes the result to an output.
+    """
+    fixture = await Fixture.Create(dut)
+    r = runfiles.Create()
+    await fixture.load_elf_and_lookup_symbols(
+        r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/' + elf_name),
+        ['impl', 'vl', 'in_buf', 'out_buf', 'index_buf'] +
+            list({c['impl'] for c in cases}),
+    )
+
+    rng = np.random.default_rng()
+    for c in tqdm.tqdm(cases):
+        impl = c['impl']
+        vl = c['vl']
+        segments = c['segments']
+        in_bytes = c['in_bytes']
+        out_size = c['out_size']
+
+        # Don't go beyond the buffer.
+        index_max = min(
+            in_bytes - segments * np.dtype(dtype).itemsize,
+            np.iinfo(index_dtype).max)
+        # TODO(davidgao): currently assuming the vl is supported.
+        # We'll eventually want to test unsupported vl.
+        indices = rng.integers(0, index_max + 1, out_size, dtype=index_dtype)
+        # Index is in bytes so input needs to be in bytes.
+        input_data = rng.integers(0, 256, in_bytes, dtype=np.uint8)
+        # Input needs to be reinterpreted. Note indices in use can reach
+        # beyond index_dtype when dtype is wider than uint8.
+        indices_in_use = np.array([
+            np.arange(x + s, x + s + np.dtype(dtype).itemsize)
+            for s in range(segments)
+            for x in indices[:vl].astype(np.uint32)
+        ])
+        expected_outputs = input_data[indices_in_use].view(dtype)[..., 0]
+        sbz = np.zeros(out_size - vl * segments, dtype=dtype)
+        expected_outputs = np.concat((expected_outputs, sbz))
+
+        await fixture.write_ptr('impl', impl)
+        await fixture.write_word('vl', vl)
+        await fixture.write('index_buf', indices)
+        await fixture.write('in_buf', input_data)
+        await fixture.write('out_buf', np.zeros([out_size], dtype=dtype))
+
+        await fixture.run_to_halt()
+
+        actual_outputs = (await fixture.read(
+            'out_buf', out_size * np.dtype(dtype).itemsize)).view(dtype)
+
+        debug_msg = str({
+            'impl': impl,
+            'input': input_data,
+            'indices': indices,
+            'indices_in_use': indices_in_use[..., 0],
+            'expected': expected_outputs,
+            'actual': actual_outputs,
+        })
+        assert (actual_outputs == expected_outputs).all(), debug_msg
+
+
 async def vector_store_indexed(
         dut,
         elf_name: str,
@@ -428,6 +497,142 @@
         pattern=list(range(0, 8)),
     )
 
+@cocotb.test()
+async def load8_index8(dut):
+    """Test vl*xei8_v_u8 usage accessible from intrinsics."""
+    def make_test_case(impl: str, vl: int):
+        return {
+            'impl': impl,
+            'vl': vl,
+            'in_bytes': 256,
+            'out_size': vl * 2,
+        }
+
+    await vector_load_indexed(
+        dut = dut,
+        elf_name = 'load8_index8.elf',
+        cases = [
+            # Unordered
+            make_test_case('vluxei8_v_u8mf4', vl = 4),
+            make_test_case('vluxei8_v_u8mf4', vl = 3),
+            make_test_case('vluxei8_v_u8mf2', vl = 8),
+            make_test_case('vluxei8_v_u8mf2', vl = 7),
+            make_test_case('vluxei8_v_u8m1', vl = 16),
+            make_test_case('vluxei8_v_u8m1', vl = 15),
+            make_test_case('vluxei8_v_u8m2', vl = 32),
+            make_test_case('vluxei8_v_u8m2', vl = 31),
+            make_test_case('vluxei8_v_u8m4', vl = 64),
+            make_test_case('vluxei8_v_u8m4', vl = 63),
+            make_test_case('vluxei8_v_u8m8', vl = 128),
+            make_test_case('vluxei8_v_u8m8', vl = 127),
+            # Ordered
+            make_test_case('vloxei8_v_u8mf4', vl = 4),
+            make_test_case('vloxei8_v_u8mf4', vl = 3),
+            make_test_case('vloxei8_v_u8mf2', vl = 8),
+            make_test_case('vloxei8_v_u8mf2', vl = 7),
+            make_test_case('vloxei8_v_u8m1', vl = 16),
+            make_test_case('vloxei8_v_u8m1', vl = 15),
+            make_test_case('vloxei8_v_u8m2', vl = 32),
+            make_test_case('vloxei8_v_u8m2', vl = 31),
+            make_test_case('vloxei8_v_u8m4', vl = 64),
+            make_test_case('vloxei8_v_u8m4', vl = 63),
+            make_test_case('vloxei8_v_u8m8', vl = 128),
+            make_test_case('vloxei8_v_u8m8', vl = 127),
+        ],
+        dtype = np.uint8,
+        index_dtype = np.uint8,
+    )
+
+
+@cocotb.test()
+async def load8_index8_seg(dut):
+    """Test vl*xei8_v_u8 usage accessible from intrinsics."""
+    def make_test_case(impl: str, vl: int, n_segs: int):
+        return {
+            'impl': impl,
+            'vl': vl,
+            'segments': n_segs,
+            'in_bytes': 263,
+            'out_size': vl * n_segs * 2,
+        }
+
+    await vector_load_segmented_indexed(
+        dut = dut,
+        elf_name = 'load8_index8_seg.elf',
+        cases = [
+            # Unordered, segment 2
+            make_test_case('vluxseg2ei8_v_u8mf4x2', vl=4, n_segs=2),
+            make_test_case('vluxseg2ei8_v_u8mf4x2', vl=3, n_segs=2),
+            make_test_case('vluxseg2ei8_v_u8mf2x2', vl=8, n_segs=2),
+            make_test_case('vluxseg2ei8_v_u8mf2x2', vl=7, n_segs=2),
+            make_test_case('vluxseg2ei8_v_u8m1x2', vl=16, n_segs=2),
+            make_test_case('vluxseg2ei8_v_u8m1x2', vl=15, n_segs=2),
+            make_test_case('vluxseg2ei8_v_u8m2x2', vl=32, n_segs=2),
+            make_test_case('vluxseg2ei8_v_u8m2x2', vl=31, n_segs=2),
+            make_test_case('vluxseg2ei8_v_u8m4x2', vl=64, n_segs=2),
+            make_test_case('vluxseg2ei8_v_u8m4x2', vl=63, n_segs=2),
+            # Unordered, segment 3
+            make_test_case('vluxseg3ei8_v_u8mf4x3', vl=4, n_segs=3),
+            make_test_case('vluxseg3ei8_v_u8mf4x3', vl=3, n_segs=3),
+            make_test_case('vluxseg3ei8_v_u8mf2x3', vl=8, n_segs=3),
+            make_test_case('vluxseg3ei8_v_u8mf2x3', vl=7, n_segs=3),
+            make_test_case('vluxseg3ei8_v_u8m1x3', vl=16, n_segs=3),
+            make_test_case('vluxseg3ei8_v_u8m1x3', vl=15, n_segs=3),
+            make_test_case('vluxseg3ei8_v_u8m2x3', vl=32, n_segs=3),
+            make_test_case('vluxseg3ei8_v_u8m2x3', vl=31, n_segs=3),
+            # Unordered, segment 4
+            make_test_case('vluxseg4ei8_v_u8mf4x4', vl=4, n_segs=4),
+            make_test_case('vluxseg4ei8_v_u8mf4x4', vl=3, n_segs=4),
+            make_test_case('vluxseg4ei8_v_u8mf2x4', vl=8, n_segs=4),
+            make_test_case('vluxseg4ei8_v_u8mf2x4', vl=7, n_segs=4),
+            make_test_case('vluxseg4ei8_v_u8m1x4', vl=16, n_segs=4),
+            make_test_case('vluxseg4ei8_v_u8m1x4', vl=15, n_segs=4),
+            make_test_case('vluxseg4ei8_v_u8m2x4', vl=32, n_segs=4),
+            make_test_case('vluxseg4ei8_v_u8m2x4', vl=31, n_segs=4),
+            # Unordered, segment 5
+            make_test_case('vluxseg5ei8_v_u8mf4x5', vl=4, n_segs=5),
+            make_test_case('vluxseg5ei8_v_u8mf4x5', vl=3, n_segs=5),
+            make_test_case('vluxseg5ei8_v_u8mf2x5', vl=8, n_segs=5),
+            make_test_case('vluxseg5ei8_v_u8mf2x5', vl=7, n_segs=5),
+            make_test_case('vluxseg5ei8_v_u8m1x5', vl=16, n_segs=5),
+            make_test_case('vluxseg5ei8_v_u8m1x5', vl=15, n_segs=5),
+            # Unordered, segment 6
+            make_test_case('vluxseg6ei8_v_u8mf4x6', vl=4, n_segs=6),
+            make_test_case('vluxseg6ei8_v_u8mf4x6', vl=3, n_segs=6),
+            make_test_case('vluxseg6ei8_v_u8mf2x6', vl=8, n_segs=6),
+            make_test_case('vluxseg6ei8_v_u8mf2x6', vl=7, n_segs=6),
+            make_test_case('vluxseg6ei8_v_u8m1x6', vl=16, n_segs=6),
+            make_test_case('vluxseg6ei8_v_u8m1x6', vl=15, n_segs=6),
+            # Unordered, segment 7
+            make_test_case('vluxseg7ei8_v_u8mf4x7', vl=4, n_segs=7),
+            make_test_case('vluxseg7ei8_v_u8mf4x7', vl=3, n_segs=7),
+            make_test_case('vluxseg7ei8_v_u8mf2x7', vl=8, n_segs=7),
+            make_test_case('vluxseg7ei8_v_u8mf2x7', vl=7, n_segs=7),
+            make_test_case('vluxseg7ei8_v_u8m1x7', vl=16, n_segs=7),
+            make_test_case('vluxseg7ei8_v_u8m1x7', vl=15, n_segs=7),
+            # Unordered, segment 8
+            make_test_case('vluxseg8ei8_v_u8mf4x8', vl=4, n_segs=8),
+            make_test_case('vluxseg8ei8_v_u8mf4x8', vl=3, n_segs=8),
+            make_test_case('vluxseg8ei8_v_u8mf2x8', vl=8, n_segs=8),
+            make_test_case('vluxseg8ei8_v_u8mf2x8', vl=7, n_segs=8),
+            make_test_case('vluxseg8ei8_v_u8m1x8', vl=16, n_segs=8),
+            make_test_case('vluxseg8ei8_v_u8m1x8', vl=15, n_segs=8),
+            # Ordered, segment 2
+            make_test_case('vloxseg2ei8_v_u8mf4x2', vl=4, n_segs=2),
+            make_test_case('vloxseg2ei8_v_u8mf4x2', vl=3, n_segs=2),
+            make_test_case('vloxseg2ei8_v_u8mf2x2', vl=8, n_segs=2),
+            make_test_case('vloxseg2ei8_v_u8mf2x2', vl=7, n_segs=2),
+            make_test_case('vloxseg2ei8_v_u8m1x2', vl=16, n_segs=2),
+            make_test_case('vloxseg2ei8_v_u8m1x2', vl=15, n_segs=2),
+            make_test_case('vloxseg2ei8_v_u8m2x2', vl=32, n_segs=2),
+            make_test_case('vloxseg2ei8_v_u8m2x2', vl=31, n_segs=2),
+            make_test_case('vloxseg2ei8_v_u8m4x2', vl=64, n_segs=2),
+            make_test_case('vloxseg2ei8_v_u8m4x2', vl=63, n_segs=2),
+        ],
+        dtype = np.uint8,
+        index_dtype = np.uint8,
+    )
+
 
 @cocotb.test()
 async def load8_seg_unit(dut):
@@ -510,53 +715,6 @@
 
 
 @cocotb.test()
-async def load8_index8(dut):
-    """Test vl*xei8_v_u8 usage accessible from intrinsics."""
-    def make_test_case(impl: str, vl: int):
-        return {
-            'impl': impl,
-            'vl': vl,
-            'in_bytes': 256,
-            'out_size': vl * 2,
-        }
-
-    await vector_load_indexed(
-        dut = dut,
-        elf_name = 'load8_index8.elf',
-        cases = [
-            # Unordered
-            make_test_case('vluxei8_v_u8mf4', vl = 4),
-            make_test_case('vluxei8_v_u8mf4', vl = 3),
-            make_test_case('vluxei8_v_u8mf2', vl = 8),
-            make_test_case('vluxei8_v_u8mf2', vl = 7),
-            make_test_case('vluxei8_v_u8m1', vl = 16),
-            make_test_case('vluxei8_v_u8m1', vl = 15),
-            make_test_case('vluxei8_v_u8m2', vl = 32),
-            make_test_case('vluxei8_v_u8m2', vl = 31),
-            make_test_case('vluxei8_v_u8m4', vl = 64),
-            make_test_case('vluxei8_v_u8m4', vl = 63),
-            make_test_case('vluxei8_v_u8m8', vl = 128),
-            make_test_case('vluxei8_v_u8m8', vl = 127),
-            # Ordered
-            make_test_case('vloxei8_v_u8mf4', vl = 4),
-            make_test_case('vloxei8_v_u8mf4', vl = 3),
-            make_test_case('vloxei8_v_u8mf2', vl = 8),
-            make_test_case('vloxei8_v_u8mf2', vl = 7),
-            make_test_case('vloxei8_v_u8m1', vl = 16),
-            make_test_case('vloxei8_v_u8m1', vl = 15),
-            make_test_case('vloxei8_v_u8m2', vl = 32),
-            make_test_case('vloxei8_v_u8m2', vl = 31),
-            make_test_case('vloxei8_v_u8m4', vl = 64),
-            make_test_case('vloxei8_v_u8m4', vl = 63),
-            make_test_case('vloxei8_v_u8m8', vl = 128),
-            make_test_case('vloxei8_v_u8m8', vl = 127),
-        ],
-        dtype = np.uint8,
-        index_dtype = np.uint8,
-    )
-
-
-@cocotb.test()
 async def load8_index16(dut):
     """Test vl*xei16_v_u8 usage accessible from intrinsics."""
     def make_test_case(impl: str, vl: int):