Extend segmented indexed load test coverage

Finish load8index8, add load16index16 and load32index32.

Change-Id: I948cbee6c3b65a14eee46162563f98a4dc01496b
diff --git a/tests/cocotb/BUILD b/tests/cocotb/BUILD
index b00766a..b58e098 100644
--- a/tests/cocotb/BUILD
+++ b/tests/cocotb/BUILD
@@ -182,11 +182,13 @@
     "load8_stride2_m1_partial",
     "load8_stride2_mf4",
     "load16_index8",
+    "load16_index16_seg",
     "load16_seg_unit",
     "load16_stride4_m1",
     "load16_stride4_m1_partial",
     "load16_stride4_mf2",
     "load32_index8",
+    "load32_index32_seg",
     "load32_seg_unit",
     "load32_stride8_m1",
     "load32_stride8_m1_partial",
diff --git a/tests/cocotb/rvv/load_store/BUILD b/tests/cocotb/rvv/load_store/BUILD
index 2efad52..9ab3fe9 100644
--- a/tests/cocotb/rvv/load_store/BUILD
+++ b/tests/cocotb/rvv/load_store/BUILD
@@ -68,6 +68,9 @@
         "load16_index8": {
             "srcs": ["load16_index8.cc"],
         },
+        "load16_index16_seg": {
+            "srcs": ["load16_index16_seg.cc"],
+        },
         "load16_seg_unit": {
             "srcs": ["load16_seg_unit.cc"],
         },
@@ -83,6 +86,9 @@
         "load32_index8": {
             "srcs": ["load32_index8.cc"],
         },
+        "load32_index32_seg": {
+            "srcs": ["load32_index32_seg.cc"],
+        },
         "load32_seg_unit": {
             "srcs": ["load32_seg_unit.cc"],
         },
@@ -141,11 +147,13 @@
         ":load_store16_unit_m2.elf",
         ":load_store32_unit_m2.elf",
         ":load16_index8.elf",
+        ":load16_index16_seg.elf",
         ":load16_seg_unit.elf",
         ":load16_stride4_m1.elf",
         ":load16_stride4_m1_partial.elf",
         ":load16_stride4_mf2.elf",
         ":load32_index8.elf",
+        ":load32_index32_seg.elf",
         ":load32_seg_unit.elf",
         ":load32_stride8_m1.elf",
         ":load32_stride8_m1_partial.elf",
diff --git a/tests/cocotb/rvv/load_store/load16_index16_seg.cc b/tests/cocotb/rvv/load_store/load16_index16_seg.cc
new file mode 100644
index 0000000..afffd25
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load16_index16_seg.cc
@@ -0,0 +1,521 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+namespace {
+constexpr size_t lut_size = 15000;
+// Double sized so we can check trailing regions are not read/written.
+constexpr size_t buf_size = 128;
+}  // namespace
+
+size_t vl __attribute__((section(".data"))) = 8;
+// Indices are always unsigned.
+uint16_t index_buf[buf_size] __attribute__((section(".data")));
+// These instructions don't differentiate signed/unsigned so we only need to
+// test one. The types come from intrinsic level.
+uint16_t in_buf[lut_size] __attribute__((section(".data")));
+uint16_t out_buf[buf_size] __attribute__((section(".data")));
+
+extern "C" {
+// Unordered, segment 2
+__attribute__((used, retain)) void vluxseg2ei16_v_u16mf2x2() {
+  auto indices = __riscv_vle16_v_u16mf2(index_buf, vl);
+  auto data = __riscv_vluxseg2ei16_v_u16mf2x2(in_buf, indices, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x2_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x2_u16mf2(data, 1),
+                         vl);
+}
+
+__attribute__((used, retain)) void vluxseg2ei16_v_u16m1x2() {
+  auto indices = __riscv_vle16_v_u16m1(index_buf, vl);
+  auto data = __riscv_vluxseg2ei16_v_u16m1x2(in_buf, indices, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x2_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x2_u16m1(data, 1),
+                        vl);
+}
+
+__attribute__((used, retain)) void vluxseg2ei16_v_u16m2x2() {
+  auto indices = __riscv_vle16_v_u16m2(index_buf, vl);
+  auto data = __riscv_vluxseg2ei16_v_u16m2x2(in_buf, indices, vl);
+  __riscv_vse16_v_u16m2(out_buf, __riscv_vget_v_u16m2x2_u16m2(data, 0), vl);
+  __riscv_vse16_v_u16m2(out_buf + vl, __riscv_vget_v_u16m2x2_u16m2(data, 1),
+                        vl);
+}
+
+__attribute__((used, retain)) void vluxseg2ei16_v_u16m4x2() {
+  auto indices = __riscv_vle16_v_u16m4(index_buf, vl);
+  auto data = __riscv_vluxseg2ei16_v_u16m4x2(in_buf, indices, vl);
+  __riscv_vse16_v_u16m4(out_buf, __riscv_vget_v_u16m4x2_u16m4(data, 0), vl);
+  __riscv_vse16_v_u16m4(out_buf + vl, __riscv_vget_v_u16m4x2_u16m4(data, 1),
+                        vl);
+}
+
+// Unordered, segment 3
+__attribute__((used, retain)) void vluxseg3ei16_v_u16mf2x3() {
+  auto indices = __riscv_vle16_v_u16mf2(index_buf, vl);
+  auto data = __riscv_vluxseg3ei16_v_u16mf2x3(in_buf, indices, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x3_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x3_u16mf2(data, 1),
+                         vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 2,
+                         __riscv_vget_v_u16mf2x3_u16mf2(data, 2), vl);
+}
+
+__attribute__((used, retain)) void vluxseg3ei16_v_u16m1x3() {
+  auto indices = __riscv_vle16_v_u16m1(index_buf, vl);
+  auto data = __riscv_vluxseg3ei16_v_u16m1x3(in_buf, indices, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x3_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x3_u16m1(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 2, __riscv_vget_v_u16m1x3_u16m1(data, 2),
+                        vl);
+}
+
+__attribute__((used, retain)) void vluxseg3ei16_v_u16m2x3() {
+  auto indices = __riscv_vle16_v_u16m2(index_buf, vl);
+  auto data = __riscv_vluxseg3ei16_v_u16m2x3(in_buf, indices, vl);
+  __riscv_vse16_v_u16m2(out_buf, __riscv_vget_v_u16m2x3_u16m2(data, 0), vl);
+  __riscv_vse16_v_u16m2(out_buf + vl, __riscv_vget_v_u16m2x3_u16m2(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m2(out_buf + vl * 2, __riscv_vget_v_u16m2x3_u16m2(data, 2),
+                        vl);
+}
+
+// Unordered, segment 4
+__attribute__((used, retain)) void vluxseg4ei16_v_u16mf2x4() {
+  auto indices = __riscv_vle16_v_u16mf2(index_buf, vl);
+  auto data = __riscv_vluxseg4ei16_v_u16mf2x4(in_buf, indices, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x4_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x4_u16mf2(data, 1),
+                         vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 2,
+                         __riscv_vget_v_u16mf2x4_u16mf2(data, 2), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 3,
+                         __riscv_vget_v_u16mf2x4_u16mf2(data, 3), vl);
+}
+
+__attribute__((used, retain)) void vluxseg4ei16_v_u16m1x4() {
+  auto indices = __riscv_vle16_v_u16m1(index_buf, vl);
+  auto data = __riscv_vluxseg4ei16_v_u16m1x4(in_buf, indices, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x4_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x4_u16m1(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 2, __riscv_vget_v_u16m1x4_u16m1(data, 2),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 3, __riscv_vget_v_u16m1x4_u16m1(data, 3),
+                        vl);
+}
+
+__attribute__((used, retain)) void vluxseg4ei16_v_u16m2x4() {
+  auto indices = __riscv_vle16_v_u16m2(index_buf, vl);
+  auto data = __riscv_vluxseg4ei16_v_u16m2x4(in_buf, indices, vl);
+  __riscv_vse16_v_u16m2(out_buf, __riscv_vget_v_u16m2x4_u16m2(data, 0), vl);
+  __riscv_vse16_v_u16m2(out_buf + vl, __riscv_vget_v_u16m2x4_u16m2(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m2(out_buf + vl * 2, __riscv_vget_v_u16m2x4_u16m2(data, 2),
+                        vl);
+  __riscv_vse16_v_u16m2(out_buf + vl * 3, __riscv_vget_v_u16m2x4_u16m2(data, 3),
+                        vl);
+}
+
+// Unordered, segment 5
+__attribute__((used, retain)) void vluxseg5ei16_v_u16mf2x5() {
+  auto indices = __riscv_vle16_v_u16mf2(index_buf, vl);
+  auto data = __riscv_vluxseg5ei16_v_u16mf2x5(in_buf, indices, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x5_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x5_u16mf2(data, 1),
+                         vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 2,
+                         __riscv_vget_v_u16mf2x5_u16mf2(data, 2), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 3,
+                         __riscv_vget_v_u16mf2x5_u16mf2(data, 3), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 4,
+                         __riscv_vget_v_u16mf2x5_u16mf2(data, 4), vl);
+}
+
+__attribute__((used, retain)) void vluxseg5ei16_v_u16m1x5() {
+  auto indices = __riscv_vle16_v_u16m1(index_buf, vl);
+  auto data = __riscv_vluxseg5ei16_v_u16m1x5(in_buf, indices, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x5_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x5_u16m1(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 2, __riscv_vget_v_u16m1x5_u16m1(data, 2),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 3, __riscv_vget_v_u16m1x5_u16m1(data, 3),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 4, __riscv_vget_v_u16m1x5_u16m1(data, 4),
+                        vl);
+}
+
+// Unordered, segment 6
+__attribute__((used, retain)) void vluxseg6ei16_v_u16mf2x6() {
+  auto indices = __riscv_vle16_v_u16mf2(index_buf, vl);
+  auto data = __riscv_vluxseg6ei16_v_u16mf2x6(in_buf, indices, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x6_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x6_u16mf2(data, 1),
+                         vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 2,
+                         __riscv_vget_v_u16mf2x6_u16mf2(data, 2), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 3,
+                         __riscv_vget_v_u16mf2x6_u16mf2(data, 3), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 4,
+                         __riscv_vget_v_u16mf2x6_u16mf2(data, 4), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 5,
+                         __riscv_vget_v_u16mf2x6_u16mf2(data, 5), vl);
+}
+
+__attribute__((used, retain)) void vluxseg6ei16_v_u16m1x6() {
+  auto indices = __riscv_vle16_v_u16m1(index_buf, vl);
+  auto data = __riscv_vluxseg6ei16_v_u16m1x6(in_buf, indices, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x6_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x6_u16m1(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 2, __riscv_vget_v_u16m1x6_u16m1(data, 2),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 3, __riscv_vget_v_u16m1x6_u16m1(data, 3),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 4, __riscv_vget_v_u16m1x6_u16m1(data, 4),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 5, __riscv_vget_v_u16m1x6_u16m1(data, 5),
+                        vl);
+}
+
+// Unordered, segment 7
+__attribute__((used, retain)) void vluxseg7ei16_v_u16mf2x7() {
+  auto indices = __riscv_vle16_v_u16mf2(index_buf, vl);
+  auto data = __riscv_vluxseg7ei16_v_u16mf2x7(in_buf, indices, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x7_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x7_u16mf2(data, 1),
+                         vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 2,
+                         __riscv_vget_v_u16mf2x7_u16mf2(data, 2), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 3,
+                         __riscv_vget_v_u16mf2x7_u16mf2(data, 3), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 4,
+                         __riscv_vget_v_u16mf2x7_u16mf2(data, 4), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 5,
+                         __riscv_vget_v_u16mf2x7_u16mf2(data, 5), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 6,
+                         __riscv_vget_v_u16mf2x7_u16mf2(data, 6), vl);
+}
+
+__attribute__((used, retain)) void vluxseg7ei16_v_u16m1x7() {
+  auto indices = __riscv_vle16_v_u16m1(index_buf, vl);
+  auto data = __riscv_vluxseg7ei16_v_u16m1x7(in_buf, indices, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x7_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x7_u16m1(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 2, __riscv_vget_v_u16m1x7_u16m1(data, 2),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 3, __riscv_vget_v_u16m1x7_u16m1(data, 3),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 4, __riscv_vget_v_u16m1x7_u16m1(data, 4),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 5, __riscv_vget_v_u16m1x7_u16m1(data, 5),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 6, __riscv_vget_v_u16m1x7_u16m1(data, 6),
+                        vl);
+}
+
+// Unordered, segment 8
+__attribute__((used, retain)) void vluxseg8ei16_v_u16mf2x8() {
+  auto indices = __riscv_vle16_v_u16mf2(index_buf, vl);
+  auto data = __riscv_vluxseg8ei16_v_u16mf2x8(in_buf, indices, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x8_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x8_u16mf2(data, 1),
+                         vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 2,
+                         __riscv_vget_v_u16mf2x8_u16mf2(data, 2), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 3,
+                         __riscv_vget_v_u16mf2x8_u16mf2(data, 3), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 4,
+                         __riscv_vget_v_u16mf2x8_u16mf2(data, 4), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 5,
+                         __riscv_vget_v_u16mf2x8_u16mf2(data, 5), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 6,
+                         __riscv_vget_v_u16mf2x8_u16mf2(data, 6), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 7,
+                         __riscv_vget_v_u16mf2x8_u16mf2(data, 7), vl);
+}
+
+__attribute__((used, retain)) void vluxseg8ei16_v_u16m1x8() {
+  auto indices = __riscv_vle16_v_u16m1(index_buf, vl);
+  auto data = __riscv_vluxseg8ei16_v_u16m1x8(in_buf, indices, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x8_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x8_u16m1(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 2, __riscv_vget_v_u16m1x8_u16m1(data, 2),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 3, __riscv_vget_v_u16m1x8_u16m1(data, 3),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 4, __riscv_vget_v_u16m1x8_u16m1(data, 4),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 5, __riscv_vget_v_u16m1x8_u16m1(data, 5),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 6, __riscv_vget_v_u16m1x8_u16m1(data, 6),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 7, __riscv_vget_v_u16m1x8_u16m1(data, 7),
+                        vl);
+}
+
+// Ordered, segment 2
+__attribute__((used, retain)) void vloxseg2ei16_v_u16mf2x2() {
+  auto indices = __riscv_vle16_v_u16mf2(index_buf, vl);
+  auto data = __riscv_vloxseg2ei16_v_u16mf2x2(in_buf, indices, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x2_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x2_u16mf2(data, 1),
+                         vl);
+}
+
+__attribute__((used, retain)) void vloxseg2ei16_v_u16m1x2() {
+  auto indices = __riscv_vle16_v_u16m1(index_buf, vl);
+  auto data = __riscv_vloxseg2ei16_v_u16m1x2(in_buf, indices, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x2_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x2_u16m1(data, 1),
+                        vl);
+}
+
+__attribute__((used, retain)) void vloxseg2ei16_v_u16m2x2() {
+  auto indices = __riscv_vle16_v_u16m2(index_buf, vl);
+  auto data = __riscv_vloxseg2ei16_v_u16m2x2(in_buf, indices, vl);
+  __riscv_vse16_v_u16m2(out_buf, __riscv_vget_v_u16m2x2_u16m2(data, 0), vl);
+  __riscv_vse16_v_u16m2(out_buf + vl, __riscv_vget_v_u16m2x2_u16m2(data, 1),
+                        vl);
+}
+
+__attribute__((used, retain)) void vloxseg2ei16_v_u16m4x2() {
+  auto indices = __riscv_vle16_v_u16m4(index_buf, vl);
+  auto data = __riscv_vloxseg2ei16_v_u16m4x2(in_buf, indices, vl);
+  __riscv_vse16_v_u16m4(out_buf, __riscv_vget_v_u16m4x2_u16m4(data, 0), vl);
+  __riscv_vse16_v_u16m4(out_buf + vl, __riscv_vget_v_u16m4x2_u16m4(data, 1),
+                        vl);
+}
+
+// Ordered, segment 3
+__attribute__((used, retain)) void vloxseg3ei16_v_u16mf2x3() {
+  auto indices = __riscv_vle16_v_u16mf2(index_buf, vl);
+  auto data = __riscv_vloxseg3ei16_v_u16mf2x3(in_buf, indices, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x3_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x3_u16mf2(data, 1),
+                         vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 2,
+                         __riscv_vget_v_u16mf2x3_u16mf2(data, 2), vl);
+}
+
+__attribute__((used, retain)) void vloxseg3ei16_v_u16m1x3() {
+  auto indices = __riscv_vle16_v_u16m1(index_buf, vl);
+  auto data = __riscv_vloxseg3ei16_v_u16m1x3(in_buf, indices, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x3_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x3_u16m1(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 2, __riscv_vget_v_u16m1x3_u16m1(data, 2),
+                        vl);
+}
+
+__attribute__((used, retain)) void vloxseg3ei16_v_u16m2x3() {
+  auto indices = __riscv_vle16_v_u16m2(index_buf, vl);
+  auto data = __riscv_vloxseg3ei16_v_u16m2x3(in_buf, indices, vl);
+  __riscv_vse16_v_u16m2(out_buf, __riscv_vget_v_u16m2x3_u16m2(data, 0), vl);
+  __riscv_vse16_v_u16m2(out_buf + vl, __riscv_vget_v_u16m2x3_u16m2(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m2(out_buf + vl * 2, __riscv_vget_v_u16m2x3_u16m2(data, 2),
+                        vl);
+}
+
+// Ordered, segment 4
+__attribute__((used, retain)) void vloxseg4ei16_v_u16mf2x4() {
+  auto indices = __riscv_vle16_v_u16mf2(index_buf, vl);
+  auto data = __riscv_vloxseg4ei16_v_u16mf2x4(in_buf, indices, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x4_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x4_u16mf2(data, 1),
+                         vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 2,
+                         __riscv_vget_v_u16mf2x4_u16mf2(data, 2), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 3,
+                         __riscv_vget_v_u16mf2x4_u16mf2(data, 3), vl);
+}
+
+__attribute__((used, retain)) void vloxseg4ei16_v_u16m1x4() {
+  auto indices = __riscv_vle16_v_u16m1(index_buf, vl);
+  auto data = __riscv_vloxseg4ei16_v_u16m1x4(in_buf, indices, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x4_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x4_u16m1(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 2, __riscv_vget_v_u16m1x4_u16m1(data, 2),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 3, __riscv_vget_v_u16m1x4_u16m1(data, 3),
+                        vl);
+}
+
+__attribute__((used, retain)) void vloxseg4ei16_v_u16m2x4() {
+  auto indices = __riscv_vle16_v_u16m2(index_buf, vl);
+  auto data = __riscv_vloxseg4ei16_v_u16m2x4(in_buf, indices, vl);
+  __riscv_vse16_v_u16m2(out_buf, __riscv_vget_v_u16m2x4_u16m2(data, 0), vl);
+  __riscv_vse16_v_u16m2(out_buf + vl, __riscv_vget_v_u16m2x4_u16m2(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m2(out_buf + vl * 2, __riscv_vget_v_u16m2x4_u16m2(data, 2),
+                        vl);
+  __riscv_vse16_v_u16m2(out_buf + vl * 3, __riscv_vget_v_u16m2x4_u16m2(data, 3),
+                        vl);
+}
+
+// Ordered, segment 5
+__attribute__((used, retain)) void vloxseg5ei16_v_u16mf2x5() {
+  auto indices = __riscv_vle16_v_u16mf2(index_buf, vl);
+  auto data = __riscv_vloxseg5ei16_v_u16mf2x5(in_buf, indices, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x5_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x5_u16mf2(data, 1),
+                         vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 2,
+                         __riscv_vget_v_u16mf2x5_u16mf2(data, 2), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 3,
+                         __riscv_vget_v_u16mf2x5_u16mf2(data, 3), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 4,
+                         __riscv_vget_v_u16mf2x5_u16mf2(data, 4), vl);
+}
+
+__attribute__((used, retain)) void vloxseg5ei16_v_u16m1x5() {
+  auto indices = __riscv_vle16_v_u16m1(index_buf, vl);
+  auto data = __riscv_vloxseg5ei16_v_u16m1x5(in_buf, indices, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x5_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x5_u16m1(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 2, __riscv_vget_v_u16m1x5_u16m1(data, 2),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 3, __riscv_vget_v_u16m1x5_u16m1(data, 3),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 4, __riscv_vget_v_u16m1x5_u16m1(data, 4),
+                        vl);
+}
+
+// Ordered, segment 6
+__attribute__((used, retain)) void vloxseg6ei16_v_u16mf2x6() {
+  auto indices = __riscv_vle16_v_u16mf2(index_buf, vl);
+  auto data = __riscv_vloxseg6ei16_v_u16mf2x6(in_buf, indices, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x6_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x6_u16mf2(data, 1),
+                         vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 2,
+                         __riscv_vget_v_u16mf2x6_u16mf2(data, 2), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 3,
+                         __riscv_vget_v_u16mf2x6_u16mf2(data, 3), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 4,
+                         __riscv_vget_v_u16mf2x6_u16mf2(data, 4), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 5,
+                         __riscv_vget_v_u16mf2x6_u16mf2(data, 5), vl);
+}
+
+__attribute__((used, retain)) void vloxseg6ei16_v_u16m1x6() {
+  auto indices = __riscv_vle16_v_u16m1(index_buf, vl);
+  auto data = __riscv_vloxseg6ei16_v_u16m1x6(in_buf, indices, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x6_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x6_u16m1(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 2, __riscv_vget_v_u16m1x6_u16m1(data, 2),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 3, __riscv_vget_v_u16m1x6_u16m1(data, 3),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 4, __riscv_vget_v_u16m1x6_u16m1(data, 4),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 5, __riscv_vget_v_u16m1x6_u16m1(data, 5),
+                        vl);
+}
+
+// Ordered, segment 7
+__attribute__((used, retain)) void vloxseg7ei16_v_u16mf2x7() {
+  auto indices = __riscv_vle16_v_u16mf2(index_buf, vl);
+  auto data = __riscv_vloxseg7ei16_v_u16mf2x7(in_buf, indices, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x7_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x7_u16mf2(data, 1),
+                         vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 2,
+                         __riscv_vget_v_u16mf2x7_u16mf2(data, 2), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 3,
+                         __riscv_vget_v_u16mf2x7_u16mf2(data, 3), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 4,
+                         __riscv_vget_v_u16mf2x7_u16mf2(data, 4), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 5,
+                         __riscv_vget_v_u16mf2x7_u16mf2(data, 5), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 6,
+                         __riscv_vget_v_u16mf2x7_u16mf2(data, 6), vl);
+}
+
+__attribute__((used, retain)) void vloxseg7ei16_v_u16m1x7() {
+  auto indices = __riscv_vle16_v_u16m1(index_buf, vl);
+  auto data = __riscv_vloxseg7ei16_v_u16m1x7(in_buf, indices, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x7_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x7_u16m1(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 2, __riscv_vget_v_u16m1x7_u16m1(data, 2),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 3, __riscv_vget_v_u16m1x7_u16m1(data, 3),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 4, __riscv_vget_v_u16m1x7_u16m1(data, 4),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 5, __riscv_vget_v_u16m1x7_u16m1(data, 5),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 6, __riscv_vget_v_u16m1x7_u16m1(data, 6),
+                        vl);
+}
+
+// Ordered, segment 8
+__attribute__((used, retain)) void vloxseg8ei16_v_u16mf2x8() {
+  auto indices = __riscv_vle16_v_u16mf2(index_buf, vl);
+  auto data = __riscv_vloxseg8ei16_v_u16mf2x8(in_buf, indices, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x8_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x8_u16mf2(data, 1),
+                         vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 2,
+                         __riscv_vget_v_u16mf2x8_u16mf2(data, 2), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 3,
+                         __riscv_vget_v_u16mf2x8_u16mf2(data, 3), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 4,
+                         __riscv_vget_v_u16mf2x8_u16mf2(data, 4), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 5,
+                         __riscv_vget_v_u16mf2x8_u16mf2(data, 5), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 6,
+                         __riscv_vget_v_u16mf2x8_u16mf2(data, 6), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 7,
+                         __riscv_vget_v_u16mf2x8_u16mf2(data, 7), vl);
+}
+
+__attribute__((used, retain)) void vloxseg8ei16_v_u16m1x8() {
+  auto indices = __riscv_vle16_v_u16m1(index_buf, vl);
+  auto data = __riscv_vloxseg8ei16_v_u16m1x8(in_buf, indices, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x8_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x8_u16m1(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 2, __riscv_vget_v_u16m1x8_u16m1(data, 2),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 3, __riscv_vget_v_u16m1x8_u16m1(data, 3),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 4, __riscv_vget_v_u16m1x8_u16m1(data, 4),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 5, __riscv_vget_v_u16m1x8_u16m1(data, 5),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 6, __riscv_vget_v_u16m1x8_u16m1(data, 6),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 7, __riscv_vget_v_u16m1x8_u16m1(data, 7),
+                        vl);
+}
+}
+
+void (*impl)() __attribute__((section(".data"))) = &vluxseg2ei16_v_u16m1x2;
+
+int main(int argc, char** argv) {
+  impl();
+  return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load32_index32_seg.cc b/tests/cocotb/rvv/load_store/load32_index32_seg.cc
new file mode 100644
index 0000000..2ec3332
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load32_index32_seg.cc
@@ -0,0 +1,325 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+namespace {
+constexpr size_t lut_size = 7000;
+// Double sized so we can check trailing regions are not read/written.
+constexpr size_t buf_size = 64;
+}  // namespace
+
+size_t vl __attribute__((section(".data"))) = 4;
+// Indices are always unsigned.
+uint32_t index_buf[buf_size] __attribute__((section(".data")));
+// These instructions don't differentiate signed/unsigned so we only need to
+// test one. The types come from intrinsic level.
+uint32_t in_buf[lut_size] __attribute__((section(".data")));
+uint32_t out_buf[buf_size] __attribute__((section(".data")));
+
+extern "C" {
+// Unordered, segment 2
+__attribute__((used, retain)) void vluxseg2ei32_v_u32m1x2() {
+  auto indices = __riscv_vle32_v_u32m1(index_buf, vl);
+  auto data = __riscv_vluxseg2ei32_v_u32m1x2(in_buf, indices, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x2_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x2_u32m1(data, 1),
+                        vl);
+}
+
+__attribute__((used, retain)) void vluxseg2ei32_v_u32m2x2() {
+  auto indices = __riscv_vle32_v_u32m2(index_buf, vl);
+  auto data = __riscv_vluxseg2ei32_v_u32m2x2(in_buf, indices, vl);
+  __riscv_vse32_v_u32m2(out_buf, __riscv_vget_v_u32m2x2_u32m2(data, 0), vl);
+  __riscv_vse32_v_u32m2(out_buf + vl, __riscv_vget_v_u32m2x2_u32m2(data, 1),
+                        vl);
+}
+
+__attribute__((used, retain)) void vluxseg2ei32_v_u32m4x2() {
+  auto indices = __riscv_vle32_v_u32m4(index_buf, vl);
+  auto data = __riscv_vluxseg2ei32_v_u32m4x2(in_buf, indices, vl);
+  __riscv_vse32_v_u32m4(out_buf, __riscv_vget_v_u32m4x2_u32m4(data, 0), vl);
+  __riscv_vse32_v_u32m4(out_buf + vl, __riscv_vget_v_u32m4x2_u32m4(data, 1),
+                        vl);
+}
+
+// Unordered, segment 3
+__attribute__((used, retain)) void vluxseg3ei32_v_u32m1x3() {
+  auto indices = __riscv_vle32_v_u32m1(index_buf, vl);
+  auto data = __riscv_vluxseg3ei32_v_u32m1x3(in_buf, indices, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x3_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x3_u32m1(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 2, __riscv_vget_v_u32m1x3_u32m1(data, 2),
+                        vl);
+}
+
+__attribute__((used, retain)) void vluxseg3ei32_v_u32m2x3() {
+  auto indices = __riscv_vle32_v_u32m2(index_buf, vl);
+  auto data = __riscv_vluxseg3ei32_v_u32m2x3(in_buf, indices, vl);
+  __riscv_vse32_v_u32m2(out_buf, __riscv_vget_v_u32m2x3_u32m2(data, 0), vl);
+  __riscv_vse32_v_u32m2(out_buf + vl, __riscv_vget_v_u32m2x3_u32m2(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m2(out_buf + vl * 2, __riscv_vget_v_u32m2x3_u32m2(data, 2),
+                        vl);
+}
+
+// Unordered, segment 4
+__attribute__((used, retain)) void vluxseg4ei32_v_u32m1x4() {
+  auto indices = __riscv_vle32_v_u32m1(index_buf, vl);
+  auto data = __riscv_vluxseg4ei32_v_u32m1x4(in_buf, indices, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x4_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x4_u32m1(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 2, __riscv_vget_v_u32m1x4_u32m1(data, 2),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 3, __riscv_vget_v_u32m1x4_u32m1(data, 3),
+                        vl);
+}
+
+__attribute__((used, retain)) void vluxseg4ei32_v_u32m2x4() {
+  auto indices = __riscv_vle32_v_u32m2(index_buf, vl);
+  auto data = __riscv_vluxseg4ei32_v_u32m2x4(in_buf, indices, vl);
+  __riscv_vse32_v_u32m2(out_buf, __riscv_vget_v_u32m2x4_u32m2(data, 0), vl);
+  __riscv_vse32_v_u32m2(out_buf + vl, __riscv_vget_v_u32m2x4_u32m2(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m2(out_buf + vl * 2, __riscv_vget_v_u32m2x4_u32m2(data, 2),
+                        vl);
+  __riscv_vse32_v_u32m2(out_buf + vl * 3, __riscv_vget_v_u32m2x4_u32m2(data, 3),
+                        vl);
+}
+
+// Unordered, segment 5
+__attribute__((used, retain)) void vluxseg5ei32_v_u32m1x5() {
+  auto indices = __riscv_vle32_v_u32m1(index_buf, vl);
+  auto data = __riscv_vluxseg5ei32_v_u32m1x5(in_buf, indices, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x5_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x5_u32m1(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 2, __riscv_vget_v_u32m1x5_u32m1(data, 2),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 3, __riscv_vget_v_u32m1x5_u32m1(data, 3),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 4, __riscv_vget_v_u32m1x5_u32m1(data, 4),
+                        vl);
+}
+
+// Unordered, segment 6
+__attribute__((used, retain)) void vluxseg6ei32_v_u32m1x6() {
+  auto indices = __riscv_vle32_v_u32m1(index_buf, vl);
+  auto data = __riscv_vluxseg6ei32_v_u32m1x6(in_buf, indices, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x6_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x6_u32m1(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 2, __riscv_vget_v_u32m1x6_u32m1(data, 2),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 3, __riscv_vget_v_u32m1x6_u32m1(data, 3),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 4, __riscv_vget_v_u32m1x6_u32m1(data, 4),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 5, __riscv_vget_v_u32m1x6_u32m1(data, 5),
+                        vl);
+}
+
+// Unordered, segment 7
+__attribute__((used, retain)) void vluxseg7ei32_v_u32m1x7() {
+  auto indices = __riscv_vle32_v_u32m1(index_buf, vl);
+  auto data = __riscv_vluxseg7ei32_v_u32m1x7(in_buf, indices, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x7_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x7_u32m1(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 2, __riscv_vget_v_u32m1x7_u32m1(data, 2),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 3, __riscv_vget_v_u32m1x7_u32m1(data, 3),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 4, __riscv_vget_v_u32m1x7_u32m1(data, 4),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 5, __riscv_vget_v_u32m1x7_u32m1(data, 5),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 6, __riscv_vget_v_u32m1x7_u32m1(data, 6),
+                        vl);
+}
+
+// Unordered, segment 8
+__attribute__((used, retain)) void vluxseg8ei32_v_u32m1x8() {
+  auto indices = __riscv_vle32_v_u32m1(index_buf, vl);
+  auto data = __riscv_vluxseg8ei32_v_u32m1x8(in_buf, indices, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x8_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x8_u32m1(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 2, __riscv_vget_v_u32m1x8_u32m1(data, 2),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 3, __riscv_vget_v_u32m1x8_u32m1(data, 3),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 4, __riscv_vget_v_u32m1x8_u32m1(data, 4),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 5, __riscv_vget_v_u32m1x8_u32m1(data, 5),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 6, __riscv_vget_v_u32m1x8_u32m1(data, 6),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 7, __riscv_vget_v_u32m1x8_u32m1(data, 7),
+                        vl);
+}
+
+// Ordered, segment 2
+__attribute__((used, retain)) void vloxseg2ei32_v_u32m1x2() {
+  auto indices = __riscv_vle32_v_u32m1(index_buf, vl);
+  auto data = __riscv_vloxseg2ei32_v_u32m1x2(in_buf, indices, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x2_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x2_u32m1(data, 1),
+                        vl);
+}
+
+__attribute__((used, retain)) void vloxseg2ei32_v_u32m2x2() {
+  auto indices = __riscv_vle32_v_u32m2(index_buf, vl);
+  auto data = __riscv_vloxseg2ei32_v_u32m2x2(in_buf, indices, vl);
+  __riscv_vse32_v_u32m2(out_buf, __riscv_vget_v_u32m2x2_u32m2(data, 0), vl);
+  __riscv_vse32_v_u32m2(out_buf + vl, __riscv_vget_v_u32m2x2_u32m2(data, 1),
+                        vl);
+}
+
+__attribute__((used, retain)) void vloxseg2ei32_v_u32m4x2() {
+  auto indices = __riscv_vle32_v_u32m4(index_buf, vl);
+  auto data = __riscv_vloxseg2ei32_v_u32m4x2(in_buf, indices, vl);
+  __riscv_vse32_v_u32m4(out_buf, __riscv_vget_v_u32m4x2_u32m4(data, 0), vl);
+  __riscv_vse32_v_u32m4(out_buf + vl, __riscv_vget_v_u32m4x2_u32m4(data, 1),
+                        vl);
+}
+
+// Ordered, segment 3
+__attribute__((used, retain)) void vloxseg3ei32_v_u32m1x3() {
+  auto indices = __riscv_vle32_v_u32m1(index_buf, vl);
+  auto data = __riscv_vloxseg3ei32_v_u32m1x3(in_buf, indices, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x3_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x3_u32m1(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 2, __riscv_vget_v_u32m1x3_u32m1(data, 2),
+                        vl);
+}
+
+__attribute__((used, retain)) void vloxseg3ei32_v_u32m2x3() {
+  auto indices = __riscv_vle32_v_u32m2(index_buf, vl);
+  auto data = __riscv_vloxseg3ei32_v_u32m2x3(in_buf, indices, vl);
+  __riscv_vse32_v_u32m2(out_buf, __riscv_vget_v_u32m2x3_u32m2(data, 0), vl);
+  __riscv_vse32_v_u32m2(out_buf + vl, __riscv_vget_v_u32m2x3_u32m2(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m2(out_buf + vl * 2, __riscv_vget_v_u32m2x3_u32m2(data, 2),
+                        vl);
+}
+
+// Ordered, segment 4
+__attribute__((used, retain)) void vloxseg4ei32_v_u32m1x4() {
+  auto indices = __riscv_vle32_v_u32m1(index_buf, vl);
+  auto data = __riscv_vloxseg4ei32_v_u32m1x4(in_buf, indices, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x4_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x4_u32m1(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 2, __riscv_vget_v_u32m1x4_u32m1(data, 2),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 3, __riscv_vget_v_u32m1x4_u32m1(data, 3),
+                        vl);
+}
+
+__attribute__((used, retain)) void vloxseg4ei32_v_u32m2x4() {
+  auto indices = __riscv_vle32_v_u32m2(index_buf, vl);
+  auto data = __riscv_vloxseg4ei32_v_u32m2x4(in_buf, indices, vl);
+  __riscv_vse32_v_u32m2(out_buf, __riscv_vget_v_u32m2x4_u32m2(data, 0), vl);
+  __riscv_vse32_v_u32m2(out_buf + vl, __riscv_vget_v_u32m2x4_u32m2(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m2(out_buf + vl * 2, __riscv_vget_v_u32m2x4_u32m2(data, 2),
+                        vl);
+  __riscv_vse32_v_u32m2(out_buf + vl * 3, __riscv_vget_v_u32m2x4_u32m2(data, 3),
+                        vl);
+}
+
+// Ordered, segment 5
+__attribute__((used, retain)) void vloxseg5ei32_v_u32m1x5() {
+  auto indices = __riscv_vle32_v_u32m1(index_buf, vl);
+  auto data = __riscv_vloxseg5ei32_v_u32m1x5(in_buf, indices, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x5_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x5_u32m1(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 2, __riscv_vget_v_u32m1x5_u32m1(data, 2),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 3, __riscv_vget_v_u32m1x5_u32m1(data, 3),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 4, __riscv_vget_v_u32m1x5_u32m1(data, 4),
+                        vl);
+}
+
+// Ordered, segment 6
+__attribute__((used, retain)) void vloxseg6ei32_v_u32m1x6() {
+  auto indices = __riscv_vle32_v_u32m1(index_buf, vl);
+  auto data = __riscv_vloxseg6ei32_v_u32m1x6(in_buf, indices, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x6_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x6_u32m1(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 2, __riscv_vget_v_u32m1x6_u32m1(data, 2),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 3, __riscv_vget_v_u32m1x6_u32m1(data, 3),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 4, __riscv_vget_v_u32m1x6_u32m1(data, 4),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 5, __riscv_vget_v_u32m1x6_u32m1(data, 5),
+                        vl);
+}
+
+// Ordered, segment 7
+__attribute__((used, retain)) void vloxseg7ei32_v_u32m1x7() {
+  auto indices = __riscv_vle32_v_u32m1(index_buf, vl);
+  auto data = __riscv_vloxseg7ei32_v_u32m1x7(in_buf, indices, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x7_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x7_u32m1(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 2, __riscv_vget_v_u32m1x7_u32m1(data, 2),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 3, __riscv_vget_v_u32m1x7_u32m1(data, 3),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 4, __riscv_vget_v_u32m1x7_u32m1(data, 4),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 5, __riscv_vget_v_u32m1x7_u32m1(data, 5),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 6, __riscv_vget_v_u32m1x7_u32m1(data, 6),
+                        vl);
+}
+
+// Ordered, segment 8
+__attribute__((used, retain)) void vloxseg8ei32_v_u32m1x8() {
+  auto indices = __riscv_vle32_v_u32m1(index_buf, vl);
+  auto data = __riscv_vloxseg8ei32_v_u32m1x8(in_buf, indices, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x8_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x8_u32m1(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 2, __riscv_vget_v_u32m1x8_u32m1(data, 2),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 3, __riscv_vget_v_u32m1x8_u32m1(data, 3),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 4, __riscv_vget_v_u32m1x8_u32m1(data, 4),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 5, __riscv_vget_v_u32m1x8_u32m1(data, 5),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 6, __riscv_vget_v_u32m1x8_u32m1(data, 6),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 7, __riscv_vget_v_u32m1x8_u32m1(data, 7),
+                        vl);
+}
+}
+
+void (*impl)() __attribute__((section(".data"))) = &vluxseg2ei32_v_u32m1x2;
+
+int main(int argc, char** argv) {
+  impl();
+  return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load8_index8_seg.cc b/tests/cocotb/rvv/load_store/load8_index8_seg.cc
index a9a0807..d2990c5 100644
--- a/tests/cocotb/rvv/load_store/load8_index8_seg.cc
+++ b/tests/cocotb/rvv/load_store/load8_index8_seg.cc
@@ -379,6 +379,284 @@
   __riscv_vse8_v_u8m4(out_buf, __riscv_vget_v_u8m4x2_u8m4(data, 0), vl);
   __riscv_vse8_v_u8m4(out_buf + vl, __riscv_vget_v_u8m4x2_u8m4(data, 1), vl);
 }
+
+// Ordered, segment 3
+__attribute__((used, retain)) void vloxseg3ei8_v_u8mf4x3() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vloxseg3ei8_v_u8mf4x3(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x3_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x3_u8mf4(data, 1), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x3_u8mf4(data, 2),
+                       vl);
+}
+
+__attribute__((used, retain)) void vloxseg3ei8_v_u8mf2x3() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vloxseg3ei8_v_u8mf2x3(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x3_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x3_u8mf2(data, 1), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x3_u8mf2(data, 2),
+                       vl);
+}
+
+__attribute__((used, retain)) void vloxseg3ei8_v_u8m1x3() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vloxseg3ei8_v_u8m1x3(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x3_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x3_u8m1(data, 1), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x3_u8m1(data, 2),
+                      vl);
+}
+
+__attribute__((used, retain)) void vloxseg3ei8_v_u8m2x3() {
+  auto indices = __riscv_vle8_v_u8m2(index_buf, vl);
+  auto data = __riscv_vloxseg3ei8_v_u8m2x3(in_buf, indices, vl);
+  __riscv_vse8_v_u8m2(out_buf, __riscv_vget_v_u8m2x3_u8m2(data, 0), vl);
+  __riscv_vse8_v_u8m2(out_buf + vl, __riscv_vget_v_u8m2x3_u8m2(data, 1), vl);
+  __riscv_vse8_v_u8m2(out_buf + vl * 2, __riscv_vget_v_u8m2x3_u8m2(data, 2),
+                      vl);
+}
+
+// Ordered, segment 4
+__attribute__((used, retain)) void vloxseg4ei8_v_u8mf4x4() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vloxseg4ei8_v_u8mf4x4(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x4_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x4_u8mf4(data, 1), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x4_u8mf4(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x4_u8mf4(data, 3),
+                       vl);
+}
+
+__attribute__((used, retain)) void vloxseg4ei8_v_u8mf2x4() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vloxseg4ei8_v_u8mf2x4(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x4_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x4_u8mf2(data, 1), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x4_u8mf2(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x4_u8mf2(data, 3),
+                       vl);
+}
+
+__attribute__((used, retain)) void vloxseg4ei8_v_u8m1x4() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vloxseg4ei8_v_u8m1x4(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x4_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x4_u8m1(data, 1), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x4_u8m1(data, 2),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x4_u8m1(data, 3),
+                      vl);
+}
+
+__attribute__((used, retain)) void vloxseg4ei8_v_u8m2x4() {
+  auto indices = __riscv_vle8_v_u8m2(index_buf, vl);
+  auto data = __riscv_vloxseg4ei8_v_u8m2x4(in_buf, indices, vl);
+  __riscv_vse8_v_u8m2(out_buf, __riscv_vget_v_u8m2x4_u8m2(data, 0), vl);
+  __riscv_vse8_v_u8m2(out_buf + vl, __riscv_vget_v_u8m2x4_u8m2(data, 1), vl);
+  __riscv_vse8_v_u8m2(out_buf + vl * 2, __riscv_vget_v_u8m2x4_u8m2(data, 2),
+                      vl);
+  __riscv_vse8_v_u8m2(out_buf + vl * 3, __riscv_vget_v_u8m2x4_u8m2(data, 3),
+                      vl);
+}
+
+// Ordered, segment 5
+__attribute__((used, retain)) void vloxseg5ei8_v_u8mf4x5() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vloxseg5ei8_v_u8mf4x5(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x5_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x5_u8mf4(data, 1), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x5_u8mf4(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x5_u8mf4(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x5_u8mf4(data, 4),
+                       vl);
+}
+
+__attribute__((used, retain)) void vloxseg5ei8_v_u8mf2x5() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vloxseg5ei8_v_u8mf2x5(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x5_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x5_u8mf2(data, 1), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x5_u8mf2(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x5_u8mf2(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x5_u8mf2(data, 4),
+                       vl);
+}
+
+__attribute__((used, retain)) void vloxseg5ei8_v_u8m1x5() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vloxseg5ei8_v_u8m1x5(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x5_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x5_u8m1(data, 1), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x5_u8m1(data, 2),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x5_u8m1(data, 3),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x5_u8m1(data, 4),
+                      vl);
+}
+
+// Ordered, segment 6
+__attribute__((used, retain)) void vloxseg6ei8_v_u8mf4x6() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vloxseg6ei8_v_u8mf4x6(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x6_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x6_u8mf4(data, 1), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x6_u8mf4(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x6_u8mf4(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x6_u8mf4(data, 4),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 5, __riscv_vget_v_u8mf4x6_u8mf4(data, 5),
+                       vl);
+}
+
+__attribute__((used, retain)) void vloxseg6ei8_v_u8mf2x6() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vloxseg6ei8_v_u8mf2x6(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x6_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x6_u8mf2(data, 1), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x6_u8mf2(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x6_u8mf2(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x6_u8mf2(data, 4),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 5, __riscv_vget_v_u8mf2x6_u8mf2(data, 5),
+                       vl);
+}
+
+__attribute__((used, retain)) void vloxseg6ei8_v_u8m1x6() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vloxseg6ei8_v_u8m1x6(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x6_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x6_u8m1(data, 1), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x6_u8m1(data, 2),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x6_u8m1(data, 3),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x6_u8m1(data, 4),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 5, __riscv_vget_v_u8m1x6_u8m1(data, 5),
+                      vl);
+}
+
+// Ordered, segment 7
+__attribute__((used, retain)) void vloxseg7ei8_v_u8mf4x7() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vloxseg7ei8_v_u8mf4x7(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x7_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x7_u8mf4(data, 1), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x7_u8mf4(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x7_u8mf4(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x7_u8mf4(data, 4),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 5, __riscv_vget_v_u8mf4x7_u8mf4(data, 5),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 6, __riscv_vget_v_u8mf4x7_u8mf4(data, 6),
+                       vl);
+}
+
+__attribute__((used, retain)) void vloxseg7ei8_v_u8mf2x7() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vloxseg7ei8_v_u8mf2x7(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x7_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x7_u8mf2(data, 1), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x7_u8mf2(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x7_u8mf2(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x7_u8mf2(data, 4),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 5, __riscv_vget_v_u8mf2x7_u8mf2(data, 5),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 6, __riscv_vget_v_u8mf2x7_u8mf2(data, 6),
+                       vl);
+}
+
+__attribute__((used, retain)) void vloxseg7ei8_v_u8m1x7() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vloxseg7ei8_v_u8m1x7(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x7_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x7_u8m1(data, 1), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x7_u8m1(data, 2),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x7_u8m1(data, 3),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x7_u8m1(data, 4),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 5, __riscv_vget_v_u8m1x7_u8m1(data, 5),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 6, __riscv_vget_v_u8m1x7_u8m1(data, 6),
+                      vl);
+}
+
+// Ordered, segment 8
+__attribute__((used, retain)) void vloxseg8ei8_v_u8mf4x8() {
+  auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+  auto data = __riscv_vloxseg8ei8_v_u8mf4x8(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x8_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x8_u8mf4(data, 1), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x8_u8mf4(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x8_u8mf4(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x8_u8mf4(data, 4),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 5, __riscv_vget_v_u8mf4x8_u8mf4(data, 5),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 6, __riscv_vget_v_u8mf4x8_u8mf4(data, 6),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 7, __riscv_vget_v_u8mf4x8_u8mf4(data, 7),
+                       vl);
+}
+
+__attribute__((used, retain)) void vloxseg8ei8_v_u8mf2x8() {
+  auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+  auto data = __riscv_vloxseg8ei8_v_u8mf2x8(in_buf, indices, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x8_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x8_u8mf2(data, 1), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x8_u8mf2(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x8_u8mf2(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x8_u8mf2(data, 4),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 5, __riscv_vget_v_u8mf2x8_u8mf2(data, 5),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 6, __riscv_vget_v_u8mf2x8_u8mf2(data, 6),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 7, __riscv_vget_v_u8mf2x8_u8mf2(data, 7),
+                       vl);
+}
+
+__attribute__((used, retain)) void vloxseg8ei8_v_u8m1x8() {
+  auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+  auto data = __riscv_vloxseg8ei8_v_u8m1x8(in_buf, indices, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x8_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x8_u8m1(data, 1), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x8_u8m1(data, 2),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x8_u8m1(data, 3),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x8_u8m1(data, 4),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 5, __riscv_vget_v_u8m1x8_u8m1(data, 5),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 6, __riscv_vget_v_u8m1x8_u8m1(data, 6),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 7, __riscv_vget_v_u8m1x8_u8m1(data, 7),
+                      vl);
+}
 }
 
 void (*impl)() __attribute__((section(".data"))) = &vluxseg2ei8_v_u8m1x2;
diff --git a/tests/cocotb/rvv_load_store_test.py b/tests/cocotb/rvv_load_store_test.py
index cda8cf6..a8e44f9 100644
--- a/tests/cocotb/rvv_load_store_test.py
+++ b/tests/cocotb/rvv_load_store_test.py
@@ -216,7 +216,7 @@
         # Input needs to be reinterpreted. Note indices in use can reach
         # beyond index_dtype when dtype is wider than uint8.
         indices_in_use = np.array([
-            np.arange(x + s, x + s + np.dtype(dtype).itemsize)
+            np.arange(x + s * np.dtype(dtype).itemsize, x + (s + 1) * np.dtype(dtype).itemsize)
             for s in range(segments)
             for x in indices[:vl].astype(np.uint32)
         ])
@@ -546,7 +546,7 @@
 
 @cocotb.test()
 async def load8_index8_seg(dut):
-    """Test vl*xei8_v_u8 usage accessible from intrinsics."""
+    """Test vl*xseg*ei8_v_u8 usage accessible from intrinsics."""
     def make_test_case(impl: str, vl: int, n_segs: int):
         return {
             'impl': impl,
@@ -628,6 +628,52 @@
             make_test_case('vloxseg2ei8_v_u8m2x2', vl=31, n_segs=2),
             make_test_case('vloxseg2ei8_v_u8m4x2', vl=64, n_segs=2),
             make_test_case('vloxseg2ei8_v_u8m4x2', vl=63, n_segs=2),
+            # Ordered, segment 3
+            make_test_case('vloxseg3ei8_v_u8mf4x3', vl=4, n_segs=3),
+            make_test_case('vloxseg3ei8_v_u8mf4x3', vl=3, n_segs=3),
+            make_test_case('vloxseg3ei8_v_u8mf2x3', vl=8, n_segs=3),
+            make_test_case('vloxseg3ei8_v_u8mf2x3', vl=7, n_segs=3),
+            make_test_case('vloxseg3ei8_v_u8m1x3', vl=16, n_segs=3),
+            make_test_case('vloxseg3ei8_v_u8m1x3', vl=15, n_segs=3),
+            make_test_case('vloxseg3ei8_v_u8m2x3', vl=32, n_segs=3),
+            make_test_case('vloxseg3ei8_v_u8m2x3', vl=31, n_segs=3),
+            # Ordered, segment 4
+            make_test_case('vloxseg4ei8_v_u8mf4x4', vl=4, n_segs=4),
+            make_test_case('vloxseg4ei8_v_u8mf4x4', vl=3, n_segs=4),
+            make_test_case('vloxseg4ei8_v_u8mf2x4', vl=8, n_segs=4),
+            make_test_case('vloxseg4ei8_v_u8mf2x4', vl=7, n_segs=4),
+            make_test_case('vloxseg4ei8_v_u8m1x4', vl=16, n_segs=4),
+            make_test_case('vloxseg4ei8_v_u8m1x4', vl=15, n_segs=4),
+            make_test_case('vloxseg4ei8_v_u8m2x4', vl=32, n_segs=4),
+            make_test_case('vloxseg4ei8_v_u8m2x4', vl=31, n_segs=4),
+            # Ordered, segment 5
+            make_test_case('vloxseg5ei8_v_u8mf4x5', vl=4, n_segs=5),
+            make_test_case('vloxseg5ei8_v_u8mf4x5', vl=3, n_segs=5),
+            make_test_case('vloxseg5ei8_v_u8mf2x5', vl=8, n_segs=5),
+            make_test_case('vloxseg5ei8_v_u8mf2x5', vl=7, n_segs=5),
+            make_test_case('vloxseg5ei8_v_u8m1x5', vl=16, n_segs=5),
+            make_test_case('vloxseg5ei8_v_u8m1x5', vl=15, n_segs=5),
+            # Ordered, segment 6
+            make_test_case('vloxseg6ei8_v_u8mf4x6', vl=4, n_segs=6),
+            make_test_case('vloxseg6ei8_v_u8mf4x6', vl=3, n_segs=6),
+            make_test_case('vloxseg6ei8_v_u8mf2x6', vl=8, n_segs=6),
+            make_test_case('vloxseg6ei8_v_u8mf2x6', vl=7, n_segs=6),
+            make_test_case('vloxseg6ei8_v_u8m1x6', vl=16, n_segs=6),
+            make_test_case('vloxseg6ei8_v_u8m1x6', vl=15, n_segs=6),
+            # Ordered, segment 7
+            make_test_case('vloxseg7ei8_v_u8mf4x7', vl=4, n_segs=7),
+            make_test_case('vloxseg7ei8_v_u8mf4x7', vl=3, n_segs=7),
+            make_test_case('vloxseg7ei8_v_u8mf2x7', vl=8, n_segs=7),
+            make_test_case('vloxseg7ei8_v_u8mf2x7', vl=7, n_segs=7),
+            make_test_case('vloxseg7ei8_v_u8m1x7', vl=16, n_segs=7),
+            make_test_case('vloxseg7ei8_v_u8m1x7', vl=15, n_segs=7),
+            # Ordered, segment 8
+            make_test_case('vloxseg8ei8_v_u8mf4x8', vl=4, n_segs=8),
+            make_test_case('vloxseg8ei8_v_u8mf4x8', vl=3, n_segs=8),
+            make_test_case('vloxseg8ei8_v_u8mf2x8', vl=8, n_segs=8),
+            make_test_case('vloxseg8ei8_v_u8mf2x8', vl=7, n_segs=8),
+            make_test_case('vloxseg8ei8_v_u8m1x8', vl=16, n_segs=8),
+            make_test_case('vloxseg8ei8_v_u8m1x8', vl=15, n_segs=8),
         ],
         dtype = np.uint8,
         index_dtype = np.uint8,
@@ -840,6 +886,114 @@
 
 
 @cocotb.test()
+async def load16_index16_seg(dut):
+    """Test vl*xseg*ei16_v_u16 usage accessible from intrinsics."""
+    def make_test_case(impl: str, vl: int, n_segs: int):
+        return {
+            'impl': impl,
+            'vl': vl,
+            'segments': n_segs,
+            'in_bytes': 30000,
+            'out_size': vl * n_segs * 2,
+        }
+
+    await vector_load_segmented_indexed(
+        dut = dut,
+        elf_name = 'load16_index16_seg.elf',
+        cases = [
+            # Unordered, segment 2
+            make_test_case('vluxseg2ei16_v_u16mf2x2', vl=4, n_segs=2),
+            make_test_case('vluxseg2ei16_v_u16mf2x2', vl=3, n_segs=2),
+            make_test_case('vluxseg2ei16_v_u16m1x2', vl=8, n_segs=2),
+            make_test_case('vluxseg2ei16_v_u16m1x2', vl=7, n_segs=2),
+            make_test_case('vluxseg2ei16_v_u16m2x2', vl=16, n_segs=2),
+            make_test_case('vluxseg2ei16_v_u16m2x2', vl=15, n_segs=2),
+            make_test_case('vluxseg2ei16_v_u16m4x2', vl=32, n_segs=2),
+            make_test_case('vluxseg2ei16_v_u16m4x2', vl=31, n_segs=2),
+            # Unordered, segment 3
+            make_test_case('vluxseg3ei16_v_u16mf2x3', vl=4, n_segs=3),
+            make_test_case('vluxseg3ei16_v_u16mf2x3', vl=3, n_segs=3),
+            make_test_case('vluxseg3ei16_v_u16m1x3', vl=8, n_segs=3),
+            make_test_case('vluxseg3ei16_v_u16m1x3', vl=7, n_segs=3),
+            make_test_case('vluxseg3ei16_v_u16m2x3', vl=16, n_segs=3),
+            make_test_case('vluxseg3ei16_v_u16m2x3', vl=15, n_segs=3),
+            # Unordered, segment 4
+            make_test_case('vluxseg4ei16_v_u16mf2x4', vl=4, n_segs=4),
+            make_test_case('vluxseg4ei16_v_u16mf2x4', vl=3, n_segs=4),
+            make_test_case('vluxseg4ei16_v_u16m1x4', vl=8, n_segs=4),
+            make_test_case('vluxseg4ei16_v_u16m1x4', vl=7, n_segs=4),
+            make_test_case('vluxseg4ei16_v_u16m2x4', vl=16, n_segs=4),
+            make_test_case('vluxseg4ei16_v_u16m2x4', vl=15, n_segs=4),
+            # Unordered, segment 5
+            make_test_case('vluxseg5ei16_v_u16mf2x5', vl=4, n_segs=5),
+            make_test_case('vluxseg5ei16_v_u16mf2x5', vl=3, n_segs=5),
+            make_test_case('vluxseg5ei16_v_u16m1x5', vl=8, n_segs=5),
+            make_test_case('vluxseg5ei16_v_u16m1x5', vl=7, n_segs=5),
+            # Unordered, segment 6
+            make_test_case('vluxseg6ei16_v_u16mf2x6', vl=4, n_segs=6),
+            make_test_case('vluxseg6ei16_v_u16mf2x6', vl=3, n_segs=6),
+            make_test_case('vluxseg6ei16_v_u16m1x6', vl=8, n_segs=6),
+            make_test_case('vluxseg6ei16_v_u16m1x6', vl=7, n_segs=6),
+            # Unordered, segment 7
+            make_test_case('vluxseg7ei16_v_u16mf2x7', vl=4, n_segs=7),
+            make_test_case('vluxseg7ei16_v_u16mf2x7', vl=3, n_segs=7),
+            make_test_case('vluxseg7ei16_v_u16m1x7', vl=8, n_segs=7),
+            make_test_case('vluxseg7ei16_v_u16m1x7', vl=7, n_segs=7),
+            # Unordered, segment 8
+            make_test_case('vluxseg8ei16_v_u16mf2x8', vl=4, n_segs=8),
+            make_test_case('vluxseg8ei16_v_u16mf2x8', vl=3, n_segs=8),
+            make_test_case('vluxseg8ei16_v_u16m1x8', vl=8, n_segs=8),
+            make_test_case('vluxseg8ei16_v_u16m1x8', vl=7, n_segs=8),
+            # Ordered, segment 2
+            make_test_case('vloxseg2ei16_v_u16mf2x2', vl=4, n_segs=2),
+            make_test_case('vloxseg2ei16_v_u16mf2x2', vl=3, n_segs=2),
+            make_test_case('vloxseg2ei16_v_u16m1x2', vl=8, n_segs=2),
+            make_test_case('vloxseg2ei16_v_u16m1x2', vl=7, n_segs=2),
+            make_test_case('vloxseg2ei16_v_u16m2x2', vl=16, n_segs=2),
+            make_test_case('vloxseg2ei16_v_u16m2x2', vl=15, n_segs=2),
+            make_test_case('vloxseg2ei16_v_u16m4x2', vl=32, n_segs=2),
+            make_test_case('vloxseg2ei16_v_u16m4x2', vl=31, n_segs=2),
+            # Ordered, segment 3
+            make_test_case('vloxseg3ei16_v_u16mf2x3', vl=4, n_segs=3),
+            make_test_case('vloxseg3ei16_v_u16mf2x3', vl=3, n_segs=3),
+            make_test_case('vloxseg3ei16_v_u16m1x3', vl=8, n_segs=3),
+            make_test_case('vloxseg3ei16_v_u16m1x3', vl=7, n_segs=3),
+            make_test_case('vloxseg3ei16_v_u16m2x3', vl=16, n_segs=3),
+            make_test_case('vloxseg3ei16_v_u16m2x3', vl=15, n_segs=3),
+            # Ordered, segment 4
+            make_test_case('vloxseg4ei16_v_u16mf2x4', vl=4, n_segs=4),
+            make_test_case('vloxseg4ei16_v_u16mf2x4', vl=3, n_segs=4),
+            make_test_case('vloxseg4ei16_v_u16m1x4', vl=8, n_segs=4),
+            make_test_case('vloxseg4ei16_v_u16m1x4', vl=7, n_segs=4),
+            make_test_case('vloxseg4ei16_v_u16m2x4', vl=16, n_segs=4),
+            make_test_case('vloxseg4ei16_v_u16m2x4', vl=15, n_segs=4),
+            # Ordered, segment 5
+            make_test_case('vloxseg5ei16_v_u16mf2x5', vl=4, n_segs=5),
+            make_test_case('vloxseg5ei16_v_u16mf2x5', vl=3, n_segs=5),
+            make_test_case('vloxseg5ei16_v_u16m1x5', vl=8, n_segs=5),
+            make_test_case('vloxseg5ei16_v_u16m1x5', vl=7, n_segs=5),
+            # Ordered, segment 6
+            make_test_case('vloxseg6ei16_v_u16mf2x6', vl=4, n_segs=6),
+            make_test_case('vloxseg6ei16_v_u16mf2x6', vl=3, n_segs=6),
+            make_test_case('vloxseg6ei16_v_u16m1x6', vl=8, n_segs=6),
+            make_test_case('vloxseg6ei16_v_u16m1x6', vl=7, n_segs=6),
+            # Ordered, segment 7
+            make_test_case('vloxseg7ei16_v_u16mf2x7', vl=4, n_segs=7),
+            make_test_case('vloxseg7ei16_v_u16mf2x7', vl=3, n_segs=7),
+            make_test_case('vloxseg7ei16_v_u16m1x7', vl=8, n_segs=7),
+            make_test_case('vloxseg7ei16_v_u16m1x7', vl=7, n_segs=7),
+            # Ordered, segment 8
+            make_test_case('vloxseg8ei16_v_u16mf2x8', vl=4, n_segs=8),
+            make_test_case('vloxseg8ei16_v_u16mf2x8', vl=3, n_segs=8),
+            make_test_case('vloxseg8ei16_v_u16m1x8', vl=8, n_segs=8),
+            make_test_case('vloxseg8ei16_v_u16m1x8', vl=7, n_segs=8),
+        ],
+        dtype = np.uint16,
+        index_dtype = np.uint16,
+    )
+
+
+@cocotb.test()
 async def load16_seg_unit(dut):
     """Test vlseg*e16 usage accessible from intrinsics."""
     def make_test_case(impl: str, vl: int, n_segs: int):
@@ -945,6 +1099,86 @@
 
 
 @cocotb.test()
+async def load32_index32_seg(dut):
+    """Test vl*xseg*ei32_v_u32 usage accessible from intrinsics."""
+    def make_test_case(impl: str, vl: int, n_segs: int):
+        return {
+            'impl': impl,
+            'vl': vl,
+            'segments': n_segs,
+            'in_bytes': 28000,
+            'out_size': vl * n_segs * 2,
+        }
+
+    await vector_load_segmented_indexed(
+        dut = dut,
+        elf_name = 'load32_index32_seg.elf',
+        cases = [
+            # Unordered, segment 2
+            make_test_case('vluxseg2ei32_v_u32m1x2', vl=4, n_segs=2),
+            make_test_case('vluxseg2ei32_v_u32m1x2', vl=3, n_segs=2),
+            make_test_case('vluxseg2ei32_v_u32m2x2', vl=8, n_segs=2),
+            make_test_case('vluxseg2ei32_v_u32m2x2', vl=7, n_segs=2),
+            make_test_case('vluxseg2ei32_v_u32m4x2', vl=16, n_segs=2),
+            make_test_case('vluxseg2ei32_v_u32m4x2', vl=15, n_segs=2),
+            # Unordered, segment 3
+            make_test_case('vluxseg3ei32_v_u32m1x3', vl=4, n_segs=3),
+            make_test_case('vluxseg3ei32_v_u32m1x3', vl=3, n_segs=3),
+            make_test_case('vluxseg3ei32_v_u32m2x3', vl=8, n_segs=3),
+            make_test_case('vluxseg3ei32_v_u32m2x3', vl=7, n_segs=3),
+            # Unordered, segment 4
+            make_test_case('vluxseg4ei32_v_u32m1x4', vl=4, n_segs=4),
+            make_test_case('vluxseg4ei32_v_u32m1x4', vl=3, n_segs=4),
+            make_test_case('vluxseg4ei32_v_u32m2x4', vl=8, n_segs=4),
+            make_test_case('vluxseg4ei32_v_u32m2x4', vl=7, n_segs=4),
+            # Unordered, segment 5
+            make_test_case('vluxseg5ei32_v_u32m1x5', vl=4, n_segs=5),
+            make_test_case('vluxseg5ei32_v_u32m1x5', vl=3, n_segs=5),
+            # Unordered, segment 6
+            make_test_case('vluxseg6ei32_v_u32m1x6', vl=4, n_segs=6),
+            make_test_case('vluxseg6ei32_v_u32m1x6', vl=3, n_segs=6),
+            # Unordered, segment 7
+            make_test_case('vluxseg7ei32_v_u32m1x7', vl=4, n_segs=7),
+            make_test_case('vluxseg7ei32_v_u32m1x7', vl=3, n_segs=7),
+            # Unordered, segment 8
+            make_test_case('vluxseg8ei32_v_u32m1x8', vl=4, n_segs=8),
+            make_test_case('vluxseg8ei32_v_u32m1x8', vl=3, n_segs=8),
+            # Ordered, segment 2
+            make_test_case('vloxseg2ei32_v_u32m1x2', vl=4, n_segs=2),
+            make_test_case('vloxseg2ei32_v_u32m1x2', vl=3, n_segs=2),
+            make_test_case('vloxseg2ei32_v_u32m2x2', vl=8, n_segs=2),
+            make_test_case('vloxseg2ei32_v_u32m2x2', vl=7, n_segs=2),
+            make_test_case('vloxseg2ei32_v_u32m4x2', vl=16, n_segs=2),
+            make_test_case('vloxseg2ei32_v_u32m4x2', vl=15, n_segs=2),
+            # Ordered, segment 3
+            make_test_case('vloxseg3ei32_v_u32m1x3', vl=4, n_segs=3),
+            make_test_case('vloxseg3ei32_v_u32m1x3', vl=3, n_segs=3),
+            make_test_case('vloxseg3ei32_v_u32m2x3', vl=8, n_segs=3),
+            make_test_case('vloxseg3ei32_v_u32m2x3', vl=7, n_segs=3),
+            # Ordered, segment 4
+            make_test_case('vloxseg4ei32_v_u32m1x4', vl=4, n_segs=4),
+            make_test_case('vloxseg4ei32_v_u32m1x4', vl=3, n_segs=4),
+            make_test_case('vloxseg4ei32_v_u32m2x4', vl=8, n_segs=4),
+            make_test_case('vloxseg4ei32_v_u32m2x4', vl=7, n_segs=4),
+            # Ordered, segment 5
+            make_test_case('vloxseg5ei32_v_u32m1x5', vl=4, n_segs=5),
+            make_test_case('vloxseg5ei32_v_u32m1x5', vl=3, n_segs=5),
+            # Ordered, segment 6
+            make_test_case('vloxseg6ei32_v_u32m1x6', vl=4, n_segs=6),
+            make_test_case('vloxseg6ei32_v_u32m1x6', vl=3, n_segs=6),
+            # Ordered, segment 7
+            make_test_case('vloxseg7ei32_v_u32m1x7', vl=4, n_segs=7),
+            make_test_case('vloxseg7ei32_v_u32m1x7', vl=3, n_segs=7),
+            # Ordered, segment 8
+            make_test_case('vloxseg8ei32_v_u32m1x8', vl=4, n_segs=8),
+            make_test_case('vloxseg8ei32_v_u32m1x8', vl=3, n_segs=8),
+        ],
+        dtype = np.uint32,
+        index_dtype = np.uint32,
+    )
+
+
+@cocotb.test()
 async def load32_seg_unit(dut):
     """Test vlseg*e32 usage accessible from intrinsics."""
     def make_test_case(impl: str, vl: int, n_segs: int):