Consolidate load8/16/32 segmented test cases

And add more coverage on the vlseg*.
Minor fixes to other existing tests to make the binaries runnable
without the test runtime.

Change-Id: I5e6a6e9d439291ec44ef00610072e37b1a152613
diff --git a/tests/cocotb/BUILD b/tests/cocotb/BUILD
index 7efc947..7a09d29 100644
--- a/tests/cocotb/BUILD
+++ b/tests/cocotb/BUILD
@@ -173,23 +173,20 @@
 # BEGIN_TESTCASES_FOR_rvv_load_store_test
 RVV_LOAD_STORE_TESTCASES = [
     "load_store_bits",
+    "load8_seg_unit",
     "load8_stride2_m1",
     "load8_stride2_m1_partial",
     "load8_stride2_mf4",
+    "load16_seg_unit",
     "load16_stride4_m1",
     "load16_stride4_m1_partial",
     "load16_stride4_mf2",
+    "load32_seg_unit",
     "load32_stride8_m1",
     "load32_stride8_m1_partial",
     "load_store8_unit_m2",
     "load_store16_unit_m2",
     "load_store32_unit_m2",
-    "load8_segment2_unit_m1",
-    "load16_segment2_unit_m1",
-    "load32_segment2_unit_m1",
-    "load8_segment2_unit_m2",
-    "load16_segment2_unit_m2",
-    "load32_segment2_unit_m2",
     "load8_segment2_stride6_m1",
     "load16_segment2_stride6_m1",
     "load8_indexed_m1",
diff --git a/tests/cocotb/rvv/load_store/BUILD b/tests/cocotb/rvv/load_store/BUILD
index 0131047..cf3dae6 100644
--- a/tests/cocotb/rvv/load_store/BUILD
+++ b/tests/cocotb/rvv/load_store/BUILD
@@ -26,23 +26,8 @@
         "load8_indexed_m1": {
             "srcs": ["load8_indexed_m1.cc"],
         },
-        "load8_segment2_unit_m1": {
-            "srcs": ["load8_segment2_unit_m1.cc"],
-        },
-        "load16_segment2_unit_m1": {
-            "srcs": ["load16_segment2_unit_m1.cc"],
-        },
-        "load32_segment2_unit_m1": {
-            "srcs": ["load32_segment2_unit_m1.cc"],
-        },
-        "load8_segment2_unit_m2": {
-            "srcs": ["load8_segment2_unit_m2.cc"],
-        },
-        "load16_segment2_unit_m2": {
-            "srcs": ["load16_segment2_unit_m2.cc"],
-        },
-        "load32_segment2_unit_m2": {
-            "srcs": ["load32_segment2_unit_m2.cc"],
+        "load8_seg_unit": {
+            "srcs": ["load8_seg_unit.cc"],
         },
         "load8_segment2_stride6_m1": {
             "srcs": ["load8_segment2_stride6_m1.cc"],
@@ -71,6 +56,9 @@
         "load_store32_unit_m2": {
             "srcs": ["load_store32_unit_m2.cc"],
         },
+        "load16_seg_unit": {
+            "srcs": ["load16_seg_unit.cc"],
+        },
         "load16_stride4_m1": {
             "srcs": ["load16_stride4_m1.cc"],
         },
@@ -80,6 +68,9 @@
         "load16_stride4_mf2": {
             "srcs": ["load16_stride4_mf2.cc"],
         },
+        "load32_seg_unit": {
+            "srcs": ["load32_seg_unit.cc"],
+        },
         "load32_stride8_m1": {
             "srcs": ["load32_stride8_m1.cc"],
         },
@@ -100,12 +91,7 @@
     srcs = [
         ":load_store_bits.elf",
         ":load8_indexed_m1.elf",
-        ":load8_segment2_unit_m1.elf",
-        ":load16_segment2_unit_m1.elf",
-        ":load32_segment2_unit_m1.elf",
-        ":load8_segment2_unit_m2.elf",
-        ":load16_segment2_unit_m2.elf",
-        ":load32_segment2_unit_m2.elf",
+        ":load8_seg_unit.elf",
         ":load8_segment2_stride6_m1.elf",
         ":load16_segment2_stride6_m1.elf",
         ":load8_stride2_m1.elf",
@@ -115,9 +101,11 @@
         ":load_store8_unit_m2.elf",
         ":load_store16_unit_m2.elf",
         ":load_store32_unit_m2.elf",
+        ":load16_seg_unit.elf",
         ":load16_stride4_m1.elf",
         ":load16_stride4_m1_partial.elf",
         ":load16_stride4_mf2.elf",
+        ":load32_seg_unit.elf",
         ":load32_stride8_m1.elf",
         ":load32_stride8_m1_partial.elf",
         ":store8_indexed_m1.elf",
diff --git a/tests/cocotb/rvv/load_store/load16_seg_unit.cc b/tests/cocotb/rvv/load_store/load16_seg_unit.cc
new file mode 100644
index 0000000..577bbf5
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load16_seg_unit.cc
@@ -0,0 +1,259 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+namespace {
+// Double sized so we can check trailing regions are not read/written.
+constexpr size_t buf_size = 128;
+}  // namespace
+
+size_t vl __attribute__((section(".data"))) = 8;
+// These instructions don't differentiate signed/unsigned so we only need to
+// test one. The types come from intrinsic level.
+uint16_t in_buf[buf_size] __attribute__((section(".data")));
+uint16_t out_buf[buf_size] __attribute__((section(".data")));
+
+extern "C" {
+// Segment 2
+__attribute__((used, retain)) void vlseg2e16_v_u16mf2x2() {
+  auto data = __riscv_vlseg2e16_v_u16mf2x2(in_buf, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x2_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x2_u16mf2(data, 1),
+                         vl);
+}
+
+__attribute__((used, retain)) void vlseg2e16_v_u16m1x2() {
+  auto data = __riscv_vlseg2e16_v_u16m1x2(in_buf, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x2_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x2_u16m1(data, 1),
+                        vl);
+}
+
+__attribute__((used, retain)) void vlseg2e16_v_u16m2x2() {
+  auto data = __riscv_vlseg2e16_v_u16m2x2(in_buf, vl);
+  __riscv_vse16_v_u16m2(out_buf, __riscv_vget_v_u16m2x2_u16m2(data, 0), vl);
+  __riscv_vse16_v_u16m2(out_buf + vl, __riscv_vget_v_u16m2x2_u16m2(data, 1),
+                        vl);
+}
+
+__attribute__((used, retain)) void vlseg2e16_v_u16m4x2() {
+  auto data = __riscv_vlseg2e16_v_u16m4x2(in_buf, vl);
+  __riscv_vse16_v_u16m4(out_buf, __riscv_vget_v_u16m4x2_u16m4(data, 0), vl);
+  __riscv_vse16_v_u16m4(out_buf + vl, __riscv_vget_v_u16m4x2_u16m4(data, 1),
+                        vl);
+}
+
+// Segment 3
+__attribute__((used, retain)) void vlseg3e16_v_u16mf2x3() {
+  auto data = __riscv_vlseg3e16_v_u16mf2x3(in_buf, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x3_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x3_u16mf2(data, 1),
+                         vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 2,
+                         __riscv_vget_v_u16mf2x3_u16mf2(data, 2), vl);
+}
+
+__attribute__((used, retain)) void vlseg3e16_v_u16m1x3() {
+  auto data = __riscv_vlseg3e16_v_u16m1x3(in_buf, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x3_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x3_u16m1(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 2, __riscv_vget_v_u16m1x3_u16m1(data, 2),
+                        vl);
+}
+
+__attribute__((used, retain)) void vlseg3e16_v_u16m2x3() {
+  auto data = __riscv_vlseg3e16_v_u16m2x3(in_buf, vl);
+  __riscv_vse16_v_u16m2(out_buf, __riscv_vget_v_u16m2x3_u16m2(data, 0), vl);
+  __riscv_vse16_v_u16m2(out_buf + vl, __riscv_vget_v_u16m2x3_u16m2(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m2(out_buf + vl * 2, __riscv_vget_v_u16m2x3_u16m2(data, 2),
+                        vl);
+}
+
+// Segment 4
+__attribute__((used, retain)) void vlseg4e16_v_u16mf2x4() {
+  auto data = __riscv_vlseg4e16_v_u16mf2x4(in_buf, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x4_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x4_u16mf2(data, 1),
+                         vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 2,
+                         __riscv_vget_v_u16mf2x4_u16mf2(data, 2), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 3,
+                         __riscv_vget_v_u16mf2x4_u16mf2(data, 3), vl);
+}
+
+__attribute__((used, retain)) void vlseg4e16_v_u16m1x4() {
+  auto data = __riscv_vlseg4e16_v_u16m1x4(in_buf, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x4_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x4_u16m1(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 2, __riscv_vget_v_u16m1x4_u16m1(data, 2),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 3, __riscv_vget_v_u16m1x4_u16m1(data, 3),
+                        vl);
+}
+
+__attribute__((used, retain)) void vlseg4e16_v_u16m2x4() {
+  auto data = __riscv_vlseg4e16_v_u16m2x4(in_buf, vl);
+  __riscv_vse16_v_u16m2(out_buf, __riscv_vget_v_u16m2x4_u16m2(data, 0), vl);
+  __riscv_vse16_v_u16m2(out_buf + vl, __riscv_vget_v_u16m2x4_u16m2(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m2(out_buf + vl * 2, __riscv_vget_v_u16m2x4_u16m2(data, 2),
+                        vl);
+  __riscv_vse16_v_u16m2(out_buf + vl * 3, __riscv_vget_v_u16m2x4_u16m2(data, 3),
+                        vl);
+}
+
+// Segment 5
+__attribute__((used, retain)) void vlseg5e16_v_u16mf2x5() {
+  auto data = __riscv_vlseg5e16_v_u16mf2x5(in_buf, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x5_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x5_u16mf2(data, 1),
+                         vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 2,
+                         __riscv_vget_v_u16mf2x5_u16mf2(data, 2), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 3,
+                         __riscv_vget_v_u16mf2x5_u16mf2(data, 3), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 4,
+                         __riscv_vget_v_u16mf2x5_u16mf2(data, 4), vl);
+}
+
+__attribute__((used, retain)) void vlseg5e16_v_u16m1x5() {
+  auto data = __riscv_vlseg5e16_v_u16m1x5(in_buf, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x5_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x5_u16m1(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 2, __riscv_vget_v_u16m1x5_u16m1(data, 2),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 3, __riscv_vget_v_u16m1x5_u16m1(data, 3),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 4, __riscv_vget_v_u16m1x5_u16m1(data, 4),
+                        vl);
+}
+
+// Segment 6
+__attribute__((used, retain)) void vlseg6e16_v_u16mf2x6() {
+  auto data = __riscv_vlseg6e16_v_u16mf2x6(in_buf, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x6_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x6_u16mf2(data, 1),
+                         vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 2,
+                         __riscv_vget_v_u16mf2x6_u16mf2(data, 2), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 3,
+                         __riscv_vget_v_u16mf2x6_u16mf2(data, 3), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 4,
+                         __riscv_vget_v_u16mf2x6_u16mf2(data, 4), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 5,
+                         __riscv_vget_v_u16mf2x6_u16mf2(data, 5), vl);
+}
+
+__attribute__((used, retain)) void vlseg6e16_v_u16m1x6() {
+  auto data = __riscv_vlseg6e16_v_u16m1x6(in_buf, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x6_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x6_u16m1(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 2, __riscv_vget_v_u16m1x6_u16m1(data, 2),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 3, __riscv_vget_v_u16m1x6_u16m1(data, 3),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 4, __riscv_vget_v_u16m1x6_u16m1(data, 4),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 5, __riscv_vget_v_u16m1x6_u16m1(data, 5),
+                        vl);
+}
+
+// Segment 7
+__attribute__((used, retain)) void vlseg7e16_v_u16mf2x7() {
+  auto data = __riscv_vlseg7e16_v_u16mf2x7(in_buf, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x7_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x7_u16mf2(data, 1),
+                         vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 2,
+                         __riscv_vget_v_u16mf2x7_u16mf2(data, 2), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 3,
+                         __riscv_vget_v_u16mf2x7_u16mf2(data, 3), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 4,
+                         __riscv_vget_v_u16mf2x7_u16mf2(data, 4), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 5,
+                         __riscv_vget_v_u16mf2x7_u16mf2(data, 5), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 6,
+                         __riscv_vget_v_u16mf2x7_u16mf2(data, 6), vl);
+}
+
+__attribute__((used, retain)) void vlseg7e16_v_u16m1x7() {
+  auto data = __riscv_vlseg7e16_v_u16m1x7(in_buf, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x7_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x7_u16m1(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 2, __riscv_vget_v_u16m1x7_u16m1(data, 2),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 3, __riscv_vget_v_u16m1x7_u16m1(data, 3),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 4, __riscv_vget_v_u16m1x7_u16m1(data, 4),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 5, __riscv_vget_v_u16m1x7_u16m1(data, 5),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 6, __riscv_vget_v_u16m1x7_u16m1(data, 6),
+                        vl);
+}
+
+// Segment 8
+__attribute__((used, retain)) void vlseg8e16_v_u16mf2x8() {
+  auto data = __riscv_vlseg8e16_v_u16mf2x8(in_buf, vl);
+  __riscv_vse16_v_u16mf2(out_buf, __riscv_vget_v_u16mf2x8_u16mf2(data, 0), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl, __riscv_vget_v_u16mf2x8_u16mf2(data, 1),
+                         vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 2,
+                         __riscv_vget_v_u16mf2x8_u16mf2(data, 2), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 3,
+                         __riscv_vget_v_u16mf2x8_u16mf2(data, 3), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 4,
+                         __riscv_vget_v_u16mf2x8_u16mf2(data, 4), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 5,
+                         __riscv_vget_v_u16mf2x8_u16mf2(data, 5), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 6,
+                         __riscv_vget_v_u16mf2x8_u16mf2(data, 6), vl);
+  __riscv_vse16_v_u16mf2(out_buf + vl * 7,
+                         __riscv_vget_v_u16mf2x8_u16mf2(data, 7), vl);
+}
+
+__attribute__((used, retain)) void vlseg8e16_v_u16m1x8() {
+  auto data = __riscv_vlseg8e16_v_u16m1x8(in_buf, vl);
+  __riscv_vse16_v_u16m1(out_buf, __riscv_vget_v_u16m1x8_u16m1(data, 0), vl);
+  __riscv_vse16_v_u16m1(out_buf + vl, __riscv_vget_v_u16m1x8_u16m1(data, 1),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 2, __riscv_vget_v_u16m1x8_u16m1(data, 2),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 3, __riscv_vget_v_u16m1x8_u16m1(data, 3),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 4, __riscv_vget_v_u16m1x8_u16m1(data, 4),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 5, __riscv_vget_v_u16m1x8_u16m1(data, 5),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 6, __riscv_vget_v_u16m1x8_u16m1(data, 6),
+                        vl);
+  __riscv_vse16_v_u16m1(out_buf + vl * 7, __riscv_vget_v_u16m1x8_u16m1(data, 7),
+                        vl);
+}
+}
+
+void (*impl)() __attribute__((section(".data"))) = &vlseg2e16_v_u16m1x2;
+
+int main(int argc, char** argv) {
+  impl();
+  return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load16_segment2_unit_m1.cc b/tests/cocotb/rvv/load_store/load16_segment2_unit_m1.cc
deleted file mode 100644
index f7f8d03..0000000
--- a/tests/cocotb/rvv/load_store/load16_segment2_unit_m1.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright 2025 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <riscv_vector.h>
-#include <stdint.h>
-
-// Enough space for m2.
-uint16_t in_buf[32] __attribute__((section(".data")));
-uint16_t out_buf[32] __attribute__((section(".data")));
-
-__attribute__((used, retain)) void test_intrinsic(const uint16_t *x,
-                                                  uint16_t *y) {
-  vuint16m1x2_t v = __riscv_vlseg2e16_v_u16m1x2(in_buf, 16);
-
-  vuint16m2_t vv = __riscv_vcreate_v_u16m1_u16m2(
-      __riscv_vget_v_u16m1x2_u16m1(v, 0),
-      __riscv_vget_v_u16m1x2_u16m1(v, 1));
-
-  __riscv_vse16_v_u16m2(y, vv, /*vl=*/16);
-}
-
-int main(int argc, char **argv) {
-  test_intrinsic(in_buf, out_buf);
-  return 0;
-}
diff --git a/tests/cocotb/rvv/load_store/load16_segment2_unit_m2.cc b/tests/cocotb/rvv/load_store/load16_segment2_unit_m2.cc
deleted file mode 100644
index 68fab72..0000000
--- a/tests/cocotb/rvv/load_store/load16_segment2_unit_m2.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright 2025 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <riscv_vector.h>
-#include <stdint.h>
-
-// Enough space for m4.
-uint16_t in_buf[64] __attribute__((section(".data")));
-uint16_t out_buf[64] __attribute__((section(".data")));
-
-__attribute__((used, retain)) void test_intrinsic(const uint16_t *x,
-                                                  uint16_t *y) {
-  vuint16m2x2_t v = __riscv_vlseg2e16_v_u16m2x2(in_buf, 16);
-
-  vuint16m4_t vv = __riscv_vcreate_v_u16m2_u16m4(
-      __riscv_vget_v_u16m2x2_u16m2(v, 0),
-      __riscv_vget_v_u16m2x2_u16m2(v, 1));
-
-  __riscv_vse16_v_u16m4(y, vv, /*vl=*/32);
-}
-
-int main(int argc, char **argv) {
-  test_intrinsic(in_buf, out_buf);
-  return 0;
-}
diff --git a/tests/cocotb/rvv/load_store/load32_seg_unit.cc b/tests/cocotb/rvv/load_store/load32_seg_unit.cc
new file mode 100644
index 0000000..82b8ce1
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load32_seg_unit.cc
@@ -0,0 +1,168 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+namespace {
+// Double sized so we can check trailing regions are not read/written.
+constexpr size_t buf_size = 64;
+}  // namespace
+
+size_t vl __attribute__((section(".data"))) = 4;
+// These instructions don't differentiate signed/unsigned so we only need to
+// test one. The types come from intrinsic level.
+uint32_t in_buf[buf_size] __attribute__((section(".data")));
+uint32_t out_buf[buf_size] __attribute__((section(".data")));
+
+extern "C" {
+// Segment 2
+__attribute__((used, retain)) void vlseg2e32_v_u32m1x2() {
+  auto data = __riscv_vlseg2e32_v_u32m1x2(in_buf, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x2_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x2_u32m1(data, 1),
+                        vl);
+}
+
+__attribute__((used, retain)) void vlseg2e32_v_u32m2x2() {
+  auto data = __riscv_vlseg2e32_v_u32m2x2(in_buf, vl);
+  __riscv_vse32_v_u32m2(out_buf, __riscv_vget_v_u32m2x2_u32m2(data, 0), vl);
+  __riscv_vse32_v_u32m2(out_buf + vl, __riscv_vget_v_u32m2x2_u32m2(data, 1),
+                        vl);
+}
+
+__attribute__((used, retain)) void vlseg2e32_v_u32m4x2() {
+  auto data = __riscv_vlseg2e32_v_u32m4x2(in_buf, vl);
+  __riscv_vse32_v_u32m4(out_buf, __riscv_vget_v_u32m4x2_u32m4(data, 0), vl);
+  __riscv_vse32_v_u32m4(out_buf + vl, __riscv_vget_v_u32m4x2_u32m4(data, 1),
+                        vl);
+}
+
+// Segment 3
+__attribute__((used, retain)) void vlseg3e32_v_u32m1x3() {
+  auto data = __riscv_vlseg3e32_v_u32m1x3(in_buf, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x3_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x3_u32m1(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 2, __riscv_vget_v_u32m1x3_u32m1(data, 2),
+                        vl);
+}
+
+__attribute__((used, retain)) void vlseg3e32_v_u32m2x3() {
+  auto data = __riscv_vlseg3e32_v_u32m2x3(in_buf, vl);
+  __riscv_vse32_v_u32m2(out_buf, __riscv_vget_v_u32m2x3_u32m2(data, 0), vl);
+  __riscv_vse32_v_u32m2(out_buf + vl, __riscv_vget_v_u32m2x3_u32m2(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m2(out_buf + vl * 2, __riscv_vget_v_u32m2x3_u32m2(data, 2),
+                        vl);
+}
+
+// Segment 4
+__attribute__((used, retain)) void vlseg4e32_v_u32m1x4() {
+  auto data = __riscv_vlseg4e32_v_u32m1x4(in_buf, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x4_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x4_u32m1(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 2, __riscv_vget_v_u32m1x4_u32m1(data, 2),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 3, __riscv_vget_v_u32m1x4_u32m1(data, 3),
+                        vl);
+}
+
+__attribute__((used, retain)) void vlseg4e32_v_u32m2x4() {
+  auto data = __riscv_vlseg4e32_v_u32m2x4(in_buf, vl);
+  __riscv_vse32_v_u32m2(out_buf, __riscv_vget_v_u32m2x4_u32m2(data, 0), vl);
+  __riscv_vse32_v_u32m2(out_buf + vl, __riscv_vget_v_u32m2x4_u32m2(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m2(out_buf + vl * 2, __riscv_vget_v_u32m2x4_u32m2(data, 2),
+                        vl);
+  __riscv_vse32_v_u32m2(out_buf + vl * 3, __riscv_vget_v_u32m2x4_u32m2(data, 3),
+                        vl);
+}
+
+// Segment 5
+__attribute__((used, retain)) void vlseg5e32_v_u32m1x5() {
+  auto data = __riscv_vlseg5e32_v_u32m1x5(in_buf, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x5_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x5_u32m1(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 2, __riscv_vget_v_u32m1x5_u32m1(data, 2),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 3, __riscv_vget_v_u32m1x5_u32m1(data, 3),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 4, __riscv_vget_v_u32m1x5_u32m1(data, 4),
+                        vl);
+}
+
+// Segment 6
+__attribute__((used, retain)) void vlseg6e32_v_u32m1x6() {
+  auto data = __riscv_vlseg6e32_v_u32m1x6(in_buf, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x6_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x6_u32m1(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 2, __riscv_vget_v_u32m1x6_u32m1(data, 2),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 3, __riscv_vget_v_u32m1x6_u32m1(data, 3),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 4, __riscv_vget_v_u32m1x6_u32m1(data, 4),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 5, __riscv_vget_v_u32m1x6_u32m1(data, 5),
+                        vl);
+}
+
+// Segment 7
+__attribute__((used, retain)) void vlseg7e32_v_u32m1x7() {
+  auto data = __riscv_vlseg7e32_v_u32m1x7(in_buf, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x7_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x7_u32m1(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 2, __riscv_vget_v_u32m1x7_u32m1(data, 2),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 3, __riscv_vget_v_u32m1x7_u32m1(data, 3),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 4, __riscv_vget_v_u32m1x7_u32m1(data, 4),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 5, __riscv_vget_v_u32m1x7_u32m1(data, 5),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 6, __riscv_vget_v_u32m1x7_u32m1(data, 6),
+                        vl);
+}
+
+// Segment 8
+__attribute__((used, retain)) void vlseg8e32_v_u32m1x8() {
+  auto data = __riscv_vlseg8e32_v_u32m1x8(in_buf, vl);
+  __riscv_vse32_v_u32m1(out_buf, __riscv_vget_v_u32m1x8_u32m1(data, 0), vl);
+  __riscv_vse32_v_u32m1(out_buf + vl, __riscv_vget_v_u32m1x8_u32m1(data, 1),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 2, __riscv_vget_v_u32m1x8_u32m1(data, 2),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 3, __riscv_vget_v_u32m1x8_u32m1(data, 3),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 4, __riscv_vget_v_u32m1x8_u32m1(data, 4),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 5, __riscv_vget_v_u32m1x8_u32m1(data, 5),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 6, __riscv_vget_v_u32m1x8_u32m1(data, 6),
+                        vl);
+  __riscv_vse32_v_u32m1(out_buf + vl * 7, __riscv_vget_v_u32m1x8_u32m1(data, 7),
+                        vl);
+}
+}
+
+void (*impl)() __attribute__((section(".data"))) = &vlseg2e32_v_u32m1x2;
+
+int main(int argc, char** argv) {
+  impl();
+  return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load32_segment2_unit_m1.cc b/tests/cocotb/rvv/load_store/load32_segment2_unit_m1.cc
deleted file mode 100644
index bfb3a7b..0000000
--- a/tests/cocotb/rvv/load_store/load32_segment2_unit_m1.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright 2025 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <riscv_vector.h>
-#include <stdint.h>
-
-// Enough space for m2.
-uint32_t in_buf[16] __attribute__((section(".data")));
-uint32_t out_buf[16] __attribute__((section(".data")));
-
-__attribute__((used, retain)) void test_intrinsic(const uint32_t *x,
-                                                  uint32_t *y) {
-  vuint32m1x2_t v = __riscv_vlseg2e32_v_u32m1x2(in_buf, 8);
-
-  vuint32m2_t vv = __riscv_vcreate_v_u32m1_u32m2(
-      __riscv_vget_v_u32m1x2_u32m1(v, 0),
-      __riscv_vget_v_u32m1x2_u32m1(v, 1));
-
-  __riscv_vse32_v_u32m2(y, vv, /*vl=*/8);
-}
-
-int main(int argc, char **argv) {
-  test_intrinsic(in_buf, out_buf);
-  return 0;
-}
diff --git a/tests/cocotb/rvv/load_store/load32_segment2_unit_m2.cc b/tests/cocotb/rvv/load_store/load32_segment2_unit_m2.cc
deleted file mode 100644
index 5790e4f..0000000
--- a/tests/cocotb/rvv/load_store/load32_segment2_unit_m2.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright 2025 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <riscv_vector.h>
-#include <stdint.h>
-
-// Enough space for m4.
-uint32_t in_buf[32] __attribute__((section(".data")));
-uint32_t out_buf[32] __attribute__((section(".data")));
-
-__attribute__((used, retain)) void test_intrinsic(const uint32_t *x,
-                                                  uint32_t *y) {
-  vuint32m2x2_t v = __riscv_vlseg2e32_v_u32m2x2(in_buf, 8);
-
-  vuint32m4_t vv = __riscv_vcreate_v_u32m2_u32m4(
-      __riscv_vget_v_u32m2x2_u32m2(v, 0),
-      __riscv_vget_v_u32m2x2_u32m2(v, 1));
-
-  __riscv_vse32_v_u32m4(y, vv, /*vl=*/16);
-}
-
-int main(int argc, char **argv) {
-  test_intrinsic(in_buf, out_buf);
-  return 0;
-}
diff --git a/tests/cocotb/rvv/load_store/load8_seg_unit.cc b/tests/cocotb/rvv/load_store/load8_seg_unit.cc
new file mode 100644
index 0000000..a8aeb3c
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load8_seg_unit.cc
@@ -0,0 +1,325 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+namespace {
+// Double sized so we can check trailing regions are not read/written.
+constexpr size_t buf_size = 256;
+}  // namespace
+
+size_t vl __attribute__((section(".data"))) = 16;
+// These instructions don't differentiate signed/unsigned so we only need to
+// test one. The types come from intrinsic level.
+uint8_t in_buf[buf_size] __attribute__((section(".data")));
+uint8_t out_buf[buf_size] __attribute__((section(".data")));
+
+extern "C" {
+// Segment 2
+__attribute__((used, retain)) void vlseg2e8_v_u8mf4x2() {
+  auto data = __riscv_vlseg2e8_v_u8mf4x2(in_buf, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x2_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x2_u8mf4(data, 1), vl);
+}
+
+__attribute__((used, retain)) void vlseg2e8_v_u8mf2x2() {
+  auto data = __riscv_vlseg2e8_v_u8mf2x2(in_buf, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x2_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x2_u8mf2(data, 1), vl);
+}
+
+__attribute__((used, retain)) void vlseg2e8_v_u8m1x2() {
+  auto data = __riscv_vlseg2e8_v_u8m1x2(in_buf, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x2_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x2_u8m1(data, 1), vl);
+}
+
+__attribute__((used, retain)) void vlseg2e8_v_u8m2x2() {
+  auto data = __riscv_vlseg2e8_v_u8m2x2(in_buf, vl);
+  __riscv_vse8_v_u8m2(out_buf, __riscv_vget_v_u8m2x2_u8m2(data, 0), vl);
+  __riscv_vse8_v_u8m2(out_buf + vl, __riscv_vget_v_u8m2x2_u8m2(data, 1), vl);
+}
+
+__attribute__((used, retain)) void vlseg2e8_v_u8m4x2() {
+  auto data = __riscv_vlseg2e8_v_u8m4x2(in_buf, vl);
+  __riscv_vse8_v_u8m4(out_buf, __riscv_vget_v_u8m4x2_u8m4(data, 0), vl);
+  __riscv_vse8_v_u8m4(out_buf + vl, __riscv_vget_v_u8m4x2_u8m4(data, 1), vl);
+}
+
+// Segment 3
+__attribute__((used, retain)) void vlseg3e8_v_u8mf4x3() {
+  auto data = __riscv_vlseg3e8_v_u8mf4x3(in_buf, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x3_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x3_u8mf4(data, 1), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x3_u8mf4(data, 2),
+                       vl);
+}
+
+__attribute__((used, retain)) void vlseg3e8_v_u8mf2x3() {
+  auto data = __riscv_vlseg3e8_v_u8mf2x3(in_buf, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x3_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x3_u8mf2(data, 1), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x3_u8mf2(data, 2),
+                       vl);
+}
+
+__attribute__((used, retain)) void vlseg3e8_v_u8m1x3() {
+  auto data = __riscv_vlseg3e8_v_u8m1x3(in_buf, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x3_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x3_u8m1(data, 1), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x3_u8m1(data, 2),
+                      vl);
+}
+
+__attribute__((used, retain)) void vlseg3e8_v_u8m2x3() {
+  auto data = __riscv_vlseg3e8_v_u8m2x3(in_buf, vl);
+  __riscv_vse8_v_u8m2(out_buf, __riscv_vget_v_u8m2x3_u8m2(data, 0), vl);
+  __riscv_vse8_v_u8m2(out_buf + vl, __riscv_vget_v_u8m2x3_u8m2(data, 1), vl);
+  __riscv_vse8_v_u8m2(out_buf + vl * 2, __riscv_vget_v_u8m2x3_u8m2(data, 2),
+                      vl);
+}
+
+// Segment 4
+__attribute__((used, retain)) void vlseg4e8_v_u8mf4x4() {
+  auto data = __riscv_vlseg4e8_v_u8mf4x4(in_buf, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x4_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x4_u8mf4(data, 1), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x4_u8mf4(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x4_u8mf4(data, 3),
+                       vl);
+}
+
+__attribute__((used, retain)) void vlseg4e8_v_u8mf2x4() {
+  auto data = __riscv_vlseg4e8_v_u8mf2x4(in_buf, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x4_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x4_u8mf2(data, 1), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x4_u8mf2(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x4_u8mf2(data, 3),
+                       vl);
+}
+
+__attribute__((used, retain)) void vlseg4e8_v_u8m1x4() {
+  auto data = __riscv_vlseg4e8_v_u8m1x4(in_buf, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x4_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x4_u8m1(data, 1), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x4_u8m1(data, 2),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x4_u8m1(data, 3),
+                      vl);
+}
+
+__attribute__((used, retain)) void vlseg4e8_v_u8m2x4() {
+  auto data = __riscv_vlseg4e8_v_u8m2x4(in_buf, vl);
+  __riscv_vse8_v_u8m2(out_buf, __riscv_vget_v_u8m2x4_u8m2(data, 0), vl);
+  __riscv_vse8_v_u8m2(out_buf + vl, __riscv_vget_v_u8m2x4_u8m2(data, 1), vl);
+  __riscv_vse8_v_u8m2(out_buf + vl * 2, __riscv_vget_v_u8m2x4_u8m2(data, 2),
+                      vl);
+  __riscv_vse8_v_u8m2(out_buf + vl * 3, __riscv_vget_v_u8m2x4_u8m2(data, 3),
+                      vl);
+}
+
+// Segment 5
+__attribute__((used, retain)) void vlseg5e8_v_u8mf4x5() {
+  auto data = __riscv_vlseg5e8_v_u8mf4x5(in_buf, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x5_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x5_u8mf4(data, 1), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x5_u8mf4(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x5_u8mf4(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x5_u8mf4(data, 4),
+                       vl);
+}
+
+__attribute__((used, retain)) void vlseg5e8_v_u8mf2x5() {
+  auto data = __riscv_vlseg5e8_v_u8mf2x5(in_buf, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x5_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x5_u8mf2(data, 1), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x5_u8mf2(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x5_u8mf2(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x5_u8mf2(data, 4),
+                       vl);
+}
+
+__attribute__((used, retain)) void vlseg5e8_v_u8m1x5() {
+  auto data = __riscv_vlseg5e8_v_u8m1x5(in_buf, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x5_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x5_u8m1(data, 1), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x5_u8m1(data, 2),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x5_u8m1(data, 3),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x5_u8m1(data, 4),
+                      vl);
+}
+
+// Segment 6
+__attribute__((used, retain)) void vlseg6e8_v_u8mf4x6() {
+  auto data = __riscv_vlseg6e8_v_u8mf4x6(in_buf, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x6_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x6_u8mf4(data, 1), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x6_u8mf4(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x6_u8mf4(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x6_u8mf4(data, 4),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 5, __riscv_vget_v_u8mf4x6_u8mf4(data, 5),
+                       vl);
+}
+
+__attribute__((used, retain)) void vlseg6e8_v_u8mf2x6() {
+  auto data = __riscv_vlseg6e8_v_u8mf2x6(in_buf, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x6_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x6_u8mf2(data, 1), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x6_u8mf2(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x6_u8mf2(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x6_u8mf2(data, 4),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 5, __riscv_vget_v_u8mf2x6_u8mf2(data, 5),
+                       vl);
+}
+
+__attribute__((used, retain)) void vlseg6e8_v_u8m1x6() {
+  auto data = __riscv_vlseg6e8_v_u8m1x6(in_buf, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x6_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x6_u8m1(data, 1), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x6_u8m1(data, 2),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x6_u8m1(data, 3),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x6_u8m1(data, 4),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 5, __riscv_vget_v_u8m1x6_u8m1(data, 5),
+                      vl);
+}
+
+// Segment 7
+__attribute__((used, retain)) void vlseg7e8_v_u8mf4x7() {
+  auto data = __riscv_vlseg7e8_v_u8mf4x7(in_buf, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x7_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x7_u8mf4(data, 1), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x7_u8mf4(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x7_u8mf4(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x7_u8mf4(data, 4),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 5, __riscv_vget_v_u8mf4x7_u8mf4(data, 5),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 6, __riscv_vget_v_u8mf4x7_u8mf4(data, 6),
+                       vl);
+}
+
+__attribute__((used, retain)) void vlseg7e8_v_u8mf2x7() {
+  auto data = __riscv_vlseg7e8_v_u8mf2x7(in_buf, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x7_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x7_u8mf2(data, 1), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x7_u8mf2(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x7_u8mf2(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x7_u8mf2(data, 4),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 5, __riscv_vget_v_u8mf2x7_u8mf2(data, 5),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 6, __riscv_vget_v_u8mf2x7_u8mf2(data, 6),
+                       vl);
+}
+
+__attribute__((used, retain)) void vlseg7e8_v_u8m1x7() {
+  auto data = __riscv_vlseg7e8_v_u8m1x7(in_buf, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x7_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x7_u8m1(data, 1), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x7_u8m1(data, 2),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x7_u8m1(data, 3),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x7_u8m1(data, 4),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 5, __riscv_vget_v_u8m1x7_u8m1(data, 5),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 6, __riscv_vget_v_u8m1x7_u8m1(data, 6),
+                      vl);
+}
+
+// Segment 8
+__attribute__((used, retain)) void vlseg8e8_v_u8mf4x8() {
+  auto data = __riscv_vlseg8e8_v_u8mf4x8(in_buf, vl);
+  __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x8_u8mf4(data, 0), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x8_u8mf4(data, 1), vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x8_u8mf4(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x8_u8mf4(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x8_u8mf4(data, 4),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 5, __riscv_vget_v_u8mf4x8_u8mf4(data, 5),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 6, __riscv_vget_v_u8mf4x8_u8mf4(data, 6),
+                       vl);
+  __riscv_vse8_v_u8mf4(out_buf + vl * 7, __riscv_vget_v_u8mf4x8_u8mf4(data, 7),
+                       vl);
+}
+
+__attribute__((used, retain)) void vlseg8e8_v_u8mf2x8() {
+  auto data = __riscv_vlseg8e8_v_u8mf2x8(in_buf, vl);
+  __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x8_u8mf2(data, 0), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x8_u8mf2(data, 1), vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x8_u8mf2(data, 2),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x8_u8mf2(data, 3),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x8_u8mf2(data, 4),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 5, __riscv_vget_v_u8mf2x8_u8mf2(data, 5),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 6, __riscv_vget_v_u8mf2x8_u8mf2(data, 6),
+                       vl);
+  __riscv_vse8_v_u8mf2(out_buf + vl * 7, __riscv_vget_v_u8mf2x8_u8mf2(data, 7),
+                       vl);
+}
+
+__attribute__((used, retain)) void vlseg8e8_v_u8m1x8() {
+  auto data = __riscv_vlseg8e8_v_u8m1x8(in_buf, vl);
+  __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x8_u8m1(data, 0), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x8_u8m1(data, 1), vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x8_u8m1(data, 2),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x8_u8m1(data, 3),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x8_u8m1(data, 4),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 5, __riscv_vget_v_u8m1x8_u8m1(data, 5),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 6, __riscv_vget_v_u8m1x8_u8m1(data, 6),
+                      vl);
+  __riscv_vse8_v_u8m1(out_buf + vl * 7, __riscv_vget_v_u8m1x8_u8m1(data, 7),
+                      vl);
+}
+}
+
+void (*impl)() __attribute__((section(".data"))) = &vlseg2e8_v_u8m1x2;
+
+int main(int argc, char** argv) {
+  impl();
+  return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load8_segment2_unit_m1.cc b/tests/cocotb/rvv/load_store/load8_segment2_unit_m1.cc
deleted file mode 100644
index 6d187d9..0000000
--- a/tests/cocotb/rvv/load_store/load8_segment2_unit_m1.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright 2025 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <riscv_vector.h>
-#include <stdint.h>
-
-// Enough space for m2.
-uint8_t in_buf[64] __attribute__((section(".data")));
-uint8_t out_buf[64] __attribute__((section(".data")));
-
-__attribute__((used, retain)) void test_intrinsic(const uint8_t *x,
-                                                  uint8_t *y) {
-  vuint8m1x2_t v = __riscv_vlseg2e8_v_u8m1x2(in_buf, 32);
-
-  vuint8m2_t vv = __riscv_vcreate_v_u8m1_u8m2(
-      __riscv_vget_v_u8m1x2_u8m1(v, 0),
-      __riscv_vget_v_u8m1x2_u8m1(v, 1));
-
-  __riscv_vse8_v_u8m2(y, vv, /*vl=*/32);
-}
-
-int main(int argc, char **argv) {
-  test_intrinsic(in_buf, out_buf);
-  return 0;
-}
diff --git a/tests/cocotb/rvv/load_store/load8_segment2_unit_m2.cc b/tests/cocotb/rvv/load_store/load8_segment2_unit_m2.cc
deleted file mode 100644
index 2f3a10f..0000000
--- a/tests/cocotb/rvv/load_store/load8_segment2_unit_m2.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright 2025 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <riscv_vector.h>
-#include <stdint.h>
-
-// Enough space for m4.
-uint8_t in_buf[128] __attribute__((section(".data")));
-uint8_t out_buf[128] __attribute__((section(".data")));
-
-__attribute__((used, retain)) void test_intrinsic(const uint8_t *x,
-                                                  uint8_t *y) {
-  auto v = __riscv_vlseg2e8_v_u8m2x2(in_buf, 32);
-
-  vuint8m4_t vv = __riscv_vcreate_v_u8m2_u8m4(
-      __riscv_vget_v_u8m2x2_u8m2(v, 0),
-      __riscv_vget_v_u8m2x2_u8m2(v, 1));
-
-  __riscv_vse8_v_u8m4(y, vv, /*vl=*/64);
-}
-
-int main(int argc, char **argv) {
-  test_intrinsic(in_buf, out_buf);
-  return 0;
-}
diff --git a/tests/cocotb/rvv/load_store/store8_seg_unit.cc b/tests/cocotb/rvv/load_store/store8_seg_unit.cc
index 399de73..8086b34 100644
--- a/tests/cocotb/rvv/load_store/store8_seg_unit.cc
+++ b/tests/cocotb/rvv/load_store/store8_seg_unit.cc
@@ -20,7 +20,7 @@
 constexpr size_t buf_size = 256;
 }  // namespace
 
-size_t vl __attribute__((section(".data"))) = 64;
+size_t vl __attribute__((section(".data"))) = 16;
 // These instructions don't differentiate signed/unsigned so we only need to
 // test one. The types come from intrinsic level.
 uint8_t in_buf[buf_size] __attribute__((section(".data")));
diff --git a/tests/cocotb/rvv/vcpop_test.cc b/tests/cocotb/rvv/vcpop_test.cc
index a91826e..8d46758 100644
--- a/tests/cocotb/rvv/vcpop_test.cc
+++ b/tests/cocotb/rvv/vcpop_test.cc
@@ -19,10 +19,9 @@
 constexpr size_t buf_size = 16;
 }
 
-size_t vl __attribute__((section(".data"))) = buf_size;
+size_t vl __attribute__((section(".data"))) = 16;
 uint8_t in_buf[buf_size] __attribute__((section(".data")));
 uint32_t result __attribute__((section(".data")));
-void (*impl)() __attribute__((section(".data"))) = nullptr;
 
 extern "C" {
 __attribute__((used, retain)) void vcpop_m_b1() {
@@ -62,6 +61,8 @@
 }
 }
 
+void (*impl)() __attribute__((section(".data"))) = vcpop_m_b8;
+
 int main(int argc, char** argv) {
   impl();
 
diff --git a/tests/cocotb/rvv_load_store_test.py b/tests/cocotb/rvv_load_store_test.py
index f7d0b40..b22f8b9 100644
--- a/tests/cocotb/rvv_load_store_test.py
+++ b/tests/cocotb/rvv_load_store_test.py
@@ -15,8 +15,8 @@
 import cocotb
 import numpy as np
 
-from kelvin_test_utils.sim_test_fixture import Fixture
 from bazel_tools.tools.python.runfiles import runfiles
+from kelvin_test_utils.sim_test_fixture import Fixture
 
 
 async def vector_load_store(
@@ -378,75 +378,202 @@
         pattern=list(range(0, 8)),
     )
 
+
 @cocotb.test()
-async def load8_segment2_unit_m1(dut):
-    await vector_load_store(
-        dut=dut,
-        elf_name='load8_segment2_unit_m1.elf',
-        dtype=np.uint8,
-        in_size=64,
-        out_size=64,
-        pattern=(list(range(0, 32, 2)) + list(range(1, 32, 2))),
+async def load8_seg_unit(dut):
+    """Test vlseg*e8 usage accessible from intrinsics."""
+    def make_test_case(impl: str, vl: int, n_segs: int):
+        return {
+            'impl': impl,
+            'vl': vl,
+            'in_size': vl * n_segs * 2,
+            'out_size': vl * n_segs * 2,
+            'pattern':[
+                i * n_segs + j
+                for j in range(n_segs) for i in range(vl)]
+        }
+
+    await vector_load_store_v2(
+        dut = dut,
+        elf_name = 'load8_seg_unit.elf',
+        cases = [
+            # Seg 2
+            make_test_case('vlseg2e8_v_u8mf4x2', vl=4, n_segs=2),
+            make_test_case('vlseg2e8_v_u8mf4x2', vl=3, n_segs=2),
+            make_test_case('vlseg2e8_v_u8mf2x2', vl=8, n_segs=2),
+            make_test_case('vlseg2e8_v_u8mf2x2', vl=7, n_segs=2),
+            make_test_case('vlseg2e8_v_u8m1x2', vl=16, n_segs=2),
+            make_test_case('vlseg2e8_v_u8m1x2', vl=15, n_segs=2),
+            make_test_case('vlseg2e8_v_u8m2x2', vl=32, n_segs=2),
+            make_test_case('vlseg2e8_v_u8m2x2', vl=31, n_segs=2),
+            make_test_case('vlseg2e8_v_u8m4x2', vl=64, n_segs=2),
+            make_test_case('vlseg2e8_v_u8m4x2', vl=63, n_segs=2),
+            # Seg 3
+            make_test_case('vlseg3e8_v_u8mf4x3', vl=4, n_segs=3),
+            make_test_case('vlseg3e8_v_u8mf4x3', vl=3, n_segs=3),
+            make_test_case('vlseg3e8_v_u8mf2x3', vl=8, n_segs=3),
+            make_test_case('vlseg3e8_v_u8mf2x3', vl=7, n_segs=3),
+            make_test_case('vlseg3e8_v_u8m1x3', vl=16, n_segs=3),
+            make_test_case('vlseg3e8_v_u8m1x3', vl=15, n_segs=3),
+            make_test_case('vlseg3e8_v_u8m2x3', vl=32, n_segs=3),
+            make_test_case('vlseg3e8_v_u8m2x3', vl=31, n_segs=3),
+            # Seg 4
+            make_test_case('vlseg4e8_v_u8mf4x4', vl=4, n_segs=4),
+            make_test_case('vlseg4e8_v_u8mf4x4', vl=3, n_segs=4),
+            make_test_case('vlseg4e8_v_u8mf2x4', vl=8, n_segs=4),
+            make_test_case('vlseg4e8_v_u8mf2x4', vl=7, n_segs=4),
+            make_test_case('vlseg4e8_v_u8m1x4', vl=16, n_segs=4),
+            make_test_case('vlseg4e8_v_u8m1x4', vl=15, n_segs=4),
+            make_test_case('vlseg4e8_v_u8m2x4', vl=32, n_segs=4),
+            make_test_case('vlseg4e8_v_u8m2x4', vl=31, n_segs=4),
+            # Seg 5
+            make_test_case('vlseg5e8_v_u8mf4x5', vl=4, n_segs=5),
+            make_test_case('vlseg5e8_v_u8mf4x5', vl=3, n_segs=5),
+            make_test_case('vlseg5e8_v_u8mf2x5', vl=8, n_segs=5),
+            make_test_case('vlseg5e8_v_u8mf2x5', vl=7, n_segs=5),
+            make_test_case('vlseg5e8_v_u8m1x5', vl=16, n_segs=5),
+            make_test_case('vlseg5e8_v_u8m1x5', vl=15, n_segs=5),
+            # Seg 6
+            make_test_case('vlseg6e8_v_u8mf4x6', vl=4, n_segs=6),
+            make_test_case('vlseg6e8_v_u8mf4x6', vl=3, n_segs=6),
+            make_test_case('vlseg6e8_v_u8mf2x6', vl=8, n_segs=6),
+            make_test_case('vlseg6e8_v_u8mf2x6', vl=7, n_segs=6),
+            make_test_case('vlseg6e8_v_u8m1x6', vl=16, n_segs=6),
+            make_test_case('vlseg6e8_v_u8m1x6', vl=15, n_segs=6),
+            # Seg 7
+            make_test_case('vlseg7e8_v_u8mf4x7', vl=4, n_segs=7),
+            make_test_case('vlseg7e8_v_u8mf4x7', vl=3, n_segs=7),
+            make_test_case('vlseg7e8_v_u8mf2x7', vl=8, n_segs=7),
+            make_test_case('vlseg7e8_v_u8mf2x7', vl=7, n_segs=7),
+            make_test_case('vlseg7e8_v_u8m1x7', vl=16, n_segs=7),
+            make_test_case('vlseg7e8_v_u8m1x7', vl=15, n_segs=7),
+            # Seg 8
+            make_test_case('vlseg8e8_v_u8mf4x8', vl=4, n_segs=8),
+            make_test_case('vlseg8e8_v_u8mf4x8', vl=3, n_segs=8),
+            make_test_case('vlseg8e8_v_u8mf2x8', vl=8, n_segs=8),
+            make_test_case('vlseg8e8_v_u8mf2x8', vl=7, n_segs=8),
+            make_test_case('vlseg8e8_v_u8m1x8', vl=16, n_segs=8),
+            make_test_case('vlseg8e8_v_u8m1x8', vl=15, n_segs=8),
+        ],
+        dtype = np.uint8,
     )
 
 
 @cocotb.test()
-async def load16_segment2_unit_m1(dut):
-    await vector_load_store(
-        dut=dut,
-        elf_name='load16_segment2_unit_m1.elf',
-        dtype=np.uint16,
-        in_size=32,
-        out_size=32,
-        pattern=(list(range(0, 16, 2)) + list(range(1, 16, 2))),
+async def load16_seg_unit(dut):
+    """Test vlseg*e16 usage accessible from intrinsics."""
+    def make_test_case(impl: str, vl: int, n_segs: int):
+        return {
+            'impl': impl,
+            'vl': vl,
+            'in_size': vl * n_segs * 2,
+            'out_size': vl * n_segs * 2,
+            'pattern':[
+                i * n_segs + j
+                for j in range(n_segs) for i in range(vl)]
+        }
+
+    await vector_load_store_v2(
+        dut = dut,
+        elf_name = 'load16_seg_unit.elf',
+        cases = [
+            # Seg 2
+            make_test_case('vlseg2e16_v_u16mf2x2', vl=4, n_segs=2),
+            make_test_case('vlseg2e16_v_u16mf2x2', vl=3, n_segs=2),
+            make_test_case('vlseg2e16_v_u16m1x2', vl=8, n_segs=2),
+            make_test_case('vlseg2e16_v_u16m1x2', vl=7, n_segs=2),
+            make_test_case('vlseg2e16_v_u16m2x2', vl=16, n_segs=2),
+            make_test_case('vlseg2e16_v_u16m2x2', vl=15, n_segs=2),
+            make_test_case('vlseg2e16_v_u16m4x2', vl=32, n_segs=2),
+            make_test_case('vlseg2e16_v_u16m4x2', vl=31, n_segs=2),
+            # Seg 3
+            make_test_case('vlseg3e16_v_u16mf2x3', vl=4, n_segs=3),
+            make_test_case('vlseg3e16_v_u16mf2x3', vl=3, n_segs=3),
+            make_test_case('vlseg3e16_v_u16m1x3', vl=8, n_segs=3),
+            make_test_case('vlseg3e16_v_u16m1x3', vl=7, n_segs=3),
+            make_test_case('vlseg3e16_v_u16m2x3', vl=16, n_segs=3),
+            make_test_case('vlseg3e16_v_u16m2x3', vl=15, n_segs=3),
+            # Seg 4
+            make_test_case('vlseg4e16_v_u16mf2x4', vl=4, n_segs=4),
+            make_test_case('vlseg4e16_v_u16mf2x4', vl=3, n_segs=4),
+            make_test_case('vlseg4e16_v_u16m1x4', vl=8, n_segs=4),
+            make_test_case('vlseg4e16_v_u16m1x4', vl=7, n_segs=4),
+            make_test_case('vlseg4e16_v_u16m2x4', vl=16, n_segs=4),
+            make_test_case('vlseg4e16_v_u16m2x4', vl=15, n_segs=4),
+            # Seg 5
+            make_test_case('vlseg5e16_v_u16mf2x5', vl=4, n_segs=5),
+            make_test_case('vlseg5e16_v_u16mf2x5', vl=3, n_segs=5),
+            make_test_case('vlseg5e16_v_u16m1x5', vl=8, n_segs=5),
+            make_test_case('vlseg5e16_v_u16m1x5', vl=7, n_segs=5),
+            # Seg 6
+            make_test_case('vlseg6e16_v_u16mf2x6', vl=4, n_segs=6),
+            make_test_case('vlseg6e16_v_u16mf2x6', vl=3, n_segs=6),
+            make_test_case('vlseg6e16_v_u16m1x6', vl=8, n_segs=6),
+            make_test_case('vlseg6e16_v_u16m1x6', vl=7, n_segs=6),
+            # Seg 7
+            make_test_case('vlseg7e16_v_u16mf2x7', vl=4, n_segs=7),
+            make_test_case('vlseg7e16_v_u16mf2x7', vl=3, n_segs=7),
+            make_test_case('vlseg7e16_v_u16m1x7', vl=8, n_segs=7),
+            make_test_case('vlseg7e16_v_u16m1x7', vl=7, n_segs=7),
+            # Seg 8
+            make_test_case('vlseg8e16_v_u16mf2x8', vl=4, n_segs=8),
+            make_test_case('vlseg8e16_v_u16mf2x8', vl=3, n_segs=8),
+            make_test_case('vlseg8e16_v_u16m1x8', vl=8, n_segs=8),
+            make_test_case('vlseg8e16_v_u16m1x8', vl=7, n_segs=8),
+        ],
+        dtype = np.uint16,
     )
 
 
 @cocotb.test()
-async def load32_segment2_unit_m1(dut):
-    await vector_load_store(
-        dut=dut,
-        elf_name='load32_segment2_unit_m1.elf',
-        dtype=np.uint32,
-        in_size=16,
-        out_size=16,
-        pattern=(list(range(0, 8, 2)) + list(range(1, 8, 2))),
-    )
+async def load32_seg_unit(dut):
+    """Test vlseg*e32 usage accessible from intrinsics."""
+    def make_test_case(impl: str, vl: int, n_segs: int):
+        return {
+            'impl': impl,
+            'vl': vl,
+            'in_size': vl * n_segs * 2,
+            'out_size': vl * n_segs * 2,
+            'pattern':[
+                i * n_segs + j
+                for j in range(n_segs) for i in range(vl)]
+        }
 
-
-@cocotb.test()
-async def load8_segment2_unit_m2(dut):
-    await vector_load_store(
-        dut=dut,
-        elf_name='load8_segment2_unit_m2.elf',
-        dtype=np.uint8,
-        in_size=128,
-        out_size=128,
-        pattern=(list(range(0, 63, 2)) + list(range(1, 64, 2))),
-    )
-
-
-@cocotb.test()
-async def load16_segment2_unit_m2(dut):
-    await vector_load_store(
-        dut=dut,
-        elf_name='load16_segment2_unit_m2.elf',
-        dtype=np.uint16,
-        in_size=64,
-        out_size=64,
-        pattern=(list(range(0, 31, 2)) + list(range(1, 32, 2))),
-    )
-
-
-@cocotb.test()
-async def load32_segment2_unit_m2(dut):
-    await vector_load_store(
-        dut=dut,
-        elf_name='load32_segment2_unit_m2.elf',
-        dtype=np.uint32,
-        in_size=32,
-        out_size=32,
-        pattern=(list(range(0, 15, 2)) + list(range(1, 16, 2))),
+    await vector_load_store_v2(
+        dut = dut,
+        elf_name = 'load32_seg_unit.elf',
+        cases = [
+            # Seg 2
+            make_test_case('vlseg2e32_v_u32m1x2', vl=4, n_segs=2),
+            make_test_case('vlseg2e32_v_u32m1x2', vl=3, n_segs=2),
+            make_test_case('vlseg2e32_v_u32m2x2', vl=8, n_segs=2),
+            make_test_case('vlseg2e32_v_u32m2x2', vl=7, n_segs=2),
+            make_test_case('vlseg2e32_v_u32m4x2', vl=16, n_segs=2),
+            make_test_case('vlseg2e32_v_u32m4x2', vl=15, n_segs=2),
+            # Seg 3
+            make_test_case('vlseg3e32_v_u32m1x3', vl=4, n_segs=3),
+            make_test_case('vlseg3e32_v_u32m1x3', vl=3, n_segs=3),
+            make_test_case('vlseg3e32_v_u32m2x3', vl=8, n_segs=3),
+            make_test_case('vlseg3e32_v_u32m2x3', vl=7, n_segs=3),
+            # Seg 4
+            make_test_case('vlseg4e32_v_u32m1x4', vl=4, n_segs=4),
+            make_test_case('vlseg4e32_v_u32m1x4', vl=3, n_segs=4),
+            make_test_case('vlseg4e32_v_u32m2x4', vl=8, n_segs=4),
+            make_test_case('vlseg4e32_v_u32m2x4', vl=7, n_segs=4),
+            # Seg 5
+            make_test_case('vlseg5e32_v_u32m1x5', vl=4, n_segs=5),
+            make_test_case('vlseg5e32_v_u32m1x5', vl=3, n_segs=5),
+            # Seg 6
+            make_test_case('vlseg6e32_v_u32m1x6', vl=4, n_segs=6),
+            make_test_case('vlseg6e32_v_u32m1x6', vl=3, n_segs=6),
+            # Seg 7
+            make_test_case('vlseg7e32_v_u32m1x7', vl=4, n_segs=7),
+            make_test_case('vlseg7e32_v_u32m1x7', vl=3, n_segs=7),
+            # Seg 8
+            make_test_case('vlseg8e32_v_u32m1x8', vl=4, n_segs=8),
+            make_test_case('vlseg8e32_v_u32m1x8', vl=3, n_segs=8),
+        ],
+        dtype = np.uint32,
     )
 
 
@@ -493,7 +620,7 @@
 
 @cocotb.test()
 async def store8_seg_unit(dut):
-    """Test vsseg*e8  usage accessible from intrinsics."""
+    """Test vsseg*e8 usage accessible from intrinsics."""
     def make_test_case(impl: str, vl: int, n_segs: int):
         return {
             'impl': impl,