Support for segmented loads.

Change-Id: Iabe9a16f4a44e5c3815a1af994d3c6a66cb1a0cd
diff --git a/hdl/chisel/src/kelvin/scalar/Decode.scala b/hdl/chisel/src/kelvin/scalar/Decode.scala
index 8b0715d..5f93edc 100644
--- a/hdl/chisel/src/kelvin/scalar/Decode.scala
+++ b/hdl/chisel/src/kelvin/scalar/Decode.scala
@@ -636,6 +636,7 @@
     io.lsu(i).bits.pc := io.inst(i).bits.addr
     if (p.enableRvv) {
       io.lsu(i).bits.elemWidth.get := io.inst(i).bits.inst(14,12)
+      io.lsu(i).bits.nfields.get := io.inst(i).bits.inst(31,29)
     }
 
     // -------------------------------------------------------------------------
@@ -1105,6 +1106,7 @@
   io.lsu.bits.pc := io.inst.bits.addr
   if (p.enableRvv) {
     io.lsu.bits.elemWidth.get := io.inst.bits.inst(14,12)
+    io.lsu.bits.nfields.get := io.inst.bits.inst(31,29)
   }
 
   // MLU opcode.
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala
index 94bd30b..e3e390f 100644
--- a/hdl/chisel/src/kelvin/scalar/Lsu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -140,10 +140,11 @@
   val op = LsuOp()
   val pc = UInt(32.W)
   val elemWidth = Option.when(p.enableRvv) { UInt(3.W) }
+  val nfields = Option.when(p.enableRvv) { UInt(3.W) }
 
   override def toPrintable: Printable = {
     cf"LsuCmd(store -> ${store}, addr -> 0x${addr}%x, op -> ${op}, " +
-    cf"pc -> 0x${pc}%x, elemWidth -> ${elemWidth})"
+    cf"pc -> 0x${pc}%x, elemWidth -> ${elemWidth}, nfields -> ${nfields})"
   }
 }
 
@@ -156,6 +157,7 @@
   val data = UInt(32.W)  // Doubles as rs2
   val elemWidth = Option.when(p.enableRvv) { UInt(3.W) }
   val lmul = Option.when(p.enableRvv) { UInt(4.W) }
+  val nfields = Option.when(p.enableRvv) { UInt(3.W) }
 
   override def toPrintable: Printable = {
     cf"LsuUOp(store -> ${store}, rd -> ${rd}, op -> ${op}, " +
@@ -185,6 +187,7 @@
     }
     if (p.enableRvv) {
       result.elemWidth.get := cmd.elemWidth.get
+      result.nfields.get := cmd.nfields.get
       // Treat fractional LMULs as LMUL=1
       val effectiveLmul = Mux(rvvState.get.bits.lmul(2),
                               0.U(2.W), rvvState.get.bits.lmul(1, 0))
@@ -253,14 +256,18 @@
   val pendingVector = Bool()
   val pendingWriteback = Bool()
   val lmul = UInt(4.W)
-  val stride = UInt(32.W)
+  val elemStride = UInt(32.W)     // Stride between lanes in a vector
+  val segmentStride = UInt(32.W)  // Stride between base addr between segments
   val elemWidth = UInt(3.W)
+  val nfields = UInt(3.W)
+  val segment = UInt(3.W)
 
   // If the slot has no pending tasks and can accept a new operation
   def slotIdle(): Bool = {
     !(pendingVector ||                        // Awaiting data from RVV Core
       active.reduce(_||_) ||                  // Active transaction
       pendingWriteback ||                     // Send result back to regfile
+      (nfields =/= segment) ||                // No pending segments
       (LsuOp.isVector(op) && (lmul =/= 0.U))  // More operations in progress
     )
   }
@@ -298,17 +305,20 @@
     result.pendingWriteback := pendingWriteback
     result.lmul := lmul
     result.baseAddr := baseAddr
-    result.stride := stride
+    result.elemStride := elemStride
+    result.segmentStride := segmentStride
+
+
+    val segmentBaseAddr = baseAddr + (segmentStride * segment)
     result.addrs := MuxCase(addrs, Seq(
         op.isOneOf(LsuOp.VLOAD_UNIT, LsuOp.VSTORE_UNIT) ->
-            VecInit((0 until bytesPerSlot).map(i => baseAddr + i.U)),
+            ComputeStridedAddrs(bytesPerSlot, segmentBaseAddr, elemStride, elemWidth),
         op.isOneOf(LsuOp.VLOAD_STRIDED, LsuOp.VSTORE_STRIDED) ->
-            ComputeStridedAddrs(bytesPerSlot, baseAddr, stride, elemWidth),
+            ComputeStridedAddrs(bytesPerSlot, segmentBaseAddr, elemStride, elemWidth),
         op.isOneOf(LsuOp.VLOAD_OINDEXED, LsuOp.VLOAD_UINDEXED,
                    LsuOp.VSTORE_OINDEXED, LsuOp.VSTORE_UINDEXED) ->
             ComputeIndexedAddrs(bytesPerSlot, baseAddr, rvv2lsu.idx.bits.data,
                                 elemWidth)
-        // TODO(derekjchow): Support segmented
     ))
     result.elemWidth := elemWidth
 
@@ -317,6 +327,9 @@
     result.active := Mux(updated && LsuOp.isVector(op) && rvv2lsu.mask.valid,
         VecInit(rvv2lsu.mask.bits.asBools), active)
     result.pendingVector := pendingVector && !updated
+    result.nfields := nfields
+    result.segment := segment
+
     result
   }
 
@@ -349,8 +362,11 @@
         i => active(i) & ~lineActive(i))
     result.data := VecInit((0 until bytesPerSlot).map(
         i => Mux(lineActive(i), gatheredData(i), data(i))))
-    result.stride := stride
+    result.elemStride := elemStride
+    result.segmentStride := segmentStride
     result.elemWidth := elemWidth
+    result.nfields := nfields
+    result.segment := segment
 
     result
   }
@@ -370,35 +386,51 @@
     result.addrs := addrs
     result.active := active
     result.data := data
-    result.stride := stride
+    result.elemStride := elemStride
+    result.segmentStride := segmentStride
     result.elemWidth := elemWidth
+    result.nfields := nfields
 
     val vectorWriteback = writeback && LsuOp.isVector(op)
+
+    val segmentNext = MuxCase(segment, Seq(
+      // Final segment, No next LMUL: don't reset segment
+      (vectorWriteback && (segment === nfields) && (lmul === 1.U)) -> segment,
+      // Final segment, next LMUL: reset segment
+      (vectorWriteback && (segment === nfields)) -> 0.U,
+      // Next segment
+      vectorWriteback -> (segment + 1.U),
+    ))
+    result.segment := segmentNext
+
+    val lmulUpdate = vectorWriteback && (segment === nfields)
     val lmulNext = MuxCase(lmul, Seq(
+        // Don't decrease below 0!
         (lmul === 0.U) -> 0.U,
-        vectorWriteback -> (lmul - 1.U),
+        // Move to next LMUL if final segment
+        (lmulUpdate) -> (lmul - 1.U),
     ))
     result.lmul := lmulNext
     result.pendingVector := MuxCase(pendingVector, Seq(
         (!writeback) -> pendingWriteback,
-        (!LsuOp.isVector(op)) -> false.B, // No vector update for non-vector
-        (lmulNext =/= 0.U) -> true.B,     // Next LMUL
-        (lmulNext === 0.U) -> false.B,    // Final LMUL
+        (!LsuOp.isVector(op)) -> false.B,    // No vector update for non-vector
+        (lmulNext =/= 0.U) -> true.B,        // Next LMUL
+        (lmulNext === 0.U) -> false.B,       // Final LMUL
     ))
     result.pendingWriteback := MuxCase(pendingWriteback, Seq(
       (!writeback) -> pendingWriteback,
-      (!LsuOp.isVector(op)) -> false.B, // One writeback for non-vector ops
-      (lmulNext =/= 0.U) -> true.B,     // Next LMUL
-      (lmulNext === 0.U) -> false.B,    // Final LMUL
+      (!LsuOp.isVector(op)) -> false.B,    // One writeback for non-vector ops
+      (lmulNext =/= 0.U) -> true.B,        // Next LMUL
+      (lmulNext === 0.U) -> false.B,       // Final LMUL
     ))
 
     result.baseAddr := MuxCase(baseAddr, Seq(
-      !writeback -> baseAddr,
-      // For Unit Updates
-      op.isOneOf(LsuOp.VLOAD_UNIT, LsuOp.VSTORE_UNIT) -> (baseAddr + 16.U),
-      // For Strided Updates
-      op.isOneOf(LsuOp.VLOAD_STRIDED, LsuOp.VSTORE_STRIDED) ->
-          (baseAddr + (stride*16.U)),
+      (!writeback || !lmulUpdate) -> baseAddr,
+      // For Unit and strided updates
+      op.isOneOf(LsuOp.VLOAD_UNIT, LsuOp.VSTORE_UNIT,
+                 LsuOp.VLOAD_STRIDED, LsuOp.VSTORE_STRIDED) ->
+          (baseAddr + (nfields * 16.U) + 16.U),
+      // Indexed don't have base addr changed.
     ))
     result.rd := rd + 1.U  // Move to next vector register
 
@@ -427,8 +459,11 @@
     result.addrs := addrs
     result.data := data
     result.lmul := lmul
-    result.stride := stride
+    result.elemStride := elemStride
+    result.segmentStride := segmentStride
     result.elemWidth := elemWidth
+    result.nfields := nfields
+    result.segment := segment
     result
   }
 
@@ -456,6 +491,8 @@
         cf"  $i: ${active(i)}, 0x${addrs(i)}%x, 0x${data(i)}%x\n")
     cf"store: $store\n  op: ${op}\n  pendingVector: ${pendingVector}\n" +
     cf"  pendingWriteback: ${pendingWriteback}\n  lmul: ${lmul}\n" +
+    cf"  nfields: ${nfields}\n  segment: ${segment}\n" +
+    cf"  elemWidth: 0b${elemWidth}%b elemStride: ${elemStride}\n" +
     lines.reduce(_+_)
   }
 }
@@ -471,7 +508,13 @@
     result.rd := uop.rd
     result.store := uop.store
     result.pc := uop.pc
-    result.lmul := uop.lmul.getOrElse(0.U)
+    if (p.enableRvv) {
+      result.lmul := uop.lmul.getOrElse(0.U)
+      result.nfields := Mux(LsuOp.isVector(uop.op), uop.nfields.get, 0.U)
+      result.segment := 0.U
+    } else {
+      result.lmul := 0.U
+    }
 
     // All vector ops require writeback. Lsu needs to inform RVV core store uop
     // has completed.
@@ -494,7 +537,18 @@
         uop.op.isOneOf(LsuOp.VLOAD_STRIDED, LsuOp.VSTORE_STRIDED),
         ComputeStridedAddrs(bytesPerSlot, uop.addr, uop.data, uop.elemWidth.getOrElse(0.U(3.W))),
         VecInit((0 until bytesPerSlot).map(i => uop.addr + i.U)))
-    result.stride := uop.data
+
+    val unitStride = MuxCase(1.U, Seq(
+        (uop.elemWidth.get === "b000".U) -> 1.U,  // 1-byte elements
+        (uop.elemWidth.get === "b101".U) -> 2.U,  // 2-byte elements
+        (uop.elemWidth.get === "b110".U) -> 4.U,  // 4-byte elements
+    ))
+
+    result.segmentStride := unitStride
+    result.elemStride := Mux(
+        uop.op.isOneOf(LsuOp.VLOAD_UNIT, LsuOp.VSTORE_UNIT),
+        unitStride + (uop.nfields.get * unitStride),
+        uop.data)
 
     result.data(0) := uop.data(7, 0)
     result.data(1) := uop.data(15, 8)
diff --git a/tests/cocotb/BUILD b/tests/cocotb/BUILD
index d274166..45f2ab4 100644
--- a/tests/cocotb/BUILD
+++ b/tests/cocotb/BUILD
@@ -316,10 +316,20 @@
     ],
     data = [
         '//tests/cocotb/rvv/load_store:load8_indexed_m1.elf',
+        '//tests/cocotb/rvv/load_store:load8_segment2_unit_m1.elf',
+        '//tests/cocotb/rvv/load_store:load16_segment2_unit_m1.elf',
+        '//tests/cocotb/rvv/load_store:load32_segment2_unit_m1.elf',
+        '//tests/cocotb/rvv/load_store:load8_segment2_unit_m2.elf',
+        '//tests/cocotb/rvv/load_store:load16_segment2_unit_m2.elf',
+        '//tests/cocotb/rvv/load_store:load32_segment2_unit_m2.elf',
+        '//tests/cocotb/rvv/load_store:load8_segment2_stride6_m1.elf',
+        '//tests/cocotb/rvv/load_store:load16_segment2_stride6_m1.elf',
         '//tests/cocotb/rvv/load_store:load8_stride2_m1.elf',
         '//tests/cocotb/rvv/load_store:load8_stride2_m1_partial.elf',
         '//tests/cocotb/rvv/load_store:load8_stride2_mf4.elf',
         '//tests/cocotb/rvv/load_store:load_store8_unit_m2.elf',
+        '//tests/cocotb/rvv/load_store:load_store16_unit_m2.elf',
+        '//tests/cocotb/rvv/load_store:load_store32_unit_m2.elf',
         '//tests/cocotb/rvv/load_store:load16_stride4_m1.elf',
         '//tests/cocotb/rvv/load_store:load16_stride4_m1_partial.elf',
         '//tests/cocotb/rvv/load_store:load16_stride4_mf2.elf',
diff --git a/tests/cocotb/rvv/load_store/BUILD b/tests/cocotb/rvv/load_store/BUILD
index d19281c..2664908 100644
--- a/tests/cocotb/rvv/load_store/BUILD
+++ b/tests/cocotb/rvv/load_store/BUILD
@@ -23,6 +23,30 @@
         "load8_indexed_m1": {
             "srcs": ["load8_indexed_m1.cc"],
         },
+        "load8_segment2_unit_m1": {
+            "srcs": ["load8_segment2_unit_m1.cc"],
+        },
+        "load16_segment2_unit_m1": {
+            "srcs": ["load16_segment2_unit_m1.cc"],
+        },
+        "load32_segment2_unit_m1": {
+            "srcs": ["load32_segment2_unit_m1.cc"],
+        },
+        "load8_segment2_unit_m2": {
+            "srcs": ["load8_segment2_unit_m2.cc"],
+        },
+        "load16_segment2_unit_m2": {
+            "srcs": ["load16_segment2_unit_m2.cc"],
+        },
+        "load32_segment2_unit_m2": {
+            "srcs": ["load32_segment2_unit_m2.cc"],
+        },
+        "load8_segment2_stride6_m1": {
+            "srcs": ["load8_segment2_stride6_m1.cc"],
+        },
+        "load16_segment2_stride6_m1": {
+            "srcs": ["load16_segment2_stride6_m1.cc"],
+        },
         "load8_stride2_m1": {
             "srcs": ["load8_stride2_m1.cc"],
         },
@@ -35,6 +59,12 @@
         "load_store8_unit_m2": {
             "srcs": ["load_store8_unit_m2.cc"],
         },
+        "load_store16_unit_m2": {
+            "srcs": ["load_store16_unit_m2.cc"],
+        },
+        "load_store32_unit_m2": {
+            "srcs": ["load_store32_unit_m2.cc"],
+        },
         "load16_stride4_m1": {
             "srcs": ["load16_stride4_m1.cc"],
         },
diff --git a/tests/cocotb/rvv/load_store/load16_segment2_stride6_m1.cc b/tests/cocotb/rvv/load_store/load16_segment2_stride6_m1.cc
new file mode 100644
index 0000000..97e461e
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load16_segment2_stride6_m1.cc
@@ -0,0 +1,35 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+uint16_t in_buf[128] __attribute__((section(".data")));
+uint16_t out_buf[32] __attribute__((section(".data")));
+
+__attribute__((used, retain)) void test_intrinsic(const uint16_t *x,
+                                                  uint16_t *y) {
+  vuint16m1x2_t v = __riscv_vlsseg2e16_v_u16m1x2(in_buf, 6, 8);
+
+  vuint16m2_t vv = __riscv_vcreate_v_u16m1_u16m2(
+      __riscv_vget_v_u16m1x2_u16m1(v, 0),
+      __riscv_vget_v_u16m1x2_u16m1(v, 1));
+
+  __riscv_vse16_v_u16m2(y, vv, /*vl=*/16);
+}
+
+int main(int argc, char **argv) {
+  test_intrinsic(in_buf, out_buf);
+  return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load16_segment2_unit_m1.cc b/tests/cocotb/rvv/load_store/load16_segment2_unit_m1.cc
new file mode 100644
index 0000000..f7f8d03
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load16_segment2_unit_m1.cc
@@ -0,0 +1,36 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+// Enough space for m2.
+uint16_t in_buf[32] __attribute__((section(".data")));
+uint16_t out_buf[32] __attribute__((section(".data")));
+
+__attribute__((used, retain)) void test_intrinsic(const uint16_t *x,
+                                                  uint16_t *y) {
+  vuint16m1x2_t v = __riscv_vlseg2e16_v_u16m1x2(in_buf, 16);
+
+  vuint16m2_t vv = __riscv_vcreate_v_u16m1_u16m2(
+      __riscv_vget_v_u16m1x2_u16m1(v, 0),
+      __riscv_vget_v_u16m1x2_u16m1(v, 1));
+
+  __riscv_vse16_v_u16m2(y, vv, /*vl=*/16);
+}
+
+int main(int argc, char **argv) {
+  test_intrinsic(in_buf, out_buf);
+  return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load16_segment2_unit_m2.cc b/tests/cocotb/rvv/load_store/load16_segment2_unit_m2.cc
new file mode 100644
index 0000000..98c2c7b
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load16_segment2_unit_m2.cc
@@ -0,0 +1,36 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+// Enough space for m4.
+uint16_t in_buf[64] __attribute__((section(".data")));
+uint16_t out_buf[64] __attribute__((section(".data")));
+
+__attribute__((used, retain)) void test_intrinsic(const uint16_t *x,
+                                                  uint16_t *y) {
+  vuint16m2x2_t v = __riscv_vlseg2e16_v_u16m2x2(in_buf, 32);
+
+  vuint16m4_t vv = __riscv_vcreate_v_u16m2_u16m4(
+      __riscv_vget_v_u16m2x2_u16m2(v, 0),
+      __riscv_vget_v_u16m2x2_u16m2(v, 1));
+
+  __riscv_vse16_v_u16m4(y, vv, /*vl=*/32);
+}
+
+int main(int argc, char **argv) {
+  test_intrinsic(in_buf, out_buf);
+  return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load32_segment2_unit_m1.cc b/tests/cocotb/rvv/load_store/load32_segment2_unit_m1.cc
new file mode 100644
index 0000000..bfb3a7b
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load32_segment2_unit_m1.cc
@@ -0,0 +1,36 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+// Enough space for m2.
+uint32_t in_buf[16] __attribute__((section(".data")));
+uint32_t out_buf[16] __attribute__((section(".data")));
+
+__attribute__((used, retain)) void test_intrinsic(const uint32_t *x,
+                                                  uint32_t *y) {
+  vuint32m1x2_t v = __riscv_vlseg2e32_v_u32m1x2(in_buf, 8);
+
+  vuint32m2_t vv = __riscv_vcreate_v_u32m1_u32m2(
+      __riscv_vget_v_u32m1x2_u32m1(v, 0),
+      __riscv_vget_v_u32m1x2_u32m1(v, 1));
+
+  __riscv_vse32_v_u32m2(y, vv, /*vl=*/8);
+}
+
+int main(int argc, char **argv) {
+  test_intrinsic(in_buf, out_buf);
+  return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load32_segment2_unit_m2.cc b/tests/cocotb/rvv/load_store/load32_segment2_unit_m2.cc
new file mode 100644
index 0000000..e88f774
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load32_segment2_unit_m2.cc
@@ -0,0 +1,36 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+// Enough space for m4.
+uint32_t in_buf[32] __attribute__((section(".data")));
+uint32_t out_buf[32] __attribute__((section(".data")));
+
+__attribute__((used, retain)) void test_intrinsic(const uint32_t *x,
+                                                  uint32_t *y) {
+  vuint32m2x2_t v = __riscv_vlseg2e32_v_u32m2x2(in_buf, 16);
+
+  vuint32m4_t vv = __riscv_vcreate_v_u32m2_u32m4(
+      __riscv_vget_v_u32m2x2_u32m2(v, 0),
+      __riscv_vget_v_u32m2x2_u32m2(v, 1));
+
+  __riscv_vse32_v_u32m4(y, vv, /*vl=*/16);
+}
+
+int main(int argc, char **argv) {
+  test_intrinsic(in_buf, out_buf);
+  return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load8_segment2_stride6_m1.cc b/tests/cocotb/rvv/load_store/load8_segment2_stride6_m1.cc
new file mode 100644
index 0000000..bafc188
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load8_segment2_stride6_m1.cc
@@ -0,0 +1,35 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+uint8_t in_buf[256] __attribute__((section(".data")));
+uint8_t out_buf[64] __attribute__((section(".data")));
+
+__attribute__((used, retain)) void test_intrinsic(const uint8_t *x,
+                                                  uint8_t *y) {
+  vuint8m1x2_t v = __riscv_vlsseg2e8_v_u8m1x2(in_buf, 6, 16);
+
+  vuint8m2_t vv = __riscv_vcreate_v_u8m1_u8m2(
+      __riscv_vget_v_u8m1x2_u8m1(v, 0),
+      __riscv_vget_v_u8m1x2_u8m1(v, 1));
+
+  __riscv_vse8_v_u8m2(y, vv, /*vl=*/32);
+}
+
+int main(int argc, char **argv) {
+  test_intrinsic(in_buf, out_buf);
+  return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load8_segment2_unit_m1.cc b/tests/cocotb/rvv/load_store/load8_segment2_unit_m1.cc
new file mode 100644
index 0000000..6d187d9
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load8_segment2_unit_m1.cc
@@ -0,0 +1,36 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+// Enough space for m2.
+uint8_t in_buf[64] __attribute__((section(".data")));
+uint8_t out_buf[64] __attribute__((section(".data")));
+
+__attribute__((used, retain)) void test_intrinsic(const uint8_t *x,
+                                                  uint8_t *y) {
+  vuint8m1x2_t v = __riscv_vlseg2e8_v_u8m1x2(in_buf, 32);
+
+  vuint8m2_t vv = __riscv_vcreate_v_u8m1_u8m2(
+      __riscv_vget_v_u8m1x2_u8m1(v, 0),
+      __riscv_vget_v_u8m1x2_u8m1(v, 1));
+
+  __riscv_vse8_v_u8m2(y, vv, /*vl=*/32);
+}
+
+int main(int argc, char **argv) {
+  test_intrinsic(in_buf, out_buf);
+  return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load8_segment2_unit_m2.cc b/tests/cocotb/rvv/load_store/load8_segment2_unit_m2.cc
new file mode 100644
index 0000000..97da9ae
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load8_segment2_unit_m2.cc
@@ -0,0 +1,36 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+// Enough space for m4.
+uint8_t in_buf[128] __attribute__((section(".data")));
+uint8_t out_buf[128] __attribute__((section(".data")));
+
+__attribute__((used, retain)) void test_intrinsic(const uint8_t *x,
+                                                  uint8_t *y) {
+  vuint8m2x2_t v = __riscv_vlseg2e8_v_u8m2x2(in_buf, 64);
+
+  vuint8m4_t vv = __riscv_vcreate_v_u8m2_u8m4(
+      __riscv_vget_v_u8m2x2_u8m2(v, 0),
+      __riscv_vget_v_u8m2x2_u8m2(v, 1));
+
+  __riscv_vse8_v_u8m4(y, vv, /*vl=*/64);
+}
+
+int main(int argc, char **argv) {
+  test_intrinsic(in_buf, out_buf);
+  return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load_store16_unit_m2.cc b/tests/cocotb/rvv/load_store/load_store16_unit_m2.cc
new file mode 100644
index 0000000..c73813c
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load_store16_unit_m2.cc
@@ -0,0 +1,31 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+// Enough space for m2.
+uint16_t in_buf[32] __attribute__((section(".data")));
+uint16_t out_buf[32] __attribute__((section(".data")));
+
+__attribute__((used, retain)) void test_intrinsic(const uint16_t *x,
+                                                  uint16_t *y) {
+  vuint16m2_t v = __riscv_vle16_v_u16m2(x, /*vl=*/16);
+  __riscv_vse16_v_u16m2(y, v, /*vl=*/16);
+}
+
+int main(int argc, char **argv) {
+  test_intrinsic(in_buf, out_buf);
+  return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load_store32_unit_m2.cc b/tests/cocotb/rvv/load_store/load_store32_unit_m2.cc
new file mode 100644
index 0000000..e327d12
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load_store32_unit_m2.cc
@@ -0,0 +1,31 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+// Enough space for m2.
+uint32_t in_buf[16] __attribute__((section(".data")));
+uint32_t out_buf[16] __attribute__((section(".data")));
+
+__attribute__((used, retain)) void test_intrinsic(const uint32_t *x,
+                                                  uint32_t *y) {
+  vuint32m2_t v = __riscv_vle32_v_u32m2(x, /*vl=*/8);
+  __riscv_vse32_v_u32m2(y, v, /*vl=*/8);
+}
+
+int main(int argc, char **argv) {
+  test_intrinsic(in_buf, out_buf);
+  return 0;
+}
diff --git a/tests/cocotb/rvv_load_store_test.py b/tests/cocotb/rvv_load_store_test.py
index 640d1f2..7fadb4a 100644
--- a/tests/cocotb/rvv_load_store_test.py
+++ b/tests/cocotb/rvv_load_store_test.py
@@ -240,6 +240,129 @@
         pattern = list(range(0, 32)),
     )
 
+
+@cocotb.test()
+async def load_store16_unit_m2(dut):
+    await vector_load_store(
+        dut=dut,
+        elf_name='load_store16_unit_m2.elf',
+        dtype=np.uint16,
+        in_size=32,
+        out_size=32,
+        pattern=list(range(0, 16)),
+    )
+
+
+@cocotb.test()
+async def load_store32_unit_m2(dut):
+    await vector_load_store(
+        dut=dut,
+        elf_name='load_store32_unit_m2.elf',
+        dtype=np.uint32,
+        in_size=16,
+        out_size=16,
+        pattern=list(range(0, 8)),
+    )
+
+@cocotb.test()
+async def load8_segment2_unit_m1(dut):
+    await vector_load_store(
+        dut=dut,
+        elf_name='load8_segment2_unit_m1.elf',
+        dtype=np.uint8,
+        in_size=64,
+        out_size=64,
+        pattern=(list(range(0, 32, 2)) + list(range(1, 32, 2))),
+    )
+
+
+@cocotb.test()
+async def load16_segment2_unit_m1(dut):
+    await vector_load_store(
+        dut=dut,
+        elf_name='load16_segment2_unit_m1.elf',
+        dtype=np.uint16,
+        in_size=32,
+        out_size=32,
+        pattern=(list(range(0, 16, 2)) + list(range(1, 16, 2))),
+    )
+
+
+@cocotb.test()
+async def load32_segment2_unit_m1(dut):
+    await vector_load_store(
+        dut=dut,
+        elf_name='load32_segment2_unit_m1.elf',
+        dtype=np.uint32,
+        in_size=16,
+        out_size=16,
+        pattern=(list(range(0, 8, 2)) + list(range(1, 8, 2))),
+    )
+
+
+@cocotb.test()
+async def load8_segment2_unit_m2(dut):
+    await vector_load_store(
+        dut=dut,
+        elf_name='load8_segment2_unit_m2.elf',
+        dtype=np.uint8,
+        in_size=128,
+        out_size=128,
+        pattern=(list(range(0, 32, 2)) + list(range(1, 32, 2)) +
+                 list(range(32, 64, 2)) + list(range(33, 64, 2))),
+    )
+
+
+@cocotb.test()
+async def load16_segment2_unit_m2(dut):
+    await vector_load_store(
+        dut=dut,
+        elf_name='load16_segment2_unit_m2.elf',
+        dtype=np.uint16,
+        in_size=64,
+        out_size=64,
+        pattern=(list(range(0, 16, 2)) + list(range(1, 16, 2)) +
+                 list(range(16, 32, 2)) + list(range(17, 32, 2))),
+    )
+
+
+@cocotb.test()
+async def load32_segment2_unit_m2(dut):
+    await vector_load_store(
+        dut=dut,
+        elf_name='load32_segment2_unit_m2.elf',
+        dtype=np.uint32,
+        in_size=32,
+        out_size=32,
+        pattern=(list(range(0, 8, 2)) + list(range(1, 8, 2)) +
+                 list(range(8, 16, 2)) + list(range(9, 16, 2))),
+    )
+
+
+@cocotb.test()
+async def load8_segment2_stride6_m1(dut):
+    await vector_load_store(
+        dut=dut,
+        elf_name='load8_segment2_stride6_m1.elf',
+        dtype=np.uint8,
+        in_size=256,
+        out_size=64,
+        pattern=([i * 6 for i in range(16)] + [i * 6 + 1 for i in range(16)]),
+    )
+
+
+@cocotb.test()
+async def load16_segment2_stride6_m1(dut):
+    await vector_load_store(
+        dut=dut,
+        elf_name='load16_segment2_stride6_m1.elf',
+        dtype=np.uint16,
+        in_size=128,
+        out_size=32,
+        pattern=([i * 3 for i in range(8)] + [i * 3 + 1 for i in range(8)]),
+    )
+
+
 @cocotb.test()
 async def load8_indexed_m1(dut):
     await vector_load_indexed(