Add VLM/VSM tests

This set of tests cover the allowed usage reachable from intrinsics.

These only pass after change 74304.

Change-Id: I6fbd9ad14e86aeb4827dd4dc57323602b22a815a
diff --git a/tests/cocotb/BUILD b/tests/cocotb/BUILD
index 369baeb..0b030d8 100644
--- a/tests/cocotb/BUILD
+++ b/tests/cocotb/BUILD
@@ -172,6 +172,7 @@
 
 # BEGIN_TESTCASES_FOR_rvv_load_store_test
 RVV_LOAD_STORE_TESTCASES = [
+    "load_store_bits",
     "load8_stride2_m1",
     "load8_stride2_m1_partial",
     "load8_stride2_mf4",
diff --git a/tests/cocotb/rvv/load_store/BUILD b/tests/cocotb/rvv/load_store/BUILD
index 167cd30..31e659f 100644
--- a/tests/cocotb/rvv/load_store/BUILD
+++ b/tests/cocotb/rvv/load_store/BUILD
@@ -20,6 +20,9 @@
 template_rule(
     kelvin_v2_binary,
     {
+        "load_store_bits": {
+            "srcs": ["load_store_bits.cc"],
+        },
         "load8_indexed_m1": {
             "srcs": ["load8_indexed_m1.cc"],
         },
@@ -92,6 +95,7 @@
 filegroup(
     name = "rvv_load_store_tests",
     srcs = [
+        ":load_store_bits.elf",
         ":load8_indexed_m1.elf",
         ":load8_segment2_unit_m1.elf",
         ":load16_segment2_unit_m1.elf",
diff --git a/tests/cocotb/rvv/load_store/load_store_bits.cc b/tests/cocotb/rvv/load_store/load_store_bits.cc
new file mode 100644
index 0000000..f4a73a7
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load_store_bits.cc
@@ -0,0 +1,63 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+namespace {
+constexpr size_t buf_size = 16;
+}
+
+size_t vl __attribute__((section(".data"))) = buf_size;
+uint8_t in_buf[buf_size] __attribute__((section(".data")));
+uint8_t out_buf[buf_size] __attribute__((section(".data")));
+
+extern "C" {
+__attribute__((used, retain)) void vlm_vsm_v_b1() {
+  auto data = __riscv_vlm_v_b1(in_buf, vl);
+  __riscv_vsm_v_b1(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vlm_vsm_v_b2() {
+  auto data = __riscv_vlm_v_b2(in_buf, vl);
+  __riscv_vsm_v_b2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vlm_vsm_v_b4() {
+  auto data = __riscv_vlm_v_b4(in_buf, vl);
+  __riscv_vsm_v_b4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vlm_vsm_v_b8() {
+  auto data = __riscv_vlm_v_b8(in_buf, vl);
+  __riscv_vsm_v_b8(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vlm_vsm_v_b16() {
+  auto data = __riscv_vlm_v_b16(in_buf, vl);
+  __riscv_vsm_v_b16(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vlm_vsm_v_b32() {
+  auto data = __riscv_vlm_v_b32(in_buf, vl);
+  __riscv_vsm_v_b32(out_buf, data, vl);
+}
+}
+
+void (*impl)() __attribute__((section(".data"))) = &vlm_vsm_v_b1;
+
+int main(int argc, char** argv) {
+  impl();
+  return 0;
+}
diff --git a/tests/cocotb/rvv_load_store_test.py b/tests/cocotb/rvv_load_store_test.py
index 78da0b4..2f3f11a 100644
--- a/tests/cocotb/rvv_load_store_test.py
+++ b/tests/cocotb/rvv_load_store_test.py
@@ -145,6 +145,66 @@
 
     assert (actual_outputs == expected_outputs).all()
 
+
+@cocotb.test()
+async def load_store_bits(dut):
+    """Test vlm/vsm usage accessible from intrinsics."""
+    # mask is not accessible from here.
+    fixture = await Fixture.Create(dut)
+    r = runfiles.Create()
+    cases = [
+        {'impl': 'vlm_vsm_v_b1', 'vl': 128},
+        {'impl': 'vlm_vsm_v_b1', 'vl': 121},
+        {'impl': 'vlm_vsm_v_b1', 'vl': 120},
+        {'impl': 'vlm_vsm_v_b2', 'vl': 64},
+        {'impl': 'vlm_vsm_v_b2', 'vl': 57},
+        {'impl': 'vlm_vsm_v_b2', 'vl': 56},
+        {'impl': 'vlm_vsm_v_b4', 'vl': 32},
+        {'impl': 'vlm_vsm_v_b4', 'vl': 25},
+        {'impl': 'vlm_vsm_v_b4', 'vl': 24},
+        {'impl': 'vlm_vsm_v_b8', 'vl': 16},
+        {'impl': 'vlm_vsm_v_b8', 'vl': 9},
+        {'impl': 'vlm_vsm_v_b8', 'vl': 8},
+        {'impl': 'vlm_vsm_v_b16', 'vl': 8},
+        {'impl': 'vlm_vsm_v_b16', 'vl': 1},
+        {'impl': 'vlm_vsm_v_b32', 'vl': 4},
+        {'impl': 'vlm_vsm_v_b32', 'vl': 1},
+    ]
+    await fixture.load_elf_and_lookup_symbols(
+        r.Rlocation(
+            'kelvin_hw/tests/cocotb/rvv/load_store/load_store_bits.elf'),
+        ['vl', 'in_buf', 'out_buf', 'impl'] + [c['impl'] for c in cases],
+    )
+    rng = np.random.default_rng()
+    for c in cases:
+        impl = c['impl']
+        vl = c['vl']
+        in_bytes = (vl + 7) // 8
+        last_byte_mask = (1 << (vl % 8) - 1) if vl % 8 else 0xFF
+
+        input_data = rng.integers(
+            low=0, high=256, size=in_bytes, dtype=np.uint8)
+        expected_output = input_data
+        expected_output[-1] = expected_output[-1] & last_byte_mask
+
+        await fixture.write_ptr('impl', impl)
+        await fixture.write_word('vl', vl)
+        await fixture.write('in_buf', input_data)
+        await fixture.write('out_buf', np.zeros([in_bytes], dtype=np.uint8))
+
+        await fixture.run_to_halt()
+
+        actual_output = (await fixture.read('out_buf', in_bytes)).view(np.uint8)
+
+        debug_msg = str({
+            'impl': impl,
+            'input': input_data,
+            'expected': expected_output,
+            'actual': actual_output,
+        })
+        assert (actual_output == expected_output).all(), debug_msg
+
+
 @cocotb.test()
 async def load8_stride2_m1(dut):
     await vector_load_store(