Add whole register into Lsu.

Change-Id: Iee07259b7c63aca7748a231ccc89a6f77bbfb4a3
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala
index 755309a..75cd112 100644
--- a/hdl/chisel/src/kelvin/scalar/Lsu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -163,6 +163,16 @@
     }
   }
 
+  def isWholeRegister(): Bool = {
+    if (p.enableRvv) {
+      (umop.get === "b01000".U) &&
+      op.isOneOf(LsuOp.VLOAD_UNIT, LsuOp.VSTORE_UNIT)
+    } else {
+      false.B
+    }
+
+  }
+
   override def toPrintable: Printable = {
     cf"LsuCmd(store -> ${store}, addr -> 0x${addr}%x, op -> ${op}, " +
     cf"pc -> 0x${pc}%x, elemWidth -> ${elemWidth}, nfields -> ${nfields})"
@@ -215,9 +225,23 @@
       result.elemWidth.get := cmd.elemWidth.get
       // If mask operation, always make LMUL=1.
       result.lmul.get := Mux(cmd.isMaskOperation(), 0.U, rvvState.get.bits.lmul)
+      result.lmul.get := MuxCase(rvvState.get.bits.lmul, Seq(
+          cmd.isMaskOperation() -> 0.U,
+          // Section 7.9 of RVV Spec: "The nf field encoders how many vector
+          // registers to load and store".
+          cmd.isWholeRegister() -> MuxCase(0.U, Seq(
+              (cmd.nfields.get === 0.U) -> 0.U,  // NF1 -> LMUL1
+              (cmd.nfields.get === 1.U) -> 1.U,  // NF2 -> LMUL2
+              (cmd.nfields.get === 3.U) -> 2.U,  // NF4 -> LMUL4
+              (cmd.nfields.get === 7.U) -> 3.U,  // NF8 -> LMUL8
+          )),
+      ))
 
       // If mask operation, force fields to zero
-      result.nfields.get := Mux(cmd.isMaskOperation(), 0.U, cmd.nfields.get)
+      result.nfields.get := MuxCase(cmd.nfields.get, Seq(
+          cmd.isMaskOperation() -> 0.U,
+          cmd.isWholeRegister() -> 0.U,
+      ))
       result.sew.get := rvvState.get.bits.sew
     }
 
diff --git a/tests/cocotb/BUILD b/tests/cocotb/BUILD
index 21b6f84..dedacec 100644
--- a/tests/cocotb/BUILD
+++ b/tests/cocotb/BUILD
@@ -174,6 +174,7 @@
 # BEGIN_TESTCASES_FOR_rvv_load_store_test
 RVV_LOAD_STORE_TESTCASES = [
     "load_store_bits",
+    "load_store_whole_register_test",
     "load_unit_masked",
     "load_unit_all_vtypes_test",
     "load_strided_all_vtypes_test",
diff --git a/tests/cocotb/rvv/load_store/BUILD b/tests/cocotb/rvv/load_store/BUILD
index 73d5979..727c38d 100644
--- a/tests/cocotb/rvv/load_store/BUILD
+++ b/tests/cocotb/rvv/load_store/BUILD
@@ -23,6 +23,9 @@
         "load_store_bits": {
             "srcs": ["load_store_bits.cc"],
         },
+        "load_store_whole_register": {
+            "srcs": ["load_store_whole_register.cc"],
+        },
         "load_unit_masked": {
             "srcs": ["load_unit_masked.cc"],
         },
@@ -165,6 +168,7 @@
     name = "rvv_load_store_tests",
     srcs = [
         ":load_store_bits.elf",
+        ":load_store_whole_register.elf",
         ":load_unit_masked.elf",
         ":load_unit_vtype.elf",
         ":load_stride_vtype.elf",
diff --git a/tests/cocotb/rvv/load_store/load_store_whole_register.cc b/tests/cocotb/rvv/load_store/load_store_whole_register.cc
new file mode 100644
index 0000000..afdd9cf
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load_store_whole_register.cc
@@ -0,0 +1,73 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+namespace {
+constexpr size_t buf_size = 128;
+}
+
+size_t vl __attribute__((section(".data"))) = buf_size;
+size_t vtype __attribute__((section(".data"))) = buf_size;
+uint8_t load_data[buf_size] __attribute__((section(".data")));
+uint8_t store_data[buf_size] __attribute__((section(".data")));
+
+extern "C" {
+#define CREATE_LOAD_FN(name, n_registers) \
+__attribute__((used, retain)) void name() { \
+  size_t eight_vl = 8*__riscv_vlenb(); \
+  asm("vsetvli zero, %[eight_vl], e8, m8, ta, ma;" \
+      "vmv.v.i v8, 0;" \
+      "vsetvl zero, %[vl], %[vtype];" \
+      "vl" #n_registers "r.v v8, %[load_data];" \
+      "vs8r.v v8, %[store_data];" \
+      : [store_data] "=m"(store_data) \
+      : [eight_vl] "r"(eight_vl), \
+        [vl] "r"(vl), \
+        [vtype] "r"(vtype), \
+        [load_data] "m"(load_data) \
+      : "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", \
+        "vl", "vtype"); \
+}
+
+#define CREATE_STORE_FN(name, n_registers) \
+__attribute__((used, retain)) void name() { \
+  asm("vsetvl zero, %[vl], %[vtype];" \
+      "vl8r.v v8, %[load_data];" \
+      "vs" #n_registers "r.v v8, %[store_data];" \
+      : [store_data] "=m"(store_data) \
+      : [vl] "r"(vl), \
+        [vtype] "r"(vtype), \
+        [load_data] "m"(load_data) \
+      : "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", \
+        "vl", "vtype"); \
+}
+
+CREATE_LOAD_FN(test_vl1r, 1)
+CREATE_LOAD_FN(test_vl2r, 2)
+CREATE_LOAD_FN(test_vl4r, 4)
+CREATE_LOAD_FN(test_vl8r, 8)
+CREATE_STORE_FN(test_vs1r, 1)
+CREATE_STORE_FN(test_vs2r, 2)
+CREATE_STORE_FN(test_vs4r, 4)
+CREATE_STORE_FN(test_vs8r, 8)
+}
+
+void (*impl)() __attribute__((section(".data"))) = &test_vl1r;
+
+int main(int argc, char** argv) {
+  impl();
+  return 0;
+}
diff --git a/tests/cocotb/rvv_load_store_test.py b/tests/cocotb/rvv_load_store_test.py
index d83d0c3..64610c9 100644
--- a/tests/cocotb/rvv_load_store_test.py
+++ b/tests/cocotb/rvv_load_store_test.py
@@ -2816,3 +2816,58 @@
 
             store_data = (await fixture.read('store_data', 8192))
             assert (expected_store_data == store_data).all()
+
+@cocotb.test()
+async def load_store_whole_register_test(dut):
+    """Testbench to test RVV strided/segmented store, with all vtypes."""
+    fixture = await Fixture.Create(dut)
+    r = runfiles.Create()
+    functions = [
+        # Name, store, n_registers
+        ("test_vl1r", False, 1),
+        ("test_vl2r", False, 2),
+        ("test_vl4r", False, 4),
+        ("test_vl8r", False, 8),
+        ("test_vs1r", True, 1),
+        ("test_vs2r", True, 2),
+        ("test_vs4r", True, 4),
+        ("test_vs8r", True, 8),
+    ]
+
+    await fixture.load_elf_and_lookup_symbols(
+        r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/load_store_whole_register.elf'),
+        ['vl', 'vtype', 'stride', 'load_data', 'store_data', 'impl'] +
+            list(f[0] for f in functions),
+    )
+
+    vlenb = 16
+    with tqdm.tqdm(functions) as t:
+      for (function, store, n_registers) in t:
+        for sew in SEWS:
+          for lmul, _ in SEW_TO_LMULS_AND_VLMAXS[sew]:
+            t.set_postfix({
+                'function': function,
+                'sew': sew,
+                'lmul': lmul,
+            })
+
+            await fixture.write_ptr('impl', function)
+            vtype = construct_vtype(1, 1, sew, lmul)
+            await fixture.write_word('vtype', vtype)
+            await fixture.write_word('vl', 1)
+
+            load_data = np.random.randint(0, 255, 128, dtype=np.uint8)
+            await fixture.write('load_data', load_data)
+            await fixture.write('store_data', 0xFF*np.ones(128, dtype=np.uint8))
+
+            await fixture.run_to_halt()
+
+            store_data = (await fixture.read('store_data', 128))
+
+            data_written = vlenb * n_registers
+            assert (load_data[0:data_written] ==
+                    store_data[0:data_written]).all()
+            if store:
+                assert(store_data[data_written:] == 0xFF).all()
+            else:
+                assert(store_data[data_written:] == 0x00).all()