Confirm no more rvv instructions write rd

Tests are added for vcpop.

Change-Id: I0178482dd8a8c7623ae14a234dddcc5bbcdee0aa
diff --git a/hdl/chisel/src/kelvin/rvv/RvvDecode.scala b/hdl/chisel/src/kelvin/rvv/RvvDecode.scala
index e508492..e1d7400 100644
--- a/hdl/chisel/src/kelvin/rvv/RvvDecode.scala
+++ b/hdl/chisel/src/kelvin/rvv/RvvDecode.scala
@@ -113,9 +113,8 @@
 
   def writesRd(): Bool = {
     isVset() ||
-    // OP MVV, VWXUNARY0
+    // OP MVV, VWXUNARY0 (all): vmv.x.s, vcpop, vfirst.
     (opcode === RvvCompressedOpcode.RVVALU && funct3() === "b010".U && funct6() === "b010000".U)
-    // TODO(derekjchow): Add all cases that write scalar rd.
   }
 
   def writesVectorRegister(): Bool = {
diff --git a/kelvin_test_utils/sim_test_fixture.py b/kelvin_test_utils/sim_test_fixture.py
index ccb2c82..9813d41 100644
--- a/kelvin_test_utils/sim_test_fixture.py
+++ b/kelvin_test_utils/sim_test_fixture.py
@@ -48,16 +48,29 @@
     async def write(self, symbol: str, data):
         await self.core_mini_axi.write(self.symbols[symbol], data)
 
+    async def write_word(self, symbol: str, data):
+        await self.core_mini_axi.write_word(self.symbols[symbol], data)
+
+    async def write_ptr(
+            self, addr_symbol: str, data_symbol: str, offset: int = 0):
+        await self.core_mini_axi.write_word(
+            self.symbols[addr_symbol], self.symbols[data_symbol] + offset)
+
     async def read(self, symbol: str, size: int):
         return await self.core_mini_axi.read(self.symbols[symbol], size)
 
+    async def read_word(self, symbol: str):
+        return await self.core_mini_axi.read_word(self.symbols[symbol])
+
     async def run_to_halt(self, timeout_cycles=10000):
         await self.core_mini_axi.execute_from(self.entry_point)
-        return (await self.core_mini_axi.wait_for_halted(timeout_cycles=timeout_cycles))
+        return await self.core_mini_axi.wait_for_halted(
+            timeout_cycles=timeout_cycles)
 
     async def run_to_fault(self, timeout_cycles=10000):
         await self.core_mini_axi.execute_from(self.entry_point)
-        return (await self.core_mini_axi.wait_for_fault(timeout_cycles=timeout_cycles))
+        return await self.core_mini_axi.wait_for_fault(
+            timeout_cycles=timeout_cycles)
 
     def fault(self):
-        return (self.core_mini_axi.dut.io_fault.value == 1)
\ No newline at end of file
+        return self.core_mini_axi.dut.io_fault.value == 1
diff --git a/tests/cocotb/BUILD b/tests/cocotb/BUILD
index 5c2fad2..369baeb 100644
--- a/tests/cocotb/BUILD
+++ b/tests/cocotb/BUILD
@@ -160,6 +160,7 @@
     "core_mini_vcsr_test",
     "core_mini_viota_test",
     "core_mini_vfirst_test",
+    "core_mini_vcpop_exception_test",
     "core_mini_vcpop_test",
     "core_mini_vcompress_test",
     "core_mini_vmsbf_test",
@@ -259,6 +260,7 @@
         "test_module": ["rvv_assembly_cocotb_test.py"],
         "deps": [
             "//kelvin_test_utils:core_mini_axi_sim_interface",
+            "//kelvin_test_utils:sim_test_fixture",
             requirement("tqdm"),
             "@bazel_tools//tools/python/runfiles",
         ],
diff --git a/tests/cocotb/rvv/BUILD b/tests/cocotb/rvv/BUILD
index bb767b6..70f8121 100644
--- a/tests/cocotb/rvv/BUILD
+++ b/tests/cocotb/rvv/BUILD
@@ -41,6 +41,9 @@
         "vfirst_test": {
             "srcs": ["vfirst_test.cc"],
         },
+        "vcpop_exception_test": {
+            "srcs": ["vcpop_exception_test.cc"],
+        },
         "vcpop_test": {
             "srcs": ["vcpop_test.cc"],
         },
@@ -69,6 +72,7 @@
         "vill_test.elf",
         "viota_test.elf",
         "vfirst_test.elf",
+        "vcpop_exception_test.elf",
         "vcpop_test.elf",
         "vcompress_test.elf",
         "vmsbf_test.elf",
diff --git a/tests/cocotb/rvv/vcpop_exception_test.cc b/tests/cocotb/rvv/vcpop_exception_test.cc
new file mode 100644
index 0000000..f520568
--- /dev/null
+++ b/tests/cocotb/rvv/vcpop_exception_test.cc
@@ -0,0 +1,62 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+uint32_t vma __attribute__((section(".data"))) = 0;
+uint32_t vta __attribute__((section(".data"))) = 0;
+uint32_t sew __attribute__((section(".data"))) = 0;
+uint32_t lmul __attribute__((section(".data"))) = 0;
+uint32_t vl __attribute__((section(".data"))) = 16;
+uint32_t vstart __attribute__((section(".data"))) = 0;
+
+uint8_t mask_data[16] __attribute__((section(".data")));
+uint32_t result __attribute__((section(".data")));
+
+uint32_t faulted __attribute__((section(".data"))) = 0;
+uint32_t mcause __attribute__((section(".data"))) = 0;
+
+// Fault handler to log fault
+extern "C" {
+void kelvin_exception_handler() {
+  faulted = 1;
+  uint32_t local_mcause;
+  asm volatile("csrr %0, mcause" : "=r"(local_mcause));
+  mcause = local_mcause;
+
+  asm volatile("ebreak");
+  while (1) {
+  }
+}
+}
+
+int main(int argc, char** argv) {
+  // Load mask data
+  asm volatile("vsetivli x0, 16, e8, m1, ta, ma");
+  asm volatile("vle8.v v0, (%0)" : : "r"(mask_data));
+
+  // Set configuration state
+  uint32_t vtype_to_write = (vma << 7) | (vta << 6) | (sew << 3) | lmul;
+  asm volatile("vsetvl x0, %0, %1" : : "r"(vl), "r"(vtype_to_write));
+  uint32_t local_vstart = vstart;
+  asm volatile("csrw vstart, %0" : : "r"(local_vstart));
+
+  // Run vcpop
+  uint32_t local_result;
+  asm volatile("vcpop.m %0, v0" : "=r"(local_result));
+  result = local_result;
+
+  return 0;
+}
diff --git a/tests/cocotb/rvv/vcpop_test.cc b/tests/cocotb/rvv/vcpop_test.cc
index 8cded24..0e3ac5f 100644
--- a/tests/cocotb/rvv/vcpop_test.cc
+++ b/tests/cocotb/rvv/vcpop_test.cc
@@ -15,47 +15,49 @@
 #include <riscv_vector.h>
 #include <stdint.h>
 
-uint32_t vma __attribute__((section(".data"))) = 0;
-uint32_t vta __attribute__((section(".data"))) = 0;
-uint32_t sew __attribute__((section(".data"))) = 0;
-uint32_t lmul __attribute__((section(".data"))) = 0;
-uint32_t vl __attribute__((section(".data"))) = 16;
-uint32_t vstart __attribute__((section(".data"))) = 0;
+namespace {
+constexpr size_t buf_size = 128;
+}
 
-uint8_t mask_data[16] __attribute__((section(".data")));
+size_t vl __attribute__((section(".data"))) = buf_size;
+uint8_t in_buf[buf_size] __attribute__((section(".data")));
 uint32_t result __attribute__((section(".data")));
+void (*impl)() __attribute__((section(".data"))) = nullptr;
 
-uint32_t faulted __attribute__((section(".data"))) = 0;
-uint32_t mcause __attribute__((section(".data"))) = 0;
-
-// Fault handler to log fault
 extern "C" {
-void kelvin_exception_handler() {
-  faulted = 1;
-  uint32_t local_mcause;
-  asm volatile("csrr %0, mcause" : "=r"(local_mcause));
-  mcause = local_mcause;
+__attribute__((used, retain)) void vcpop_m_b1() {
+  auto data = __riscv_vlm_v_b1(in_buf, vl);
+  result = __riscv_vcpop_m_b1(data, vl);
+}
 
-  asm volatile("ebreak");
-  while (1) {}
+__attribute__((used, retain)) void vcpop_m_b2() {
+  auto data = __riscv_vlm_v_b2(in_buf, vl);
+  result = __riscv_vcpop_m_b2(data, vl);
+}
+
+__attribute__((used, retain)) void vcpop_m_b4() {
+  auto data = __riscv_vlm_v_b4(in_buf, vl);
+  result = __riscv_vcpop_m_b4(data, vl);
+}
+
+__attribute__((used, retain)) void vcpop_m_b8() {
+  auto data = __riscv_vlm_v_b8(in_buf, vl);
+  result = __riscv_vcpop_m_b8(data, vl);
+}
+
+__attribute__((used, retain)) void vcpop_m_b16() {
+  auto data = __riscv_vlm_v_b16(in_buf, vl);
+  result = __riscv_vcpop_m_b16(data, vl);
+}
+
+__attribute__((used, retain)) void vcpop_m_b32() {
+  auto data = __riscv_vlm_v_b32(in_buf, vl);
+  result = __riscv_vcpop_m_b32(data, vl);
 }
 }
 
-int main(int argc, char **argv) {
-  // Load mask data
-  asm volatile("vsetivli x0, 16, e8, m1, ta, ma");
-  asm volatile("vle8.v v0, (%0)" : : "r"(mask_data));
-
-  // Set configuration state
-  uint32_t vtype_to_write = (vma << 7) | (vta << 6) | (sew << 3) | lmul;
-  asm volatile("vsetvl x0, %0, %1": : "r"(vl), "r"(vtype_to_write));
-  uint32_t local_vstart = vstart;
-  asm volatile("csrw vstart, %0" : : "r"(local_vstart));
-
-  // Run vcpop
-  uint32_t local_result;
-  asm volatile("vcpop.m %0, v0" : "=r"(local_result));
-  result = local_result;
+int main(int argc, char** argv) {
+  impl();
 
   return 0;
-}
\ No newline at end of file
+}
diff --git a/tests/cocotb/rvv_assembly_cocotb_test.py b/tests/cocotb/rvv_assembly_cocotb_test.py
index d66d356..ca7ee80 100644
--- a/tests/cocotb/rvv_assembly_cocotb_test.py
+++ b/tests/cocotb/rvv_assembly_cocotb_test.py
@@ -1,6 +1,7 @@
 import cocotb
 import numpy as np
 from kelvin_test_utils.core_mini_axi_interface import CoreMiniAxiInterface
+from kelvin_test_utils.sim_test_fixture import Fixture
 from bazel_tools.tools.python.runfiles import runfiles
 
 SEWS = [
@@ -271,10 +272,60 @@
 
 
 @cocotb.test()
-async def core_mini_vcpop_test(dut):
+async def core_mini_vcpop_exception_test(dut):
     """Testbench to test vstart!=0 vcpop."""
     await test_vstart_not_zero_failure(
-        dut, "kelvin_hw/tests/cocotb/rvv/vcpop_test.elf")
+        dut, "kelvin_hw/tests/cocotb/rvv/vcpop_exception_test.elf")
+
+
+@cocotb.test()
+async def core_mini_vcpop_test(dut):
+    """Test vcpop usage accessible from intrinsics."""
+    # mask is not accessible from here.
+    fixture = await Fixture.Create(dut)
+    r = runfiles.Create()
+    cases = [
+        # lmul>1 currently fail
+        # {'impl': 'vcpop_m_b1', 'vl': 1024},
+        # {'impl': 'vcpop_m_b2', 'vl': 512},
+        # {'impl': 'vcpop_m_b4', 'vl': 256},
+        {'impl': 'vcpop_m_b8', 'vl': 128},
+        {'impl': 'vcpop_m_b16', 'vl': 64},
+        {'impl': 'vcpop_m_b32', 'vl': 32},
+    ]
+    await fixture.load_elf_and_lookup_symbols(
+        r.Rlocation('kelvin_hw/tests/cocotb/rvv/vcpop_test.elf'),
+        ['vl', 'in_buf', 'result', 'impl'] + [c['impl'] for c in cases],
+    )
+    rng = np.random.default_rng()
+    for c in cases:
+        impl = c['impl']
+        vl = c['vl']
+        # TODO(davidgao): test other vl. Need special handling of expected
+        # output if not full bytes.
+        in_bytes = vl // 8
+
+        input_data = rng.integers(
+            low=0, high=256, size=in_bytes, dtype=np.uint8)
+        expected_output = np.sum(
+            np.bitwise_count(input_data), dtype=np.uint32)
+
+        await fixture.write_ptr('impl', impl)
+        await fixture.write_word('vl', vl)
+        await fixture.write('in_buf', input_data)
+        await fixture.write_word('result', 0)
+
+        await fixture.run_to_halt()
+
+        actual_output = (await fixture.read_word('result')).view(np.uint32)
+
+        debug_msg = str({
+            'impl': impl,
+            'input': input_data,
+            'expected': expected_output,
+            'actual': actual_output,
+        })
+        assert (actual_output == expected_output), debug_msg
 
 
 @cocotb.test()