Switch from VLM to VLE in VCPOP tests

Atm our VLM is broken. Use VLE with reinterpret for VCPOP tests.
Dedicated VLM/VSM tests will be added later.

Added fractional byte support so we can test VCPOP more thouroughly.

Change-Id: I531270db1fe23d380903fb96c4c8772cecf850b6
diff --git a/tests/cocotb/rvv/vcpop_test.cc b/tests/cocotb/rvv/vcpop_test.cc
index 0e3ac5f..a91826e 100644
--- a/tests/cocotb/rvv/vcpop_test.cc
+++ b/tests/cocotb/rvv/vcpop_test.cc
@@ -16,7 +16,7 @@
 #include <stdint.h>
 
 namespace {
-constexpr size_t buf_size = 128;
+constexpr size_t buf_size = 16;
 }
 
 size_t vl __attribute__((section(".data"))) = buf_size;
@@ -26,32 +26,38 @@
 
 extern "C" {
 __attribute__((used, retain)) void vcpop_m_b1() {
-  auto data = __riscv_vlm_v_b1(in_buf, vl);
+  auto data =
+      __riscv_vreinterpret_v_u8m1_b1(__riscv_vle8_v_u8m1(in_buf, (vl + 7) / 8));
   result = __riscv_vcpop_m_b1(data, vl);
 }
 
 __attribute__((used, retain)) void vcpop_m_b2() {
-  auto data = __riscv_vlm_v_b2(in_buf, vl);
+  auto data =
+      __riscv_vreinterpret_v_u8m1_b2(__riscv_vle8_v_u8m1(in_buf, (vl + 7) / 8));
   result = __riscv_vcpop_m_b2(data, vl);
 }
 
 __attribute__((used, retain)) void vcpop_m_b4() {
-  auto data = __riscv_vlm_v_b4(in_buf, vl);
+  auto data =
+      __riscv_vreinterpret_v_u8m1_b4(__riscv_vle8_v_u8m1(in_buf, (vl + 7) / 8));
   result = __riscv_vcpop_m_b4(data, vl);
 }
 
 __attribute__((used, retain)) void vcpop_m_b8() {
-  auto data = __riscv_vlm_v_b8(in_buf, vl);
+  auto data =
+      __riscv_vreinterpret_v_u8m1_b8(__riscv_vle8_v_u8m1(in_buf, (vl + 7) / 8));
   result = __riscv_vcpop_m_b8(data, vl);
 }
 
 __attribute__((used, retain)) void vcpop_m_b16() {
-  auto data = __riscv_vlm_v_b16(in_buf, vl);
+  auto data = __riscv_vreinterpret_v_u8m1_b16(
+      __riscv_vle8_v_u8m1(in_buf, (vl + 7) / 8));
   result = __riscv_vcpop_m_b16(data, vl);
 }
 
 __attribute__((used, retain)) void vcpop_m_b32() {
-  auto data = __riscv_vlm_v_b32(in_buf, vl);
+  auto data = __riscv_vreinterpret_v_u8m1_b32(
+      __riscv_vle8_v_u8m1(in_buf, (vl + 7) / 8));
   result = __riscv_vcpop_m_b32(data, vl);
 }
 }
diff --git a/tests/cocotb/rvv_assembly_cocotb_test.py b/tests/cocotb/rvv_assembly_cocotb_test.py
index ca7ee80..8af90c7 100644
--- a/tests/cocotb/rvv_assembly_cocotb_test.py
+++ b/tests/cocotb/rvv_assembly_cocotb_test.py
@@ -285,13 +285,22 @@
     fixture = await Fixture.Create(dut)
     r = runfiles.Create()
     cases = [
-        # lmul>1 currently fail
-        # {'impl': 'vcpop_m_b1', 'vl': 1024},
-        # {'impl': 'vcpop_m_b2', 'vl': 512},
-        # {'impl': 'vcpop_m_b4', 'vl': 256},
-        {'impl': 'vcpop_m_b8', 'vl': 128},
-        {'impl': 'vcpop_m_b16', 'vl': 64},
-        {'impl': 'vcpop_m_b32', 'vl': 32},
+        {'impl': 'vcpop_m_b1', 'vl': 128},
+        {'impl': 'vcpop_m_b1', 'vl': 121},
+        {'impl': 'vcpop_m_b1', 'vl': 120},
+        {'impl': 'vcpop_m_b2', 'vl': 64},
+        {'impl': 'vcpop_m_b2', 'vl': 57},
+        {'impl': 'vcpop_m_b2', 'vl': 56},
+        {'impl': 'vcpop_m_b4', 'vl': 32},
+        {'impl': 'vcpop_m_b4', 'vl': 25},
+        {'impl': 'vcpop_m_b4', 'vl': 24},
+        {'impl': 'vcpop_m_b8', 'vl': 16},
+        {'impl': 'vcpop_m_b8', 'vl': 9},
+        {'impl': 'vcpop_m_b8', 'vl': 8},
+        {'impl': 'vcpop_m_b16', 'vl': 8},
+        {'impl': 'vcpop_m_b16', 'vl': 1},
+        {'impl': 'vcpop_m_b32', 'vl': 4},
+        {'impl': 'vcpop_m_b32', 'vl': 1},
     ]
     await fixture.load_elf_and_lookup_symbols(
         r.Rlocation('kelvin_hw/tests/cocotb/rvv/vcpop_test.elf'),
@@ -301,12 +310,13 @@
     for c in cases:
         impl = c['impl']
         vl = c['vl']
-        # TODO(davidgao): test other vl. Need special handling of expected
-        # output if not full bytes.
-        in_bytes = vl // 8
+        in_bytes = (vl + 7) // 8
+        last_byte_mask = (1 << (vl % 8) - 1) if vl % 8 else 0xFF
 
         input_data = rng.integers(
             low=0, high=256, size=in_bytes, dtype=np.uint8)
+        input_data_trimmed = input_data
+        input_data_trimmed[-1] = input_data_trimmed[-1] & last_byte_mask
         expected_output = np.sum(
             np.bitwise_count(input_data), dtype=np.uint32)