Matmul with Rvv intrinsics
There is a bug in widen multiply and 32bit store with intrinsics
So assembly instructions are used as a work around
Change-Id: I6220b848dd942d493279ed857d3966b30d5ce1d0
diff --git a/tests/cocotb/BUILD b/tests/cocotb/BUILD
index 673daad..206b156 100644
--- a/tests/cocotb/BUILD
+++ b/tests/cocotb/BUILD
@@ -415,3 +415,31 @@
"//tests/cocotb/rvv/load_store:rvv_load_store_tests",
],
)
+
+cocotb_test_suite(
+ name = "rvv_ml_ops_cocotb_test",
+ simulators = [
+ "verilator",
+ "vcs",
+ ],
+ testcases = RVV_LOAD_STORE_TESTCASES,
+ testcases_vname = "RVV_LOAD_STORE_TESTCASES",
+ tests_kwargs = {
+ "hdl_toplevel": "RvvCoreMiniAxi",
+ "waves": True,
+ "seed": "42",
+ "test_module": ["rvv_ml_ops_cocotb_test.py"],
+ "deps": [
+ "//kelvin_test_utils:sim_test_fixture",
+ "@bazel_tools//tools/python/runfiles",
+ ],
+ "data": ["//tests/cocotb/rvv/ml_ops:rvv_matmul_assembly"],
+ "size": "large",
+ },
+ vcs_data = ["//tests/cocotb/rvv/ml_ops:rvv_matmul_assembly"] + [":coverage_exclude.cfg"],
+ vcs_build_args = VCS_BUILD_ARGS,
+ vcs_test_args = VCS_TEST_ARGS,
+ vcs_defines = VCS_DEFINES,
+ vcs_verilog_sources = ["//hdl/chisel/src/kelvin:rvv_core_mini_axi_cc_library_verilog"],
+ verilator_model = ":rvv_core_mini_axi_model",
+)
diff --git a/tests/cocotb/rvv/ml_ops/BUILD b/tests/cocotb/rvv/ml_ops/BUILD
new file mode 100644
index 0000000..e6f4ff4
--- /dev/null
+++ b/tests/cocotb/rvv/ml_ops/BUILD
@@ -0,0 +1,27 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+load("//rules:kelvin_v2.bzl", "kelvin_v2_binary")
+load("//rules:utils.bzl", "template_rule")
+
+package(default_visibility = ["//visibility:public"])
+
+template_rule(
+ kelvin_v2_binary,
+ {
+ "rvv_matmul_assembly": {
+ "srcs": ["rvv_matmul_assembly.cc"],
+ },
+ },
+)
diff --git a/tests/cocotb/rvv/ml_ops/rvv_matmul_assembly.cc b/tests/cocotb/rvv/ml_ops/rvv_matmul_assembly.cc
new file mode 100644
index 0000000..d50981d
--- /dev/null
+++ b/tests/cocotb/rvv/ml_ops/rvv_matmul_assembly.cc
@@ -0,0 +1,66 @@
+#include <riscv_vector.h>
+#include <stdint.h>
+
+constexpr size_t kLhsRows = 16;
+constexpr size_t kRhsCols = 16;
+constexpr size_t kInner = 24;
+
+int8_t lhs_input[kLhsRows*kInner] __attribute__((section(".data"))) __attribute__((aligned(16)));
+int8_t rhs_input[kInner*kRhsCols] __attribute__((section(".data"))) __attribute__((aligned(16)));
+int32_t result_output[kLhsRows*kRhsCols] __attribute__((section(".data"))) __attribute__((aligned(16)));
+
+// Assume rhs is column major.
+void MatMul(size_t lhs_rows, size_t inner, size_t rhs_cols,
+ const int8_t* lhs, const int8_t* rhs, int32_t* result) {
+ const size_t vlenb = __riscv_vlenb();
+
+ // Create zero register for vredsum
+ asm("vsetvli zero, %0, e32, m4, ta, ma;"
+ "vmv.v.i v0, 0;" : : "r" (vlenb));
+
+ for (size_t r = 0; r < lhs_rows; r++) {
+ const int8_t* lhs_data = lhs + (r * inner);
+ int32_t* result_row = result + (r * rhs_cols);
+ for (size_t c = 0; c < rhs_cols; c++) {
+ const int8_t* rhs_data = rhs + (c * inner);
+
+ // Reset accumulators
+ asm("vsetvli zero, %0, e32, m4, ta, ma" : : "r" (vlenb));
+ asm("vmv.v.i v8, 0");
+
+ // Inner dot product loop
+ size_t k = 0;
+ size_t vl = vlenb;
+ while (k < inner) {
+ if (inner - k < vl) {
+ vl = inner - k;
+ }
+ // Load weights/activations
+ asm("vsetvli zero, %0, e8, m1, ta, ma" : : "r" (vl));
+ asm("vle8.v v14, (%0)" : : "r" (lhs_data + k));
+ asm("vle8.v v15, (%0)" : : "r" (rhs_data + k));
+
+ // Multiply-accumulate
+ asm("vsetvli zero, %0, e8, m1, ta, ma;"
+ "vwmul.vv v12, v14, v15;"
+ "vsetvli zero, %0, e16, m2, ta, ma;"
+ "vwadd.wv v8, v8, v12;" : : "r" (vl));
+
+ k += vl;
+ }
+
+ // Reduction
+ asm("vsetvli zero, %0, e32, m4, ta, ma;"
+ "vredsum.vs v8, v8, v0;" : : "r" (vlenb));
+
+ // Store
+ asm("vsetivli zero, 1, e32, m1, ta, ma;"
+ "vse32.v v8, (%0);" : : "r" (result_row + c));
+ }
+ }
+}
+
+int main() {
+ MatMul(kLhsRows, kInner, kRhsCols, lhs_input, rhs_input, result_output);
+ return 0;
+}
\ No newline at end of file
diff --git a/tests/cocotb/rvv_ml_ops_cocotb_test.py b/tests/cocotb/rvv_ml_ops_cocotb_test.py
new file mode 100644
index 0000000..22f8f8b
--- /dev/null
+++ b/tests/cocotb/rvv_ml_ops_cocotb_test.py
@@ -0,0 +1,47 @@
+import cocotb
+import numpy as np
+import argparse
+
+from kelvin_test_utils.sim_test_fixture import Fixture
+from bazel_tools.tools.python.runfiles import runfiles
+
+
+@cocotb.test()
+async def core_mini_rvv_matmul_test(dut):
+ """Testbench to test matmul with rvv intrinsics.
+
+ This test performs matmul in M1 16x24 M2 24x16 matrices.
+ Compares results with native numpy matmul.
+ """
+
+ LHS_ROWS = 16
+ RHS_COLS = 16
+ INNER = 24
+
+ fixture = await Fixture.Create(dut)
+ r = runfiles.Create()
+ await fixture.load_elf_and_lookup_symbols(
+ r.Rlocation(
+ 'kelvin_hw/tests/cocotb/rvv/ml_ops/rvv_matmul_assembly.elf'),
+ ['lhs_input', 'rhs_input', 'result_output'])
+ np_type = np.int8
+ min_value = np.iinfo(np_type).min
+ max_value = np.iinfo(np_type).max + 1 # One above.
+ lhs_data = np.random.randint(min_value,
+ max_value, [LHS_ROWS, INNER],
+ dtype=np_type)
+ rhs_data = np.random.randint(min_value,
+ max_value, [INNER, RHS_COLS],
+ dtype=np_type)
+ result_data = np.matmul(lhs_data.astype(np.int32),
+ rhs_data.astype(np.int32))
+
+ await fixture.write('lhs_input', lhs_data.flatten())
+ await fixture.write('rhs_input', rhs_data.transpose().flatten())
+
+ await fixture.run_to_halt(timeout_cycles=1000000)
+
+ output_matmul_result = (await fixture.read(
+ 'result_output', LHS_ROWS * RHS_COLS *
+ 4)).view(dtype=np.int32).reshape([LHS_ROWS, RHS_COLS])
+ assert ((result_data == output_matmul_result).all())