Matmul with Rvv intrinsics

There is a bug in widen multiply and 32bit store with intrinsics
So assembly instructions are used as a work around

Change-Id: I6220b848dd942d493279ed857d3966b30d5ce1d0
diff --git a/tests/cocotb/BUILD b/tests/cocotb/BUILD
index 673daad..206b156 100644
--- a/tests/cocotb/BUILD
+++ b/tests/cocotb/BUILD
@@ -415,3 +415,31 @@
         "//tests/cocotb/rvv/load_store:rvv_load_store_tests",
     ],
 )
+
+cocotb_test_suite(
+    name = "rvv_ml_ops_cocotb_test",
+    simulators = [
+        "verilator",
+        "vcs",
+    ],
+    testcases = RVV_LOAD_STORE_TESTCASES,
+    testcases_vname = "RVV_LOAD_STORE_TESTCASES",
+    tests_kwargs = {
+        "hdl_toplevel": "RvvCoreMiniAxi",
+        "waves": True,
+        "seed": "42",
+        "test_module": ["rvv_ml_ops_cocotb_test.py"],
+        "deps": [
+            "//kelvin_test_utils:sim_test_fixture",
+            "@bazel_tools//tools/python/runfiles",
+        ],
+        "data": ["//tests/cocotb/rvv/ml_ops:rvv_matmul_assembly"],
+        "size": "large",
+    },
+    vcs_data = ["//tests/cocotb/rvv/ml_ops:rvv_matmul_assembly"] + [":coverage_exclude.cfg"],
+    vcs_build_args = VCS_BUILD_ARGS,
+    vcs_test_args = VCS_TEST_ARGS,
+    vcs_defines = VCS_DEFINES,
+    vcs_verilog_sources = ["//hdl/chisel/src/kelvin:rvv_core_mini_axi_cc_library_verilog"],
+    verilator_model = ":rvv_core_mini_axi_model",
+)
diff --git a/tests/cocotb/rvv/ml_ops/BUILD b/tests/cocotb/rvv/ml_ops/BUILD
new file mode 100644
index 0000000..e6f4ff4
--- /dev/null
+++ b/tests/cocotb/rvv/ml_ops/BUILD
@@ -0,0 +1,27 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+load("//rules:kelvin_v2.bzl", "kelvin_v2_binary")
+load("//rules:utils.bzl", "template_rule")
+
+package(default_visibility = ["//visibility:public"])
+
+template_rule(
+    kelvin_v2_binary,
+    {
+        "rvv_matmul_assembly": {
+            "srcs": ["rvv_matmul_assembly.cc"],
+        },
+    },
+)
diff --git a/tests/cocotb/rvv/ml_ops/rvv_matmul_assembly.cc b/tests/cocotb/rvv/ml_ops/rvv_matmul_assembly.cc
new file mode 100644
index 0000000..d50981d
--- /dev/null
+++ b/tests/cocotb/rvv/ml_ops/rvv_matmul_assembly.cc
@@ -0,0 +1,66 @@
+#include <riscv_vector.h>
+#include <stdint.h>
+
+constexpr size_t kLhsRows = 16;
+constexpr size_t kRhsCols = 16;
+constexpr size_t kInner = 24;
+
+int8_t lhs_input[kLhsRows*kInner] __attribute__((section(".data"))) __attribute__((aligned(16)));
+int8_t rhs_input[kInner*kRhsCols] __attribute__((section(".data"))) __attribute__((aligned(16)));
+int32_t result_output[kLhsRows*kRhsCols] __attribute__((section(".data"))) __attribute__((aligned(16)));
+
+// Assume rhs is column major.
+void MatMul(size_t lhs_rows, size_t inner, size_t rhs_cols,
+            const int8_t* lhs, const int8_t* rhs, int32_t* result) {
+  const size_t vlenb = __riscv_vlenb();
+
+  // Create zero register for vredsum
+  asm("vsetvli zero, %0, e32, m4, ta, ma;"
+      "vmv.v.i v0, 0;" : : "r" (vlenb));
+
+  for (size_t r = 0; r < lhs_rows; r++) {
+    const int8_t* lhs_data = lhs + (r * inner);
+    int32_t* result_row = result + (r * rhs_cols);
+    for (size_t c = 0; c < rhs_cols; c++) {
+      const int8_t* rhs_data = rhs + (c * inner);
+
+      // Reset accumulators
+      asm("vsetvli zero, %0, e32, m4, ta, ma" : : "r" (vlenb));
+      asm("vmv.v.i v8, 0");
+
+      // Inner dot product loop
+      size_t k = 0;
+      size_t vl = vlenb;
+      while (k < inner) {
+        if (inner - k < vl) {
+          vl = inner - k;
+        }
+        // Load weights/activations
+        asm("vsetvli zero, %0, e8, m1, ta, ma" : : "r" (vl));
+        asm("vle8.v  v14, (%0)" : : "r" (lhs_data + k));
+        asm("vle8.v  v15, (%0)" : : "r" (rhs_data + k));
+
+        // Multiply-accumulate
+        asm("vsetvli zero, %0, e8, m1, ta, ma;"
+            "vwmul.vv v12, v14, v15;"
+            "vsetvli zero, %0, e16, m2, ta, ma;"
+            "vwadd.wv v8, v8, v12;" : : "r" (vl));
+
+        k += vl;
+      }
+
+      // Reduction
+      asm("vsetvli zero, %0, e32, m4, ta, ma;"
+          "vredsum.vs v8, v8, v0;" : : "r" (vlenb));
+
+      // Store
+      asm("vsetivli zero, 1, e32, m1, ta, ma;"
+          "vse32.v v8, (%0);" : : "r" (result_row + c));
+    }
+  }
+}
+
+int main() {
+  MatMul(kLhsRows, kInner, kRhsCols, lhs_input, rhs_input, result_output);
+  return 0;
+}
\ No newline at end of file
diff --git a/tests/cocotb/rvv_ml_ops_cocotb_test.py b/tests/cocotb/rvv_ml_ops_cocotb_test.py
new file mode 100644
index 0000000..22f8f8b
--- /dev/null
+++ b/tests/cocotb/rvv_ml_ops_cocotb_test.py
@@ -0,0 +1,47 @@
+import cocotb
+import numpy as np
+import argparse
+
+from kelvin_test_utils.sim_test_fixture import Fixture
+from bazel_tools.tools.python.runfiles import runfiles
+
+
+@cocotb.test()
+async def core_mini_rvv_matmul_test(dut):
+    """Testbench to test matmul with rvv intrinsics.
+
+    This test performs matmul in M1 16x24 M2 24x16 matrices.
+    Compares results with native numpy matmul.
+    """
+
+    LHS_ROWS = 16
+    RHS_COLS = 16
+    INNER = 24
+
+    fixture = await Fixture.Create(dut)
+    r = runfiles.Create()
+    await fixture.load_elf_and_lookup_symbols(
+        r.Rlocation(
+            'kelvin_hw/tests/cocotb/rvv/ml_ops/rvv_matmul_assembly.elf'),
+        ['lhs_input', 'rhs_input', 'result_output'])
+    np_type = np.int8
+    min_value = np.iinfo(np_type).min
+    max_value = np.iinfo(np_type).max + 1  # One above.
+    lhs_data = np.random.randint(min_value,
+                                 max_value, [LHS_ROWS, INNER],
+                                 dtype=np_type)
+    rhs_data = np.random.randint(min_value,
+                                 max_value, [INNER, RHS_COLS],
+                                 dtype=np_type)
+    result_data = np.matmul(lhs_data.astype(np.int32),
+                            rhs_data.astype(np.int32))
+
+    await fixture.write('lhs_input', lhs_data.flatten())
+    await fixture.write('rhs_input', rhs_data.transpose().flatten())
+
+    await fixture.run_to_halt(timeout_cycles=1000000)
+
+    output_matmul_result = (await fixture.read(
+        'result_output', LHS_ROWS * RHS_COLS *
+        4)).view(dtype=np.int32).reshape([LHS_ROWS, RHS_COLS])
+    assert ((result_data == output_matmul_result).all())