tests/cocotb/rvv/ml_ops/rvv_matmul.cc - hw/kelvin - Git at Google

 #include <riscv_vector.h>
 #include <stdint.h>

 constexpr size_t kLhsRows = 16;
 constexpr size_t kRhsCols = 16;
 constexpr size_t kInner = 48;

 int8_t lhs_input[kLhsRows * kInner] __attribute__((section(".data")))
 __attribute__((aligned(16)));
 int8_t rhs_input[kInner * kRhsCols] __attribute__((section(".data")))
 __attribute__((aligned(16)));
 int32_t result_output[kLhsRows * kRhsCols] __attribute__((section(".data")))
 __attribute__((aligned(16)));

 // Assume rhs is column major.
 void MatMul(size_t lhs_rows, size_t inner, size_t rhs_cols, const int8_t* lhs,
             const int8_t* rhs, int32_t* result) {
   const size_t vlenb = __riscv_vlenb();

   for (size_t r = 0; r < lhs_rows; r++) {
     const int8_t* lhs_data = lhs + (r * inner);
     int32_t* result_row = result + (r * rhs_cols);
     for (size_t c = 0; c < rhs_cols; c++) {
       const int8_t* rhs_data = rhs + (c * inner);
       // Reset accumulators
       vint32m1_t vacc = __riscv_vmv_v_x_i32m1(0, 1);

       // Inner dot product loop
       size_t k = 0;
       size_t vl = vlenb;
       while (k < inner) {
         if (inner - k < vl) {
           vl = inner - k;
         }
         // Load weights/activations
         vint8m1_t vlhs_data = __riscv_vle8_v_i8m1(lhs_data + k, vl);
         vint8m1_t vrhs_data =
             __riscv_vle8_v_i8m1(rhs_data + k, vl);  // input rhs is transposed
         vint16m2_t vmul_16 = __riscv_vwmul_vv_i16m2(vlhs_data, vrhs_data, vl);
         vacc = __riscv_vwredsum_vs_i16m2_i32m1(vmul_16, vacc, vlenb);
         k += vl;
       }
       __riscv_vse32_v_i32m1(result_row + c, vacc, 1);
     }
   }
 }

 int main() {
   MatMul(kLhsRows, kInner, kRhsCols, lhs_input, rhs_input, result_output);
   return 0;
 }
	#include <riscv_vector.h>
	#include <stdint.h>

	constexpr size_t kLhsRows = 16;
	constexpr size_t kRhsCols = 16;
	constexpr size_t kInner = 48;

	int8_t lhs_input[kLhsRows * kInner] __attribute__((section(".data")))
	__attribute__((aligned(16)));
	int8_t rhs_input[kInner * kRhsCols] __attribute__((section(".data")))
	__attribute__((aligned(16)));
	int32_t result_output[kLhsRows * kRhsCols] __attribute__((section(".data")))
	__attribute__((aligned(16)));

	// Assume rhs is column major.
	void MatMul(size_t lhs_rows, size_t inner, size_t rhs_cols, const int8_t* lhs,
	const int8_t* rhs, int32_t* result) {
	const size_t vlenb = __riscv_vlenb();

	for (size_t r = 0; r < lhs_rows; r++) {
	const int8_t* lhs_data = lhs + (r * inner);
	int32_t* result_row = result + (r * rhs_cols);
	for (size_t c = 0; c < rhs_cols; c++) {
	const int8_t* rhs_data = rhs + (c * inner);
	// Reset accumulators
	vint32m1_t vacc = __riscv_vmv_v_x_i32m1(0, 1);

	// Inner dot product loop
	size_t k = 0;
	size_t vl = vlenb;
	while (k < inner) {
	if (inner - k < vl) {
	vl = inner - k;
	}
	// Load weights/activations
	vint8m1_t vlhs_data = __riscv_vle8_v_i8m1(lhs_data + k, vl);
	vint8m1_t vrhs_data =
	__riscv_vle8_v_i8m1(rhs_data + k, vl); // input rhs is transposed
	vint16m2_t vmul_16 = __riscv_vwmul_vv_i16m2(vlhs_data, vrhs_data, vl);
	vacc = __riscv_vwredsum_vs_i16m2_i32m1(vmul_16, vacc, vlenb);
	k += vl;
	}
	__riscv_vse32_v_i32m1(result_row + c, vacc, 1);
	}
	}
	}

	int main() {
	MatMul(kLhsRows, kInner, kRhsCols, lhs_input, rhs_input, result_output);
	return 0;
	}