Fix wrong results for mixed prec indexed loads
Adding a bunch of indexed load tests too.
The fix is tested on data8 index16 loads below m1 and index32 below mf2.
Pending another fix for larger lmul.
TODO: test store
TODO: test load data16/32
Change-Id: I83b986339ff0900449fa4b57fa2eb895ca4e26f8
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala
index 832db87..e3ef588 100644
--- a/hdl/chisel/src/kelvin/scalar/Lsu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -166,7 +166,12 @@
val pc = UInt(32.W)
val addr = UInt(32.W)
val data = UInt(32.W) // Doubles as rs2
+ // This aligns with "width" in the spec. It controls index width in
+ // indexed loads/stores and data width otherwise.
val elemWidth = Option.when(p.enableRvv) { UInt(3.W) }
+ // This is the sew from vtype. It controls data width in indexed
+ // loads/stores and is unused in other ops.
+ val sew = Option.when(p.enableRvv) { UInt(3.W) }
val lmul = Option.when(p.enableRvv) { UInt(4.W) }
val nfields = Option.when(p.enableRvv) { UInt(3.W) }
@@ -207,6 +212,7 @@
result.lmul.get := 1.U(1.W) << effectiveLmul
// If mask operation, force fields to zero
result.nfields.get := Mux(cmd.isMaskOperation(), 0.U, cmd.nfields.get)
+ result.sew.get := rvvState.get.bits.sew
}
result
@@ -235,22 +241,35 @@
def apply(bytesPerSlot: Int,
baseAddr: UInt,
indices: UInt,
- elemWidth: UInt): Vec[UInt] = {
+ indexWidth: UInt,
+ sew: UInt): Vec[UInt] = {
val indices8 = UIntToVec(indices, 8)
- val indices16 = UIntToVec(indices, 16)
- val indices32 = UIntToVec(indices, 32)
+ val indices16 = UIntToVec(indices, 16).padTo(bytesPerSlot, 0.U)
+ val indices32 = UIntToVec(indices, 32).padTo(bytesPerSlot, 0.U)
+
+ val indices_v = MuxCase(VecInit.fill(bytesPerSlot)(0.U(32.W)), Seq(
+ // 8-bit indices.
+ (indexWidth === "b000".U) -> VecInit((0 until bytesPerSlot).map(
+ i => Cat(0.U(24.W), indices8(i)))),
+ // 16-bit indices.
+ (indexWidth === "b101".U) -> VecInit((0 until bytesPerSlot).map(
+ i => Cat(0.U(16.W), indices16(i)))),
+ // 32-bit indices.
+ (indexWidth === "b110".U) -> VecInit((0 until bytesPerSlot).map(
+ i => indices32(i))),
+ ))
MuxCase(VecInit.fill(bytesPerSlot)(0.U(32.W)), Seq(
// elemWidth validation is done at decode time.
// 8-bit indices. Each byte has its own offset.
- (elemWidth === "b000".U) -> VecInit((0 until bytesPerSlot).map(
- i => (baseAddr + indices8(i))(31, 0))),
+ (sew === "b000".U) -> VecInit((0 until bytesPerSlot).map(
+ i => (baseAddr + indices_v(i)))),
// 16-bit indices. Each 2-byte element has an offset.
- (elemWidth === "b101".U) -> VecInit((0 until bytesPerSlot).map(
- i => (baseAddr + indices16(i >> 1))(31, 0) + (i & 1).U)),
+ (sew === "b001".U) -> VecInit((0 until bytesPerSlot).map(
+ i => (baseAddr + indices_v(i >> 1) + (i & 1).U))),
// 32-bit indices. Each 4-byte element has an offset.
- (elemWidth === "b110".U) -> VecInit((0 until bytesPerSlot).map(
- i => (baseAddr + indices32(i >> 2))(31, 0) + (i & 3).U))
+ (sew === "b010".U) -> VecInit((0 until bytesPerSlot).map(
+ i => (baseAddr + indices_v(i >> 2) + (i & 3).U)))
))
}
}
@@ -273,7 +292,12 @@
val lmul = UInt(4.W)
val elemStride = UInt(32.W) // Stride between lanes in a vector
val segmentStride = UInt(32.W) // Stride between base addr between segments
+ // This aligns with "width" in the spec. It controls index width in
+ // indexed loads/stores and data width otherwise.
val elemWidth = UInt(3.W)
+ // This controls data width in indexed loads/stores and is unused in
+ // other ops.
+ val sew = UInt(3.W)
val nfields = UInt(3.W)
val segment = UInt(3.W)
// Add this to find the next segment.
@@ -339,9 +363,10 @@
op.isOneOf(LsuOp.VLOAD_OINDEXED, LsuOp.VLOAD_UINDEXED,
LsuOp.VSTORE_OINDEXED, LsuOp.VSTORE_UINDEXED) ->
ComputeIndexedAddrs(bytesPerSlot, baseAddr, rvv2lsu.idx.bits.data,
- elemWidth)
+ elemWidth, sew),
))
result.elemWidth := elemWidth
+ result.sew := sew
result.data := Mux(updated && LsuOp.isVector(op) && rvv2lsu.vregfile.valid,
UIntToVec(rvv2lsu.vregfile.bits.data, 8), data)
@@ -386,6 +411,7 @@
result.elemStride := elemStride
result.segmentStride := segmentStride
result.elemWidth := elemWidth
+ result.sew := sew
result.nfields := nfields
result.segment := segment
result.nextSegmentVectorOffset := nextSegmentVectorOffset
@@ -412,6 +438,7 @@
result.elemStride := elemStride
result.segmentStride := segmentStride
result.elemWidth := elemWidth
+ result.sew := sew
result.nfields := nfields
result.nextSegmentVectorOffset := nextSegmentVectorOffset
result.nextLmulVectorRewind := nextLmulVectorRewind
@@ -492,6 +519,7 @@
result.elemStride := elemStride
result.segmentStride := segmentStride
result.elemWidth := elemWidth
+ result.sew := sew
result.nfields := nfields
result.segment := segment
result.nextSegmentVectorOffset := nextSegmentVectorOffset
@@ -571,6 +599,7 @@
// Compute addrs
result.baseAddr := uop.addr
result.elemWidth := uop.elemWidth.getOrElse(0.U(3.W))
+ result.sew := uop.sew.getOrElse(0.U(3.W))
result.addrs := Mux(
uop.op.isOneOf(LsuOp.VLOAD_STRIDED, LsuOp.VSTORE_STRIDED),
ComputeStridedAddrs(bytesPerSlot, uop.addr, uop.data, uop.elemWidth.getOrElse(0.U(3.W))),
diff --git a/tests/cocotb/BUILD b/tests/cocotb/BUILD
index ee7bec8..e0d7820 100644
--- a/tests/cocotb/BUILD
+++ b/tests/cocotb/BUILD
@@ -173,6 +173,9 @@
# BEGIN_TESTCASES_FOR_rvv_load_store_test
RVV_LOAD_STORE_TESTCASES = [
"load_store_bits",
+ "load8_index8",
+ "load8_index16",
+ "load8_index32",
"load8_seg_unit",
"load8_stride2_m1",
"load8_stride2_m1_partial",
@@ -189,7 +192,6 @@
"load_store32_unit_m2",
"load8_segment2_stride6_m1",
"load16_segment2_stride6_m1",
- "load8_indexed_m1",
"store8_indexed_m1",
"store8_seg_unit",
"store16_seg_unit",
@@ -377,6 +379,7 @@
"deps": [
"//kelvin_test_utils:sim_test_fixture",
"@bazel_tools//tools/python/runfiles",
+ requirement("tqdm"),
],
"data": ["//tests/cocotb/rvv/load_store:rvv_load_store_tests"],
"size": "large",
diff --git a/tests/cocotb/rvv/load_store/BUILD b/tests/cocotb/rvv/load_store/BUILD
index c5a79d3..4e07588 100644
--- a/tests/cocotb/rvv/load_store/BUILD
+++ b/tests/cocotb/rvv/load_store/BUILD
@@ -23,8 +23,14 @@
"load_store_bits": {
"srcs": ["load_store_bits.cc"],
},
- "load8_indexed_m1": {
- "srcs": ["load8_indexed_m1.cc"],
+ "load8_index8": {
+ "srcs": ["load8_index8.cc"],
+ },
+ "load8_index16": {
+ "srcs": ["load8_index16.cc"],
+ },
+ "load8_index32": {
+ "srcs": ["load8_index32.cc"],
},
"load8_seg_unit": {
"srcs": ["load8_seg_unit.cc"],
@@ -96,7 +102,9 @@
name = "rvv_load_store_tests",
srcs = [
":load_store_bits.elf",
- ":load8_indexed_m1.elf",
+ ":load8_index8.elf",
+ ":load8_index16.elf",
+ ":load8_index32.elf",
":load8_seg_unit.elf",
":load8_segment2_stride6_m1.elf",
":load16_segment2_stride6_m1.elf",
diff --git a/tests/cocotb/rvv/load_store/load8_index16.cc b/tests/cocotb/rvv/load_store/load8_index16.cc
new file mode 100644
index 0000000..f6decff
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load8_index16.cc
@@ -0,0 +1,101 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+namespace {
+constexpr size_t lut_size = 32000; // DTCM is 32KB.
+// Double sized so we can check trailing regions are not read/written.
+constexpr size_t buf_size = 128;
+} // namespace
+
+size_t vl __attribute__((section(".data"))) = 8;
+// Indices are always unsigned.
+uint16_t index_buf[buf_size] __attribute__((section(".data")));
+// These instructions don't differentiate signed/unsigned so we only need to
+// test one. The types come from intrinsic level.
+uint8_t in_buf[lut_size] __attribute__((section(".data")));
+uint8_t out_buf[buf_size] __attribute__((section(".data")));
+
+extern "C" {
+// Unordered
+__attribute__((used, retain)) void vluxei16_v_u8mf4() {
+ auto indices = __riscv_vle16_v_u16mf2(index_buf, vl);
+ auto data = __riscv_vluxei16_v_u8mf4(in_buf, indices, vl);
+ __riscv_vse8_v_u8mf4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei16_v_u8mf2() {
+ auto indices = __riscv_vle16_v_u16m1(index_buf, vl);
+ auto data = __riscv_vluxei16_v_u8mf2(in_buf, indices, vl);
+ __riscv_vse8_v_u8mf2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei16_v_u8m1() {
+ auto indices = __riscv_vle16_v_u16m2(index_buf, vl);
+ auto data = __riscv_vluxei16_v_u8m1(in_buf, indices, vl);
+ __riscv_vse8_v_u8m1(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei16_v_u8m2() {
+ auto indices = __riscv_vle16_v_u16m4(index_buf, vl);
+ auto data = __riscv_vluxei16_v_u8m2(in_buf, indices, vl);
+ __riscv_vse8_v_u8m2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei16_v_u8m4() {
+ auto indices = __riscv_vle16_v_u16m8(index_buf, vl);
+ auto data = __riscv_vluxei16_v_u8m4(in_buf, indices, vl);
+ __riscv_vse8_v_u8m4(out_buf, data, vl);
+}
+
+// Ordered
+__attribute__((used, retain)) void vloxei16_v_u8mf4() {
+ auto indices = __riscv_vle16_v_u16mf2(index_buf, vl);
+ auto data = __riscv_vloxei16_v_u8mf4(in_buf, indices, vl);
+ __riscv_vse8_v_u8mf4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei16_v_u8mf2() {
+ auto indices = __riscv_vle16_v_u16m1(index_buf, vl);
+ auto data = __riscv_vloxei16_v_u8mf2(in_buf, indices, vl);
+ __riscv_vse8_v_u8mf2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei16_v_u8m1() {
+ auto indices = __riscv_vle16_v_u16m2(index_buf, vl);
+ auto data = __riscv_vloxei16_v_u8m1(in_buf, indices, vl);
+ __riscv_vse8_v_u8m1(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei16_v_u8m2() {
+ auto indices = __riscv_vle16_v_u16m4(index_buf, vl);
+ auto data = __riscv_vloxei16_v_u8m2(in_buf, indices, vl);
+ __riscv_vse8_v_u8m2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei16_v_u8m4() {
+ auto indices = __riscv_vle16_v_u16m8(index_buf, vl);
+ auto data = __riscv_vloxei16_v_u8m4(in_buf, indices, vl);
+ __riscv_vse8_v_u8m4(out_buf, data, vl);
+}
+}
+
+void (*impl)() __attribute__((section(".data"))) = &vluxei16_v_u8m1;
+
+int main(int argc, char** argv) {
+ impl();
+ return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load8_index32.cc b/tests/cocotb/rvv/load_store/load8_index32.cc
new file mode 100644
index 0000000..524e86d
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load8_index32.cc
@@ -0,0 +1,89 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+namespace {
+constexpr size_t lut_size = 32000; // DTCM is 32KB.
+// Double sized so we can check trailing regions are not read/written.
+constexpr size_t buf_size = 64;
+} // namespace
+
+size_t vl __attribute__((section(".data"))) = 8;
+// Indices are always unsigned.
+uint32_t index_buf[buf_size] __attribute__((section(".data")));
+// These instructions don't differentiate signed/unsigned so we only need to
+// test one. The types come from intrinsic level.
+uint8_t in_buf[lut_size] __attribute__((section(".data")));
+uint8_t out_buf[buf_size] __attribute__((section(".data")));
+
+extern "C" {
+// Unordered
+__attribute__((used, retain)) void vluxei32_v_u8mf4() {
+ auto indices = __riscv_vle32_v_u32m1(index_buf, vl);
+ auto data = __riscv_vluxei32_v_u8mf4(in_buf, indices, vl);
+ __riscv_vse8_v_u8mf4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei32_v_u8mf2() {
+ auto indices = __riscv_vle32_v_u32m2(index_buf, vl);
+ auto data = __riscv_vluxei32_v_u8mf2(in_buf, indices, vl);
+ __riscv_vse8_v_u8mf2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei32_v_u8m1() {
+ auto indices = __riscv_vle32_v_u32m4(index_buf, vl);
+ auto data = __riscv_vluxei32_v_u8m1(in_buf, indices, vl);
+ __riscv_vse8_v_u8m1(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei32_v_u8m2() {
+ auto indices = __riscv_vle32_v_u32m8(index_buf, vl);
+ auto data = __riscv_vluxei32_v_u8m2(in_buf, indices, vl);
+ __riscv_vse8_v_u8m2(out_buf, data, vl);
+}
+
+// Ordered
+__attribute__((used, retain)) void vloxei32_v_u8mf4() {
+ auto indices = __riscv_vle32_v_u32m1(index_buf, vl);
+ auto data = __riscv_vloxei32_v_u8mf4(in_buf, indices, vl);
+ __riscv_vse8_v_u8mf4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei32_v_u8mf2() {
+ auto indices = __riscv_vle32_v_u32m2(index_buf, vl);
+ auto data = __riscv_vloxei32_v_u8mf2(in_buf, indices, vl);
+ __riscv_vse8_v_u8mf2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei32_v_u8m1() {
+ auto indices = __riscv_vle32_v_u32m4(index_buf, vl);
+ auto data = __riscv_vloxei32_v_u8m1(in_buf, indices, vl);
+ __riscv_vse8_v_u8m1(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei32_v_u8m2() {
+ auto indices = __riscv_vle32_v_u32m8(index_buf, vl);
+ auto data = __riscv_vloxei32_v_u8m2(in_buf, indices, vl);
+ __riscv_vse8_v_u8m2(out_buf, data, vl);
+}
+}
+
+void (*impl)() __attribute__((section(".data"))) = &vluxei32_v_u8m1;
+
+int main(int argc, char** argv) {
+ impl();
+ return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load8_index8.cc b/tests/cocotb/rvv/load_store/load8_index8.cc
new file mode 100644
index 0000000..6a6d75f
--- /dev/null
+++ b/tests/cocotb/rvv/load_store/load8_index8.cc
@@ -0,0 +1,113 @@
+// Copyright 2025 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <riscv_vector.h>
+#include <stdint.h>
+
+namespace {
+constexpr size_t lut_size = 256;
+// Double sized so we can check trailing regions are not read/written.
+constexpr size_t buf_size = 256;
+} // namespace
+
+size_t vl __attribute__((section(".data"))) = 16;
+// Indices are always unsigned.
+uint8_t index_buf[buf_size] __attribute__((section(".data")));
+// These instructions don't differentiate signed/unsigned so we only need to
+// test one. The types come from intrinsic level.
+uint8_t in_buf[lut_size] __attribute__((section(".data")));
+uint8_t out_buf[buf_size] __attribute__((section(".data")));
+
+extern "C" {
+// Unordered
+__attribute__((used, retain)) void vluxei8_v_u8mf4() {
+ auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+ auto data = __riscv_vluxei8_v_u8mf4(in_buf, indices, vl);
+ __riscv_vse8_v_u8mf4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei8_v_u8mf2() {
+ auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+ auto data = __riscv_vluxei8_v_u8mf2(in_buf, indices, vl);
+ __riscv_vse8_v_u8mf2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei8_v_u8m1() {
+ auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+ auto data = __riscv_vluxei8_v_u8m1(in_buf, indices, vl);
+ __riscv_vse8_v_u8m1(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei8_v_u8m2() {
+ auto indices = __riscv_vle8_v_u8m2(index_buf, vl);
+ auto data = __riscv_vluxei8_v_u8m2(in_buf, indices, vl);
+ __riscv_vse8_v_u8m2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei8_v_u8m4() {
+ auto indices = __riscv_vle8_v_u8m4(index_buf, vl);
+ auto data = __riscv_vluxei8_v_u8m4(in_buf, indices, vl);
+ __riscv_vse8_v_u8m4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vluxei8_v_u8m8() {
+ auto indices = __riscv_vle8_v_u8m8(index_buf, vl);
+ auto data = __riscv_vluxei8_v_u8m8(in_buf, indices, vl);
+ __riscv_vse8_v_u8m8(out_buf, data, vl);
+}
+
+// Ordered
+__attribute__((used, retain)) void vloxei8_v_u8mf4() {
+ auto indices = __riscv_vle8_v_u8mf4(index_buf, vl);
+ auto data = __riscv_vloxei8_v_u8mf4(in_buf, indices, vl);
+ __riscv_vse8_v_u8mf4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei8_v_u8mf2() {
+ auto indices = __riscv_vle8_v_u8mf2(index_buf, vl);
+ auto data = __riscv_vloxei8_v_u8mf2(in_buf, indices, vl);
+ __riscv_vse8_v_u8mf2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei8_v_u8m1() {
+ auto indices = __riscv_vle8_v_u8m1(index_buf, vl);
+ auto data = __riscv_vloxei8_v_u8m1(in_buf, indices, vl);
+ __riscv_vse8_v_u8m1(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei8_v_u8m2() {
+ auto indices = __riscv_vle8_v_u8m2(index_buf, vl);
+ auto data = __riscv_vloxei8_v_u8m2(in_buf, indices, vl);
+ __riscv_vse8_v_u8m2(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei8_v_u8m4() {
+ auto indices = __riscv_vle8_v_u8m4(index_buf, vl);
+ auto data = __riscv_vloxei8_v_u8m4(in_buf, indices, vl);
+ __riscv_vse8_v_u8m4(out_buf, data, vl);
+}
+
+__attribute__((used, retain)) void vloxei8_v_u8m8() {
+ auto indices = __riscv_vle8_v_u8m8(index_buf, vl);
+ auto data = __riscv_vloxei8_v_u8m8(in_buf, indices, vl);
+ __riscv_vse8_v_u8m8(out_buf, data, vl);
+}
+}
+
+void (*impl)() __attribute__((section(".data"))) = &vluxei8_v_u8m1;
+
+int main(int argc, char** argv) {
+ impl();
+ return 0;
+}
diff --git a/tests/cocotb/rvv/load_store/load8_indexed_m1.cc b/tests/cocotb/rvv/load_store/load8_indexed_m1.cc
deleted file mode 100644
index 8c6c9a7..0000000
--- a/tests/cocotb/rvv/load_store/load8_indexed_m1.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2025 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <riscv_vector.h>
-#include <stdint.h>
-
-uint8_t input_indices[128] __attribute__((section(".data")));
-uint8_t input_data[4096] __attribute__((section(".data")));
-uint8_t output_data[128] __attribute__((section(".data")));
-
-int main(int argc, char **argv) {
- vuint8m1_t indices = __riscv_vle8_v_u8m1(input_indices, /*vl=*/16);
- vuint8m1_t data = __riscv_vloxei8_v_u8m1(input_data, indices, /*vl=*/16);
- __riscv_vse8_v_u8m1(output_data, data, /*vl=*/16);
-
- return 0;
-}
diff --git a/tests/cocotb/rvv_load_store_test.py b/tests/cocotb/rvv_load_store_test.py
index 2f8d9d3..843e6e2 100644
--- a/tests/cocotb/rvv_load_store_test.py
+++ b/tests/cocotb/rvv_load_store_test.py
@@ -14,6 +14,7 @@
import cocotb
import numpy as np
+import tqdm
from bazel_tools.tools.python.runfiles import runfiles
from kelvin_test_utils.sim_test_fixture import Fixture
@@ -75,13 +76,14 @@
r = runfiles.Create()
await fixture.load_elf_and_lookup_symbols(
r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/' + elf_name),
- ['impl', 'vl', 'in_buf', 'out_buf'] + [c['impl'] for c in cases],
+ ['impl', 'vl', 'in_buf', 'out_buf'] +
+ list(set([c['impl'] for c in cases])),
)
min_value = np.iinfo(dtype).min
max_value = np.iinfo(dtype).max + 1 # One above.
rng = np.random.default_rng()
- for c in cases:
+ for c in tqdm.tqdm(cases):
impl = c['impl']
vl = c['vl']
in_size = c['in_size']
@@ -111,10 +113,13 @@
})
assert (actual_outputs == expected_outputs).all(), debug_msg
+
async def vector_load_indexed(
dut,
elf_name: str,
+ cases: list[dict], # keys: impl, vl, in_size, out_size.
dtype,
+ index_dtype,
):
"""RVV load-store test template for indexed loads.
@@ -124,32 +129,47 @@
r = runfiles.Create()
await fixture.load_elf_and_lookup_symbols(
r.Rlocation('kelvin_hw/tests/cocotb/rvv/load_store/' + elf_name),
- ['input_indices', 'input_data', 'output_data'],
+ ['impl', 'vl', 'in_buf', 'out_buf', 'index_buf'] +
+ list(set([c['impl'] for c in cases])),
)
- indices_count = 16 // np.dtype(dtype).itemsize
- in_data_count = 4096 // np.dtype(dtype).itemsize
- out_data_count = 16 // np.dtype(dtype).itemsize
-
min_value = np.iinfo(dtype).min
max_value = np.iinfo(dtype).max + 1 # One above.
rng = np.random.default_rng()
- input_data = rng.integers(min_value, max_value, in_data_count, dtype=dtype)
- input_indices = rng.integers(
- 0, min(max_value, in_data_count-1), indices_count, dtype=dtype)
+ for c in tqdm.tqdm(cases):
+ impl = c['impl']
+ vl = c['vl']
+ in_size = c['in_size']
+ out_size = c['out_size']
- expected_outputs = np.take(input_data, input_indices)
+ # TODO(davidgao): currently assuming the vl is supported.
+ # We'll eventually want to test unsupported vl.
+ indices = rng.integers(0, in_size, vl, dtype=index_dtype)
+ input_data = rng.integers(min_value, max_value, in_size, dtype=dtype)
+ expected_outputs = input_data[indices[:vl]]
+ sbz = np.zeros(out_size - vl, dtype=dtype)
+ expected_outputs = np.concat((expected_outputs, sbz))
- await fixture.write('input_data', input_data)
- await fixture.write('input_indices', input_indices)
- await fixture.write('output_data', np.zeros([out_data_count], dtype=dtype))
+ await fixture.write_ptr('impl', impl)
+ await fixture.write_word('vl', vl)
+ await fixture.write('index_buf', indices)
+ await fixture.write('in_buf', input_data)
+ await fixture.write('out_buf', np.zeros([out_size], dtype=dtype))
- await fixture.run_to_halt()
+ await fixture.run_to_halt()
- actual_outputs = (await fixture.read(
- 'output_data', out_data_count * np.dtype(dtype).itemsize)).view(dtype)
+ actual_outputs = (await fixture.read(
+ 'out_buf', out_size * np.dtype(dtype).itemsize)).view(dtype)
- assert (actual_outputs == expected_outputs).all()
+ debug_msg = str({
+ 'impl': impl,
+ 'input': input_data,
+ 'indices': indices,
+ 'expected': expected_outputs,
+ 'actual': actual_outputs,
+ })
+ assert (actual_outputs == expected_outputs).all(), debug_msg
+
async def vector_store_indexed(
dut,
@@ -223,7 +243,8 @@
await fixture.load_elf_and_lookup_symbols(
r.Rlocation(
'kelvin_hw/tests/cocotb/rvv/load_store/load_store_bits.elf'),
- ['vl', 'in_buf', 'out_buf', 'impl'] + [c['impl'] for c in cases],
+ ['vl', 'in_buf', 'out_buf', 'impl'] +
+ list(set([c['impl'] for c in cases])),
)
rng = np.random.default_rng()
for c in cases:
@@ -460,6 +481,139 @@
@cocotb.test()
+async def load8_index8(dut):
+ """Test vl*xei8_v_u8 usage accessible from intrinsics."""
+ def make_test_case(impl: str, vl: int):
+ return {
+ 'impl': impl,
+ 'vl': vl,
+ 'in_size': 256,
+ 'out_size': vl * 2,
+ }
+
+ await vector_load_indexed(
+ dut = dut,
+ elf_name = 'load8_index8.elf',
+ cases = [
+ # Unordered
+ make_test_case('vluxei8_v_u8mf4', vl = 4),
+ make_test_case('vluxei8_v_u8mf4', vl = 3),
+ make_test_case('vluxei8_v_u8mf2', vl = 8),
+ make_test_case('vluxei8_v_u8mf2', vl = 7),
+ make_test_case('vluxei8_v_u8m1', vl = 16),
+ make_test_case('vluxei8_v_u8m1', vl = 15),
+ make_test_case('vluxei8_v_u8m2', vl = 32),
+ make_test_case('vluxei8_v_u8m2', vl = 31),
+ make_test_case('vluxei8_v_u8m4', vl = 64),
+ make_test_case('vluxei8_v_u8m4', vl = 63),
+ make_test_case('vluxei8_v_u8m8', vl = 128),
+ make_test_case('vluxei8_v_u8m8', vl = 127),
+ # Ordered
+ make_test_case('vloxei8_v_u8mf4', vl = 4),
+ make_test_case('vloxei8_v_u8mf4', vl = 3),
+ make_test_case('vloxei8_v_u8mf2', vl = 8),
+ make_test_case('vloxei8_v_u8mf2', vl = 7),
+ make_test_case('vloxei8_v_u8m1', vl = 16),
+ make_test_case('vloxei8_v_u8m1', vl = 15),
+ make_test_case('vloxei8_v_u8m2', vl = 32),
+ make_test_case('vloxei8_v_u8m2', vl = 31),
+ make_test_case('vloxei8_v_u8m4', vl = 64),
+ make_test_case('vloxei8_v_u8m4', vl = 63),
+ make_test_case('vloxei8_v_u8m8', vl = 128),
+ make_test_case('vloxei8_v_u8m8', vl = 127),
+ ],
+ dtype = np.uint8,
+ index_dtype = np.uint8,
+ )
+
+
+@cocotb.test()
+async def load8_index16(dut):
+ """Test vl*xei16_v_u8 usage accessible from intrinsics."""
+ def make_test_case(impl: str, vl: int):
+ return {
+ 'impl': impl,
+ 'vl': vl,
+ 'in_size': 32000, # DTCM is 32KB
+ 'out_size': vl * 2,
+ }
+
+ await vector_load_indexed(
+ dut = dut,
+ elf_name = 'load8_index16.elf',
+ cases = [
+ # Unordered
+ make_test_case('vluxei16_v_u8mf4', vl = 4),
+ make_test_case('vluxei16_v_u8mf4', vl = 3),
+ make_test_case('vluxei16_v_u8mf2', vl = 8),
+ make_test_case('vluxei16_v_u8mf2', vl = 7),
+ # make_test_case('vluxei16_v_u8m1', vl = 16),
+ # make_test_case('vluxei16_v_u8m1', vl = 15),
+ # make_test_case('vluxei16_v_u8m2', vl = 32),
+ # make_test_case('vluxei16_v_u8m2', vl = 31),
+ # make_test_case('vluxei16_v_u8m4', vl = 64),
+ # make_test_case('vluxei16_v_u8m4', vl = 63),
+ # Ordered
+ make_test_case('vloxei16_v_u8mf4', vl = 4),
+ make_test_case('vloxei16_v_u8mf4', vl = 3),
+ make_test_case('vloxei16_v_u8mf2', vl = 8),
+ make_test_case('vloxei16_v_u8mf2', vl = 7),
+ # make_test_case('vloxei16_v_u8m1', vl = 16),
+ # make_test_case('vloxei16_v_u8m1', vl = 15),
+ # make_test_case('vloxei16_v_u8m2', vl = 32),
+ # make_test_case('vloxei16_v_u8m2', vl = 31),
+ # make_test_case('vloxei16_v_u8m4', vl = 64),
+ # make_test_case('vloxei16_v_u8m4', vl = 63),
+ ],
+ dtype = np.uint8,
+ index_dtype = np.uint16,
+ )
+
+
+@cocotb.test()
+async def load8_index32(dut):
+ """Test vl*xei32_v_u8 usage accessible from intrinsics."""
+ def make_test_case(impl: str, vl: int):
+ return {
+ 'impl': impl,
+ 'vl': vl,
+ 'in_size': 32000, # DTCM is 32KB
+ 'out_size': vl * 2,
+ }
+
+ await vector_load_indexed(
+ dut = dut,
+ elf_name = 'load8_index32.elf',
+ cases = [
+ # Unordered
+ make_test_case('vluxei32_v_u8mf4', vl = 4),
+ make_test_case('vluxei32_v_u8mf4', vl = 3),
+ # make_test_case('vluxei32_v_u8mf2', vl = 8),
+ # make_test_case('vluxei32_v_u8mf2', vl = 7),
+ # make_test_case('vluxei32_v_u8m1', vl = 16),
+ # make_test_case('vluxei32_v_u8m1', vl = 15),
+ # make_test_case('vluxei32_v_u8m2', vl = 32),
+ # make_test_case('vluxei32_v_u8m2', vl = 31),
+ # make_test_case('vluxei32_v_u8m4', vl = 64),
+ # make_test_case('vluxei32_v_u8m4', vl = 63),
+ # Ordered
+ make_test_case('vloxei32_v_u8mf4', vl = 4),
+ make_test_case('vloxei32_v_u8mf4', vl = 3),
+ # make_test_case('vloxei32_v_u8mf2', vl = 8),
+ # make_test_case('vloxei32_v_u8mf2', vl = 7),
+ # make_test_case('vloxei32_v_u8m1', vl = 16),
+ # make_test_case('vloxei32_v_u8m1', vl = 15),
+ # make_test_case('vloxei32_v_u8m2', vl = 32),
+ # make_test_case('vloxei32_v_u8m2', vl = 31),
+ # make_test_case('vloxei32_v_u8m4', vl = 64),
+ # make_test_case('vloxei32_v_u8m4', vl = 63),
+ ],
+ dtype = np.uint8,
+ index_dtype = np.uint32,
+ )
+
+
+@cocotb.test()
async def load16_seg_unit(dut):
"""Test vlseg*e16 usage accessible from intrinsics."""
def make_test_case(impl: str, vl: int, n_segs: int):
@@ -602,14 +756,6 @@
@cocotb.test()
-async def load8_indexed_m1(dut):
- await vector_load_indexed(
- dut = dut,
- elf_name = 'load8_indexed_m1.elf',
- dtype = np.uint8,
- )
-
-@cocotb.test()
async def store8_indexed_m1(dut):
await vector_store_indexed(
dut = dut,