Fix element length for segmented indexed loads load8 index16 tests are added. Currently the core hangs at vluxseg2ei16_v_u8m4x2 (index is u16m8) Change-Id: I62a604a7b5880731cd3f1617867b2e9dcf123ed4
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala index cbe4ea3..e62d108 100644 --- a/hdl/chisel/src/kelvin/scalar/Lsu.scala +++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -265,13 +265,13 @@ MuxCase(VecInit.fill(bytesPerSlot)(0.U(32.W)), Seq( // elemWidth validation is done at decode time. - // 8-bit indices. Each byte has its own offset. + // 8-bit data. Each byte has its own offset. (sew === "b000".U) -> VecInit((0 until bytesPerSlot).map( i => (baseAddr + indices_v(i)))), - // 16-bit indices. Each 2-byte element has an offset. + // 16-bit data. Each 2-byte element has an offset. (sew === "b001".U) -> VecInit((0 until bytesPerSlot).map( i => (baseAddr + indices_v(i >> 1) + (i & 1).U))), - // 32-bit indices. Each 4-byte element has an offset. + // 32-bit data. Each 4-byte element has an offset. (sew === "b010".U) -> VecInit((0 until bytesPerSlot).map( i => (baseAddr + indices_v(i >> 2) + (i & 3).U))) )) @@ -653,11 +653,22 @@ ComputeStridedAddrs(bytesPerSlot, uop.addr, uop.data, uop.elemWidth.getOrElse(0.U(3.W))), VecInit((0 until bytesPerSlot).map(i => uop.addr + i.U))) - val unitStride = MuxCase(1.U, Seq( - (uop.elemWidth.get === "b000".U) -> 1.U, // 1-byte elements - (uop.elemWidth.get === "b101".U) -> 2.U, // 2-byte elements - (uop.elemWidth.get === "b110".U) -> 4.U, // 4-byte elements - )) + val unitStride = Mux( + uop.op.isOneOf(LsuOp.VLOAD_OINDEXED, LsuOp.VLOAD_UINDEXED, + LsuOp.VSTORE_OINDEXED, LsuOp.VSTORE_UINDEXED), + // Indexed load. The unit stride also controls segment stride. + MuxCase(1.U, Seq( + (result.sew === "b000".U) -> 1.U, // 1-byte elements + (result.sew === "b001".U) -> 2.U, // 2-byte elements + (result.sew === "b010".U) -> 4.U, // 4-byte elements + )), + // Non-indexed load. + MuxCase(1.U, Seq( + (uop.elemWidth.get === "b000".U) -> 1.U, // 1-byte elements + (uop.elemWidth.get === "b101".U) -> 2.U, // 2-byte elements + (uop.elemWidth.get === "b110".U) -> 4.U, // 4-byte elements + )), + ) result.segmentStride := unitStride result.elemStride := Mux(
diff --git a/tests/cocotb/BUILD b/tests/cocotb/BUILD index b58e098..3cadbae 100644 --- a/tests/cocotb/BUILD +++ b/tests/cocotb/BUILD
@@ -176,6 +176,7 @@ "load8_index8", "load8_index8_seg", "load8_index16", + "load8_index16_seg", "load8_index32", "load8_seg_unit", "load8_stride2_m1",
diff --git a/tests/cocotb/rvv/load_store/BUILD b/tests/cocotb/rvv/load_store/BUILD index 9ab3fe9..1de8c1b 100644 --- a/tests/cocotb/rvv/load_store/BUILD +++ b/tests/cocotb/rvv/load_store/BUILD
@@ -32,6 +32,9 @@ "load8_index16": { "srcs": ["load8_index16.cc"], }, + "load8_index16_seg": { + "srcs": ["load8_index16_seg.cc"], + }, "load8_index32": { "srcs": ["load8_index32.cc"], }, @@ -135,6 +138,7 @@ ":load8_index8.elf", ":load8_index8_seg.elf", ":load8_index16.elf", + ":load8_index16_seg.elf", ":load8_index32.elf", ":load8_seg_unit.elf", ":load8_segment2_stride6_m1.elf",
diff --git a/tests/cocotb/rvv/load_store/load8_index16_seg.cc b/tests/cocotb/rvv/load_store/load8_index16_seg.cc new file mode 100644 index 0000000..596404c --- /dev/null +++ b/tests/cocotb/rvv/load_store/load8_index16_seg.cc
@@ -0,0 +1,967 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <riscv_vector.h> +#include <stdint.h> + +namespace { +constexpr size_t lut_size = 30000; +// Double sized so we can check trailing regions are not read/written. +constexpr size_t buf_size = 256; +} // namespace + +size_t vl __attribute__((section(".data"))) = 16; +// Indices are always unsigned. +uint16_t index_buf[buf_size] __attribute__((section(".data"))); +// These instructions don't differentiate signed/unsigned so we only need to +// test one. The types come from intrinsic level. +uint8_t in_buf[lut_size] __attribute__((section(".data"))); +uint8_t out_buf[buf_size] __attribute__((section(".data"))); + +extern "C" { +// Unordered, segment 2 +__attribute__((used, retain)) void vluxseg2ei16_v_u8mf4x2() { + vuint16mf2_t indices; + asm("vsetvli zero, %[vl], e16, mf2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16mf2(index_buf, vl); + auto data = __riscv_vluxseg2ei16_v_u8mf4x2(in_buf, indices, vl); + __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x2_u8mf4(data, 0), vl); + __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x2_u8mf4(data, 1), vl); +} + +__attribute__((used, retain)) void vluxseg2ei16_v_u8mf2x2() { + vuint16m1_t indices; + asm("vsetvli zero, %[vl], e16, m1, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m1(index_buf, vl); + auto data = __riscv_vluxseg2ei16_v_u8mf2x2(in_buf, indices, vl); + __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x2_u8mf2(data, 0), vl); + __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x2_u8mf2(data, 1), vl); +} + +__attribute__((used, retain)) void vluxseg2ei16_v_u8m1x2() { + vuint16m2_t indices; + asm("vsetvli zero, %[vl], e16, m2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m2(index_buf, vl); + auto data = __riscv_vluxseg2ei16_v_u8m1x2(in_buf, indices, vl); + __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x2_u8m1(data, 0), vl); + __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x2_u8m1(data, 1), vl); +} + +__attribute__((used, retain)) void vluxseg2ei16_v_u8m2x2() { + vuint16m4_t indices; + asm("vsetvli zero, %[vl], e16, m4, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m4(index_buf, vl); + auto data = __riscv_vluxseg2ei16_v_u8m2x2(in_buf, indices, vl); + __riscv_vse8_v_u8m2(out_buf, __riscv_vget_v_u8m2x2_u8m2(data, 0), vl); + __riscv_vse8_v_u8m2(out_buf + vl, __riscv_vget_v_u8m2x2_u8m2(data, 1), vl); +} + +__attribute__((used, retain)) void vluxseg2ei16_v_u8m4x2() { + vuint16m8_t indices; + asm("vsetvli zero, %[vl], e16, m8, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m8(index_buf, vl); + auto data = __riscv_vluxseg2ei16_v_u8m4x2(in_buf, indices, vl); + __riscv_vse8_v_u8m4(out_buf, __riscv_vget_v_u8m4x2_u8m4(data, 0), vl); + __riscv_vse8_v_u8m4(out_buf + vl, __riscv_vget_v_u8m4x2_u8m4(data, 1), vl); +} + +// // Unordered, segment 3 +__attribute__((used, retain)) void vluxseg3ei16_v_u8mf4x3() { + vuint16mf2_t indices; + asm("vsetvli zero, %[vl], e16, mf2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16mf2(index_buf, vl); + auto data = __riscv_vluxseg3ei16_v_u8mf4x3(in_buf, indices, vl); + __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x3_u8mf4(data, 0), vl); + __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x3_u8mf4(data, 1), vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x3_u8mf4(data, 2), + vl); +} + +__attribute__((used, retain)) void vluxseg3ei16_v_u8mf2x3() { + vuint16m1_t indices; + asm("vsetvli zero, %[vl], e16, m1, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m1(index_buf, vl); + auto data = __riscv_vluxseg3ei16_v_u8mf2x3(in_buf, indices, vl); + __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x3_u8mf2(data, 0), vl); + __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x3_u8mf2(data, 1), vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x3_u8mf2(data, 2), + vl); +} + +__attribute__((used, retain)) void vluxseg3ei16_v_u8m1x3() { + vuint16m2_t indices; + asm("vsetvli zero, %[vl], e16, m2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m2(index_buf, vl); + auto data = __riscv_vluxseg3ei16_v_u8m1x3(in_buf, indices, vl); + __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x3_u8m1(data, 0), vl); + __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x3_u8m1(data, 1), vl); + __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x3_u8m1(data, 2), + vl); +} + +__attribute__((used, retain)) void vluxseg3ei16_v_u8m2x3() { + vuint16m4_t indices; + asm("vsetvli zero, %[vl], e16, m4, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m4(index_buf, vl); + auto data = __riscv_vluxseg3ei16_v_u8m2x3(in_buf, indices, vl); + __riscv_vse8_v_u8m2(out_buf, __riscv_vget_v_u8m2x3_u8m2(data, 0), vl); + __riscv_vse8_v_u8m2(out_buf + vl, __riscv_vget_v_u8m2x3_u8m2(data, 1), vl); + __riscv_vse8_v_u8m2(out_buf + vl * 2, __riscv_vget_v_u8m2x3_u8m2(data, 2), + vl); +} + +// Unordered, segment 4 +__attribute__((used, retain)) void vluxseg4ei16_v_u8mf4x4() { + vuint16mf2_t indices; + asm("vsetvli zero, %[vl], e16, mf2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16mf2(index_buf, vl); + auto data = __riscv_vluxseg4ei16_v_u8mf4x4(in_buf, indices, vl); + __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x4_u8mf4(data, 0), vl); + __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x4_u8mf4(data, 1), vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x4_u8mf4(data, 2), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x4_u8mf4(data, 3), + vl); +} + +__attribute__((used, retain)) void vluxseg4ei16_v_u8mf2x4() { + vuint16m1_t indices; + asm("vsetvli zero, %[vl], e16, m1, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m1(index_buf, vl); + auto data = __riscv_vluxseg4ei16_v_u8mf2x4(in_buf, indices, vl); + __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x4_u8mf2(data, 0), vl); + __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x4_u8mf2(data, 1), vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x4_u8mf2(data, 2), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x4_u8mf2(data, 3), + vl); +} + +__attribute__((used, retain)) void vluxseg4ei16_v_u8m1x4() { + vuint16m2_t indices; + asm("vsetvli zero, %[vl], e16, m2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m2(index_buf, vl); + auto data = __riscv_vluxseg4ei16_v_u8m1x4(in_buf, indices, vl); + __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x4_u8m1(data, 0), vl); + __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x4_u8m1(data, 1), vl); + __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x4_u8m1(data, 2), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x4_u8m1(data, 3), + vl); +} + +__attribute__((used, retain)) void vluxseg4ei16_v_u8m2x4() { + vuint16m4_t indices; + asm("vsetvli zero, %[vl], e16, m4, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m4(index_buf, vl); + auto data = __riscv_vluxseg4ei16_v_u8m2x4(in_buf, indices, vl); + __riscv_vse8_v_u8m2(out_buf, __riscv_vget_v_u8m2x4_u8m2(data, 0), vl); + __riscv_vse8_v_u8m2(out_buf + vl, __riscv_vget_v_u8m2x4_u8m2(data, 1), vl); + __riscv_vse8_v_u8m2(out_buf + vl * 2, __riscv_vget_v_u8m2x4_u8m2(data, 2), + vl); + __riscv_vse8_v_u8m2(out_buf + vl * 3, __riscv_vget_v_u8m2x4_u8m2(data, 3), + vl); +} + +// Unordered, segment 5 +__attribute__((used, retain)) void vluxseg5ei16_v_u8mf4x5() { + vuint16mf2_t indices; + asm("vsetvli zero, %[vl], e16, mf2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16mf2(index_buf, vl); + auto data = __riscv_vluxseg5ei16_v_u8mf4x5(in_buf, indices, vl); + __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x5_u8mf4(data, 0), vl); + __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x5_u8mf4(data, 1), vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x5_u8mf4(data, 2), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x5_u8mf4(data, 3), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x5_u8mf4(data, 4), + vl); +} + +__attribute__((used, retain)) void vluxseg5ei16_v_u8mf2x5() { + vuint16m1_t indices; + asm("vsetvli zero, %[vl], e16, m1, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m1(index_buf, vl); + auto data = __riscv_vluxseg5ei16_v_u8mf2x5(in_buf, indices, vl); + __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x5_u8mf2(data, 0), vl); + __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x5_u8mf2(data, 1), vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x5_u8mf2(data, 2), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x5_u8mf2(data, 3), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x5_u8mf2(data, 4), + vl); +} + +__attribute__((used, retain)) void vluxseg5ei16_v_u8m1x5() { + vuint16m2_t indices; + asm("vsetvli zero, %[vl], e16, m2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m2(index_buf, vl); + auto data = __riscv_vluxseg5ei16_v_u8m1x5(in_buf, indices, vl); + __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x5_u8m1(data, 0), vl); + __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x5_u8m1(data, 1), vl); + __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x5_u8m1(data, 2), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x5_u8m1(data, 3), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x5_u8m1(data, 4), + vl); +} + +// Unordered, segment 6 +__attribute__((used, retain)) void vluxseg6ei16_v_u8mf4x6() { + vuint16mf2_t indices; + asm("vsetvli zero, %[vl], e16, mf2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16mf2(index_buf, vl); + auto data = __riscv_vluxseg6ei16_v_u8mf4x6(in_buf, indices, vl); + __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x6_u8mf4(data, 0), vl); + __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x6_u8mf4(data, 1), vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x6_u8mf4(data, 2), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x6_u8mf4(data, 3), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x6_u8mf4(data, 4), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 5, __riscv_vget_v_u8mf4x6_u8mf4(data, 5), + vl); +} + +__attribute__((used, retain)) void vluxseg6ei16_v_u8mf2x6() { + vuint16m1_t indices; + asm("vsetvli zero, %[vl], e16, m1, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m1(index_buf, vl); + auto data = __riscv_vluxseg6ei16_v_u8mf2x6(in_buf, indices, vl); + __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x6_u8mf2(data, 0), vl); + __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x6_u8mf2(data, 1), vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x6_u8mf2(data, 2), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x6_u8mf2(data, 3), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x6_u8mf2(data, 4), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 5, __riscv_vget_v_u8mf2x6_u8mf2(data, 5), + vl); +} + +__attribute__((used, retain)) void vluxseg6ei16_v_u8m1x6() { + vuint16m2_t indices; + asm("vsetvli zero, %[vl], e16, m2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m2(index_buf, vl); + auto data = __riscv_vluxseg6ei16_v_u8m1x6(in_buf, indices, vl); + __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x6_u8m1(data, 0), vl); + __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x6_u8m1(data, 1), vl); + __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x6_u8m1(data, 2), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x6_u8m1(data, 3), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x6_u8m1(data, 4), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 5, __riscv_vget_v_u8m1x6_u8m1(data, 5), + vl); +} + +// Unordered, segment 7 +__attribute__((used, retain)) void vluxseg7ei16_v_u8mf4x7() { + vuint16mf2_t indices; + asm("vsetvli zero, %[vl], e16, mf2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16mf2(index_buf, vl); + auto data = __riscv_vluxseg7ei16_v_u8mf4x7(in_buf, indices, vl); + __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x7_u8mf4(data, 0), vl); + __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x7_u8mf4(data, 1), vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x7_u8mf4(data, 2), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x7_u8mf4(data, 3), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x7_u8mf4(data, 4), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 5, __riscv_vget_v_u8mf4x7_u8mf4(data, 5), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 6, __riscv_vget_v_u8mf4x7_u8mf4(data, 6), + vl); +} + +__attribute__((used, retain)) void vluxseg7ei16_v_u8mf2x7() { + vuint16m1_t indices; + asm("vsetvli zero, %[vl], e16, m1, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m1(index_buf, vl); + auto data = __riscv_vluxseg7ei16_v_u8mf2x7(in_buf, indices, vl); + __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x7_u8mf2(data, 0), vl); + __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x7_u8mf2(data, 1), vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x7_u8mf2(data, 2), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x7_u8mf2(data, 3), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x7_u8mf2(data, 4), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 5, __riscv_vget_v_u8mf2x7_u8mf2(data, 5), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 6, __riscv_vget_v_u8mf2x7_u8mf2(data, 6), + vl); +} + +__attribute__((used, retain)) void vluxseg7ei16_v_u8m1x7() { + vuint16m2_t indices; + asm("vsetvli zero, %[vl], e16, m2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m2(index_buf, vl); + auto data = __riscv_vluxseg7ei16_v_u8m1x7(in_buf, indices, vl); + __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x7_u8m1(data, 0), vl); + __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x7_u8m1(data, 1), vl); + __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x7_u8m1(data, 2), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x7_u8m1(data, 3), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x7_u8m1(data, 4), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 5, __riscv_vget_v_u8m1x7_u8m1(data, 5), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 6, __riscv_vget_v_u8m1x7_u8m1(data, 6), + vl); +} + +// Unordered, segment 8 +__attribute__((used, retain)) void vluxseg8ei16_v_u8mf4x8() { + vuint16mf2_t indices; + asm("vsetvli zero, %[vl], e16, mf2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16mf2(index_buf, vl); + auto data = __riscv_vluxseg8ei16_v_u8mf4x8(in_buf, indices, vl); + __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x8_u8mf4(data, 0), vl); + __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x8_u8mf4(data, 1), vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x8_u8mf4(data, 2), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x8_u8mf4(data, 3), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x8_u8mf4(data, 4), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 5, __riscv_vget_v_u8mf4x8_u8mf4(data, 5), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 6, __riscv_vget_v_u8mf4x8_u8mf4(data, 6), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 7, __riscv_vget_v_u8mf4x8_u8mf4(data, 7), + vl); +} + +__attribute__((used, retain)) void vluxseg8ei16_v_u8mf2x8() { + vuint16m1_t indices; + asm("vsetvli zero, %[vl], e16, m1, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m1(index_buf, vl); + auto data = __riscv_vluxseg8ei16_v_u8mf2x8(in_buf, indices, vl); + __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x8_u8mf2(data, 0), vl); + __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x8_u8mf2(data, 1), vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x8_u8mf2(data, 2), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x8_u8mf2(data, 3), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x8_u8mf2(data, 4), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 5, __riscv_vget_v_u8mf2x8_u8mf2(data, 5), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 6, __riscv_vget_v_u8mf2x8_u8mf2(data, 6), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 7, __riscv_vget_v_u8mf2x8_u8mf2(data, 7), + vl); +} + +__attribute__((used, retain)) void vluxseg8ei16_v_u8m1x8() { + vuint16m2_t indices; + asm("vsetvli zero, %[vl], e16, m2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m2(index_buf, vl); + auto data = __riscv_vluxseg8ei16_v_u8m1x8(in_buf, indices, vl); + __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x8_u8m1(data, 0), vl); + __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x8_u8m1(data, 1), vl); + __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x8_u8m1(data, 2), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x8_u8m1(data, 3), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x8_u8m1(data, 4), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 5, __riscv_vget_v_u8m1x8_u8m1(data, 5), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 6, __riscv_vget_v_u8m1x8_u8m1(data, 6), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 7, __riscv_vget_v_u8m1x8_u8m1(data, 7), + vl); +} + +// Ordered, segment 2 +__attribute__((used, retain)) void vloxseg2ei16_v_u8mf4x2() { + vuint16mf2_t indices; + asm("vsetvli zero, %[vl], e16, mf2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16mf2(index_buf, vl); + auto data = __riscv_vloxseg2ei16_v_u8mf4x2(in_buf, indices, vl); + __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x2_u8mf4(data, 0), vl); + __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x2_u8mf4(data, 1), vl); +} + +__attribute__((used, retain)) void vloxseg2ei16_v_u8mf2x2() { + vuint16m1_t indices; + asm("vsetvli zero, %[vl], e16, m1, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m1(index_buf, vl); + auto data = __riscv_vloxseg2ei16_v_u8mf2x2(in_buf, indices, vl); + __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x2_u8mf2(data, 0), vl); + __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x2_u8mf2(data, 1), vl); +} + +__attribute__((used, retain)) void vloxseg2ei16_v_u8m1x2() { + vuint16m2_t indices; + asm("vsetvli zero, %[vl], e16, m2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m2(index_buf, vl); + auto data = __riscv_vloxseg2ei16_v_u8m1x2(in_buf, indices, vl); + __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x2_u8m1(data, 0), vl); + __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x2_u8m1(data, 1), vl); +} + +__attribute__((used, retain)) void vloxseg2ei16_v_u8m2x2() { + vuint16m4_t indices; + asm("vsetvli zero, %[vl], e16, m4, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m4(index_buf, vl); + auto data = __riscv_vloxseg2ei16_v_u8m2x2(in_buf, indices, vl); + __riscv_vse8_v_u8m2(out_buf, __riscv_vget_v_u8m2x2_u8m2(data, 0), vl); + __riscv_vse8_v_u8m2(out_buf + vl, __riscv_vget_v_u8m2x2_u8m2(data, 1), vl); +} + +__attribute__((used, retain)) void vloxseg2ei16_v_u8m4x2() { + vuint16m8_t indices; + asm("vsetvli zero, %[vl], e16, m8, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m8(index_buf, vl); + auto data = __riscv_vloxseg2ei16_v_u8m4x2(in_buf, indices, vl); + __riscv_vse8_v_u8m4(out_buf, __riscv_vget_v_u8m4x2_u8m4(data, 0), vl); + __riscv_vse8_v_u8m4(out_buf + vl, __riscv_vget_v_u8m4x2_u8m4(data, 1), vl); +} + +// // Ordered, segment 3 +__attribute__((used, retain)) void vloxseg3ei16_v_u8mf4x3() { + vuint16mf2_t indices; + asm("vsetvli zero, %[vl], e16, mf2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16mf2(index_buf, vl); + auto data = __riscv_vloxseg3ei16_v_u8mf4x3(in_buf, indices, vl); + __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x3_u8mf4(data, 0), vl); + __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x3_u8mf4(data, 1), vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x3_u8mf4(data, 2), + vl); +} + +__attribute__((used, retain)) void vloxseg3ei16_v_u8mf2x3() { + vuint16m1_t indices; + asm("vsetvli zero, %[vl], e16, m1, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m1(index_buf, vl); + auto data = __riscv_vloxseg3ei16_v_u8mf2x3(in_buf, indices, vl); + __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x3_u8mf2(data, 0), vl); + __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x3_u8mf2(data, 1), vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x3_u8mf2(data, 2), + vl); +} + +__attribute__((used, retain)) void vloxseg3ei16_v_u8m1x3() { + vuint16m2_t indices; + asm("vsetvli zero, %[vl], e16, m2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m2(index_buf, vl); + auto data = __riscv_vloxseg3ei16_v_u8m1x3(in_buf, indices, vl); + __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x3_u8m1(data, 0), vl); + __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x3_u8m1(data, 1), vl); + __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x3_u8m1(data, 2), + vl); +} + +__attribute__((used, retain)) void vloxseg3ei16_v_u8m2x3() { + vuint16m4_t indices; + asm("vsetvli zero, %[vl], e16, m4, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m4(index_buf, vl); + auto data = __riscv_vloxseg3ei16_v_u8m2x3(in_buf, indices, vl); + __riscv_vse8_v_u8m2(out_buf, __riscv_vget_v_u8m2x3_u8m2(data, 0), vl); + __riscv_vse8_v_u8m2(out_buf + vl, __riscv_vget_v_u8m2x3_u8m2(data, 1), vl); + __riscv_vse8_v_u8m2(out_buf + vl * 2, __riscv_vget_v_u8m2x3_u8m2(data, 2), + vl); +} + +// Ordered, segment 4 +__attribute__((used, retain)) void vloxseg4ei16_v_u8mf4x4() { + vuint16mf2_t indices; + asm("vsetvli zero, %[vl], e16, mf2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16mf2(index_buf, vl); + auto data = __riscv_vloxseg4ei16_v_u8mf4x4(in_buf, indices, vl); + __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x4_u8mf4(data, 0), vl); + __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x4_u8mf4(data, 1), vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x4_u8mf4(data, 2), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x4_u8mf4(data, 3), + vl); +} + +__attribute__((used, retain)) void vloxseg4ei16_v_u8mf2x4() { + vuint16m1_t indices; + asm("vsetvli zero, %[vl], e16, m1, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m1(index_buf, vl); + auto data = __riscv_vloxseg4ei16_v_u8mf2x4(in_buf, indices, vl); + __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x4_u8mf2(data, 0), vl); + __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x4_u8mf2(data, 1), vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x4_u8mf2(data, 2), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x4_u8mf2(data, 3), + vl); +} + +__attribute__((used, retain)) void vloxseg4ei16_v_u8m1x4() { + vuint16m2_t indices; + asm("vsetvli zero, %[vl], e16, m2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m2(index_buf, vl); + auto data = __riscv_vloxseg4ei16_v_u8m1x4(in_buf, indices, vl); + __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x4_u8m1(data, 0), vl); + __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x4_u8m1(data, 1), vl); + __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x4_u8m1(data, 2), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x4_u8m1(data, 3), + vl); +} + +__attribute__((used, retain)) void vloxseg4ei16_v_u8m2x4() { + vuint16m4_t indices; + asm("vsetvli zero, %[vl], e16, m4, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m4(index_buf, vl); + auto data = __riscv_vloxseg4ei16_v_u8m2x4(in_buf, indices, vl); + __riscv_vse8_v_u8m2(out_buf, __riscv_vget_v_u8m2x4_u8m2(data, 0), vl); + __riscv_vse8_v_u8m2(out_buf + vl, __riscv_vget_v_u8m2x4_u8m2(data, 1), vl); + __riscv_vse8_v_u8m2(out_buf + vl * 2, __riscv_vget_v_u8m2x4_u8m2(data, 2), + vl); + __riscv_vse8_v_u8m2(out_buf + vl * 3, __riscv_vget_v_u8m2x4_u8m2(data, 3), + vl); +} + +// Ordered, segment 5 +__attribute__((used, retain)) void vloxseg5ei16_v_u8mf4x5() { + vuint16mf2_t indices; + asm("vsetvli zero, %[vl], e16, mf2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16mf2(index_buf, vl); + auto data = __riscv_vloxseg5ei16_v_u8mf4x5(in_buf, indices, vl); + __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x5_u8mf4(data, 0), vl); + __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x5_u8mf4(data, 1), vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x5_u8mf4(data, 2), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x5_u8mf4(data, 3), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x5_u8mf4(data, 4), + vl); +} + +__attribute__((used, retain)) void vloxseg5ei16_v_u8mf2x5() { + vuint16m1_t indices; + asm("vsetvli zero, %[vl], e16, m1, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m1(index_buf, vl); + auto data = __riscv_vloxseg5ei16_v_u8mf2x5(in_buf, indices, vl); + __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x5_u8mf2(data, 0), vl); + __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x5_u8mf2(data, 1), vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x5_u8mf2(data, 2), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x5_u8mf2(data, 3), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x5_u8mf2(data, 4), + vl); +} + +__attribute__((used, retain)) void vloxseg5ei16_v_u8m1x5() { + vuint16m2_t indices; + asm("vsetvli zero, %[vl], e16, m2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m2(index_buf, vl); + auto data = __riscv_vloxseg5ei16_v_u8m1x5(in_buf, indices, vl); + __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x5_u8m1(data, 0), vl); + __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x5_u8m1(data, 1), vl); + __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x5_u8m1(data, 2), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x5_u8m1(data, 3), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x5_u8m1(data, 4), + vl); +} + +// Ordered, segment 6 +__attribute__((used, retain)) void vloxseg6ei16_v_u8mf4x6() { + vuint16mf2_t indices; + asm("vsetvli zero, %[vl], e16, mf2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16mf2(index_buf, vl); + auto data = __riscv_vloxseg6ei16_v_u8mf4x6(in_buf, indices, vl); + __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x6_u8mf4(data, 0), vl); + __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x6_u8mf4(data, 1), vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x6_u8mf4(data, 2), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x6_u8mf4(data, 3), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x6_u8mf4(data, 4), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 5, __riscv_vget_v_u8mf4x6_u8mf4(data, 5), + vl); +} + +__attribute__((used, retain)) void vloxseg6ei16_v_u8mf2x6() { + vuint16m1_t indices; + asm("vsetvli zero, %[vl], e16, m1, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m1(index_buf, vl); + auto data = __riscv_vloxseg6ei16_v_u8mf2x6(in_buf, indices, vl); + __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x6_u8mf2(data, 0), vl); + __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x6_u8mf2(data, 1), vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x6_u8mf2(data, 2), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x6_u8mf2(data, 3), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x6_u8mf2(data, 4), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 5, __riscv_vget_v_u8mf2x6_u8mf2(data, 5), + vl); +} + +__attribute__((used, retain)) void vloxseg6ei16_v_u8m1x6() { + vuint16m2_t indices; + asm("vsetvli zero, %[vl], e16, m2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m2(index_buf, vl); + auto data = __riscv_vloxseg6ei16_v_u8m1x6(in_buf, indices, vl); + __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x6_u8m1(data, 0), vl); + __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x6_u8m1(data, 1), vl); + __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x6_u8m1(data, 2), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x6_u8m1(data, 3), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x6_u8m1(data, 4), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 5, __riscv_vget_v_u8m1x6_u8m1(data, 5), + vl); +} + +// Ordered, segment 7 +__attribute__((used, retain)) void vloxseg7ei16_v_u8mf4x7() { + vuint16mf2_t indices; + asm("vsetvli zero, %[vl], e16, mf2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16mf2(index_buf, vl); + auto data = __riscv_vloxseg7ei16_v_u8mf4x7(in_buf, indices, vl); + __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x7_u8mf4(data, 0), vl); + __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x7_u8mf4(data, 1), vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x7_u8mf4(data, 2), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x7_u8mf4(data, 3), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x7_u8mf4(data, 4), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 5, __riscv_vget_v_u8mf4x7_u8mf4(data, 5), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 6, __riscv_vget_v_u8mf4x7_u8mf4(data, 6), + vl); +} + +__attribute__((used, retain)) void vloxseg7ei16_v_u8mf2x7() { + vuint16m1_t indices; + asm("vsetvli zero, %[vl], e16, m1, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m1(index_buf, vl); + auto data = __riscv_vloxseg7ei16_v_u8mf2x7(in_buf, indices, vl); + __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x7_u8mf2(data, 0), vl); + __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x7_u8mf2(data, 1), vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x7_u8mf2(data, 2), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x7_u8mf2(data, 3), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x7_u8mf2(data, 4), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 5, __riscv_vget_v_u8mf2x7_u8mf2(data, 5), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 6, __riscv_vget_v_u8mf2x7_u8mf2(data, 6), + vl); +} + +__attribute__((used, retain)) void vloxseg7ei16_v_u8m1x7() { + vuint16m2_t indices; + asm("vsetvli zero, %[vl], e16, m2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m2(index_buf, vl); + auto data = __riscv_vloxseg7ei16_v_u8m1x7(in_buf, indices, vl); + __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x7_u8m1(data, 0), vl); + __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x7_u8m1(data, 1), vl); + __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x7_u8m1(data, 2), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x7_u8m1(data, 3), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x7_u8m1(data, 4), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 5, __riscv_vget_v_u8m1x7_u8m1(data, 5), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 6, __riscv_vget_v_u8m1x7_u8m1(data, 6), + vl); +} + +// Ordered, segment 8 +__attribute__((used, retain)) void vloxseg8ei16_v_u8mf4x8() { + vuint16mf2_t indices; + asm("vsetvli zero, %[vl], e16, mf2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16mf2(index_buf, vl); + auto data = __riscv_vloxseg8ei16_v_u8mf4x8(in_buf, indices, vl); + __riscv_vse8_v_u8mf4(out_buf, __riscv_vget_v_u8mf4x8_u8mf4(data, 0), vl); + __riscv_vse8_v_u8mf4(out_buf + vl, __riscv_vget_v_u8mf4x8_u8mf4(data, 1), vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 2, __riscv_vget_v_u8mf4x8_u8mf4(data, 2), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 3, __riscv_vget_v_u8mf4x8_u8mf4(data, 3), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 4, __riscv_vget_v_u8mf4x8_u8mf4(data, 4), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 5, __riscv_vget_v_u8mf4x8_u8mf4(data, 5), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 6, __riscv_vget_v_u8mf4x8_u8mf4(data, 6), + vl); + __riscv_vse8_v_u8mf4(out_buf + vl * 7, __riscv_vget_v_u8mf4x8_u8mf4(data, 7), + vl); +} + +__attribute__((used, retain)) void vloxseg8ei16_v_u8mf2x8() { + vuint16m1_t indices; + asm("vsetvli zero, %[vl], e16, m1, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m1(index_buf, vl); + auto data = __riscv_vloxseg8ei16_v_u8mf2x8(in_buf, indices, vl); + __riscv_vse8_v_u8mf2(out_buf, __riscv_vget_v_u8mf2x8_u8mf2(data, 0), vl); + __riscv_vse8_v_u8mf2(out_buf + vl, __riscv_vget_v_u8mf2x8_u8mf2(data, 1), vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 2, __riscv_vget_v_u8mf2x8_u8mf2(data, 2), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 3, __riscv_vget_v_u8mf2x8_u8mf2(data, 3), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 4, __riscv_vget_v_u8mf2x8_u8mf2(data, 4), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 5, __riscv_vget_v_u8mf2x8_u8mf2(data, 5), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 6, __riscv_vget_v_u8mf2x8_u8mf2(data, 6), + vl); + __riscv_vse8_v_u8mf2(out_buf + vl * 7, __riscv_vget_v_u8mf2x8_u8mf2(data, 7), + vl); +} + +__attribute__((used, retain)) void vloxseg8ei16_v_u8m1x8() { + vuint16m2_t indices; + asm("vsetvli zero, %[vl], e16, m2, ta, ma;" + "vle16.v %[index], %[index_buf];" + : [index] "=vr"(indices) + : [vl] "r"(vl), [index_buf] "m"(index_buf)); + // TODO: Revert once compiler bug is eliminated + // auto indices = __riscv_vle16_v_u16m2(index_buf, vl); + auto data = __riscv_vloxseg8ei16_v_u8m1x8(in_buf, indices, vl); + __riscv_vse8_v_u8m1(out_buf, __riscv_vget_v_u8m1x8_u8m1(data, 0), vl); + __riscv_vse8_v_u8m1(out_buf + vl, __riscv_vget_v_u8m1x8_u8m1(data, 1), vl); + __riscv_vse8_v_u8m1(out_buf + vl * 2, __riscv_vget_v_u8m1x8_u8m1(data, 2), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 3, __riscv_vget_v_u8m1x8_u8m1(data, 3), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 4, __riscv_vget_v_u8m1x8_u8m1(data, 4), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 5, __riscv_vget_v_u8m1x8_u8m1(data, 5), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 6, __riscv_vget_v_u8m1x8_u8m1(data, 6), + vl); + __riscv_vse8_v_u8m1(out_buf + vl * 7, __riscv_vget_v_u8m1x8_u8m1(data, 7), + vl); +} +} + +void (*impl)() __attribute__((section(".data"))) = &vluxseg2ei16_v_u8m1x2; + +int main(int argc, char** argv) { + impl(); + return 0; +}
diff --git a/tests/cocotb/rvv_load_store_test.py b/tests/cocotb/rvv_load_store_test.py index a8e44f9..9d12c63 100644 --- a/tests/cocotb/rvv_load_store_test.py +++ b/tests/cocotb/rvv_load_store_test.py
@@ -681,6 +681,185 @@ @cocotb.test() +async def load8_index16(dut): + """Test vl*xei16_v_u8 usage accessible from intrinsics.""" + def make_test_case(impl: str, vl: int): + return { + 'impl': impl, + 'vl': vl, + 'in_bytes': 32000, # DTCM is 32KB + 'out_size': vl * 2, + } + + await vector_load_indexed( + dut = dut, + elf_name = 'load8_index16.elf', + cases = [ + # Unordered + make_test_case('vluxei16_v_u8mf4', vl = 4), + make_test_case('vluxei16_v_u8mf4', vl = 3), + make_test_case('vluxei16_v_u8mf2', vl = 8), + make_test_case('vluxei16_v_u8mf2', vl = 7), + make_test_case('vluxei16_v_u8m1', vl = 16), + make_test_case('vluxei16_v_u8m1', vl = 15), + make_test_case('vluxei16_v_u8m2', vl = 32), + make_test_case('vluxei16_v_u8m2', vl = 31), + make_test_case('vluxei16_v_u8m4', vl = 64), + make_test_case('vluxei16_v_u8m4', vl = 63), + # Ordered + make_test_case('vloxei16_v_u8mf4', vl = 4), + make_test_case('vloxei16_v_u8mf4', vl = 3), + make_test_case('vloxei16_v_u8mf2', vl = 8), + make_test_case('vloxei16_v_u8mf2', vl = 7), + make_test_case('vloxei16_v_u8m1', vl = 16), + make_test_case('vloxei16_v_u8m1', vl = 15), + make_test_case('vloxei16_v_u8m2', vl = 32), + make_test_case('vloxei16_v_u8m2', vl = 31), + make_test_case('vloxei16_v_u8m4', vl = 64), + make_test_case('vloxei16_v_u8m4', vl = 63), + ], + dtype = np.uint8, + index_dtype = np.uint16, + ) + + +@cocotb.test() +async def load8_index16_seg(dut): + """Test vl*xseg*ei16_v_u8 usage accessible from intrinsics.""" + def make_test_case(impl: str, vl: int, n_segs: int): + return { + 'impl': impl, + 'vl': vl, + 'segments': n_segs, + 'in_bytes': 30000, + 'out_size': vl * n_segs * 2, + } + + await vector_load_segmented_indexed( + dut = dut, + elf_name = 'load8_index16_seg.elf', + cases = [ + # Unordered, segment 2 + make_test_case('vluxseg2ei16_v_u8mf4x2', vl=4, n_segs=2), + make_test_case('vluxseg2ei16_v_u8mf4x2', vl=3, n_segs=2), + make_test_case('vluxseg2ei16_v_u8mf2x2', vl=8, n_segs=2), + make_test_case('vluxseg2ei16_v_u8mf2x2', vl=7, n_segs=2), + make_test_case('vluxseg2ei16_v_u8m1x2', vl=16, n_segs=2), + make_test_case('vluxseg2ei16_v_u8m1x2', vl=15, n_segs=2), + make_test_case('vluxseg2ei16_v_u8m2x2', vl=32, n_segs=2), + make_test_case('vluxseg2ei16_v_u8m2x2', vl=31, n_segs=2), + # make_test_case('vluxseg2ei16_v_u8m4x2', vl=64, n_segs=2), + # make_test_case('vluxseg2ei16_v_u8m4x2', vl=63, n_segs=2), + # Unordered, segment 3 + make_test_case('vluxseg3ei16_v_u8mf4x3', vl=4, n_segs=3), + make_test_case('vluxseg3ei16_v_u8mf4x3', vl=3, n_segs=3), + make_test_case('vluxseg3ei16_v_u8mf2x3', vl=8, n_segs=3), + make_test_case('vluxseg3ei16_v_u8mf2x3', vl=7, n_segs=3), + make_test_case('vluxseg3ei16_v_u8m1x3', vl=16, n_segs=3), + make_test_case('vluxseg3ei16_v_u8m1x3', vl=15, n_segs=3), + # make_test_case('vluxseg3ei16_v_u8m2x3', vl=32, n_segs=3), + # make_test_case('vluxseg3ei16_v_u8m2x3', vl=31, n_segs=3), + # # Unordered, segment 4 + make_test_case('vluxseg4ei16_v_u8mf4x4', vl=4, n_segs=4), + make_test_case('vluxseg4ei16_v_u8mf4x4', vl=3, n_segs=4), + make_test_case('vluxseg4ei16_v_u8mf2x4', vl=8, n_segs=4), + make_test_case('vluxseg4ei16_v_u8mf2x4', vl=7, n_segs=4), + make_test_case('vluxseg4ei16_v_u8m1x4', vl=16, n_segs=4), + make_test_case('vluxseg4ei16_v_u8m1x4', vl=15, n_segs=4), + # make_test_case('vluxseg4ei16_v_u8m2x4', vl=32, n_segs=4), + # make_test_case('vluxseg4ei16_v_u8m2x4', vl=31, n_segs=4), + # # Unordered, segment 5 + make_test_case('vluxseg5ei16_v_u8mf4x5', vl=4, n_segs=5), + make_test_case('vluxseg5ei16_v_u8mf4x5', vl=3, n_segs=5), + make_test_case('vluxseg5ei16_v_u8mf2x5', vl=8, n_segs=5), + make_test_case('vluxseg5ei16_v_u8mf2x5', vl=7, n_segs=5), + # make_test_case('vluxseg5ei16_v_u8m1x5', vl=16, n_segs=5), + # make_test_case('vluxseg5ei16_v_u8m1x5', vl=15, n_segs=5), + # # Unordered, segment 6 + make_test_case('vluxseg6ei16_v_u8mf4x6', vl=4, n_segs=6), + make_test_case('vluxseg6ei16_v_u8mf4x6', vl=3, n_segs=6), + make_test_case('vluxseg6ei16_v_u8mf2x6', vl=8, n_segs=6), + make_test_case('vluxseg6ei16_v_u8mf2x6', vl=7, n_segs=6), + # make_test_case('vluxseg6ei16_v_u8m1x6', vl=16, n_segs=6), + # make_test_case('vluxseg6ei16_v_u8m1x6', vl=15, n_segs=6), + # # Unordered, segment 7 + make_test_case('vluxseg7ei16_v_u8mf4x7', vl=4, n_segs=7), + make_test_case('vluxseg7ei16_v_u8mf4x7', vl=3, n_segs=7), + make_test_case('vluxseg7ei16_v_u8mf2x7', vl=8, n_segs=7), + make_test_case('vluxseg7ei16_v_u8mf2x7', vl=7, n_segs=7), + # make_test_case('vluxseg7ei16_v_u8m1x7', vl=16, n_segs=7), + # make_test_case('vluxseg7ei16_v_u8m1x7', vl=15, n_segs=7), + # # Unordered, segment 8 + make_test_case('vluxseg8ei16_v_u8mf4x8', vl=4, n_segs=8), + make_test_case('vluxseg8ei16_v_u8mf4x8', vl=3, n_segs=8), + make_test_case('vluxseg8ei16_v_u8mf2x8', vl=8, n_segs=8), + make_test_case('vluxseg8ei16_v_u8mf2x8', vl=7, n_segs=8), + # make_test_case('vluxseg8ei16_v_u8m1x8', vl=16, n_segs=8), + # make_test_case('vluxseg8ei16_v_u8m1x8', vl=15, n_segs=8), + # # Ordered, segment 2 + make_test_case('vloxseg2ei16_v_u8mf4x2', vl=4, n_segs=2), + make_test_case('vloxseg2ei16_v_u8mf4x2', vl=3, n_segs=2), + make_test_case('vloxseg2ei16_v_u8mf2x2', vl=8, n_segs=2), + make_test_case('vloxseg2ei16_v_u8mf2x2', vl=7, n_segs=2), + make_test_case('vloxseg2ei16_v_u8m1x2', vl=16, n_segs=2), + make_test_case('vloxseg2ei16_v_u8m1x2', vl=15, n_segs=2), + make_test_case('vloxseg2ei16_v_u8m2x2', vl=32, n_segs=2), + make_test_case('vloxseg2ei16_v_u8m2x2', vl=31, n_segs=2), + # make_test_case('vloxseg2ei16_v_u8m4x2', vl=64, n_segs=2), + # make_test_case('vloxseg2ei16_v_u8m4x2', vl=63, n_segs=2), + # # Ordered, segment 3 + make_test_case('vloxseg3ei16_v_u8mf4x3', vl=4, n_segs=3), + make_test_case('vloxseg3ei16_v_u8mf4x3', vl=3, n_segs=3), + make_test_case('vloxseg3ei16_v_u8mf2x3', vl=8, n_segs=3), + make_test_case('vloxseg3ei16_v_u8mf2x3', vl=7, n_segs=3), + make_test_case('vloxseg3ei16_v_u8m1x3', vl=16, n_segs=3), + make_test_case('vloxseg3ei16_v_u8m1x3', vl=15, n_segs=3), + # make_test_case('vloxseg3ei16_v_u8m2x3', vl=32, n_segs=3), + # make_test_case('vloxseg3ei16_v_u8m2x3', vl=31, n_segs=3), + # # Ordered, segment 4 + make_test_case('vloxseg4ei16_v_u8mf4x4', vl=4, n_segs=4), + make_test_case('vloxseg4ei16_v_u8mf4x4', vl=3, n_segs=4), + make_test_case('vloxseg4ei16_v_u8mf2x4', vl=8, n_segs=4), + make_test_case('vloxseg4ei16_v_u8mf2x4', vl=7, n_segs=4), + make_test_case('vloxseg4ei16_v_u8m1x4', vl=16, n_segs=4), + make_test_case('vloxseg4ei16_v_u8m1x4', vl=15, n_segs=4), + # make_test_case('vloxseg4ei16_v_u8m2x4', vl=32, n_segs=4), + # make_test_case('vloxseg4ei16_v_u8m2x4', vl=31, n_segs=4), + # # Ordered, segment 5 + make_test_case('vloxseg5ei16_v_u8mf4x5', vl=4, n_segs=5), + make_test_case('vloxseg5ei16_v_u8mf4x5', vl=3, n_segs=5), + make_test_case('vloxseg5ei16_v_u8mf2x5', vl=8, n_segs=5), + make_test_case('vloxseg5ei16_v_u8mf2x5', vl=7, n_segs=5), + # make_test_case('vloxseg5ei16_v_u8m1x5', vl=16, n_segs=5), + # make_test_case('vloxseg5ei16_v_u8m1x5', vl=15, n_segs=5), + # # Ordered, segment 6 + make_test_case('vloxseg6ei16_v_u8mf4x6', vl=4, n_segs=6), + make_test_case('vloxseg6ei16_v_u8mf4x6', vl=3, n_segs=6), + make_test_case('vloxseg6ei16_v_u8mf2x6', vl=8, n_segs=6), + make_test_case('vloxseg6ei16_v_u8mf2x6', vl=7, n_segs=6), + # make_test_case('vloxseg6ei16_v_u8m1x6', vl=16, n_segs=6), + # make_test_case('vloxseg6ei16_v_u8m1x6', vl=15, n_segs=6), + # # Ordered, segment 7 + make_test_case('vloxseg7ei16_v_u8mf4x7', vl=4, n_segs=7), + make_test_case('vloxseg7ei16_v_u8mf4x7', vl=3, n_segs=7), + make_test_case('vloxseg7ei16_v_u8mf2x7', vl=8, n_segs=7), + make_test_case('vloxseg7ei16_v_u8mf2x7', vl=7, n_segs=7), + # make_test_case('vloxseg7ei16_v_u8m1x7', vl=16, n_segs=7), + # make_test_case('vloxseg7ei16_v_u8m1x7', vl=15, n_segs=7), + # # Ordered, segment 8 + make_test_case('vloxseg8ei16_v_u8mf4x8', vl=4, n_segs=8), + make_test_case('vloxseg8ei16_v_u8mf4x8', vl=3, n_segs=8), + make_test_case('vloxseg8ei16_v_u8mf2x8', vl=8, n_segs=8), + make_test_case('vloxseg8ei16_v_u8mf2x8', vl=7, n_segs=8), + # make_test_case('vloxseg8ei16_v_u8m1x8', vl=16, n_segs=8), + # make_test_case('vloxseg8ei16_v_u8m1x8', vl=15, n_segs=8), + ], + dtype = np.uint8, + index_dtype = np.uint16, + ) + + +@cocotb.test() async def load8_seg_unit(dut): """Test vlseg*e8 usage accessible from intrinsics.""" def make_test_case(impl: str, vl: int, n_segs: int): @@ -761,49 +940,6 @@ @cocotb.test() -async def load8_index16(dut): - """Test vl*xei16_v_u8 usage accessible from intrinsics.""" - def make_test_case(impl: str, vl: int): - return { - 'impl': impl, - 'vl': vl, - 'in_bytes': 32000, # DTCM is 32KB - 'out_size': vl * 2, - } - - await vector_load_indexed( - dut = dut, - elf_name = 'load8_index16.elf', - cases = [ - # Unordered - make_test_case('vluxei16_v_u8mf4', vl = 4), - make_test_case('vluxei16_v_u8mf4', vl = 3), - make_test_case('vluxei16_v_u8mf2', vl = 8), - make_test_case('vluxei16_v_u8mf2', vl = 7), - make_test_case('vluxei16_v_u8m1', vl = 16), - make_test_case('vluxei16_v_u8m1', vl = 15), - make_test_case('vluxei16_v_u8m2', vl = 32), - make_test_case('vluxei16_v_u8m2', vl = 31), - make_test_case('vluxei16_v_u8m4', vl = 64), - make_test_case('vluxei16_v_u8m4', vl = 63), - # Ordered - make_test_case('vloxei16_v_u8mf4', vl = 4), - make_test_case('vloxei16_v_u8mf4', vl = 3), - make_test_case('vloxei16_v_u8mf2', vl = 8), - make_test_case('vloxei16_v_u8mf2', vl = 7), - make_test_case('vloxei16_v_u8m1', vl = 16), - make_test_case('vloxei16_v_u8m1', vl = 15), - make_test_case('vloxei16_v_u8m2', vl = 32), - make_test_case('vloxei16_v_u8m2', vl = 31), - make_test_case('vloxei16_v_u8m4', vl = 64), - make_test_case('vloxei16_v_u8m4', vl = 63), - ], - dtype = np.uint8, - index_dtype = np.uint16, - ) - - -@cocotb.test() async def load8_index32(dut): """Test vl*xei32_v_u8 usage accessible from intrinsics.""" def make_test_case(impl: str, vl: int):