Test segmented store8 index8 Segment support is added to the control path, non-segmented cases are now handled as 1-segment. Adding additional logic to avoid overlapping scatters - they make the tests flaky. Change-Id: Ie78eb663d7c3772902c47000eb70d8380868be52
diff --git a/tests/cocotb/BUILD b/tests/cocotb/BUILD index b7d8482..eb70a5c 100644 --- a/tests/cocotb/BUILD +++ b/tests/cocotb/BUILD
@@ -203,6 +203,7 @@ "load16_segment2_stride6_m1", "store_unit_masked", "store8_index8", + "store8_index8_seg", "store16_index8", "store16_index16", "store32_index8",
diff --git a/tests/cocotb/rvv/load_store/BUILD b/tests/cocotb/rvv/load_store/BUILD index 1ced5fc..9719dcb 100644 --- a/tests/cocotb/rvv/load_store/BUILD +++ b/tests/cocotb/rvv/load_store/BUILD
@@ -113,6 +113,9 @@ "store8_index8": { "srcs": ["store8_index8.cc"], }, + "store8_index8_seg": { + "srcs": ["store8_index8_seg.cc"], + }, "store16_index8": { "srcs": ["store16_index8.cc"], }, @@ -177,6 +180,7 @@ ":load32_stride8_m1.elf", ":load32_stride8_m1_partial.elf", ":store8_index8.elf", + ":store8_index8_seg.elf", ":store16_index8.elf", ":store16_index16.elf", ":store32_index8.elf",
diff --git a/tests/cocotb/rvv/load_store/store8_index8_seg.cc b/tests/cocotb/rvv/load_store/store8_index8_seg.cc new file mode 100644 index 0000000..1d659d0 --- /dev/null +++ b/tests/cocotb/rvv/load_store/store8_index8_seg.cc
@@ -0,0 +1,535 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <riscv_vector.h> +#include <stdint.h> + +namespace { +constexpr size_t scatter_count = 128; +// Double sized so we can check trailing regions are not written. +constexpr size_t buf_size = 512; +} // namespace + +size_t vl __attribute__((section(".data"))) = 16; +// Indices are always unsigned. +uint8_t index_buf[scatter_count] __attribute__((section(".data"))); +// These instructions don't differentiate signed/unsigned so we only need to +// test one. The types come from intrinsic level. +uint8_t in_buf[scatter_count] __attribute__((section(".data"))); +uint8_t out_buf[buf_size] __attribute__((section(".data"))); + +extern "C" { +// Unordered, segment 2 +__attribute__((used, retain)) void vsuxseg2ei8_v_u8mf4x2() { + auto indices = __riscv_vle8_v_u8mf4(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf4x2(__riscv_vle8_v_u8mf4(in_buf, vl), + __riscv_vle8_v_u8mf4(in_buf + vl, vl)); + __riscv_vsuxseg2ei8_v_u8mf4x2(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsuxseg2ei8_v_u8mf2x2() { + auto indices = __riscv_vle8_v_u8mf2(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf2x2(__riscv_vle8_v_u8mf2(in_buf, vl), + __riscv_vle8_v_u8mf2(in_buf + vl, vl)); + __riscv_vsuxseg2ei8_v_u8mf2x2(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsuxseg2ei8_v_u8m1x2() { + auto indices = __riscv_vle8_v_u8m1(index_buf, vl); + auto data = __riscv_vcreate_v_u8m1x2(__riscv_vle8_v_u8m1(in_buf, vl), + __riscv_vle8_v_u8m1(in_buf + vl, vl)); + __riscv_vsuxseg2ei8_v_u8m1x2(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsuxseg2ei8_v_u8m2x2() { + auto indices = __riscv_vle8_v_u8m2(index_buf, vl); + auto data = __riscv_vcreate_v_u8m2x2(__riscv_vle8_v_u8m2(in_buf, vl), + __riscv_vle8_v_u8m2(in_buf + vl, vl)); + __riscv_vsuxseg2ei8_v_u8m2x2(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsuxseg2ei8_v_u8m4x2() { + auto indices = __riscv_vle8_v_u8m4(index_buf, vl); + auto data = __riscv_vcreate_v_u8m4x2(__riscv_vle8_v_u8m4(in_buf, vl), + __riscv_vle8_v_u8m4(in_buf + vl, vl)); + __riscv_vsuxseg2ei8_v_u8m4x2(out_buf, indices, data, vl); +} + +// Unordered, segment 3 +__attribute__((used, retain)) void vsuxseg3ei8_v_u8mf4x3() { + auto indices = __riscv_vle8_v_u8mf4(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf4x3( + __riscv_vle8_v_u8mf4(in_buf, vl), __riscv_vle8_v_u8mf4(in_buf + vl, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 2, vl)); + __riscv_vsuxseg3ei8_v_u8mf4x3(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsuxseg3ei8_v_u8mf2x3() { + auto indices = __riscv_vle8_v_u8mf2(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf2x3( + __riscv_vle8_v_u8mf2(in_buf, vl), __riscv_vle8_v_u8mf2(in_buf + vl, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 2, vl)); + __riscv_vsuxseg3ei8_v_u8mf2x3(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsuxseg3ei8_v_u8m1x3() { + auto indices = __riscv_vle8_v_u8m1(index_buf, vl); + auto data = __riscv_vcreate_v_u8m1x3( + __riscv_vle8_v_u8m1(in_buf, vl), __riscv_vle8_v_u8m1(in_buf + vl, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 2, vl)); + __riscv_vsuxseg3ei8_v_u8m1x3(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsuxseg3ei8_v_u8m2x3() { + auto indices = __riscv_vle8_v_u8m2(index_buf, vl); + auto data = __riscv_vcreate_v_u8m2x3( + __riscv_vle8_v_u8m2(in_buf, vl), __riscv_vle8_v_u8m2(in_buf + vl, vl), + __riscv_vle8_v_u8m2(in_buf + vl * 2, vl)); + __riscv_vsuxseg3ei8_v_u8m2x3(out_buf, indices, data, vl); +} + +// Unordered, segment 4 +__attribute__((used, retain)) void vsuxseg4ei8_v_u8mf4x4() { + auto indices = __riscv_vle8_v_u8mf4(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf4x4( + __riscv_vle8_v_u8mf4(in_buf, vl), __riscv_vle8_v_u8mf4(in_buf + vl, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 3, vl)); + __riscv_vsuxseg4ei8_v_u8mf4x4(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsuxseg4ei8_v_u8mf2x4() { + auto indices = __riscv_vle8_v_u8mf2(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf2x4( + __riscv_vle8_v_u8mf2(in_buf, vl), __riscv_vle8_v_u8mf2(in_buf + vl, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 3, vl)); + __riscv_vsuxseg4ei8_v_u8mf2x4(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsuxseg4ei8_v_u8m1x4() { + auto indices = __riscv_vle8_v_u8m1(index_buf, vl); + auto data = __riscv_vcreate_v_u8m1x4( + __riscv_vle8_v_u8m1(in_buf, vl), __riscv_vle8_v_u8m1(in_buf + vl, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 2, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 3, vl)); + __riscv_vsuxseg4ei8_v_u8m1x4(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsuxseg4ei8_v_u8m2x4() { + auto indices = __riscv_vle8_v_u8m2(index_buf, vl); + auto data = __riscv_vcreate_v_u8m2x4( + __riscv_vle8_v_u8m2(in_buf, vl), __riscv_vle8_v_u8m2(in_buf + vl, vl), + __riscv_vle8_v_u8m2(in_buf + vl * 2, vl), + __riscv_vle8_v_u8m2(in_buf + vl * 3, vl)); + __riscv_vsuxseg4ei8_v_u8m2x4(out_buf, indices, data, vl); +} + +// Unordered, segment 5 +__attribute__((used, retain)) void vsuxseg5ei8_v_u8mf4x5() { + auto indices = __riscv_vle8_v_u8mf4(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf4x5( + __riscv_vle8_v_u8mf4(in_buf, vl), __riscv_vle8_v_u8mf4(in_buf + vl, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 3, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 4, vl)); + __riscv_vsuxseg5ei8_v_u8mf4x5(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsuxseg5ei8_v_u8mf2x5() { + auto indices = __riscv_vle8_v_u8mf2(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf2x5( + __riscv_vle8_v_u8mf2(in_buf, vl), __riscv_vle8_v_u8mf2(in_buf + vl, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 3, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 4, vl)); + __riscv_vsuxseg5ei8_v_u8mf2x5(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsuxseg5ei8_v_u8m1x5() { + auto indices = __riscv_vle8_v_u8m1(index_buf, vl); + auto data = __riscv_vcreate_v_u8m1x5( + __riscv_vle8_v_u8m1(in_buf, vl), __riscv_vle8_v_u8m1(in_buf + vl, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 2, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 3, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 4, vl)); + __riscv_vsuxseg5ei8_v_u8m1x5(out_buf, indices, data, vl); +} + +// Unordered, segment 6 +__attribute__((used, retain)) void vsuxseg6ei8_v_u8mf4x6() { + auto indices = __riscv_vle8_v_u8mf4(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf4x6( + __riscv_vle8_v_u8mf4(in_buf, vl), __riscv_vle8_v_u8mf4(in_buf + vl, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 3, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 4, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 5, vl)); + __riscv_vsuxseg6ei8_v_u8mf4x6(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsuxseg6ei8_v_u8mf2x6() { + auto indices = __riscv_vle8_v_u8mf2(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf2x6( + __riscv_vle8_v_u8mf2(in_buf, vl), __riscv_vle8_v_u8mf2(in_buf + vl, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 3, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 4, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 5, vl)); + __riscv_vsuxseg6ei8_v_u8mf2x6(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsuxseg6ei8_v_u8m1x6() { + auto indices = __riscv_vle8_v_u8m1(index_buf, vl); + auto data = __riscv_vcreate_v_u8m1x6( + __riscv_vle8_v_u8m1(in_buf, vl), __riscv_vle8_v_u8m1(in_buf + vl, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 2, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 3, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 4, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 5, vl)); + __riscv_vsuxseg6ei8_v_u8m1x6(out_buf, indices, data, vl); +} + +// Unordered, segment 7 +__attribute__((used, retain)) void vsuxseg7ei8_v_u8mf4x7() { + auto indices = __riscv_vle8_v_u8mf4(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf4x7( + __riscv_vle8_v_u8mf4(in_buf, vl), __riscv_vle8_v_u8mf4(in_buf + vl, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 3, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 4, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 5, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 6, vl)); + __riscv_vsuxseg7ei8_v_u8mf4x7(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsuxseg7ei8_v_u8mf2x7() { + auto indices = __riscv_vle8_v_u8mf2(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf2x7( + __riscv_vle8_v_u8mf2(in_buf, vl), __riscv_vle8_v_u8mf2(in_buf + vl, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 3, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 4, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 5, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 6, vl)); + __riscv_vsuxseg7ei8_v_u8mf2x7(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsuxseg7ei8_v_u8m1x7() { + auto indices = __riscv_vle8_v_u8m1(index_buf, vl); + auto data = __riscv_vcreate_v_u8m1x7( + __riscv_vle8_v_u8m1(in_buf, vl), __riscv_vle8_v_u8m1(in_buf + vl, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 2, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 3, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 4, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 5, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 6, vl)); + __riscv_vsuxseg7ei8_v_u8m1x7(out_buf, indices, data, vl); +} + +// Unordered, segment 8 +__attribute__((used, retain)) void vsuxseg8ei8_v_u8mf4x8() { + auto indices = __riscv_vle8_v_u8mf4(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf4x8( + __riscv_vle8_v_u8mf4(in_buf, vl), __riscv_vle8_v_u8mf4(in_buf + vl, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 3, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 4, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 5, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 6, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 7, vl)); + __riscv_vsuxseg8ei8_v_u8mf4x8(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsuxseg8ei8_v_u8mf2x8() { + auto indices = __riscv_vle8_v_u8mf2(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf2x8( + __riscv_vle8_v_u8mf2(in_buf, vl), __riscv_vle8_v_u8mf2(in_buf + vl, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 3, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 4, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 5, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 6, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 7, vl)); + __riscv_vsuxseg8ei8_v_u8mf2x8(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsuxseg8ei8_v_u8m1x8() { + auto indices = __riscv_vle8_v_u8m1(index_buf, vl); + auto data = __riscv_vcreate_v_u8m1x8( + __riscv_vle8_v_u8m1(in_buf, vl), __riscv_vle8_v_u8m1(in_buf + vl, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 2, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 3, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 4, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 5, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 6, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 7, vl)); + __riscv_vsuxseg8ei8_v_u8m1x8(out_buf, indices, data, vl); +} + +// Ordered, segment 2 +__attribute__((used, retain)) void vsoxseg2ei8_v_u8mf4x2() { + auto indices = __riscv_vle8_v_u8mf4(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf4x2(__riscv_vle8_v_u8mf4(in_buf, vl), + __riscv_vle8_v_u8mf4(in_buf + vl, vl)); + __riscv_vsoxseg2ei8_v_u8mf4x2(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsoxseg2ei8_v_u8mf2x2() { + auto indices = __riscv_vle8_v_u8mf2(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf2x2(__riscv_vle8_v_u8mf2(in_buf, vl), + __riscv_vle8_v_u8mf2(in_buf + vl, vl)); + __riscv_vsoxseg2ei8_v_u8mf2x2(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsoxseg2ei8_v_u8m1x2() { + auto indices = __riscv_vle8_v_u8m1(index_buf, vl); + auto data = __riscv_vcreate_v_u8m1x2(__riscv_vle8_v_u8m1(in_buf, vl), + __riscv_vle8_v_u8m1(in_buf + vl, vl)); + __riscv_vsoxseg2ei8_v_u8m1x2(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsoxseg2ei8_v_u8m2x2() { + auto indices = __riscv_vle8_v_u8m2(index_buf, vl); + auto data = __riscv_vcreate_v_u8m2x2(__riscv_vle8_v_u8m2(in_buf, vl), + __riscv_vle8_v_u8m2(in_buf + vl, vl)); + __riscv_vsoxseg2ei8_v_u8m2x2(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsoxseg2ei8_v_u8m4x2() { + auto indices = __riscv_vle8_v_u8m4(index_buf, vl); + auto data = __riscv_vcreate_v_u8m4x2(__riscv_vle8_v_u8m4(in_buf, vl), + __riscv_vle8_v_u8m4(in_buf + vl, vl)); + __riscv_vsoxseg2ei8_v_u8m4x2(out_buf, indices, data, vl); +} + +// Ordered, segment 3 +__attribute__((used, retain)) void vsoxseg3ei8_v_u8mf4x3() { + auto indices = __riscv_vle8_v_u8mf4(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf4x3( + __riscv_vle8_v_u8mf4(in_buf, vl), __riscv_vle8_v_u8mf4(in_buf + vl, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 2, vl)); + __riscv_vsoxseg3ei8_v_u8mf4x3(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsoxseg3ei8_v_u8mf2x3() { + auto indices = __riscv_vle8_v_u8mf2(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf2x3( + __riscv_vle8_v_u8mf2(in_buf, vl), __riscv_vle8_v_u8mf2(in_buf + vl, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 2, vl)); + __riscv_vsoxseg3ei8_v_u8mf2x3(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsoxseg3ei8_v_u8m1x3() { + auto indices = __riscv_vle8_v_u8m1(index_buf, vl); + auto data = __riscv_vcreate_v_u8m1x3( + __riscv_vle8_v_u8m1(in_buf, vl), __riscv_vle8_v_u8m1(in_buf + vl, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 2, vl)); + __riscv_vsoxseg3ei8_v_u8m1x3(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsoxseg3ei8_v_u8m2x3() { + auto indices = __riscv_vle8_v_u8m2(index_buf, vl); + auto data = __riscv_vcreate_v_u8m2x3( + __riscv_vle8_v_u8m2(in_buf, vl), __riscv_vle8_v_u8m2(in_buf + vl, vl), + __riscv_vle8_v_u8m2(in_buf + vl * 2, vl)); + __riscv_vsoxseg3ei8_v_u8m2x3(out_buf, indices, data, vl); +} + +// Ordered, segment 4 +__attribute__((used, retain)) void vsoxseg4ei8_v_u8mf4x4() { + auto indices = __riscv_vle8_v_u8mf4(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf4x4( + __riscv_vle8_v_u8mf4(in_buf, vl), __riscv_vle8_v_u8mf4(in_buf + vl, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 3, vl)); + __riscv_vsoxseg4ei8_v_u8mf4x4(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsoxseg4ei8_v_u8mf2x4() { + auto indices = __riscv_vle8_v_u8mf2(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf2x4( + __riscv_vle8_v_u8mf2(in_buf, vl), __riscv_vle8_v_u8mf2(in_buf + vl, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 3, vl)); + __riscv_vsoxseg4ei8_v_u8mf2x4(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsoxseg4ei8_v_u8m1x4() { + auto indices = __riscv_vle8_v_u8m1(index_buf, vl); + auto data = __riscv_vcreate_v_u8m1x4( + __riscv_vle8_v_u8m1(in_buf, vl), __riscv_vle8_v_u8m1(in_buf + vl, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 2, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 3, vl)); + __riscv_vsoxseg4ei8_v_u8m1x4(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsoxseg4ei8_v_u8m2x4() { + auto indices = __riscv_vle8_v_u8m2(index_buf, vl); + auto data = __riscv_vcreate_v_u8m2x4( + __riscv_vle8_v_u8m2(in_buf, vl), __riscv_vle8_v_u8m2(in_buf + vl, vl), + __riscv_vle8_v_u8m2(in_buf + vl * 2, vl), + __riscv_vle8_v_u8m2(in_buf + vl * 3, vl)); + __riscv_vsoxseg4ei8_v_u8m2x4(out_buf, indices, data, vl); +} + +// Ordered, segment 5 +__attribute__((used, retain)) void vsoxseg5ei8_v_u8mf4x5() { + auto indices = __riscv_vle8_v_u8mf4(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf4x5( + __riscv_vle8_v_u8mf4(in_buf, vl), __riscv_vle8_v_u8mf4(in_buf + vl, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 3, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 4, vl)); + __riscv_vsoxseg5ei8_v_u8mf4x5(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsoxseg5ei8_v_u8mf2x5() { + auto indices = __riscv_vle8_v_u8mf2(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf2x5( + __riscv_vle8_v_u8mf2(in_buf, vl), __riscv_vle8_v_u8mf2(in_buf + vl, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 3, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 4, vl)); + __riscv_vsoxseg5ei8_v_u8mf2x5(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsoxseg5ei8_v_u8m1x5() { + auto indices = __riscv_vle8_v_u8m1(index_buf, vl); + auto data = __riscv_vcreate_v_u8m1x5( + __riscv_vle8_v_u8m1(in_buf, vl), __riscv_vle8_v_u8m1(in_buf + vl, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 2, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 3, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 4, vl)); + __riscv_vsoxseg5ei8_v_u8m1x5(out_buf, indices, data, vl); +} + +// Ordered, segment 6 +__attribute__((used, retain)) void vsoxseg6ei8_v_u8mf4x6() { + auto indices = __riscv_vle8_v_u8mf4(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf4x6( + __riscv_vle8_v_u8mf4(in_buf, vl), __riscv_vle8_v_u8mf4(in_buf + vl, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 3, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 4, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 5, vl)); + __riscv_vsoxseg6ei8_v_u8mf4x6(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsoxseg6ei8_v_u8mf2x6() { + auto indices = __riscv_vle8_v_u8mf2(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf2x6( + __riscv_vle8_v_u8mf2(in_buf, vl), __riscv_vle8_v_u8mf2(in_buf + vl, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 3, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 4, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 5, vl)); + __riscv_vsoxseg6ei8_v_u8mf2x6(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsoxseg6ei8_v_u8m1x6() { + auto indices = __riscv_vle8_v_u8m1(index_buf, vl); + auto data = __riscv_vcreate_v_u8m1x6( + __riscv_vle8_v_u8m1(in_buf, vl), __riscv_vle8_v_u8m1(in_buf + vl, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 2, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 3, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 4, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 5, vl)); + __riscv_vsoxseg6ei8_v_u8m1x6(out_buf, indices, data, vl); +} + +// Ordered, segment 7 +__attribute__((used, retain)) void vsoxseg7ei8_v_u8mf4x7() { + auto indices = __riscv_vle8_v_u8mf4(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf4x7( + __riscv_vle8_v_u8mf4(in_buf, vl), __riscv_vle8_v_u8mf4(in_buf + vl, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 3, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 4, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 5, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 6, vl)); + __riscv_vsoxseg7ei8_v_u8mf4x7(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsoxseg7ei8_v_u8mf2x7() { + auto indices = __riscv_vle8_v_u8mf2(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf2x7( + __riscv_vle8_v_u8mf2(in_buf, vl), __riscv_vle8_v_u8mf2(in_buf + vl, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 3, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 4, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 5, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 6, vl)); + __riscv_vsoxseg7ei8_v_u8mf2x7(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsoxseg7ei8_v_u8m1x7() { + auto indices = __riscv_vle8_v_u8m1(index_buf, vl); + auto data = __riscv_vcreate_v_u8m1x7( + __riscv_vle8_v_u8m1(in_buf, vl), __riscv_vle8_v_u8m1(in_buf + vl, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 2, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 3, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 4, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 5, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 6, vl)); + __riscv_vsoxseg7ei8_v_u8m1x7(out_buf, indices, data, vl); +} + +// Ordered, segment 8 +__attribute__((used, retain)) void vsoxseg8ei8_v_u8mf4x8() { + auto indices = __riscv_vle8_v_u8mf4(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf4x8( + __riscv_vle8_v_u8mf4(in_buf, vl), __riscv_vle8_v_u8mf4(in_buf + vl, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 3, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 4, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 5, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 6, vl), + __riscv_vle8_v_u8mf4(in_buf + vl * 7, vl)); + __riscv_vsoxseg8ei8_v_u8mf4x8(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsoxseg8ei8_v_u8mf2x8() { + auto indices = __riscv_vle8_v_u8mf2(index_buf, vl); + auto data = __riscv_vcreate_v_u8mf2x8( + __riscv_vle8_v_u8mf2(in_buf, vl), __riscv_vle8_v_u8mf2(in_buf + vl, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 2, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 3, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 4, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 5, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 6, vl), + __riscv_vle8_v_u8mf2(in_buf + vl * 7, vl)); + __riscv_vsoxseg8ei8_v_u8mf2x8(out_buf, indices, data, vl); +} + +__attribute__((used, retain)) void vsoxseg8ei8_v_u8m1x8() { + auto indices = __riscv_vle8_v_u8m1(index_buf, vl); + auto data = __riscv_vcreate_v_u8m1x8( + __riscv_vle8_v_u8m1(in_buf, vl), __riscv_vle8_v_u8m1(in_buf + vl, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 2, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 3, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 4, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 5, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 6, vl), + __riscv_vle8_v_u8m1(in_buf + vl * 7, vl)); + __riscv_vsoxseg8ei8_v_u8m1x8(out_buf, indices, data, vl); +} +} + +void (*impl)() __attribute__((section(".data"))) = &vsuxseg2ei8_v_u8m1x2; + +int main(int argc, char** argv) { + impl(); + return 0; +}
diff --git a/tests/cocotb/rvv_load_store_test.py b/tests/cocotb/rvv_load_store_test.py index ce818e0..39094b7 100644 --- a/tests/cocotb/rvv_load_store_test.py +++ b/tests/cocotb/rvv_load_store_test.py
@@ -186,14 +186,14 @@ assert (actual_outputs == expected_outputs).all(), debug_msg -async def vector_store_indexed( +async def vector_store_segmented_indexed( dut, elf_name: str, - cases: list[dict], # keys: impl, vl, out_size. + cases: list[dict], # keys: impl, vl, segments, out_size. data_dtype, index_dtype, ): - """RVV load-store test template for indexed stores. + """RVV load-store test template for segmented indexed stores. Each test loads indices and data and performs a scatter operation. """ @@ -209,19 +209,49 @@ for c in tqdm.tqdm(cases): impl = c['impl'] vl = c['vl'] + segments = c['segments'] out_size = c['out_size'] - in_bytes = np.dtype(data_dtype).itemsize * vl + struct_bytes = np.dtype(data_dtype).itemsize * segments # Don't go beyond the buffer. index_max = min(np.iinfo(index_dtype).max, out_size) - index_max = index_max - np.dtype(index_dtype).itemsize + assert vl * struct_bytes <= index_max + index_max = index_max - struct_bytes # TODO(davidgao): currently assuming the vl is supported. # We'll eventually want to test unsupported vl. - indices = rng.integers(0, index_max+1, vl, dtype=index_dtype) - # Index is in bytes so input needs to be in bytes. - input_data = rng.integers(0, np.iinfo(data_dtype).max + 1, vl, + indices = rng.integers(0, index_max + 1, vl, dtype=index_dtype) + # Deal with overlapping indices by rerunning them + exclusion_set = set() + retries = 0 + for i in range(vl): + # If the scatter range is too dense and we're struggling to find + # space, the test could timeout and become flaky. Flag it here + # to be reconfigured. + assert retries < 10000 + while (indices[i] in exclusion_set or \ + indices[i] + struct_bytes - 1 in exclusion_set): + indices[i] = rng.integers(0, index_max + 1, 1)[0] + retries = retries + 1 + exclusion_set = exclusion_set.union( + range(indices[i], indices[i] + struct_bytes)) + input_data = rng.integers(0, np.iinfo(data_dtype).max + 1, + segments * vl, dtype=data_dtype) - output_data = np.zeros(out_size, dtype=data_dtype) + # Index is in bytes so output needs to be in bytes. + output_data = np.zeros( + out_size * np.dtype(data_dtype).itemsize, + dtype=np.uint8) + # Compute expected outputs. Note that indices are in bytes for all stores. + expected_outputs = output_data.copy() + elem_size = np.dtype(data_dtype).itemsize + input_data_bytes = input_data.view(np.uint8) + indices_in_use = indices.astype(np.uint32) + indices_in_use = np.arange(segments).reshape(-1, 1, 1) * elem_size + \ + indices_in_use.reshape(1, -1, 1) + \ + np.arange(elem_size).reshape(1, 1, -1) + indices_in_use = indices_in_use.reshape(-1) + np.put_along_axis(expected_outputs, indices_in_use, input_data_bytes, None) + expected_outputs = expected_outputs.view(data_dtype) await fixture.write_ptr('impl', impl) await fixture.write_word('vl', vl) @@ -234,15 +264,6 @@ actual_outputs = (await fixture.read( 'out_buf', out_size * np.dtype(data_dtype).itemsize)).view(data_dtype) - # Compute expected outputs. Note that indices are in bytes for all stores. - expected_outputs = output_data.copy().view(np.uint8) - elem_size = np.dtype(data_dtype).itemsize - input_data_bytes = input_data.view(np.uint8) - indices_bytes = np.repeat(indices, elem_size) + \ - np.tile(np.arange(elem_size), len(indices)) - np.put_along_axis(expected_outputs, indices_bytes, input_data_bytes, None) - expected_outputs = expected_outputs.view(data_dtype) - debug_msg = str({ 'impl': impl, 'input': input_data, @@ -1773,10 +1794,11 @@ return { 'impl': impl, 'vl': vl, + 'segments': 1, 'out_size': 512, } - await vector_store_indexed( + await vector_store_segmented_indexed( dut = dut, elf_name = 'store8_index8.elf', cases = [ @@ -1813,16 +1835,152 @@ @cocotb.test() +async def store8_index8_seg(dut): + """Test vs*xseg*ei8_v_u8 usage accessible from intrinsics.""" + def make_test_case(impl: str, vl: int, segs: int): + return { + 'impl': impl, + 'vl': vl, + 'segments': segs, + 'out_size': 512, + } + + await vector_store_segmented_indexed( + dut = dut, + elf_name = 'store8_index8_seg.elf', + cases = [ + # Unordered, segment 2 + make_test_case('vsuxseg2ei8_v_u8mf4x2', vl = 4, segs = 2), + make_test_case('vsuxseg2ei8_v_u8mf4x2', vl = 3, segs = 2), + make_test_case('vsuxseg2ei8_v_u8mf2x2', vl = 8, segs = 2), + make_test_case('vsuxseg2ei8_v_u8mf2x2', vl = 7, segs = 2), + make_test_case('vsuxseg2ei8_v_u8m1x2', vl = 16, segs = 2), + make_test_case('vsuxseg2ei8_v_u8m1x2', vl = 15, segs = 2), + make_test_case('vsuxseg2ei8_v_u8m2x2', vl = 32, segs = 2), + make_test_case('vsuxseg2ei8_v_u8m2x2', vl = 31, segs = 2), + make_test_case('vsuxseg2ei8_v_u8m4x2', vl = 64, segs = 2), + make_test_case('vsuxseg2ei8_v_u8m4x2', vl = 63, segs = 2), + # Unordered, segment 3 + make_test_case('vsuxseg3ei8_v_u8mf4x3', vl = 4, segs = 3), + make_test_case('vsuxseg3ei8_v_u8mf4x3', vl = 3, segs = 3), + make_test_case('vsuxseg3ei8_v_u8mf2x3', vl = 8, segs = 3), + make_test_case('vsuxseg3ei8_v_u8mf2x3', vl = 7, segs = 3), + make_test_case('vsuxseg3ei8_v_u8m1x3', vl = 16, segs = 3), + make_test_case('vsuxseg3ei8_v_u8m1x3', vl = 15, segs = 3), + make_test_case('vsuxseg3ei8_v_u8m2x3', vl = 32, segs = 3), + make_test_case('vsuxseg3ei8_v_u8m2x3', vl = 31, segs = 3), + # Unordered, segment 4 + make_test_case('vsuxseg4ei8_v_u8mf4x4', vl = 4, segs = 4), + make_test_case('vsuxseg4ei8_v_u8mf4x4', vl = 3, segs = 4), + make_test_case('vsuxseg4ei8_v_u8mf2x4', vl = 8, segs = 4), + make_test_case('vsuxseg4ei8_v_u8mf2x4', vl = 7, segs = 4), + make_test_case('vsuxseg4ei8_v_u8m1x4', vl = 16, segs = 4), + make_test_case('vsuxseg4ei8_v_u8m1x4', vl = 15, segs = 4), + make_test_case('vsuxseg4ei8_v_u8m2x4', vl = 32, segs = 4), + make_test_case('vsuxseg4ei8_v_u8m2x4', vl = 31, segs = 4), + # Unordered, segment 5 + make_test_case('vsuxseg5ei8_v_u8mf4x5', vl = 4, segs = 5), + make_test_case('vsuxseg5ei8_v_u8mf4x5', vl = 3, segs = 5), + make_test_case('vsuxseg5ei8_v_u8mf2x5', vl = 8, segs = 5), + make_test_case('vsuxseg5ei8_v_u8mf2x5', vl = 7, segs = 5), + make_test_case('vsuxseg5ei8_v_u8m1x5', vl = 16, segs = 5), + make_test_case('vsuxseg5ei8_v_u8m1x5', vl = 15, segs = 5), + # Unordered, segment 6 + make_test_case('vsuxseg6ei8_v_u8mf4x6', vl = 4, segs = 6), + make_test_case('vsuxseg6ei8_v_u8mf4x6', vl = 3, segs = 6), + make_test_case('vsuxseg6ei8_v_u8mf2x6', vl = 8, segs = 6), + make_test_case('vsuxseg6ei8_v_u8mf2x6', vl = 7, segs = 6), + make_test_case('vsuxseg6ei8_v_u8m1x6', vl = 16, segs = 6), + make_test_case('vsuxseg6ei8_v_u8m1x6', vl = 15, segs = 6), + # Unordered, segment 7 + make_test_case('vsuxseg7ei8_v_u8mf4x7', vl = 4, segs = 7), + make_test_case('vsuxseg7ei8_v_u8mf4x7', vl = 3, segs = 7), + make_test_case('vsuxseg7ei8_v_u8mf2x7', vl = 8, segs = 7), + make_test_case('vsuxseg7ei8_v_u8mf2x7', vl = 7, segs = 7), + make_test_case('vsuxseg7ei8_v_u8m1x7', vl = 16, segs = 7), + make_test_case('vsuxseg7ei8_v_u8m1x7', vl = 15, segs = 7), + # Unordered, segment 8 + make_test_case('vsuxseg8ei8_v_u8mf4x8', vl = 4, segs = 8), + make_test_case('vsuxseg8ei8_v_u8mf4x8', vl = 3, segs = 8), + make_test_case('vsuxseg8ei8_v_u8mf2x8', vl = 8, segs = 8), + make_test_case('vsuxseg8ei8_v_u8mf2x8', vl = 7, segs = 8), + make_test_case('vsuxseg8ei8_v_u8m1x8', vl = 16, segs = 8), + make_test_case('vsuxseg8ei8_v_u8m1x8', vl = 15, segs = 8), + # Ordered, segment 2 + make_test_case('vsoxseg2ei8_v_u8mf4x2', vl = 4, segs = 2), + make_test_case('vsoxseg2ei8_v_u8mf4x2', vl = 3, segs = 2), + make_test_case('vsoxseg2ei8_v_u8mf2x2', vl = 8, segs = 2), + make_test_case('vsoxseg2ei8_v_u8mf2x2', vl = 7, segs = 2), + make_test_case('vsoxseg2ei8_v_u8m1x2', vl = 16, segs = 2), + make_test_case('vsoxseg2ei8_v_u8m1x2', vl = 15, segs = 2), + make_test_case('vsoxseg2ei8_v_u8m2x2', vl = 32, segs = 2), + make_test_case('vsoxseg2ei8_v_u8m2x2', vl = 31, segs = 2), + make_test_case('vsoxseg2ei8_v_u8m4x2', vl = 64, segs = 2), + make_test_case('vsoxseg2ei8_v_u8m4x2', vl = 63, segs = 2), + # Ordered, segment 3 + make_test_case('vsoxseg3ei8_v_u8mf4x3', vl = 4, segs = 3), + make_test_case('vsoxseg3ei8_v_u8mf4x3', vl = 3, segs = 3), + make_test_case('vsoxseg3ei8_v_u8mf2x3', vl = 8, segs = 3), + make_test_case('vsoxseg3ei8_v_u8mf2x3', vl = 7, segs = 3), + make_test_case('vsoxseg3ei8_v_u8m1x3', vl = 16, segs = 3), + make_test_case('vsoxseg3ei8_v_u8m1x3', vl = 15, segs = 3), + make_test_case('vsoxseg3ei8_v_u8m2x3', vl = 32, segs = 3), + make_test_case('vsoxseg3ei8_v_u8m2x3', vl = 31, segs = 3), + # Ordered, segment 4 + make_test_case('vsoxseg4ei8_v_u8mf4x4', vl = 4, segs = 4), + make_test_case('vsoxseg4ei8_v_u8mf4x4', vl = 3, segs = 4), + make_test_case('vsoxseg4ei8_v_u8mf2x4', vl = 8, segs = 4), + make_test_case('vsoxseg4ei8_v_u8mf2x4', vl = 7, segs = 4), + make_test_case('vsoxseg4ei8_v_u8m1x4', vl = 16, segs = 4), + make_test_case('vsoxseg4ei8_v_u8m1x4', vl = 15, segs = 4), + make_test_case('vsoxseg4ei8_v_u8m2x4', vl = 32, segs = 4), + make_test_case('vsoxseg4ei8_v_u8m2x4', vl = 31, segs = 4), + # Ordered, segment 5 + make_test_case('vsoxseg5ei8_v_u8mf4x5', vl = 4, segs = 5), + make_test_case('vsoxseg5ei8_v_u8mf4x5', vl = 3, segs = 5), + make_test_case('vsoxseg5ei8_v_u8mf2x5', vl = 8, segs = 5), + make_test_case('vsoxseg5ei8_v_u8mf2x5', vl = 7, segs = 5), + make_test_case('vsoxseg5ei8_v_u8m1x5', vl = 16, segs = 5), + make_test_case('vsoxseg5ei8_v_u8m1x5', vl = 15, segs = 5), + # Ordered, segment 6 + make_test_case('vsoxseg6ei8_v_u8mf4x6', vl = 4, segs = 6), + make_test_case('vsoxseg6ei8_v_u8mf4x6', vl = 3, segs = 6), + make_test_case('vsoxseg6ei8_v_u8mf2x6', vl = 8, segs = 6), + make_test_case('vsoxseg6ei8_v_u8mf2x6', vl = 7, segs = 6), + make_test_case('vsoxseg6ei8_v_u8m1x6', vl = 16, segs = 6), + make_test_case('vsoxseg6ei8_v_u8m1x6', vl = 15, segs = 6), + # Ordered, segment 7 + make_test_case('vsoxseg7ei8_v_u8mf4x7', vl = 4, segs = 7), + make_test_case('vsoxseg7ei8_v_u8mf4x7', vl = 3, segs = 7), + make_test_case('vsoxseg7ei8_v_u8mf2x7', vl = 8, segs = 7), + make_test_case('vsoxseg7ei8_v_u8mf2x7', vl = 7, segs = 7), + make_test_case('vsoxseg7ei8_v_u8m1x7', vl = 16, segs = 7), + make_test_case('vsoxseg7ei8_v_u8m1x7', vl = 15, segs = 7), + # Ordered, segment 8 + make_test_case('vsoxseg8ei8_v_u8mf4x8', vl = 4, segs = 8), + make_test_case('vsoxseg8ei8_v_u8mf4x8', vl = 3, segs = 8), + make_test_case('vsoxseg8ei8_v_u8mf2x8', vl = 8, segs = 8), + make_test_case('vsoxseg8ei8_v_u8mf2x8', vl = 7, segs = 8), + make_test_case('vsoxseg8ei8_v_u8m1x8', vl = 16, segs = 8), + make_test_case('vsoxseg8ei8_v_u8m1x8', vl = 15, segs = 8), + ], + data_dtype = np.uint8, + index_dtype = np.uint8, + ) + + +@cocotb.test() async def store16_index8(dut): """Test vs*xei8_v_u16 usage accessible from intrinsics.""" def make_test_case(impl: str, vl: int): return { 'impl': impl, 'vl': vl, + 'segments': 1, 'out_size': 256, } - await vector_store_indexed( + await vector_store_segmented_indexed( dut = dut, elf_name = 'store16_index8.elf', cases = [ @@ -1861,10 +2019,11 @@ return { 'impl': impl, 'vl': vl, + 'segments': 1, 'out_size': 16000, } - await vector_store_indexed( + await vector_store_segmented_indexed( dut = dut, elf_name = 'store16_index16.elf', cases = [ @@ -1903,10 +2062,11 @@ return { 'impl': impl, 'vl': vl, + 'segments': 1, 'out_size': 257, } - await vector_store_indexed( + await vector_store_segmented_indexed( dut = dut, elf_name = 'store32_index8.elf', cases = [ @@ -1941,10 +2101,11 @@ return { 'impl': impl, 'vl': vl, + 'segments': 1, 'out_size': 4000, } - await vector_store_indexed( + await vector_store_segmented_indexed( dut = dut, elf_name = 'store32_index16.elf', cases = [ @@ -1979,10 +2140,11 @@ return { 'impl': impl, 'vl': vl, + 'segments': 1, 'out_size': 8000, } - await vector_store_indexed( + await vector_store_segmented_indexed( dut = dut, elf_name = 'store32_index32.elf', cases = [