sw/vec_iree: Update RVV implementations using latest RVV spec
Needed when we update riscv32-unknown-elf-gdb toolchain.
Change-Id: I82c6e6d4a3d080fc9aec274be435efc7a497c2bc
diff --git a/audio_prep/mfcc.c b/audio_prep/mfcc.c
index 0a137e6..994ffcb 100644
--- a/audio_prep/mfcc.c
+++ b/audio_prep/mfcc.c
@@ -39,13 +39,13 @@
vuint32m1_t v_sum;
uint32_t sum = 0;
for (size_t i = 0; i < n; i += vl) {
- vl = vsetvl_e32m8(n - i);
- vu = vle32_v_u32m8(u + i, vl); // load
- vw = vle32_v_u32m8(w + i, vl); // load
- vx = vmul(vu, vw, vl); // multiply
- v_sum = vmv_s(v_sum, 0, vl); // init
- v_sum = vredsum(v_sum, vx, v_sum, vl); // sum
- sum += vmv_x(v_sum);
+ vl = __riscv_vsetvl_e32m8(n - i);
+ vu = __riscv_vle32_v_u32m8(u + i, vl); // load
+ vw = __riscv_vle32_v_u32m8(w + i, vl); // load
+ vx = __riscv_vmul(vu, vw, vl); // multiply
+ v_sum = __riscv_vmv_v_x_u32m1(0, vl); // init
+ v_sum = __riscv_vredsum(vx, v_sum, vl); // sum
+ sum += __riscv_vmv_x(v_sum);
}
return sum;
}
diff --git a/risp4ml/isp_stages/blc_rvv.c b/risp4ml/isp_stages/blc_rvv.c
index 2cb3092..551c817 100644
--- a/risp4ml/isp_stages/blc_rvv.c
+++ b/risp4ml/isp_stages/blc_rvv.c
@@ -34,9 +34,9 @@
uint16_t* x;
for (size_t i = 0; i < n; i += vl) {
x = img->data + i;
- vl = vsetvl_e16m8(n - i);
- vx = vle16_v_u16m8(x, vl); // load
- vx = vssubu(vx, offset, vl); // subtract
- vse16(x, vx, vl); // save
+ vl = __riscv_vsetvl_e16m8(n - i);
+ vx = __riscv_vle16_v_u16m8(x, vl); // load
+ vx = __riscv_vssubu(vx, offset, vl); // subtract
+ __riscv_vse16(x, vx, vl); // save
}
}
diff --git a/risp4ml/isp_stages/demosaic_rvv.c b/risp4ml/isp_stages/demosaic_rvv.c
index 4709a93..0f3c02a 100644
--- a/risp4ml/isp_stages/demosaic_rvv.c
+++ b/risp4ml/isp_stages/demosaic_rvv.c
@@ -96,48 +96,62 @@
x_offset[2] = x + 1;
ptrdiff_t stride = 2 * sizeof(uint16_t);
size_t avl = (input->width - 1 - x) / 2;
- vl = vsetvl_e16m4(avl);
+ vl = __riscv_vsetvl_e16m4(avl);
if (n + (y & 0x1) == 2) { // kR or kB
// ch0
- vx = vlse16_v_u16m4(line_in[1] + x_offset[1], stride, vl); // load
- vsse16(line_out[0] + x, stride, vx, vl); // save
+ vx = __riscv_vlse16_v_u16m4(line_in[1] + x_offset[1], stride,
+ vl); // load
+ __riscv_vsse16(line_out[0] + x, stride, vx, vl); // save
// ch1
- vx = vlse16_v_u16m4(line_in[0] + x_offset[1], stride, vl); // load
- vy = vlse16_v_u16m4(line_in[2] + x_offset[1], stride, vl); // load
- vz = vwaddu_vv(vx, vy, vl); // add
- vy = vlse16_v_u16m4(line_in[1] + x_offset[0], stride, vl); // load
- vz = vwaddu_wv(vz, vy, vl); // add
- vy = vlse16_v_u16m4(line_in[1] + x_offset[2], stride, vl); // load
- vz = vwaddu_wv(vz, vy, vl); // add
- vx = vnsrl(vz, 2, vl); // 1/4
- vsse16(line_out[1] + x, stride, vx, vl); // save
+ vx = __riscv_vlse16_v_u16m4(line_in[0] + x_offset[1], stride,
+ vl); // load
+ vy = __riscv_vlse16_v_u16m4(line_in[2] + x_offset[1], stride,
+ vl); // load
+ vz = __riscv_vwaddu_vv(vx, vy, vl); // add
+ vy = __riscv_vlse16_v_u16m4(line_in[1] + x_offset[0], stride,
+ vl); // load
+ vz = __riscv_vwaddu_wv(vz, vy, vl); // add
+ vy = __riscv_vlse16_v_u16m4(line_in[1] + x_offset[2], stride,
+ vl); // load
+ vz = __riscv_vwaddu_wv(vz, vy, vl); // add
+ vx = __riscv_vnsrl(vz, 2, vl); // 1/4
+ __riscv_vsse16(line_out[1] + x, stride, vx, vl); // save
// ch2
- vx = vlse16_v_u16m4(line_in[0] + x_offset[0], stride, vl); // load
- vy = vlse16_v_u16m4(line_in[0] + x_offset[2], stride, vl); // load
- vz = vwaddu_vv(vx, vy, vl); // add
- vy = vlse16_v_u16m4(line_in[2] + x_offset[0], stride, vl); // load
- vz = vwaddu_wv(vz, vy, vl); // add
- vy = vlse16_v_u16m4(line_in[2] + x_offset[2], stride, vl); // load
- vz = vwaddu_wv(vz, vy, vl); // add
- vx = vnsrl(vz, 2, vl); // 1/4
- vsse16(line_out[2] + x, stride, vx, vl); // save
+ vx = __riscv_vlse16_v_u16m4(line_in[0] + x_offset[0], stride,
+ vl); // load
+ vy = __riscv_vlse16_v_u16m4(line_in[0] + x_offset[2], stride,
+ vl); // load
+ vz = __riscv_vwaddu_vv(vx, vy, vl); // add
+ vy = __riscv_vlse16_v_u16m4(line_in[2] + x_offset[0], stride,
+ vl); // load
+ vz = __riscv_vwaddu_wv(vz, vy, vl); // add
+ vy = __riscv_vlse16_v_u16m4(line_in[2] + x_offset[2], stride,
+ vl); // load
+ vz = __riscv_vwaddu_wv(vz, vy, vl); // add
+ vx = __riscv_vnsrl(vz, 2, vl); // 1/4
+ __riscv_vsse16(line_out[2] + x, stride, vx, vl); // save
} else { // kGr or kRb
// ch0
- vx = vlse16_v_u16m4(line_in[1] + x_offset[0], stride, vl); // load
- vy = vlse16_v_u16m4(line_in[1] + x_offset[2], stride, vl); // load
- vz = vwaddu_vv(vx, vy, vl); // add
- vx = vnsrl(vz, 1, vl); // 1/2
- vsse16(line_out[0] + x, stride, vx, vl); // save
+ vx = __riscv_vlse16_v_u16m4(line_in[1] + x_offset[0], stride,
+ vl); // load
+ vy = __riscv_vlse16_v_u16m4(line_in[1] + x_offset[2], stride,
+ vl); // load
+ vz = __riscv_vwaddu_vv(vx, vy, vl); // add
+ vx = __riscv_vnsrl(vz, 1, vl); // 1/2
+ __riscv_vsse16(line_out[0] + x, stride, vx, vl); // save
// ch1
- vx = vlse16_v_u16m4(line_in[1] + x_offset[1], stride, vl); // load
- vsse16(line_out[1] + x, stride, vx, vl); // save
+ vx = __riscv_vlse16_v_u16m4(line_in[1] + x_offset[1], stride,
+ vl); // load
+ __riscv_vsse16(line_out[1] + x, stride, vx, vl); // save
// ch2
- vx = vlse16_v_u16m4(line_in[0] + x_offset[1], stride, vl); // load
- vy = vlse16_v_u16m4(line_in[2] + x_offset[1], stride, vl); // load
- vz = vwaddu_vv(vx, vy, vl); // add
- vx = vnsrl(vz, 1, vl); // 1/2
- vsse16(line_out[2] + x, stride, vx, vl); // save
+ vx = __riscv_vlse16_v_u16m4(line_in[0] + x_offset[1], stride,
+ vl); // load
+ vy = __riscv_vlse16_v_u16m4(line_in[2] + x_offset[1], stride,
+ vl); // load
+ vz = __riscv_vwaddu_vv(vx, vy, vl); // add
+ vx = __riscv_vnsrl(vz, 1, vl); // 1/2
+ __riscv_vsse16(line_out[2] + x, stride, vx, vl); // save
}
}
}
diff --git a/risp4ml/isp_stages/dg_rvv.c b/risp4ml/isp_stages/dg_rvv.c
index a62dede..2488f64 100644
--- a/risp4ml/isp_stages/dg_rvv.c
+++ b/risp4ml/isp_stages/dg_rvv.c
@@ -40,12 +40,12 @@
uint16_t* x;
for (size_t i = 0; i < n; i += vl) {
x = img->data + i;
- vl = vsetvl_e16m4(n - i);
- vx = vle16_v_u16m4(x, vl); // load
- vy = vwmulu(vx, gain, vl); // multiply
- vy = vadd(vy, offset, vl); // add
- vy = vminu(vy, max_val, vl); // clamp
- vx = vnsrl(vy, kDgFractional, vl); // bit shift
- vse16(x, vx, vl); // save
+ vl = __riscv_vsetvl_e16m4(n - i);
+ vx = __riscv_vle16_v_u16m4(x, vl); // load
+ vy = __riscv_vwmulu(vx, gain, vl); // multiply
+ vy = __riscv_vadd(vy, offset, vl); // add
+ vy = __riscv_vminu(vy, max_val, vl); // clamp
+ vx = __riscv_vnsrl(vy, kDgFractional, vl); // bit shift
+ __riscv_vse16(x, vx, vl); // save
}
}
diff --git a/risp4ml/isp_stages/downscale_rvv.c b/risp4ml/isp_stages/downscale_rvv.c
index 9098148..849e9f6 100644
--- a/risp4ml/isp_stages/downscale_rvv.c
+++ b/risp4ml/isp_stages/downscale_rvv.c
@@ -66,95 +66,96 @@
uint8_t* out = output->data + c;
for (size_t i = 0; i < n; i += vl) {
- vl = vsetvl_e16m4(n - i);
- vid = vid_v_u32m8(vl);
- vid = vadd(vid, i, vl);
+ vl = __riscv_vsetvl_e16m4(n - i);
+ vid = __riscv_vid_v_u32m8(vl);
+ vid = __riscv_vadd(vid, i, vl);
- vy = vdivu(vid, output->width, vl);
- vx = vremu(vid, output->width, vl);
+ vy = __riscv_vdivu(vid, output->width, vl);
+ vx = __riscv_vremu(vid, output->width, vl);
// find 4 neighboring points
// four neighboring coordinates are
// [y_l, x_l], [y_l, x_h], [y_h, x_l], [y_h, x_h]
- vx_l = vmul(vx, input_w_1, vl);
- vx_l = vdivu(vx_l, w_1, vl);
- vx_h = vadd(vx_l, 1, vl);
- vx_h = vminu(vx_h, input_w_1, vl); // clamp
+ vx_l = __riscv_vmul(vx, input_w_1, vl);
+ vx_l = __riscv_vdivu(vx_l, w_1, vl);
+ vx_h = __riscv_vadd(vx_l, 1, vl);
+ vx_h = __riscv_vminu(vx_h, input_w_1, vl); // clamp
- vy_l = vmul(vy, input_h_1, vl);
- vy_l = vdivu(vy_l, h_1, vl);
- vy_h = vadd(vy_l, 1, vl);
- vy_h = vminu(vy_h, input_h_1, vl); // clamp
+ vy_l = __riscv_vmul(vy, input_h_1, vl);
+ vy_l = __riscv_vdivu(vy_l, h_1, vl);
+ vy_h = __riscv_vadd(vy_l, 1, vl);
+ vy_h = __riscv_vminu(vy_h, input_h_1, vl); // clamp
// load values of four neighboring points: A, B C, D
- vz = vmul(vy_l, input_width, vl);
- vz = vadd(vz, vx_l, vl);
- vz = vsll(vz, 1, vl); // *2
- vo_16b = vluxei32(in, vz, vl);
- va = vwaddu_vx(vo_16b, 0, vl);
+ vz = __riscv_vmul(vy_l, input_width, vl);
+ vz = __riscv_vadd(vz, vx_l, vl);
+ vz = __riscv_vsll(vz, 1, vl); // *2
+ vo_16b = __riscv_vluxei32(in, vz, vl);
+ va = __riscv_vwaddu_vx(vo_16b, 0, vl);
- vz = vmul(vy_l, input_width, vl);
- vz = vadd(vz, vx_h, vl);
- vz = vsll(vz, 1, vl); // *2
- vo_16b = vluxei32(in, vz, vl);
- vb = vwaddu_vx(vo_16b, 0, vl);
+ vz = __riscv_vmul(vy_l, input_width, vl);
+ vz = __riscv_vadd(vz, vx_h, vl);
+ vz = __riscv_vsll(vz, 1, vl); // *2
+ vo_16b = __riscv_vluxei32(in, vz, vl);
+ vb = __riscv_vwaddu_vx(vo_16b, 0, vl);
- vz = vmul(vy_h, input_width, vl);
- vz = vadd(vz, vx_l, vl);
- vz = vsll(vz, 1, vl); // *2
- vo_16b = vluxei32(in, vz, vl);
- vc = vwaddu_vx(vo_16b, 0, vl);
+ vz = __riscv_vmul(vy_h, input_width, vl);
+ vz = __riscv_vadd(vz, vx_l, vl);
+ vz = __riscv_vsll(vz, 1, vl); // *2
+ vo_16b = __riscv_vluxei32(in, vz, vl);
+ vc = __riscv_vwaddu_vx(vo_16b, 0, vl);
- vz = vmul(vy_h, input_width, vl);
- vz = vadd(vz, vx_h, vl);
- vz = vsll(vz, 1, vl); // *2
- vo_16b = vluxei32(in, vz, vl);
- vd = vwaddu_vx(vo_16b, 0, vl);
+ vz = __riscv_vmul(vy_h, input_width, vl);
+ vz = __riscv_vadd(vz, vx_h, vl);
+ vz = __riscv_vsll(vz, 1, vl); // *2
+ vo_16b = __riscv_vluxei32(in, vz, vl);
+ vd = __riscv_vwaddu_vx(vo_16b, 0, vl);
// compute weights of four neighboring points: wx, wy, 1-wx, 1-wy
- vp = vmul(vx, input_w_1, vl);
- vq = vmul(vx_l, w_1, vl);
- vp = vssubu(vp, vq, vl);
- vp = vsll(vp, params.scale_precision, vl);
- vx_weight = vdivu(vp, w_1, vl);
+ vp = __riscv_vmul(vx, input_w_1, vl);
+ vq = __riscv_vmul(vx_l, w_1, vl);
+ vp = __riscv_vssubu(vp, vq, vl);
+ vp = __riscv_vsll(vp, params.scale_precision, vl);
+ vx_weight = __riscv_vdivu(vp, w_1, vl);
- vp = vmul(vy, input_h_1, vl);
- vq = vmul(vy_l, h_1, vl);
- vp = vssubu(vp, vq, vl);
- vp = vsll(vp, params.scale_precision, vl);
- vy_weight = vdivu(vp, h_1, vl);
+ vp = __riscv_vmul(vy, input_h_1, vl);
+ vq = __riscv_vmul(vy_l, h_1, vl);
+ vp = __riscv_vssubu(vp, vq, vl);
+ vp = __riscv_vsll(vp, params.scale_precision, vl);
+ vy_weight = __riscv_vdivu(vp, h_1, vl);
- vx_weight_1minus = vrsub(vx_weight, params.scale_fixed_one, vl);
- vy_weight_1minus = vrsub(vy_weight, params.scale_fixed_one, vl);
+ vx_weight_1minus = __riscv_vrsub(vx_weight, params.scale_fixed_one, vl);
+ vy_weight_1minus = __riscv_vrsub(vy_weight, params.scale_fixed_one, vl);
// Bilinear Interpolation Formular:
// out = A*(1-wx)*(1-wy) + B*wx*(1-wy)
// + C*(1-wx)*wy + D*wx*wy
- vo = vmul(va, vx_weight_1minus, vl);
- vo = vsrl(vo, params.scale_precision, vl);
- vo = vmul(vo, vy_weight_1minus, vl);
+ vo = __riscv_vmul(va, vx_weight_1minus, vl);
+ vo = __riscv_vsrl(vo, params.scale_precision, vl);
+ vo = __riscv_vmul(vo, vy_weight_1minus, vl);
- vp = vmul(vb, vx_weight, vl);
- vp = vsrl(vp, params.scale_precision, vl);
- vp = vmul(vp, vy_weight_1minus, vl);
- vo = vadd(vo, vp, vl);
+ vp = __riscv_vmul(vb, vx_weight, vl);
+ vp = __riscv_vsrl(vp, params.scale_precision, vl);
+ vp = __riscv_vmul(vp, vy_weight_1minus, vl);
+ vo = __riscv_vadd(vo, vp, vl);
- vp = vmul(vc, vx_weight_1minus, vl);
- vp = vsrl(vp, params.scale_precision, vl);
- vp = vmul(vp, vy_weight, vl);
- vo = vadd(vo, vp, vl);
+ vp = __riscv_vmul(vc, vx_weight_1minus, vl);
+ vp = __riscv_vsrl(vp, params.scale_precision, vl);
+ vp = __riscv_vmul(vp, vy_weight, vl);
+ vo = __riscv_vadd(vo, vp, vl);
- vp = vmul(vd, vx_weight, vl);
- vp = vsrl(vp, params.scale_precision, vl);
- vp = vmul(vp, vy_weight, vl);
- vo = vadd(vo, vp, vl);
+ vp = __riscv_vmul(vd, vx_weight, vl);
+ vp = __riscv_vsrl(vp, params.scale_precision, vl);
+ vp = __riscv_vmul(vp, vy_weight, vl);
+ vo = __riscv_vadd(vo, vp, vl);
// bit shift from 32bits to 8bits
- vo_16b = vnsrl(vo, params.scale_precision, vl);
- vo_8b = vnsrl(vo_16b, kRawPipelineBpp - kPipeOutputBpp, vl);
+ vo_16b = __riscv_vnsrl(vo, params.scale_precision, vl);
+ vo_8b = __riscv_vnsrl(vo_16b, kRawPipelineBpp - kPipeOutputBpp, vl);
// save
- vsse8(out + i * output->num_channels, output->num_channels, vo_8b, vl);
+ __riscv_vsse8(out + i * output->num_channels, output->num_channels, vo_8b,
+ vl);
}
}
}
diff --git a/risp4ml/isp_stages/gamma_rvv.c b/risp4ml/isp_stages/gamma_rvv.c
index e04025d..dab4f91 100644
--- a/risp4ml/isp_stages/gamma_rvv.c
+++ b/risp4ml/isp_stages/gamma_rvv.c
@@ -247,23 +247,23 @@
for (size_t i = 0; i < n; i += vl) {
x = img->data + i;
- vl = vsetvl_e16m8(n - i);
- vx = vle16_v_u16m8(x, vl); // load
- mask = vmsne(vx, kRgbPipelineMaxVal,
- vl); // mask to exclude kRgbPipelineMaxVal
+ vl = __riscv_vsetvl_e16m8(n - i);
+ vx = __riscv_vle16_v_u16m8(x, vl); // load
+ mask = __riscv_vmsne(vx, kRgbPipelineMaxVal,
+ vl); // mask to exclude kRgbPipelineMaxVal
- vy = vsrl(vx, kGammaShiftBits, vl); // 1/32
- vy = vsll(vy, 1, vl); // *2
- v_offset = vand(vx, kGammaSpacing - 1, vl); // offset within bin
+ vy = __riscv_vsrl(vx, kGammaShiftBits, vl); // 1/32
+ vy = __riscv_vsll(vy, 1, vl); // *2
+ v_offset = __riscv_vand(vx, kGammaSpacing - 1, vl); // offset within bin
- vx = vluxei16(gamma_params.lut, vy, vl); // left
- vy = vluxei16(gamma_params.lut + 1, vy, vl); // right
+ vx = __riscv_vluxei16(gamma_params.lut, vy, vl); // left
+ vy = __riscv_vluxei16(gamma_params.lut + 1, vy, vl); // right
- vy = vsub(vy, vx, vl); // right - left
- vy = vmul(vy, v_offset, vl); // multiply offset_within_bin
- vy = vsrl(vy, kGammaShiftBits, vl); // 1/32
- vx = vadd(vx, vy, vl); // add
+ vy = __riscv_vsub(vy, vx, vl); // right - left
+ vy = __riscv_vmul(vy, v_offset, vl); // multiply offset_within_bin
+ vy = __riscv_vsrl(vy, kGammaShiftBits, vl); // 1/32
+ vx = __riscv_vadd(vx, vy, vl); // add
- vse16(mask, x, vx, vl); // save
+ __riscv_vse16(mask, x, vx, vl); // save
}
}
diff --git a/risp4ml/isp_stages/wbg_rvv.c b/risp4ml/isp_stages/wbg_rvv.c
index 1bc1187..3374de6 100644
--- a/risp4ml/isp_stages/wbg_rvv.c
+++ b/risp4ml/isp_stages/wbg_rvv.c
@@ -48,12 +48,13 @@
for (uint8_t n = 0; n < 2; n++) {
for (uint16_t x = n; x < img->width; x += 2 * vl) {
size_t avl = (img->width + 1 - x) / 2;
- vl = vsetvl_e16m8(avl);
- vx = vlse16_v_u16m8(line + x, 2 * sizeof(uint16_t), vl); // load
+ vl = __riscv_vsetvl_e16m8(avl);
+ vx =
+ __riscv_vlse16_v_u16m8(line + x, 2 * sizeof(uint16_t), vl); // load
- vy = vmv_v_x_u32m1(0, vl); // init
- vy = vwredsumu(vy, vx, vy, vl); // sum
- uint32_t sum = vmv_x(vy);
+ vy = __riscv_vmv_v_x_u32m1(0, vl); // init
+ vy = __riscv_vwredsumu(vx, vy, vl); // sum
+ uint32_t sum = __riscv_vmv_x(vy);
if ((y & 0x1) == 0 && n == 0) {
sum_of_reds += sum;
} else if ((y & 0x1) == 1 && n == 1) {
@@ -108,13 +109,14 @@
gain = (y & 0x1) ? wbg_params.gains[2 + n] : wbg_params.gains[n];
for (uint16_t x = n; x < img->width; x += 2 * vl) {
size_t avl = (img->width + 1 - x) / 2;
- vl = vsetvl_e16m4(avl);
- vx = vlse16_v_u16m4(line + x, 2 * sizeof(uint16_t), vl); // load
- vy = vwmulu(vx, gain, vl); // multiply
- vy = vadd(vy, offset, vl); // add
- vy = vminu(vy, max_val, vl); // clamp
- vx = vnsrl(vy, kWbgFractional, vl); // bit shift
- vsse16(line + x, 2 * sizeof(uint16_t), vx, vl); // save
+ vl = __riscv_vsetvl_e16m4(avl);
+ vx =
+ __riscv_vlse16_v_u16m4(line + x, 2 * sizeof(uint16_t), vl); // load
+ vy = __riscv_vwmulu(vx, gain, vl); // multiply
+ vy = __riscv_vadd(vy, offset, vl); // add
+ vy = __riscv_vminu(vy, max_val, vl); // clamp
+ vx = __riscv_vnsrl(vy, kWbgFractional, vl); // bit shift
+ __riscv_vsse16(line + x, 2 * sizeof(uint16_t), vx, vl); // save
}
}
}
diff --git a/vmvx_ukernel/elementwise.c b/vmvx_ukernel/elementwise.c
index c568dbb..354e7e8 100644
--- a/vmvx_ukernel/elementwise.c
+++ b/vmvx_ukernel/elementwise.c
@@ -143,53 +143,53 @@
iree_uk_index_t out_stride, size_t vl) {
iree_uk_x32b_opcode_type_t op_type = get_iree_uk_x32b_op_type(opcode);
if (op_type == IREE_UK_X32B_UI) {
- vuint32m8_t vx = vlse32_v_u32m8(lhs, lhs_stride, vl); // load
- vuint32m8_t vy = vlse32_v_u32m8(rhs, rhs_stride, vl); // load
+ vuint32m8_t vx = __riscv_vlse32_v_u32m8(lhs, lhs_stride, vl); // load
+ vuint32m8_t vy = __riscv_vlse32_v_u32m8(rhs, rhs_stride, vl); // load
switch (opcode) {
case IREE_UK_X32B_ADDI:
- vx = vadd(vx, vy, vl);
+ vx = __riscv_vadd(vx, vy, vl);
break;
case IREE_UK_X32B_ANDI:
- vx = vand(vx, vy, vl);
+ vx = __riscv_vand(vx, vy, vl);
break;
case IREE_UK_X32B_DIVUI:
- vx = vdivu(vx, vy, vl);
+ vx = __riscv_vdivu(vx, vy, vl);
break;
case IREE_UK_X32B_MULI:
- vx = vmul(vx, vy, vl);
+ vx = __riscv_vmul(vx, vy, vl);
break;
case IREE_UK_X32B_ORI:
- vx = vor(vx, vy, vl);
+ vx = __riscv_vor(vx, vy, vl);
break;
case IREE_UK_X32B_SHLI:
- vx = vsll(vx, vy, vl);
+ vx = __riscv_vsll(vx, vy, vl);
break;
case IREE_UK_X32B_SHRUI:
- vx = vsrl(vx, vy, vl);
+ vx = __riscv_vsrl(vx, vy, vl);
break;
case IREE_UKENREL_X32B_XORI:
- vx = vor(vx, vy, vl);
+ vx = __riscv_vor(vx, vy, vl);
break;
case IREE_UK_X32B_SUBI:
- vx = vsub(vx, vy, vl);
+ vx = __riscv_vsub(vx, vy, vl);
break;
default:
*result_code = 1;
}
- vsse32(out, out_stride, vx, vl); // save
+ __riscv_vsse32(out, out_stride, vx, vl); // save
} else if (op_type == IREE_UK_X32B_SI) {
vint32m8_t vx =
- vlse32_v_i32m8((iree_uk_int32_t*)lhs, lhs_stride, vl); // load
+ __riscv_vlse32_v_i32m8((iree_uk_int32_t*)lhs, lhs_stride, vl); // load
vint32m8_t vy =
- vlse32_v_i32m8((iree_uk_int32_t*)rhs, rhs_stride, vl); // load
+ __riscv_vlse32_v_i32m8((iree_uk_int32_t*)rhs, rhs_stride, vl); // load
switch (opcode) {
case IREE_UK_X32B_DIVSI:
- vx = vdiv(vx, vy, vl);
+ vx = __riscv_vdiv(vx, vy, vl);
break;
default:
*result_code = 1;
}
- vsse32((iree_uk_int32_t*)out, out_stride, vx, vl); // save
+ __riscv_vsse32((iree_uk_int32_t*)out, out_stride, vx, vl); // save
} else {
*result_code = 1;
}
@@ -316,7 +316,7 @@
if (size0 < size1) {
for (iree_uk_index_t i = 0; i < size0; ++i) {
for (iree_uk_index_t j = 0; j < size1; j += vl) {
- vl = vsetvl_e32m8(size1 - j);
+ vl = __riscv_vsetvl_e32m8(size1 - j);
iree_uk_rvv_x32b_op(opcode, &result_code,
&lhs[i * lhs_stride0 + j * lhs_stride1],
lhs_stride1 * sizeof(uint32_t),
@@ -329,7 +329,7 @@
} else {
for (iree_uk_index_t j = 0; j < size1; ++j) {
for (iree_uk_index_t i = 0; i < size0; i += vl) {
- vl = vsetvl_e32m8(size0 - i);
+ vl = __riscv_vsetvl_e32m8(size0 - i);
iree_uk_rvv_x32b_op(opcode, &result_code,
&lhs[i * lhs_stride0 + j * lhs_stride1],
lhs_stride0 * sizeof(uint32_t),
diff --git a/vmvx_ukernel/mmt4d_tile.c b/vmvx_ukernel/mmt4d_tile.c
index 28d8145..7f00451 100644
--- a/vmvx_ukernel/mmt4d_tile.c
+++ b/vmvx_ukernel/mmt4d_tile.c
@@ -29,13 +29,13 @@
vint32m1_t v_sum;
iree_uk_int32_t sum = 0;
for (size_t i = 0; i < n; i += vl) {
- vl = vsetvl_e8m4(n - i);
- vu = vle8_v_i8m4(u + i, vl); // load
- vw = vle8_v_i8m4(w + i, vl); // load
- vx = vwmul(vu, vw, vl); // multiply
- v_sum = vmv_s(v_sum, 0, vl); // init
- v_sum = vwredsum(v_sum, vx, v_sum, vl); // sum
- sum += vmv_x(v_sum);
+ vl = __riscv_vsetvl_e8m4(n - i);
+ vu = __riscv_vle8_v_i8m4(u + i, vl); // load
+ vw = __riscv_vle8_v_i8m4(w + i, vl); // load
+ vx = __riscv_vwmul(vu, vw, vl); // multiply
+ v_sum = __riscv_vmv_v_x_i32m1(0, vl); // init
+ v_sum = __riscv_vwredsum(vx, v_sum, vl); // sum
+ sum += __riscv_vmv_x(v_sum);
}
return sum;
}