sw/vec_iree: Update RVV implementations using latest RVV spec Needed when we update riscv32-unknown-elf-gdb toolchain. Change-Id: I82c6e6d4a3d080fc9aec274be435efc7a497c2bc
diff --git a/audio_prep/mfcc.c b/audio_prep/mfcc.c index 0a137e6..994ffcb 100644 --- a/audio_prep/mfcc.c +++ b/audio_prep/mfcc.c
@@ -39,13 +39,13 @@ vuint32m1_t v_sum; uint32_t sum = 0; for (size_t i = 0; i < n; i += vl) { - vl = vsetvl_e32m8(n - i); - vu = vle32_v_u32m8(u + i, vl); // load - vw = vle32_v_u32m8(w + i, vl); // load - vx = vmul(vu, vw, vl); // multiply - v_sum = vmv_s(v_sum, 0, vl); // init - v_sum = vredsum(v_sum, vx, v_sum, vl); // sum - sum += vmv_x(v_sum); + vl = __riscv_vsetvl_e32m8(n - i); + vu = __riscv_vle32_v_u32m8(u + i, vl); // load + vw = __riscv_vle32_v_u32m8(w + i, vl); // load + vx = __riscv_vmul(vu, vw, vl); // multiply + v_sum = __riscv_vmv_v_x_u32m1(0, vl); // init + v_sum = __riscv_vredsum(vx, v_sum, vl); // sum + sum += __riscv_vmv_x(v_sum); } return sum; }
diff --git a/risp4ml/isp_stages/blc_rvv.c b/risp4ml/isp_stages/blc_rvv.c index 2cb3092..551c817 100644 --- a/risp4ml/isp_stages/blc_rvv.c +++ b/risp4ml/isp_stages/blc_rvv.c
@@ -34,9 +34,9 @@ uint16_t* x; for (size_t i = 0; i < n; i += vl) { x = img->data + i; - vl = vsetvl_e16m8(n - i); - vx = vle16_v_u16m8(x, vl); // load - vx = vssubu(vx, offset, vl); // subtract - vse16(x, vx, vl); // save + vl = __riscv_vsetvl_e16m8(n - i); + vx = __riscv_vle16_v_u16m8(x, vl); // load + vx = __riscv_vssubu(vx, offset, vl); // subtract + __riscv_vse16(x, vx, vl); // save } }
diff --git a/risp4ml/isp_stages/demosaic_rvv.c b/risp4ml/isp_stages/demosaic_rvv.c index 4709a93..0f3c02a 100644 --- a/risp4ml/isp_stages/demosaic_rvv.c +++ b/risp4ml/isp_stages/demosaic_rvv.c
@@ -96,48 +96,62 @@ x_offset[2] = x + 1; ptrdiff_t stride = 2 * sizeof(uint16_t); size_t avl = (input->width - 1 - x) / 2; - vl = vsetvl_e16m4(avl); + vl = __riscv_vsetvl_e16m4(avl); if (n + (y & 0x1) == 2) { // kR or kB // ch0 - vx = vlse16_v_u16m4(line_in[1] + x_offset[1], stride, vl); // load - vsse16(line_out[0] + x, stride, vx, vl); // save + vx = __riscv_vlse16_v_u16m4(line_in[1] + x_offset[1], stride, + vl); // load + __riscv_vsse16(line_out[0] + x, stride, vx, vl); // save // ch1 - vx = vlse16_v_u16m4(line_in[0] + x_offset[1], stride, vl); // load - vy = vlse16_v_u16m4(line_in[2] + x_offset[1], stride, vl); // load - vz = vwaddu_vv(vx, vy, vl); // add - vy = vlse16_v_u16m4(line_in[1] + x_offset[0], stride, vl); // load - vz = vwaddu_wv(vz, vy, vl); // add - vy = vlse16_v_u16m4(line_in[1] + x_offset[2], stride, vl); // load - vz = vwaddu_wv(vz, vy, vl); // add - vx = vnsrl(vz, 2, vl); // 1/4 - vsse16(line_out[1] + x, stride, vx, vl); // save + vx = __riscv_vlse16_v_u16m4(line_in[0] + x_offset[1], stride, + vl); // load + vy = __riscv_vlse16_v_u16m4(line_in[2] + x_offset[1], stride, + vl); // load + vz = __riscv_vwaddu_vv(vx, vy, vl); // add + vy = __riscv_vlse16_v_u16m4(line_in[1] + x_offset[0], stride, + vl); // load + vz = __riscv_vwaddu_wv(vz, vy, vl); // add + vy = __riscv_vlse16_v_u16m4(line_in[1] + x_offset[2], stride, + vl); // load + vz = __riscv_vwaddu_wv(vz, vy, vl); // add + vx = __riscv_vnsrl(vz, 2, vl); // 1/4 + __riscv_vsse16(line_out[1] + x, stride, vx, vl); // save // ch2 - vx = vlse16_v_u16m4(line_in[0] + x_offset[0], stride, vl); // load - vy = vlse16_v_u16m4(line_in[0] + x_offset[2], stride, vl); // load - vz = vwaddu_vv(vx, vy, vl); // add - vy = vlse16_v_u16m4(line_in[2] + x_offset[0], stride, vl); // load - vz = vwaddu_wv(vz, vy, vl); // add - vy = vlse16_v_u16m4(line_in[2] + x_offset[2], stride, vl); // load - vz = vwaddu_wv(vz, vy, vl); // add - vx = vnsrl(vz, 2, vl); // 1/4 - vsse16(line_out[2] + x, stride, vx, vl); // save + vx = __riscv_vlse16_v_u16m4(line_in[0] + x_offset[0], stride, + vl); // load + vy = __riscv_vlse16_v_u16m4(line_in[0] + x_offset[2], stride, + vl); // load + vz = __riscv_vwaddu_vv(vx, vy, vl); // add + vy = __riscv_vlse16_v_u16m4(line_in[2] + x_offset[0], stride, + vl); // load + vz = __riscv_vwaddu_wv(vz, vy, vl); // add + vy = __riscv_vlse16_v_u16m4(line_in[2] + x_offset[2], stride, + vl); // load + vz = __riscv_vwaddu_wv(vz, vy, vl); // add + vx = __riscv_vnsrl(vz, 2, vl); // 1/4 + __riscv_vsse16(line_out[2] + x, stride, vx, vl); // save } else { // kGr or kRb // ch0 - vx = vlse16_v_u16m4(line_in[1] + x_offset[0], stride, vl); // load - vy = vlse16_v_u16m4(line_in[1] + x_offset[2], stride, vl); // load - vz = vwaddu_vv(vx, vy, vl); // add - vx = vnsrl(vz, 1, vl); // 1/2 - vsse16(line_out[0] + x, stride, vx, vl); // save + vx = __riscv_vlse16_v_u16m4(line_in[1] + x_offset[0], stride, + vl); // load + vy = __riscv_vlse16_v_u16m4(line_in[1] + x_offset[2], stride, + vl); // load + vz = __riscv_vwaddu_vv(vx, vy, vl); // add + vx = __riscv_vnsrl(vz, 1, vl); // 1/2 + __riscv_vsse16(line_out[0] + x, stride, vx, vl); // save // ch1 - vx = vlse16_v_u16m4(line_in[1] + x_offset[1], stride, vl); // load - vsse16(line_out[1] + x, stride, vx, vl); // save + vx = __riscv_vlse16_v_u16m4(line_in[1] + x_offset[1], stride, + vl); // load + __riscv_vsse16(line_out[1] + x, stride, vx, vl); // save // ch2 - vx = vlse16_v_u16m4(line_in[0] + x_offset[1], stride, vl); // load - vy = vlse16_v_u16m4(line_in[2] + x_offset[1], stride, vl); // load - vz = vwaddu_vv(vx, vy, vl); // add - vx = vnsrl(vz, 1, vl); // 1/2 - vsse16(line_out[2] + x, stride, vx, vl); // save + vx = __riscv_vlse16_v_u16m4(line_in[0] + x_offset[1], stride, + vl); // load + vy = __riscv_vlse16_v_u16m4(line_in[2] + x_offset[1], stride, + vl); // load + vz = __riscv_vwaddu_vv(vx, vy, vl); // add + vx = __riscv_vnsrl(vz, 1, vl); // 1/2 + __riscv_vsse16(line_out[2] + x, stride, vx, vl); // save } } }
diff --git a/risp4ml/isp_stages/dg_rvv.c b/risp4ml/isp_stages/dg_rvv.c index a62dede..2488f64 100644 --- a/risp4ml/isp_stages/dg_rvv.c +++ b/risp4ml/isp_stages/dg_rvv.c
@@ -40,12 +40,12 @@ uint16_t* x; for (size_t i = 0; i < n; i += vl) { x = img->data + i; - vl = vsetvl_e16m4(n - i); - vx = vle16_v_u16m4(x, vl); // load - vy = vwmulu(vx, gain, vl); // multiply - vy = vadd(vy, offset, vl); // add - vy = vminu(vy, max_val, vl); // clamp - vx = vnsrl(vy, kDgFractional, vl); // bit shift - vse16(x, vx, vl); // save + vl = __riscv_vsetvl_e16m4(n - i); + vx = __riscv_vle16_v_u16m4(x, vl); // load + vy = __riscv_vwmulu(vx, gain, vl); // multiply + vy = __riscv_vadd(vy, offset, vl); // add + vy = __riscv_vminu(vy, max_val, vl); // clamp + vx = __riscv_vnsrl(vy, kDgFractional, vl); // bit shift + __riscv_vse16(x, vx, vl); // save } }
diff --git a/risp4ml/isp_stages/downscale_rvv.c b/risp4ml/isp_stages/downscale_rvv.c index 9098148..849e9f6 100644 --- a/risp4ml/isp_stages/downscale_rvv.c +++ b/risp4ml/isp_stages/downscale_rvv.c
@@ -66,95 +66,96 @@ uint8_t* out = output->data + c; for (size_t i = 0; i < n; i += vl) { - vl = vsetvl_e16m4(n - i); - vid = vid_v_u32m8(vl); - vid = vadd(vid, i, vl); + vl = __riscv_vsetvl_e16m4(n - i); + vid = __riscv_vid_v_u32m8(vl); + vid = __riscv_vadd(vid, i, vl); - vy = vdivu(vid, output->width, vl); - vx = vremu(vid, output->width, vl); + vy = __riscv_vdivu(vid, output->width, vl); + vx = __riscv_vremu(vid, output->width, vl); // find 4 neighboring points // four neighboring coordinates are // [y_l, x_l], [y_l, x_h], [y_h, x_l], [y_h, x_h] - vx_l = vmul(vx, input_w_1, vl); - vx_l = vdivu(vx_l, w_1, vl); - vx_h = vadd(vx_l, 1, vl); - vx_h = vminu(vx_h, input_w_1, vl); // clamp + vx_l = __riscv_vmul(vx, input_w_1, vl); + vx_l = __riscv_vdivu(vx_l, w_1, vl); + vx_h = __riscv_vadd(vx_l, 1, vl); + vx_h = __riscv_vminu(vx_h, input_w_1, vl); // clamp - vy_l = vmul(vy, input_h_1, vl); - vy_l = vdivu(vy_l, h_1, vl); - vy_h = vadd(vy_l, 1, vl); - vy_h = vminu(vy_h, input_h_1, vl); // clamp + vy_l = __riscv_vmul(vy, input_h_1, vl); + vy_l = __riscv_vdivu(vy_l, h_1, vl); + vy_h = __riscv_vadd(vy_l, 1, vl); + vy_h = __riscv_vminu(vy_h, input_h_1, vl); // clamp // load values of four neighboring points: A, B C, D - vz = vmul(vy_l, input_width, vl); - vz = vadd(vz, vx_l, vl); - vz = vsll(vz, 1, vl); // *2 - vo_16b = vluxei32(in, vz, vl); - va = vwaddu_vx(vo_16b, 0, vl); + vz = __riscv_vmul(vy_l, input_width, vl); + vz = __riscv_vadd(vz, vx_l, vl); + vz = __riscv_vsll(vz, 1, vl); // *2 + vo_16b = __riscv_vluxei32(in, vz, vl); + va = __riscv_vwaddu_vx(vo_16b, 0, vl); - vz = vmul(vy_l, input_width, vl); - vz = vadd(vz, vx_h, vl); - vz = vsll(vz, 1, vl); // *2 - vo_16b = vluxei32(in, vz, vl); - vb = vwaddu_vx(vo_16b, 0, vl); + vz = __riscv_vmul(vy_l, input_width, vl); + vz = __riscv_vadd(vz, vx_h, vl); + vz = __riscv_vsll(vz, 1, vl); // *2 + vo_16b = __riscv_vluxei32(in, vz, vl); + vb = __riscv_vwaddu_vx(vo_16b, 0, vl); - vz = vmul(vy_h, input_width, vl); - vz = vadd(vz, vx_l, vl); - vz = vsll(vz, 1, vl); // *2 - vo_16b = vluxei32(in, vz, vl); - vc = vwaddu_vx(vo_16b, 0, vl); + vz = __riscv_vmul(vy_h, input_width, vl); + vz = __riscv_vadd(vz, vx_l, vl); + vz = __riscv_vsll(vz, 1, vl); // *2 + vo_16b = __riscv_vluxei32(in, vz, vl); + vc = __riscv_vwaddu_vx(vo_16b, 0, vl); - vz = vmul(vy_h, input_width, vl); - vz = vadd(vz, vx_h, vl); - vz = vsll(vz, 1, vl); // *2 - vo_16b = vluxei32(in, vz, vl); - vd = vwaddu_vx(vo_16b, 0, vl); + vz = __riscv_vmul(vy_h, input_width, vl); + vz = __riscv_vadd(vz, vx_h, vl); + vz = __riscv_vsll(vz, 1, vl); // *2 + vo_16b = __riscv_vluxei32(in, vz, vl); + vd = __riscv_vwaddu_vx(vo_16b, 0, vl); // compute weights of four neighboring points: wx, wy, 1-wx, 1-wy - vp = vmul(vx, input_w_1, vl); - vq = vmul(vx_l, w_1, vl); - vp = vssubu(vp, vq, vl); - vp = vsll(vp, params.scale_precision, vl); - vx_weight = vdivu(vp, w_1, vl); + vp = __riscv_vmul(vx, input_w_1, vl); + vq = __riscv_vmul(vx_l, w_1, vl); + vp = __riscv_vssubu(vp, vq, vl); + vp = __riscv_vsll(vp, params.scale_precision, vl); + vx_weight = __riscv_vdivu(vp, w_1, vl); - vp = vmul(vy, input_h_1, vl); - vq = vmul(vy_l, h_1, vl); - vp = vssubu(vp, vq, vl); - vp = vsll(vp, params.scale_precision, vl); - vy_weight = vdivu(vp, h_1, vl); + vp = __riscv_vmul(vy, input_h_1, vl); + vq = __riscv_vmul(vy_l, h_1, vl); + vp = __riscv_vssubu(vp, vq, vl); + vp = __riscv_vsll(vp, params.scale_precision, vl); + vy_weight = __riscv_vdivu(vp, h_1, vl); - vx_weight_1minus = vrsub(vx_weight, params.scale_fixed_one, vl); - vy_weight_1minus = vrsub(vy_weight, params.scale_fixed_one, vl); + vx_weight_1minus = __riscv_vrsub(vx_weight, params.scale_fixed_one, vl); + vy_weight_1minus = __riscv_vrsub(vy_weight, params.scale_fixed_one, vl); // Bilinear Interpolation Formular: // out = A*(1-wx)*(1-wy) + B*wx*(1-wy) // + C*(1-wx)*wy + D*wx*wy - vo = vmul(va, vx_weight_1minus, vl); - vo = vsrl(vo, params.scale_precision, vl); - vo = vmul(vo, vy_weight_1minus, vl); + vo = __riscv_vmul(va, vx_weight_1minus, vl); + vo = __riscv_vsrl(vo, params.scale_precision, vl); + vo = __riscv_vmul(vo, vy_weight_1minus, vl); - vp = vmul(vb, vx_weight, vl); - vp = vsrl(vp, params.scale_precision, vl); - vp = vmul(vp, vy_weight_1minus, vl); - vo = vadd(vo, vp, vl); + vp = __riscv_vmul(vb, vx_weight, vl); + vp = __riscv_vsrl(vp, params.scale_precision, vl); + vp = __riscv_vmul(vp, vy_weight_1minus, vl); + vo = __riscv_vadd(vo, vp, vl); - vp = vmul(vc, vx_weight_1minus, vl); - vp = vsrl(vp, params.scale_precision, vl); - vp = vmul(vp, vy_weight, vl); - vo = vadd(vo, vp, vl); + vp = __riscv_vmul(vc, vx_weight_1minus, vl); + vp = __riscv_vsrl(vp, params.scale_precision, vl); + vp = __riscv_vmul(vp, vy_weight, vl); + vo = __riscv_vadd(vo, vp, vl); - vp = vmul(vd, vx_weight, vl); - vp = vsrl(vp, params.scale_precision, vl); - vp = vmul(vp, vy_weight, vl); - vo = vadd(vo, vp, vl); + vp = __riscv_vmul(vd, vx_weight, vl); + vp = __riscv_vsrl(vp, params.scale_precision, vl); + vp = __riscv_vmul(vp, vy_weight, vl); + vo = __riscv_vadd(vo, vp, vl); // bit shift from 32bits to 8bits - vo_16b = vnsrl(vo, params.scale_precision, vl); - vo_8b = vnsrl(vo_16b, kRawPipelineBpp - kPipeOutputBpp, vl); + vo_16b = __riscv_vnsrl(vo, params.scale_precision, vl); + vo_8b = __riscv_vnsrl(vo_16b, kRawPipelineBpp - kPipeOutputBpp, vl); // save - vsse8(out + i * output->num_channels, output->num_channels, vo_8b, vl); + __riscv_vsse8(out + i * output->num_channels, output->num_channels, vo_8b, + vl); } } }
diff --git a/risp4ml/isp_stages/gamma_rvv.c b/risp4ml/isp_stages/gamma_rvv.c index e04025d..dab4f91 100644 --- a/risp4ml/isp_stages/gamma_rvv.c +++ b/risp4ml/isp_stages/gamma_rvv.c
@@ -247,23 +247,23 @@ for (size_t i = 0; i < n; i += vl) { x = img->data + i; - vl = vsetvl_e16m8(n - i); - vx = vle16_v_u16m8(x, vl); // load - mask = vmsne(vx, kRgbPipelineMaxVal, - vl); // mask to exclude kRgbPipelineMaxVal + vl = __riscv_vsetvl_e16m8(n - i); + vx = __riscv_vle16_v_u16m8(x, vl); // load + mask = __riscv_vmsne(vx, kRgbPipelineMaxVal, + vl); // mask to exclude kRgbPipelineMaxVal - vy = vsrl(vx, kGammaShiftBits, vl); // 1/32 - vy = vsll(vy, 1, vl); // *2 - v_offset = vand(vx, kGammaSpacing - 1, vl); // offset within bin + vy = __riscv_vsrl(vx, kGammaShiftBits, vl); // 1/32 + vy = __riscv_vsll(vy, 1, vl); // *2 + v_offset = __riscv_vand(vx, kGammaSpacing - 1, vl); // offset within bin - vx = vluxei16(gamma_params.lut, vy, vl); // left - vy = vluxei16(gamma_params.lut + 1, vy, vl); // right + vx = __riscv_vluxei16(gamma_params.lut, vy, vl); // left + vy = __riscv_vluxei16(gamma_params.lut + 1, vy, vl); // right - vy = vsub(vy, vx, vl); // right - left - vy = vmul(vy, v_offset, vl); // multiply offset_within_bin - vy = vsrl(vy, kGammaShiftBits, vl); // 1/32 - vx = vadd(vx, vy, vl); // add + vy = __riscv_vsub(vy, vx, vl); // right - left + vy = __riscv_vmul(vy, v_offset, vl); // multiply offset_within_bin + vy = __riscv_vsrl(vy, kGammaShiftBits, vl); // 1/32 + vx = __riscv_vadd(vx, vy, vl); // add - vse16(mask, x, vx, vl); // save + __riscv_vse16(mask, x, vx, vl); // save } }
diff --git a/risp4ml/isp_stages/wbg_rvv.c b/risp4ml/isp_stages/wbg_rvv.c index 1bc1187..3374de6 100644 --- a/risp4ml/isp_stages/wbg_rvv.c +++ b/risp4ml/isp_stages/wbg_rvv.c
@@ -48,12 +48,13 @@ for (uint8_t n = 0; n < 2; n++) { for (uint16_t x = n; x < img->width; x += 2 * vl) { size_t avl = (img->width + 1 - x) / 2; - vl = vsetvl_e16m8(avl); - vx = vlse16_v_u16m8(line + x, 2 * sizeof(uint16_t), vl); // load + vl = __riscv_vsetvl_e16m8(avl); + vx = + __riscv_vlse16_v_u16m8(line + x, 2 * sizeof(uint16_t), vl); // load - vy = vmv_v_x_u32m1(0, vl); // init - vy = vwredsumu(vy, vx, vy, vl); // sum - uint32_t sum = vmv_x(vy); + vy = __riscv_vmv_v_x_u32m1(0, vl); // init + vy = __riscv_vwredsumu(vx, vy, vl); // sum + uint32_t sum = __riscv_vmv_x(vy); if ((y & 0x1) == 0 && n == 0) { sum_of_reds += sum; } else if ((y & 0x1) == 1 && n == 1) { @@ -108,13 +109,14 @@ gain = (y & 0x1) ? wbg_params.gains[2 + n] : wbg_params.gains[n]; for (uint16_t x = n; x < img->width; x += 2 * vl) { size_t avl = (img->width + 1 - x) / 2; - vl = vsetvl_e16m4(avl); - vx = vlse16_v_u16m4(line + x, 2 * sizeof(uint16_t), vl); // load - vy = vwmulu(vx, gain, vl); // multiply - vy = vadd(vy, offset, vl); // add - vy = vminu(vy, max_val, vl); // clamp - vx = vnsrl(vy, kWbgFractional, vl); // bit shift - vsse16(line + x, 2 * sizeof(uint16_t), vx, vl); // save + vl = __riscv_vsetvl_e16m4(avl); + vx = + __riscv_vlse16_v_u16m4(line + x, 2 * sizeof(uint16_t), vl); // load + vy = __riscv_vwmulu(vx, gain, vl); // multiply + vy = __riscv_vadd(vy, offset, vl); // add + vy = __riscv_vminu(vy, max_val, vl); // clamp + vx = __riscv_vnsrl(vy, kWbgFractional, vl); // bit shift + __riscv_vsse16(line + x, 2 * sizeof(uint16_t), vx, vl); // save } } }
diff --git a/vmvx_ukernel/elementwise.c b/vmvx_ukernel/elementwise.c index c568dbb..354e7e8 100644 --- a/vmvx_ukernel/elementwise.c +++ b/vmvx_ukernel/elementwise.c
@@ -143,53 +143,53 @@ iree_uk_index_t out_stride, size_t vl) { iree_uk_x32b_opcode_type_t op_type = get_iree_uk_x32b_op_type(opcode); if (op_type == IREE_UK_X32B_UI) { - vuint32m8_t vx = vlse32_v_u32m8(lhs, lhs_stride, vl); // load - vuint32m8_t vy = vlse32_v_u32m8(rhs, rhs_stride, vl); // load + vuint32m8_t vx = __riscv_vlse32_v_u32m8(lhs, lhs_stride, vl); // load + vuint32m8_t vy = __riscv_vlse32_v_u32m8(rhs, rhs_stride, vl); // load switch (opcode) { case IREE_UK_X32B_ADDI: - vx = vadd(vx, vy, vl); + vx = __riscv_vadd(vx, vy, vl); break; case IREE_UK_X32B_ANDI: - vx = vand(vx, vy, vl); + vx = __riscv_vand(vx, vy, vl); break; case IREE_UK_X32B_DIVUI: - vx = vdivu(vx, vy, vl); + vx = __riscv_vdivu(vx, vy, vl); break; case IREE_UK_X32B_MULI: - vx = vmul(vx, vy, vl); + vx = __riscv_vmul(vx, vy, vl); break; case IREE_UK_X32B_ORI: - vx = vor(vx, vy, vl); + vx = __riscv_vor(vx, vy, vl); break; case IREE_UK_X32B_SHLI: - vx = vsll(vx, vy, vl); + vx = __riscv_vsll(vx, vy, vl); break; case IREE_UK_X32B_SHRUI: - vx = vsrl(vx, vy, vl); + vx = __riscv_vsrl(vx, vy, vl); break; case IREE_UKENREL_X32B_XORI: - vx = vor(vx, vy, vl); + vx = __riscv_vor(vx, vy, vl); break; case IREE_UK_X32B_SUBI: - vx = vsub(vx, vy, vl); + vx = __riscv_vsub(vx, vy, vl); break; default: *result_code = 1; } - vsse32(out, out_stride, vx, vl); // save + __riscv_vsse32(out, out_stride, vx, vl); // save } else if (op_type == IREE_UK_X32B_SI) { vint32m8_t vx = - vlse32_v_i32m8((iree_uk_int32_t*)lhs, lhs_stride, vl); // load + __riscv_vlse32_v_i32m8((iree_uk_int32_t*)lhs, lhs_stride, vl); // load vint32m8_t vy = - vlse32_v_i32m8((iree_uk_int32_t*)rhs, rhs_stride, vl); // load + __riscv_vlse32_v_i32m8((iree_uk_int32_t*)rhs, rhs_stride, vl); // load switch (opcode) { case IREE_UK_X32B_DIVSI: - vx = vdiv(vx, vy, vl); + vx = __riscv_vdiv(vx, vy, vl); break; default: *result_code = 1; } - vsse32((iree_uk_int32_t*)out, out_stride, vx, vl); // save + __riscv_vsse32((iree_uk_int32_t*)out, out_stride, vx, vl); // save } else { *result_code = 1; } @@ -316,7 +316,7 @@ if (size0 < size1) { for (iree_uk_index_t i = 0; i < size0; ++i) { for (iree_uk_index_t j = 0; j < size1; j += vl) { - vl = vsetvl_e32m8(size1 - j); + vl = __riscv_vsetvl_e32m8(size1 - j); iree_uk_rvv_x32b_op(opcode, &result_code, &lhs[i * lhs_stride0 + j * lhs_stride1], lhs_stride1 * sizeof(uint32_t), @@ -329,7 +329,7 @@ } else { for (iree_uk_index_t j = 0; j < size1; ++j) { for (iree_uk_index_t i = 0; i < size0; i += vl) { - vl = vsetvl_e32m8(size0 - i); + vl = __riscv_vsetvl_e32m8(size0 - i); iree_uk_rvv_x32b_op(opcode, &result_code, &lhs[i * lhs_stride0 + j * lhs_stride1], lhs_stride0 * sizeof(uint32_t),
diff --git a/vmvx_ukernel/mmt4d_tile.c b/vmvx_ukernel/mmt4d_tile.c index 28d8145..7f00451 100644 --- a/vmvx_ukernel/mmt4d_tile.c +++ b/vmvx_ukernel/mmt4d_tile.c
@@ -29,13 +29,13 @@ vint32m1_t v_sum; iree_uk_int32_t sum = 0; for (size_t i = 0; i < n; i += vl) { - vl = vsetvl_e8m4(n - i); - vu = vle8_v_i8m4(u + i, vl); // load - vw = vle8_v_i8m4(w + i, vl); // load - vx = vwmul(vu, vw, vl); // multiply - v_sum = vmv_s(v_sum, 0, vl); // init - v_sum = vwredsum(v_sum, vx, v_sum, vl); // sum - sum += vmv_x(v_sum); + vl = __riscv_vsetvl_e8m4(n - i); + vu = __riscv_vle8_v_i8m4(u + i, vl); // load + vw = __riscv_vle8_v_i8m4(w + i, vl); // load + vx = __riscv_vwmul(vu, vw, vl); // multiply + v_sum = __riscv_vmv_v_x_i32m1(0, vl); // init + v_sum = __riscv_vwredsum(vx, v_sum, vl); // sum + sum += __riscv_vmv_x(v_sum); } return sum; }