sw/vec_iree: Update RVV implementations using latest RVV spec

Needed when we update riscv32-unknown-elf-gdb toolchain.

Change-Id: I82c6e6d4a3d080fc9aec274be435efc7a497c2bc
diff --git a/audio_prep/mfcc.c b/audio_prep/mfcc.c
index 0a137e6..994ffcb 100644
--- a/audio_prep/mfcc.c
+++ b/audio_prep/mfcc.c
@@ -39,13 +39,13 @@
   vuint32m1_t v_sum;
   uint32_t sum = 0;
   for (size_t i = 0; i < n; i += vl) {
-    vl = vsetvl_e32m8(n - i);
-    vu = vle32_v_u32m8(u + i, vl);          // load
-    vw = vle32_v_u32m8(w + i, vl);          // load
-    vx = vmul(vu, vw, vl);                  // multiply
-    v_sum = vmv_s(v_sum, 0, vl);            // init
-    v_sum = vredsum(v_sum, vx, v_sum, vl);  // sum
-    sum += vmv_x(v_sum);
+    vl = __riscv_vsetvl_e32m8(n - i);
+    vu = __riscv_vle32_v_u32m8(u + i, vl);   // load
+    vw = __riscv_vle32_v_u32m8(w + i, vl);   // load
+    vx = __riscv_vmul(vu, vw, vl);           // multiply
+    v_sum = __riscv_vmv_v_x_u32m1(0, vl);    // init
+    v_sum = __riscv_vredsum(vx, v_sum, vl);  // sum
+    sum += __riscv_vmv_x(v_sum);
   }
   return sum;
 }
diff --git a/risp4ml/isp_stages/blc_rvv.c b/risp4ml/isp_stages/blc_rvv.c
index 2cb3092..551c817 100644
--- a/risp4ml/isp_stages/blc_rvv.c
+++ b/risp4ml/isp_stages/blc_rvv.c
@@ -34,9 +34,9 @@
   uint16_t* x;
   for (size_t i = 0; i < n; i += vl) {
     x = img->data + i;
-    vl = vsetvl_e16m8(n - i);
-    vx = vle16_v_u16m8(x, vl);    // load
-    vx = vssubu(vx, offset, vl);  // subtract
-    vse16(x, vx, vl);             // save
+    vl = __riscv_vsetvl_e16m8(n - i);
+    vx = __riscv_vle16_v_u16m8(x, vl);    // load
+    vx = __riscv_vssubu(vx, offset, vl);  // subtract
+    __riscv_vse16(x, vx, vl);             // save
   }
 }
diff --git a/risp4ml/isp_stages/demosaic_rvv.c b/risp4ml/isp_stages/demosaic_rvv.c
index 4709a93..0f3c02a 100644
--- a/risp4ml/isp_stages/demosaic_rvv.c
+++ b/risp4ml/isp_stages/demosaic_rvv.c
@@ -96,48 +96,62 @@
         x_offset[2] = x + 1;
         ptrdiff_t stride = 2 * sizeof(uint16_t);
         size_t avl = (input->width - 1 - x) / 2;
-        vl = vsetvl_e16m4(avl);
+        vl = __riscv_vsetvl_e16m4(avl);
 
         if (n + (y & 0x1) == 2) {  // kR or kB
           // ch0
-          vx = vlse16_v_u16m4(line_in[1] + x_offset[1], stride, vl);  // load
-          vsse16(line_out[0] + x, stride, vx, vl);                    // save
+          vx = __riscv_vlse16_v_u16m4(line_in[1] + x_offset[1], stride,
+                                      vl);                  // load
+          __riscv_vsse16(line_out[0] + x, stride, vx, vl);  // save
           // ch1
-          vx = vlse16_v_u16m4(line_in[0] + x_offset[1], stride, vl);  // load
-          vy = vlse16_v_u16m4(line_in[2] + x_offset[1], stride, vl);  // load
-          vz = vwaddu_vv(vx, vy, vl);                                 // add
-          vy = vlse16_v_u16m4(line_in[1] + x_offset[0], stride, vl);  // load
-          vz = vwaddu_wv(vz, vy, vl);                                 // add
-          vy = vlse16_v_u16m4(line_in[1] + x_offset[2], stride, vl);  // load
-          vz = vwaddu_wv(vz, vy, vl);                                 // add
-          vx = vnsrl(vz, 2, vl);                                      // 1/4
-          vsse16(line_out[1] + x, stride, vx, vl);                    // save
+          vx = __riscv_vlse16_v_u16m4(line_in[0] + x_offset[1], stride,
+                                      vl);  // load
+          vy = __riscv_vlse16_v_u16m4(line_in[2] + x_offset[1], stride,
+                                      vl);     // load
+          vz = __riscv_vwaddu_vv(vx, vy, vl);  // add
+          vy = __riscv_vlse16_v_u16m4(line_in[1] + x_offset[0], stride,
+                                      vl);     // load
+          vz = __riscv_vwaddu_wv(vz, vy, vl);  // add
+          vy = __riscv_vlse16_v_u16m4(line_in[1] + x_offset[2], stride,
+                                      vl);                  // load
+          vz = __riscv_vwaddu_wv(vz, vy, vl);               // add
+          vx = __riscv_vnsrl(vz, 2, vl);                    // 1/4
+          __riscv_vsse16(line_out[1] + x, stride, vx, vl);  // save
           // ch2
-          vx = vlse16_v_u16m4(line_in[0] + x_offset[0], stride, vl);  // load
-          vy = vlse16_v_u16m4(line_in[0] + x_offset[2], stride, vl);  // load
-          vz = vwaddu_vv(vx, vy, vl);                                 // add
-          vy = vlse16_v_u16m4(line_in[2] + x_offset[0], stride, vl);  // load
-          vz = vwaddu_wv(vz, vy, vl);                                 // add
-          vy = vlse16_v_u16m4(line_in[2] + x_offset[2], stride, vl);  // load
-          vz = vwaddu_wv(vz, vy, vl);                                 // add
-          vx = vnsrl(vz, 2, vl);                                      // 1/4
-          vsse16(line_out[2] + x, stride, vx, vl);                    // save
+          vx = __riscv_vlse16_v_u16m4(line_in[0] + x_offset[0], stride,
+                                      vl);  // load
+          vy = __riscv_vlse16_v_u16m4(line_in[0] + x_offset[2], stride,
+                                      vl);     // load
+          vz = __riscv_vwaddu_vv(vx, vy, vl);  // add
+          vy = __riscv_vlse16_v_u16m4(line_in[2] + x_offset[0], stride,
+                                      vl);     // load
+          vz = __riscv_vwaddu_wv(vz, vy, vl);  // add
+          vy = __riscv_vlse16_v_u16m4(line_in[2] + x_offset[2], stride,
+                                      vl);                  // load
+          vz = __riscv_vwaddu_wv(vz, vy, vl);               // add
+          vx = __riscv_vnsrl(vz, 2, vl);                    // 1/4
+          __riscv_vsse16(line_out[2] + x, stride, vx, vl);  // save
         } else {  // kGr or kRb
           // ch0
-          vx = vlse16_v_u16m4(line_in[1] + x_offset[0], stride, vl);  // load
-          vy = vlse16_v_u16m4(line_in[1] + x_offset[2], stride, vl);  // load
-          vz = vwaddu_vv(vx, vy, vl);                                 // add
-          vx = vnsrl(vz, 1, vl);                                      // 1/2
-          vsse16(line_out[0] + x, stride, vx, vl);                    // save
+          vx = __riscv_vlse16_v_u16m4(line_in[1] + x_offset[0], stride,
+                                      vl);  // load
+          vy = __riscv_vlse16_v_u16m4(line_in[1] + x_offset[2], stride,
+                                      vl);                  // load
+          vz = __riscv_vwaddu_vv(vx, vy, vl);               // add
+          vx = __riscv_vnsrl(vz, 1, vl);                    // 1/2
+          __riscv_vsse16(line_out[0] + x, stride, vx, vl);  // save
           // ch1
-          vx = vlse16_v_u16m4(line_in[1] + x_offset[1], stride, vl);  // load
-          vsse16(line_out[1] + x, stride, vx, vl);                    // save
+          vx = __riscv_vlse16_v_u16m4(line_in[1] + x_offset[1], stride,
+                                      vl);                  // load
+          __riscv_vsse16(line_out[1] + x, stride, vx, vl);  // save
           // ch2
-          vx = vlse16_v_u16m4(line_in[0] + x_offset[1], stride, vl);  // load
-          vy = vlse16_v_u16m4(line_in[2] + x_offset[1], stride, vl);  // load
-          vz = vwaddu_vv(vx, vy, vl);                                 // add
-          vx = vnsrl(vz, 1, vl);                                      // 1/2
-          vsse16(line_out[2] + x, stride, vx, vl);                    // save
+          vx = __riscv_vlse16_v_u16m4(line_in[0] + x_offset[1], stride,
+                                      vl);  // load
+          vy = __riscv_vlse16_v_u16m4(line_in[2] + x_offset[1], stride,
+                                      vl);                  // load
+          vz = __riscv_vwaddu_vv(vx, vy, vl);               // add
+          vx = __riscv_vnsrl(vz, 1, vl);                    // 1/2
+          __riscv_vsse16(line_out[2] + x, stride, vx, vl);  // save
         }
       }
     }
diff --git a/risp4ml/isp_stages/dg_rvv.c b/risp4ml/isp_stages/dg_rvv.c
index a62dede..2488f64 100644
--- a/risp4ml/isp_stages/dg_rvv.c
+++ b/risp4ml/isp_stages/dg_rvv.c
@@ -40,12 +40,12 @@
   uint16_t* x;
   for (size_t i = 0; i < n; i += vl) {
     x = img->data + i;
-    vl = vsetvl_e16m4(n - i);
-    vx = vle16_v_u16m4(x, vl);          // load
-    vy = vwmulu(vx, gain, vl);          // multiply
-    vy = vadd(vy, offset, vl);          // add
-    vy = vminu(vy, max_val, vl);        // clamp
-    vx = vnsrl(vy, kDgFractional, vl);  // bit shift
-    vse16(x, vx, vl);                   // save
+    vl = __riscv_vsetvl_e16m4(n - i);
+    vx = __riscv_vle16_v_u16m4(x, vl);          // load
+    vy = __riscv_vwmulu(vx, gain, vl);          // multiply
+    vy = __riscv_vadd(vy, offset, vl);          // add
+    vy = __riscv_vminu(vy, max_val, vl);        // clamp
+    vx = __riscv_vnsrl(vy, kDgFractional, vl);  // bit shift
+    __riscv_vse16(x, vx, vl);                   // save
   }
 }
diff --git a/risp4ml/isp_stages/downscale_rvv.c b/risp4ml/isp_stages/downscale_rvv.c
index 9098148..849e9f6 100644
--- a/risp4ml/isp_stages/downscale_rvv.c
+++ b/risp4ml/isp_stages/downscale_rvv.c
@@ -66,95 +66,96 @@
     uint8_t* out = output->data + c;
 
     for (size_t i = 0; i < n; i += vl) {
-      vl = vsetvl_e16m4(n - i);
-      vid = vid_v_u32m8(vl);
-      vid = vadd(vid, i, vl);
+      vl = __riscv_vsetvl_e16m4(n - i);
+      vid = __riscv_vid_v_u32m8(vl);
+      vid = __riscv_vadd(vid, i, vl);
 
-      vy = vdivu(vid, output->width, vl);
-      vx = vremu(vid, output->width, vl);
+      vy = __riscv_vdivu(vid, output->width, vl);
+      vx = __riscv_vremu(vid, output->width, vl);
 
       // find 4 neighboring points
       // four neighboring coordinates are
       // [y_l, x_l], [y_l, x_h], [y_h, x_l], [y_h, x_h]
-      vx_l = vmul(vx, input_w_1, vl);
-      vx_l = vdivu(vx_l, w_1, vl);
-      vx_h = vadd(vx_l, 1, vl);
-      vx_h = vminu(vx_h, input_w_1, vl);  // clamp
+      vx_l = __riscv_vmul(vx, input_w_1, vl);
+      vx_l = __riscv_vdivu(vx_l, w_1, vl);
+      vx_h = __riscv_vadd(vx_l, 1, vl);
+      vx_h = __riscv_vminu(vx_h, input_w_1, vl);  // clamp
 
-      vy_l = vmul(vy, input_h_1, vl);
-      vy_l = vdivu(vy_l, h_1, vl);
-      vy_h = vadd(vy_l, 1, vl);
-      vy_h = vminu(vy_h, input_h_1, vl);  // clamp
+      vy_l = __riscv_vmul(vy, input_h_1, vl);
+      vy_l = __riscv_vdivu(vy_l, h_1, vl);
+      vy_h = __riscv_vadd(vy_l, 1, vl);
+      vy_h = __riscv_vminu(vy_h, input_h_1, vl);  // clamp
 
       // load values of four neighboring points: A, B C, D
-      vz = vmul(vy_l, input_width, vl);
-      vz = vadd(vz, vx_l, vl);
-      vz = vsll(vz, 1, vl);  // *2
-      vo_16b = vluxei32(in, vz, vl);
-      va = vwaddu_vx(vo_16b, 0, vl);
+      vz = __riscv_vmul(vy_l, input_width, vl);
+      vz = __riscv_vadd(vz, vx_l, vl);
+      vz = __riscv_vsll(vz, 1, vl);  // *2
+      vo_16b = __riscv_vluxei32(in, vz, vl);
+      va = __riscv_vwaddu_vx(vo_16b, 0, vl);
 
-      vz = vmul(vy_l, input_width, vl);
-      vz = vadd(vz, vx_h, vl);
-      vz = vsll(vz, 1, vl);  // *2
-      vo_16b = vluxei32(in, vz, vl);
-      vb = vwaddu_vx(vo_16b, 0, vl);
+      vz = __riscv_vmul(vy_l, input_width, vl);
+      vz = __riscv_vadd(vz, vx_h, vl);
+      vz = __riscv_vsll(vz, 1, vl);  // *2
+      vo_16b = __riscv_vluxei32(in, vz, vl);
+      vb = __riscv_vwaddu_vx(vo_16b, 0, vl);
 
-      vz = vmul(vy_h, input_width, vl);
-      vz = vadd(vz, vx_l, vl);
-      vz = vsll(vz, 1, vl);  // *2
-      vo_16b = vluxei32(in, vz, vl);
-      vc = vwaddu_vx(vo_16b, 0, vl);
+      vz = __riscv_vmul(vy_h, input_width, vl);
+      vz = __riscv_vadd(vz, vx_l, vl);
+      vz = __riscv_vsll(vz, 1, vl);  // *2
+      vo_16b = __riscv_vluxei32(in, vz, vl);
+      vc = __riscv_vwaddu_vx(vo_16b, 0, vl);
 
-      vz = vmul(vy_h, input_width, vl);
-      vz = vadd(vz, vx_h, vl);
-      vz = vsll(vz, 1, vl);  // *2
-      vo_16b = vluxei32(in, vz, vl);
-      vd = vwaddu_vx(vo_16b, 0, vl);
+      vz = __riscv_vmul(vy_h, input_width, vl);
+      vz = __riscv_vadd(vz, vx_h, vl);
+      vz = __riscv_vsll(vz, 1, vl);  // *2
+      vo_16b = __riscv_vluxei32(in, vz, vl);
+      vd = __riscv_vwaddu_vx(vo_16b, 0, vl);
 
       // compute weights of four neighboring points: wx, wy, 1-wx, 1-wy
-      vp = vmul(vx, input_w_1, vl);
-      vq = vmul(vx_l, w_1, vl);
-      vp = vssubu(vp, vq, vl);
-      vp = vsll(vp, params.scale_precision, vl);
-      vx_weight = vdivu(vp, w_1, vl);
+      vp = __riscv_vmul(vx, input_w_1, vl);
+      vq = __riscv_vmul(vx_l, w_1, vl);
+      vp = __riscv_vssubu(vp, vq, vl);
+      vp = __riscv_vsll(vp, params.scale_precision, vl);
+      vx_weight = __riscv_vdivu(vp, w_1, vl);
 
-      vp = vmul(vy, input_h_1, vl);
-      vq = vmul(vy_l, h_1, vl);
-      vp = vssubu(vp, vq, vl);
-      vp = vsll(vp, params.scale_precision, vl);
-      vy_weight = vdivu(vp, h_1, vl);
+      vp = __riscv_vmul(vy, input_h_1, vl);
+      vq = __riscv_vmul(vy_l, h_1, vl);
+      vp = __riscv_vssubu(vp, vq, vl);
+      vp = __riscv_vsll(vp, params.scale_precision, vl);
+      vy_weight = __riscv_vdivu(vp, h_1, vl);
 
-      vx_weight_1minus = vrsub(vx_weight, params.scale_fixed_one, vl);
-      vy_weight_1minus = vrsub(vy_weight, params.scale_fixed_one, vl);
+      vx_weight_1minus = __riscv_vrsub(vx_weight, params.scale_fixed_one, vl);
+      vy_weight_1minus = __riscv_vrsub(vy_weight, params.scale_fixed_one, vl);
 
       // Bilinear Interpolation Formular:
       // out = A*(1-wx)*(1-wy) + B*wx*(1-wy)
       //     + C*(1-wx)*wy + D*wx*wy
-      vo = vmul(va, vx_weight_1minus, vl);
-      vo = vsrl(vo, params.scale_precision, vl);
-      vo = vmul(vo, vy_weight_1minus, vl);
+      vo = __riscv_vmul(va, vx_weight_1minus, vl);
+      vo = __riscv_vsrl(vo, params.scale_precision, vl);
+      vo = __riscv_vmul(vo, vy_weight_1minus, vl);
 
-      vp = vmul(vb, vx_weight, vl);
-      vp = vsrl(vp, params.scale_precision, vl);
-      vp = vmul(vp, vy_weight_1minus, vl);
-      vo = vadd(vo, vp, vl);
+      vp = __riscv_vmul(vb, vx_weight, vl);
+      vp = __riscv_vsrl(vp, params.scale_precision, vl);
+      vp = __riscv_vmul(vp, vy_weight_1minus, vl);
+      vo = __riscv_vadd(vo, vp, vl);
 
-      vp = vmul(vc, vx_weight_1minus, vl);
-      vp = vsrl(vp, params.scale_precision, vl);
-      vp = vmul(vp, vy_weight, vl);
-      vo = vadd(vo, vp, vl);
+      vp = __riscv_vmul(vc, vx_weight_1minus, vl);
+      vp = __riscv_vsrl(vp, params.scale_precision, vl);
+      vp = __riscv_vmul(vp, vy_weight, vl);
+      vo = __riscv_vadd(vo, vp, vl);
 
-      vp = vmul(vd, vx_weight, vl);
-      vp = vsrl(vp, params.scale_precision, vl);
-      vp = vmul(vp, vy_weight, vl);
-      vo = vadd(vo, vp, vl);
+      vp = __riscv_vmul(vd, vx_weight, vl);
+      vp = __riscv_vsrl(vp, params.scale_precision, vl);
+      vp = __riscv_vmul(vp, vy_weight, vl);
+      vo = __riscv_vadd(vo, vp, vl);
 
       // bit shift from 32bits to 8bits
-      vo_16b = vnsrl(vo, params.scale_precision, vl);
-      vo_8b = vnsrl(vo_16b, kRawPipelineBpp - kPipeOutputBpp, vl);
+      vo_16b = __riscv_vnsrl(vo, params.scale_precision, vl);
+      vo_8b = __riscv_vnsrl(vo_16b, kRawPipelineBpp - kPipeOutputBpp, vl);
 
       // save
-      vsse8(out + i * output->num_channels, output->num_channels, vo_8b, vl);
+      __riscv_vsse8(out + i * output->num_channels, output->num_channels, vo_8b,
+                    vl);
     }
   }
 }
diff --git a/risp4ml/isp_stages/gamma_rvv.c b/risp4ml/isp_stages/gamma_rvv.c
index e04025d..dab4f91 100644
--- a/risp4ml/isp_stages/gamma_rvv.c
+++ b/risp4ml/isp_stages/gamma_rvv.c
@@ -247,23 +247,23 @@
   for (size_t i = 0; i < n; i += vl) {
     x = img->data + i;
 
-    vl = vsetvl_e16m8(n - i);
-    vx = vle16_v_u16m8(x, vl);  // load
-    mask = vmsne(vx, kRgbPipelineMaxVal,
-                 vl);  // mask to exclude kRgbPipelineMaxVal
+    vl = __riscv_vsetvl_e16m8(n - i);
+    vx = __riscv_vle16_v_u16m8(x, vl);  // load
+    mask = __riscv_vmsne(vx, kRgbPipelineMaxVal,
+                         vl);  // mask to exclude kRgbPipelineMaxVal
 
-    vy = vsrl(vx, kGammaShiftBits, vl);          // 1/32
-    vy = vsll(vy, 1, vl);                        // *2
-    v_offset = vand(vx, kGammaSpacing - 1, vl);  // offset within bin
+    vy = __riscv_vsrl(vx, kGammaShiftBits, vl);          // 1/32
+    vy = __riscv_vsll(vy, 1, vl);                        // *2
+    v_offset = __riscv_vand(vx, kGammaSpacing - 1, vl);  // offset within bin
 
-    vx = vluxei16(gamma_params.lut, vy, vl);      // left
-    vy = vluxei16(gamma_params.lut + 1, vy, vl);  // right
+    vx = __riscv_vluxei16(gamma_params.lut, vy, vl);      // left
+    vy = __riscv_vluxei16(gamma_params.lut + 1, vy, vl);  // right
 
-    vy = vsub(vy, vx, vl);               // right - left
-    vy = vmul(vy, v_offset, vl);         // multiply offset_within_bin
-    vy = vsrl(vy, kGammaShiftBits, vl);  // 1/32
-    vx = vadd(vx, vy, vl);               // add
+    vy = __riscv_vsub(vy, vx, vl);               // right - left
+    vy = __riscv_vmul(vy, v_offset, vl);         // multiply offset_within_bin
+    vy = __riscv_vsrl(vy, kGammaShiftBits, vl);  // 1/32
+    vx = __riscv_vadd(vx, vy, vl);               // add
 
-    vse16(mask, x, vx, vl);  // save
+    __riscv_vse16(mask, x, vx, vl);  // save
   }
 }
diff --git a/risp4ml/isp_stages/wbg_rvv.c b/risp4ml/isp_stages/wbg_rvv.c
index 1bc1187..3374de6 100644
--- a/risp4ml/isp_stages/wbg_rvv.c
+++ b/risp4ml/isp_stages/wbg_rvv.c
@@ -48,12 +48,13 @@
     for (uint8_t n = 0; n < 2; n++) {
       for (uint16_t x = n; x < img->width; x += 2 * vl) {
         size_t avl = (img->width + 1 - x) / 2;
-        vl = vsetvl_e16m8(avl);
-        vx = vlse16_v_u16m8(line + x, 2 * sizeof(uint16_t), vl);  // load
+        vl = __riscv_vsetvl_e16m8(avl);
+        vx =
+            __riscv_vlse16_v_u16m8(line + x, 2 * sizeof(uint16_t), vl);  // load
 
-        vy = vmv_v_x_u32m1(0, vl);       // init
-        vy = vwredsumu(vy, vx, vy, vl);  // sum
-        uint32_t sum = vmv_x(vy);
+        vy = __riscv_vmv_v_x_u32m1(0, vl);   // init
+        vy = __riscv_vwredsumu(vx, vy, vl);  // sum
+        uint32_t sum = __riscv_vmv_x(vy);
         if ((y & 0x1) == 0 && n == 0) {
           sum_of_reds += sum;
         } else if ((y & 0x1) == 1 && n == 1) {
@@ -108,13 +109,14 @@
       gain = (y & 0x1) ? wbg_params.gains[2 + n] : wbg_params.gains[n];
       for (uint16_t x = n; x < img->width; x += 2 * vl) {
         size_t avl = (img->width + 1 - x) / 2;
-        vl = vsetvl_e16m4(avl);
-        vx = vlse16_v_u16m4(line + x, 2 * sizeof(uint16_t), vl);  // load
-        vy = vwmulu(vx, gain, vl);                                // multiply
-        vy = vadd(vy, offset, vl);                                // add
-        vy = vminu(vy, max_val, vl);                              // clamp
-        vx = vnsrl(vy, kWbgFractional, vl);                       // bit shift
-        vsse16(line + x, 2 * sizeof(uint16_t), vx, vl);           // save
+        vl = __riscv_vsetvl_e16m4(avl);
+        vx =
+            __riscv_vlse16_v_u16m4(line + x, 2 * sizeof(uint16_t), vl);  // load
+        vy = __riscv_vwmulu(vx, gain, vl);                       // multiply
+        vy = __riscv_vadd(vy, offset, vl);                       // add
+        vy = __riscv_vminu(vy, max_val, vl);                     // clamp
+        vx = __riscv_vnsrl(vy, kWbgFractional, vl);              // bit shift
+        __riscv_vsse16(line + x, 2 * sizeof(uint16_t), vx, vl);  // save
       }
     }
   }
diff --git a/vmvx_ukernel/elementwise.c b/vmvx_ukernel/elementwise.c
index c568dbb..354e7e8 100644
--- a/vmvx_ukernel/elementwise.c
+++ b/vmvx_ukernel/elementwise.c
@@ -143,53 +143,53 @@
                                 iree_uk_index_t out_stride, size_t vl) {
   iree_uk_x32b_opcode_type_t op_type = get_iree_uk_x32b_op_type(opcode);
   if (op_type == IREE_UK_X32B_UI) {
-    vuint32m8_t vx = vlse32_v_u32m8(lhs, lhs_stride, vl);  // load
-    vuint32m8_t vy = vlse32_v_u32m8(rhs, rhs_stride, vl);  // load
+    vuint32m8_t vx = __riscv_vlse32_v_u32m8(lhs, lhs_stride, vl);  // load
+    vuint32m8_t vy = __riscv_vlse32_v_u32m8(rhs, rhs_stride, vl);  // load
     switch (opcode) {
       case IREE_UK_X32B_ADDI:
-        vx = vadd(vx, vy, vl);
+        vx = __riscv_vadd(vx, vy, vl);
         break;
       case IREE_UK_X32B_ANDI:
-        vx = vand(vx, vy, vl);
+        vx = __riscv_vand(vx, vy, vl);
         break;
       case IREE_UK_X32B_DIVUI:
-        vx = vdivu(vx, vy, vl);
+        vx = __riscv_vdivu(vx, vy, vl);
         break;
       case IREE_UK_X32B_MULI:
-        vx = vmul(vx, vy, vl);
+        vx = __riscv_vmul(vx, vy, vl);
         break;
       case IREE_UK_X32B_ORI:
-        vx = vor(vx, vy, vl);
+        vx = __riscv_vor(vx, vy, vl);
         break;
       case IREE_UK_X32B_SHLI:
-        vx = vsll(vx, vy, vl);
+        vx = __riscv_vsll(vx, vy, vl);
         break;
       case IREE_UK_X32B_SHRUI:
-        vx = vsrl(vx, vy, vl);
+        vx = __riscv_vsrl(vx, vy, vl);
         break;
       case IREE_UKENREL_X32B_XORI:
-        vx = vor(vx, vy, vl);
+        vx = __riscv_vor(vx, vy, vl);
         break;
       case IREE_UK_X32B_SUBI:
-        vx = vsub(vx, vy, vl);
+        vx = __riscv_vsub(vx, vy, vl);
         break;
       default:
         *result_code = 1;
     }
-    vsse32(out, out_stride, vx, vl);  // save
+    __riscv_vsse32(out, out_stride, vx, vl);  // save
   } else if (op_type == IREE_UK_X32B_SI) {
     vint32m8_t vx =
-        vlse32_v_i32m8((iree_uk_int32_t*)lhs, lhs_stride, vl);  // load
+        __riscv_vlse32_v_i32m8((iree_uk_int32_t*)lhs, lhs_stride, vl);  // load
     vint32m8_t vy =
-        vlse32_v_i32m8((iree_uk_int32_t*)rhs, rhs_stride, vl);  // load
+        __riscv_vlse32_v_i32m8((iree_uk_int32_t*)rhs, rhs_stride, vl);  // load
     switch (opcode) {
       case IREE_UK_X32B_DIVSI:
-        vx = vdiv(vx, vy, vl);
+        vx = __riscv_vdiv(vx, vy, vl);
         break;
       default:
         *result_code = 1;
     }
-    vsse32((iree_uk_int32_t*)out, out_stride, vx, vl);  // save
+    __riscv_vsse32((iree_uk_int32_t*)out, out_stride, vx, vl);  // save
   } else {
     *result_code = 1;
   }
@@ -316,7 +316,7 @@
     if (size0 < size1) {
       for (iree_uk_index_t i = 0; i < size0; ++i) {
         for (iree_uk_index_t j = 0; j < size1; j += vl) {
-          vl = vsetvl_e32m8(size1 - j);
+          vl = __riscv_vsetvl_e32m8(size1 - j);
           iree_uk_rvv_x32b_op(opcode, &result_code,
                               &lhs[i * lhs_stride0 + j * lhs_stride1],
                               lhs_stride1 * sizeof(uint32_t),
@@ -329,7 +329,7 @@
     } else {
       for (iree_uk_index_t j = 0; j < size1; ++j) {
         for (iree_uk_index_t i = 0; i < size0; i += vl) {
-          vl = vsetvl_e32m8(size0 - i);
+          vl = __riscv_vsetvl_e32m8(size0 - i);
           iree_uk_rvv_x32b_op(opcode, &result_code,
                               &lhs[i * lhs_stride0 + j * lhs_stride1],
                               lhs_stride0 * sizeof(uint32_t),
diff --git a/vmvx_ukernel/mmt4d_tile.c b/vmvx_ukernel/mmt4d_tile.c
index 28d8145..7f00451 100644
--- a/vmvx_ukernel/mmt4d_tile.c
+++ b/vmvx_ukernel/mmt4d_tile.c
@@ -29,13 +29,13 @@
   vint32m1_t v_sum;
   iree_uk_int32_t sum = 0;
   for (size_t i = 0; i < n; i += vl) {
-    vl = vsetvl_e8m4(n - i);
-    vu = vle8_v_i8m4(u + i, vl);             // load
-    vw = vle8_v_i8m4(w + i, vl);             // load
-    vx = vwmul(vu, vw, vl);                  // multiply
-    v_sum = vmv_s(v_sum, 0, vl);             // init
-    v_sum = vwredsum(v_sum, vx, v_sum, vl);  // sum
-    sum += vmv_x(v_sum);
+    vl = __riscv_vsetvl_e8m4(n - i);
+    vu = __riscv_vle8_v_i8m4(u + i, vl);      // load
+    vw = __riscv_vle8_v_i8m4(w + i, vl);      // load
+    vx = __riscv_vwmul(vu, vw, vl);           // multiply
+    v_sum = __riscv_vmv_v_x_i32m1(0, vl);     // init
+    v_sum = __riscv_vwredsum(vx, v_sum, vl);  // sum
+    sum += __riscv_vmv_x(v_sum);
   }
   return sum;
 }