| /* |
| * Copyright 2023 Google LLC |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include "tests/cv/gaussian.h" |
| |
| #include <cstdio> |
| |
| #include "crt/kelvin.h" |
| |
| // Note: separable kernel is vertical then horizontal. H then V with the |
| // intermediate horizontal is retained may reduce compute further. |
| |
| namespace kelvin::cv { |
| |
| static void GaussianVerticalKernel(int num_cols, const uint16_t* input0, |
| const uint16_t* input1, |
| const uint16_t* input2, |
| const uint16_t* input3, |
| const uint16_t* input4, bool is_stripmine, |
| uint32_t* output0, uint32_t* output1) { |
| uint32_t vl_input, vl_output; |
| while (num_cols > 0) { |
| if (is_stripmine) { |
| getvl_h_x_m(vl_input, num_cols); |
| num_cols -= vl_input; |
| vl_output = vl_input / 2; |
| vld_h_lp_xx_m(vm8, input0, vl_input); |
| vld_h_lp_xx_m(vm12, input4, vl_input); |
| vld_h_lp_xx_m(vm9, input1, vl_input); |
| vld_h_lp_xx_m(vm10, input2, vl_input); |
| vld_h_lp_xx_m(vm11, input3, vl_input); |
| |
| vaddw_w_u_vv_m(vm0, vm8, vm12); |
| vmulw_w_u_vx_m(vm2, vm9, 4); |
| vmulw_w_u_vx_m(vm4, vm10, 6); |
| vmulw_w_u_vx_m(vm6, vm11, 4); |
| |
| vadd3_w_vv_m(vm0, vm2, vm4); |
| vadd3_w_vv_m(vm1, vm3, vm5); |
| vadd_w_vv_m(vm0, vm0, vm6); |
| vadd_w_vv_m(vm1, vm1, vm7); |
| |
| vst_w_lp_xx_m(vm0, output0, vl_output); |
| vst_w_lp_xx_m(vm1, output1, vl_output); |
| } else { |
| getvl_h_x(vl_input, num_cols); |
| num_cols -= vl_input; |
| vl_output = vl_input / 2; |
| vld_h_lp_xx(v10, input0, vl_input); |
| vld_h_lp_xx(v11, input1, vl_input); |
| vld_h_lp_xx(v12, input2, vl_input); |
| vld_h_lp_xx(v13, input3, vl_input); |
| vld_h_lp_xx(v14, input4, vl_input); |
| |
| vaddw_w_u_vv(v16, v10, v14); |
| vmulw_w_u_vx(v18, v11, 4); |
| vmulw_w_u_vx(v20, v12, 6); |
| vmulw_w_u_vx(v22, v13, 4); |
| |
| vadd3_w_vv(v16, v18, v20); |
| vadd3_w_vv(v17, v19, v21); |
| vadd_w_vv(v16, v16, v22); |
| vadd_w_vv(v17, v17, v23); |
| |
| vst_w_lp_xx(v16, output0, vl_output); |
| vst_w_lp_xx(v17, output1, vl_output); |
| } |
| } |
| } |
| |
| static void GaussianHorizontalKernel(int num_cols, const uint32_t* input0, |
| const uint32_t* input1, bool is_stripmine, |
| uint16_t* output) { |
| #define PREV0 vm8 |
| #define PREV1 vm9 |
| #define CURR0 vm10 |
| #define CURR1 vm11 |
| #define NEXT0 vm12 |
| #define NEXT1 vm13 |
| #define P0 vm4 |
| #define P1 vm5 |
| #define N0 vm6 |
| #define N1 vm7 |
| #define SN vm14 |
| |
| #define Rm0 vm0 |
| #define Rm1 vm1 |
| #define R0 v4 |
| #define R1 v5 |
| #define T0 vm2 |
| #define T1 vm3 |
| |
| uint32_t vl_input, vl_output; |
| |
| if (is_stripmine) { |
| getmaxvl_w_m(vl_input); |
| |
| vld_w_x_m(PREV0, input0 - vl_input); |
| vld_w_x_m(PREV1, input1 - vl_input); |
| vld_w_p_x_m(CURR0, input0); |
| vld_w_p_x_m(CURR1, input1); |
| } else { |
| getmaxvl_w(vl_input); |
| |
| vld_w_x(PREV0, input0 - vl_input); |
| vld_w_x(PREV1, input1 - vl_input); |
| vld_w_p_x(CURR0, input0); |
| vld_w_p_x(CURR1, input1); |
| } |
| |
| while (num_cols > 0) { |
| if (is_stripmine) { |
| getvl_h_x_m(vl_output, num_cols); |
| num_cols -= vl_output; |
| |
| vld_w_p_x_m(NEXT0, input0); |
| vld_w_p_x_m(NEXT1, input1); |
| |
| vslidehp_w_1_vv_m(P0, PREV0, CURR0); |
| vslidehp_w_1_vv_m(P1, PREV1, CURR1); |
| vslidehn_w_1_vv_m(N0, CURR0, NEXT0); |
| vslidehn_w_1_vv_m(N1, CURR1, NEXT1); |
| |
| // even / odd, with additional accumulator |
| vmul_w_vx_m(Rm0, P1, 4); |
| vmul_w_vx_m(Rm1, CURR0, 4); |
| vadd_w_vv_m(T0, P0, N0); |
| vadd_w_vv_m(T1, P1, N1); |
| vmacc_w_vx_m(Rm0, CURR0, 6); |
| vmacc_w_vx_m(Rm1, CURR1, 6); |
| vmacc_w_vx_m(T0, CURR1, 4); |
| vmacc_w_vx_m(T1, N0, 4); |
| vadd_w_vv_m(Rm0, Rm0, T0); |
| vadd_w_vv_m(Rm1, Rm1, T1); |
| |
| vsransu_h_r_vx_m(SN, Rm0, 8); |
| |
| vst_h_lp_xx_m(SN, output, vl_output); |
| |
| vmv_v_m(PREV0, CURR0); |
| vmv_v_m(PREV1, CURR1); |
| vmv_v_m(CURR0, NEXT0); |
| vmv_v_m(CURR1, NEXT1); |
| } else { |
| getvl_h_x(vl_output, num_cols); |
| num_cols -= vl_output; |
| |
| vld_w_p_x(NEXT0, input0); |
| vld_w_p_x(NEXT1, input1); |
| |
| vslidep_w_1_vv(P0, PREV0, CURR0); |
| vslidep_w_1_vv(P1, PREV1, CURR1); |
| vsliden_w_1_vv(N0, CURR0, NEXT0); |
| vsliden_w_1_vv(N1, CURR1, NEXT1); |
| |
| // even |
| vadd_w_vv(R0, P0, N0); |
| vmacc_w_vx(R0, P1, 4); |
| vmacc_w_vx(R0, CURR0, 6); |
| vmacc_w_vx(R0, CURR1, 4); |
| |
| // odd |
| vadd_w_vv(R1, P1, N1); |
| vmacc_w_vx(R1, CURR0, 4); |
| vmacc_w_vx(R1, CURR1, 6); |
| vmacc_w_vx(R1, N0, 4); |
| |
| vsransu_h_r_vx(SN, R0, 8); |
| |
| vst_h_lp_xx(SN, output, vl_output); |
| |
| vmv_v(PREV0, CURR0); |
| vmv_v(PREV1, CURR1); |
| vmv_v(CURR0, NEXT0); |
| vmv_v(CURR1, NEXT1); |
| } |
| } |
| } |
| |
| #define ARRAYSIZE(x) sizeof(x) / sizeof(x[0]) |
| |
| void gaussian(int num_cols, const uint16_t* input0_row, |
| const uint16_t* input1_row, const uint16_t* input2_row, |
| const uint16_t* input3_row, const uint16_t* input4_row, |
| bool is_stripmine, uint16_t* output_row) { |
| int vlenw; |
| getmaxvl_w(vlenw); |
| const int interleave_num = num_cols / 2 - 1; // even/odd interleaved |
| uint32_t temp0_data_unpadded[1024 + 2 * vlenw] __attribute__((aligned(64))); |
| uint32_t temp1_data_unpadded[1024 + 2 * vlenw] __attribute__((aligned(64))); |
| uint32_t* temp0_data = temp0_data_unpadded + vlenw; |
| uint32_t* temp1_data = temp1_data_unpadded + vlenw; |
| |
| GaussianVerticalKernel(num_cols, input0_row, input1_row, input2_row, |
| input3_row, input4_row, is_stripmine, temp0_data, |
| temp1_data); |
| if (temp0_data <= temp0_data_unpadded || |
| ((temp0_data - temp0_data_unpadded) / sizeof(uint32_t) + interleave_num + |
| 1 >= |
| ARRAYSIZE(temp0_data_unpadded))) { |
| printf("**error**: temp0_data out of bound\n"); |
| exit(1); |
| } |
| if (temp1_data <= temp1_data_unpadded || |
| ((temp1_data - temp1_data_unpadded) / sizeof(uint32_t) + interleave_num + |
| 1 >= |
| ARRAYSIZE(temp1_data_unpadded))) { |
| printf("**error**: temp1_data out of bound\n"); |
| exit(1); |
| } |
| temp0_data[-1] = temp0_data[0]; |
| temp1_data[-1] = temp0_data[0]; |
| temp0_data[interleave_num + 1] = temp1_data[interleave_num]; |
| temp1_data[interleave_num + 1] = temp1_data[interleave_num]; |
| GaussianHorizontalKernel(num_cols, temp0_data, temp1_data, is_stripmine, |
| output_row); |
| } |
| |
| }; // namespace kelvin::cv |