blob: 95f96f3d0b6c0eb1ac84eee0b804c36a5bed0906 [file] [log] [blame]
// Copyright 2023 Google LLC
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
#include "tests/cv/shift_gaussian.h"
#include <cstdint>
#include <cstdio>
#include "crt/kelvin.h"
// Note: separable kernel is vertical then horizontal. H then V with the
// intermediate horizontal is retained may reduce compute further.
namespace kelvin::cv {
static void GaussianVerticalKernel(int num_cols, const uint8_t* input0,
const uint8_t* input1, const uint8_t* input2,
const uint8_t* input3, const uint8_t* input4,
bool is_stripmine, uint16_t* output) {
uint32_t vl_input, vl_output_0, vl_output_1;
while (num_cols > 0) {
if (is_stripmine) {
getvl_b_x_m(vl_input, num_cols);
getvl_h_x_m(vl_output_0, num_cols);
num_cols -= vl_input;
vl_output_1 = vl_input - vl_output_0;
vld_b_lp_xx_m(v32, input0, vl_input);
vld_b_lp_xx_m(v36, input1, vl_input);
vld_b_lp_xx_m(v40, input2, vl_input);
vld_b_lp_xx_m(v44, input3, vl_input);
vld_b_lp_xx_m(v48, input4, vl_input);
vaddw_h_u_vv_m(v0, v32, v48);
vmulw_h_u_vx_m(v8, v36, 4);
vmulw_h_u_vx_m(v16, v40, 6);
vmulw_h_u_vx_m(v24, v44, 4);
vadd3_h_vv_m(v0, v8, v16);
vadd3_h_vv_m(v4, v12, v20);
vadd_h_vv_m(v0, v0, v24);
vadd_h_vv_m(v4, v4, v28);
vzip_h_vv_m(v16, v0, v4);
vst_h_lp_xx_m(v16, output, vl_output_0);
vst_h_lp_xx_m(v20, output, vl_output_1);
} else {
getvl_b_x(vl_input, num_cols);
getvl_h_x(vl_output_0, num_cols);
num_cols -= vl_input;
vl_output_1 = vl_input - vl_output_0;
vld_b_lp_xx(v10, input0, vl_input);
vld_b_lp_xx(v11, input1, vl_input);
vld_b_lp_xx(v12, input2, vl_input);
vld_b_lp_xx(v13, input3, vl_input);
vld_b_lp_xx(v14, input4, vl_input);
vaddw_h_u_vv(v16, v10, v14);
vmulw_h_u_vx(v18, v11, 4);
vmulw_h_u_vx(v20, v12, 6);
vmulw_h_u_vx(v22, v13, 4);
vadd3_h_vv(v16, v18, v20);
vadd3_h_vv(v17, v19, v21);
vadd_h_vv(v16, v16, v22);
vadd_h_vv(v17, v17, v23);
vzip_h_vv(v0, v16, v17);
vst_h_lp_xx(v0, output, vl_output_0);
vst_h_lp_xx(v1, output, vl_output_1);
}
}
}
static void GaussianHorizontalKernel(int num_cols, const uint16_t* input,
bool is_stripmine, uint16_t* output) {
#define PREV v32
#define CURR v40
#define NEXT v48
#define P2 v16
#define P1 v20
#define N1 v24
#define N2 v28
#define RS v0
uint32_t vl_input, vl_output;
if (is_stripmine) {
getmaxvl_h_m(vl_input);
vld_h_x_m(PREV, input - vl_input);
vld_h_p_x_m(CURR, input);
} else {
getmaxvl_h(vl_input);
vld_h_x(PREV, input - vl_input);
vld_h_p_x(CURR, input);
}
while (num_cols > 0) {
if (is_stripmine) {
getvl_h_x_m(vl_output, num_cols);
num_cols -= vl_output;
vld_h_p_x_m(NEXT, input);
vslidehp_h_2_vv_m(P2, PREV, CURR);
vslidehp_h_1_vv_m(P1, PREV, CURR);
vslidehn_h_1_vv_m(N1, CURR, NEXT);
vslidehn_h_2_vv_m(N2, CURR, NEXT);
vadd_h_vv_m(RS, P2, N2);
vmacc_h_vx_m(RS, P1, 4);
vmacc_h_vx_m(RS, CURR, 6);
vmacc_h_vx_m(RS, N1, 4);
vst_h_lp_xx_m(RS, output, vl_output);
vmv_v_m(PREV, CURR);
vmv_v_m(CURR, NEXT);
} else {
getvl_h_x(vl_output, num_cols);
num_cols -= vl_output;
vld_h_p_x(NEXT, input);
vslidep_h_2_vv(P2, PREV, CURR);
vslidep_h_1_vv(P1, PREV, CURR);
vsliden_h_1_vv(N1, CURR, NEXT);
vsliden_h_2_vv(N2, CURR, NEXT);
vadd_h_vv(RS, P2, N2);
vmacc_h_vx(RS, P1, 4);
vmacc_h_vx(RS, CURR, 6);
vmacc_h_vx(RS, N1, 4);
vst_h_lp_xx(RS, output, vl_output);
vmv_v(PREV, CURR);
vmv_v(CURR, NEXT);
}
}
}
void shift_gaussian(int num_cols, const uint8_t* input0_row,
const uint8_t* input1_row, const uint8_t* input2_row,
const uint8_t* input3_row, const uint8_t* input4_row,
bool is_stripmine, uint16_t* output_row) {
int vlenh;
getmaxvl_h(vlenh);
const int r = num_cols - 1;
uint16_t temp_data_unpadded[1024 + 2 * vlenh] __attribute__((aligned(64)));
uint16_t* temp_data = temp_data_unpadded + vlenh;
GaussianVerticalKernel(num_cols, input0_row, input1_row, input2_row,
input3_row, input4_row, is_stripmine, temp_data);
if (temp_data <= &temp_data_unpadded[1]) {
printf("**error**: temp_data out of bound\n");
exit(1);
}
temp_data[-1] = temp_data[0];
temp_data[-2] = temp_data[0];
temp_data[r + 1] = temp_data[r];
temp_data[r + 2] = temp_data[r];
GaussianHorizontalKernel(num_cols, temp_data, is_stripmine, output_row);
}
}; // namespace kelvin::cv