| // Copyright 2023 Google LLC |
| // Licensed under the Apache License, Version 2.0, see LICENSE for details. |
| // SPDX-License-Identifier: Apache-2.0 |
| |
| #include "tests/cv/extrema.h" |
| |
| #include <cstdint> |
| |
| #include "crt/kelvin.h" |
| |
| namespace kelvin::cv { |
| |
| void extrema(int num_cols, const int16_t* input[4][3], uint8_t* output0, |
| uint8_t* output1) { |
| #define prev00 v0 |
| #define prev01 v1 |
| #define prev02 v2 |
| #define prev10 v3 |
| #define prev11 v4 |
| #define prev12 v5 |
| #define prev20 v6 |
| #define prev21 v7 |
| #define prev22 v8 |
| #define prev30 v9 |
| #define prev31 v10 |
| #define prev32 v11 |
| #define curr00 v12 |
| #define curr01 v13 |
| #define curr02 v14 |
| #define curr10 v15 |
| #define curr11 v16 |
| #define curr12 v17 |
| #define curr20 v18 |
| #define curr21 v19 |
| #define curr22 v20 |
| #define curr30 v21 |
| #define curr31 v22 |
| #define curr32 v23 |
| #define next00 v24 |
| #define next01 v25 |
| #define next02 v26 |
| #define next10 v27 |
| #define next11 v28 |
| #define next12 v29 |
| #define next20 v30 |
| #define next21 v31 |
| #define next22 v32 |
| #define next30 v33 |
| #define next31 v34 |
| #define next32 v35 |
| #define elem v36 |
| #define tmin0 v37 |
| #define tmax0 v38 |
| #define tmin1 v39 |
| #define tmax1 v40 |
| #define rmin v41 |
| #define rmax v42 |
| #define value0 v43 |
| #define value1 v44 |
| #define result0 v45 |
| #define result1 v46 |
| |
| int16_t* ptr0 = const_cast<int16_t*>(input[0][0]); |
| int16_t* ptr1 = const_cast<int16_t*>(input[0][1]); |
| int16_t* ptr2 = const_cast<int16_t*>(input[0][2]); |
| int16_t* ptr3 = const_cast<int16_t*>(input[1][0]); |
| int16_t* ptr4 = const_cast<int16_t*>(input[1][1]); |
| int16_t* ptr5 = const_cast<int16_t*>(input[1][2]); |
| int16_t* ptr6 = const_cast<int16_t*>(input[2][0]); |
| int16_t* ptr7 = const_cast<int16_t*>(input[2][1]); |
| int16_t* ptr8 = const_cast<int16_t*>(input[2][2]); |
| int16_t* ptr9 = const_cast<int16_t*>(input[3][0]); |
| int16_t* ptra = const_cast<int16_t*>(input[3][1]); |
| int16_t* ptrb = const_cast<int16_t*>(input[3][2]); |
| |
| uint8_t* out0 = const_cast<uint8_t*>(output0); |
| uint8_t* out1 = const_cast<uint8_t*>(output1); |
| |
| vld_h_p_x(curr00, ptr0); |
| vld_h_p_x(curr01, ptr1); |
| vld_h_p_x(curr02, ptr2); |
| vld_h_p_x(curr10, ptr3); |
| vld_h_p_x(curr11, ptr4); |
| vld_h_p_x(curr12, ptr5); |
| vld_h_p_x(curr20, ptr6); |
| vld_h_p_x(curr21, ptr7); |
| vld_h_p_x(curr22, ptr8); |
| vld_h_p_x(curr30, ptr9); |
| vld_h_p_x(curr31, ptra); |
| vld_h_p_x(curr32, ptrb); |
| |
| int vlenh; |
| getmaxvl_h(vlenh); |
| |
| for (int i = 0; i < num_cols; i += vlenh) { |
| // Extrema compute. |
| #define minmax_p(param0, param1, param2) \ |
| vslidep_h_1_vv(elem, prev##param1##param2, curr##param1##param2); \ |
| vmin_h_vv(tmin##param0, tmin##param0, elem); \ |
| vmax_h_vv(tmax##param0, tmax##param0, elem); |
| |
| #define minmax_n(param0, param1, param2) \ |
| vsliden_h_1_vv(elem, curr##param1##param2, next##param1##param2); \ |
| vmin_h_vv(tmin##param0, tmin##param0, elem); \ |
| vmax_h_vv(tmax##param0, tmax##param0, elem); |
| |
| #define minmax_c(param0, param1, param2) \ |
| vmin_h_vv(tmin##param0, tmin##param0, prev##param1##param2); \ |
| vmax_h_vv(tmax##param0, tmax##param0, prev##param1##param2); |
| |
| // Common centers. |
| vmin_h_vv(tmin0, curr10, curr12); |
| vmax_h_vv(tmax0, curr10, curr12); |
| vmin_h_vv(tmin0, tmin0, curr20); |
| vmax_h_vv(tmax0, tmax0, curr20); |
| vmin_h_vv(tmin0, tmin0, curr22); |
| vmax_h_vv(tmax0, tmax0, curr22); |
| |
| // Common inner two layers. |
| vld_h_p_x(next10, ptr3); |
| vld_h_p_x(next11, ptr4); |
| vld_h_p_x(next12, ptr5); |
| minmax_p(0, 1, 0); |
| minmax_n(0, 1, 0); |
| minmax_p(0, 1, 1); |
| minmax_n(0, 1, 1); |
| minmax_p(0, 1, 2); |
| minmax_n(0, 1, 2); |
| vmv_v(prev10, curr10); |
| vmv_v(prev11, curr11); |
| vmv_v(prev12, curr12); |
| vmv_v(curr10, next10); |
| vmv_v(curr11, next11); |
| vmv_v(curr12, next12); |
| |
| vld_h_p_x(next20, ptr6); |
| vld_h_p_x(next21, ptr7); |
| vld_h_p_x(next22, ptr8); |
| minmax_p(0, 2, 0); |
| minmax_n(0, 2, 0); |
| minmax_p(0, 2, 1); |
| minmax_n(0, 2, 1); |
| minmax_p(0, 2, 2); |
| minmax_n(0, 2, 2); |
| vmv_v(prev20, curr20); |
| vmv_v(prev21, curr21); |
| vmv_v(prev22, curr22); |
| vmv_v(curr20, next20); |
| vmv_v(curr21, next21); |
| vmv_v(curr22, next22); |
| |
| // Shared state end. |
| vmv_v(tmax1, tmax0); |
| vmv_v(tmin1, tmin0); |
| |
| // [0,1,2] |
| vld_h_p_x(next00, ptr0); |
| vld_h_p_x(next01, ptr1); |
| vld_h_p_x(next02, ptr2); |
| minmax_p(0, 0, 0); |
| minmax_n(0, 0, 0); |
| minmax_p(0, 0, 1); |
| minmax_n(0, 0, 1); |
| minmax_p(0, 0, 2); |
| minmax_n(0, 0, 2); |
| vmv_v(prev00, curr00); |
| vmv_v(prev01, curr01); |
| vmv_v(prev02, curr02); |
| vmv_v(curr00, next00); |
| vmv_v(curr01, next01); |
| vmv_v(curr02, next02); |
| |
| minmax_c(0, 0, 0); |
| minmax_c(0, 0, 1); |
| minmax_c(0, 0, 2); |
| minmax_c(0, 2, 1); |
| |
| // [1,2,3] |
| vld_h_p_x(next30, ptr9); |
| vld_h_p_x(next31, ptra); |
| vld_h_p_x(next32, ptrb); |
| minmax_p(1, 3, 0); |
| minmax_n(1, 3, 0); |
| minmax_p(1, 3, 1); |
| minmax_n(1, 3, 1); |
| minmax_p(1, 3, 2); |
| minmax_n(1, 3, 2); |
| vmv_v(prev30, curr30); |
| vmv_v(prev31, curr31); |
| vmv_v(prev32, curr32); |
| vmv_v(curr30, next30); |
| vmv_v(curr31, next31); |
| vmv_v(curr32, next32); |
| |
| minmax_c(1, 1, 1); |
| minmax_c(1, 3, 0); |
| minmax_c(1, 3, 1); |
| minmax_c(1, 3, 2); |
| |
| // Compare center with min:max. |
| vmv_v(value0, prev11); |
| vmv_v(value1, prev21); |
| |
| vlt_h_vv(rmin, value0, tmin0); |
| vgt_h_vv(rmax, value0, tmax0); |
| vsll_h_vx(rmax, rmax, 1); |
| vor_vv(result0, rmax, rmin); |
| vevn_b_vv(result0, result0, result0); |
| vst_b_lp_xx(result0, out0, vlenh); |
| |
| vlt_h_vv(rmin, value1, tmin1); |
| vgt_h_vv(rmax, value1, tmax1); |
| vsll_h_vx(rmax, rmax, 1); |
| vor_vv(result1, rmax, rmin); |
| vevn_b_vv(result1, result1, result1); |
| vst_b_lp_xx(result1, out1, vlenh); |
| } |
| } |
| |
| }; // namespace kelvin::cv |