blob: 6c6ea206591adb5f07f5122e9382b5b43c20c051 [file] [log] [blame]
// Copyright 2023 Google LLC
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
#include "tests/cv/upsample.h"
#include <cstdint>
#include "crt/kelvin.h"
namespace kelvin::cv {
void upsample(int num_output_cols, uint16_t* input0_row, uint16_t* input1_row,
uint16_t* output0_row, uint16_t* output1_row) {
// Mirror the edges using padded input buffers.
{
const int r = (num_output_cols / 2) - 1;
input0_row[-1] = input0_row[0];
input1_row[-1] = input1_row[0];
input0_row[r + 1] = input0_row[r];
input1_row[r + 1] = input1_row[r];
}
int vlenh;
getmaxvl_h(vlenh);
uint16_t* __restrict input0 =
reinterpret_cast<uint16_t*>((input0_row - vlenh));
uint16_t* __restrict input1 =
reinterpret_cast<uint16_t*>((input1_row - vlenh));
uint16_t* __restrict output0 = reinterpret_cast<uint16_t*>(output0_row);
uint16_t* __restrict output1 = reinterpret_cast<uint16_t*>(output1_row);
#define prev0 v0
#define prev1 v1
#define curr0 v2
#define curr1 v3
#define next0 v4
#define next1 v5
#define c0 curr0
#define c1 curr1
#define p0 v6
#define p0_0 v6
#define p0_1 v7
#define p1 v8
#define p1_0 v8
#define p1_1 v9
#define n0 v10
#define n0_0 v10
#define n0_1 v11
#define n1 v12
#define n1_0 v12
#define n1_1 v13
#define a v14
#define a_0 v14
#define a_1 v15
#define b v16
#define b_0 v16
#define b_1 v17
#define ae v18
#define ae_0 v18
#define ae_1 v19
#define be v20
#define be_0 v20
#define be_1 v21
#define ao v22
#define ao_0 v22
#define ao_1 v23
#define bo v24
#define bo_0 v24
#define bo_1 v25
#define re0 v26
#define re0_0 v26
#define re0_1 v27
#define re1 v28
#define re1_0 v28
#define re1_1 v29
#define ro0 v30
#define ro0_0 v30
#define ro0_1 v31
#define ro1 v32
#define ro1_0 v32
#define ro1_1 v33
#define out0 v34
#define out0_0 v34
#define out0_1 v35
#define out1 v36
#define out1_0 v36
#define out1_1 v37
vld_h_p_x(prev0, input0);
vld_h_p_x(prev1, input1);
vld_h_p_x(curr0, input0);
vld_h_p_x(curr1, input1);
for (int i = 0; i < num_output_cols; i += 2 * vlenh) {
vld_h_p_x(next0, input0);
vld_h_p_x(next1, input1);
vslidep_h_1_vv(p0, prev0, curr0);
vslidep_h_1_vv(p1, prev1, curr1);
vsliden_h_1_vv(n0, curr0, next0);
vsliden_h_1_vv(n1, curr1, next1);
vmulw_w_u_vx(ae, c0, 3);
vmulw_w_u_vx(be, c1, 3);
vacc_w_u_vv(ao, ae, n0);
vacc_w_u_vv(bo, be, n1);
vacc_w_u_vv(ae, ae, p0);
vacc_w_u_vv(be, be, p1);
vmvp_vv(re0, be_0, be_1);
vmvp_vv(re1, ae_0, ae_1);
vmvp_vv(ro0, bo_0, bo_1);
vmvp_vv(ro1, ao_0, ao_1);
vmacc_w_vx(re0_0, ae_0, 3);
vmacc_w_vx(re0_1, ae_1, 3);
vmacc_w_vx(re1_0, be_0, 3);
vmacc_w_vx(re1_1, be_1, 3);
vmacc_w_vx(ro0_0, ao_0, 3);
vmacc_w_vx(ro0_1, ao_1, 3);
vmacc_w_vx(ro1_0, bo_0, 3);
vmacc_w_vx(ro1_1, bo_1, 3);
vsransu_h_r_vx(re0, re0, 4);
vsransu_h_r_vx(re1, re1, 4);
vsransu_h_r_vx(ro0, ro0, 4);
vsransu_h_r_vx(ro1, ro1, 4);
vzip_h_vv(out0, re0, ro0);
vzip_h_vv(out1, re1, ro1);
vst_h_p_x(out0_0, output0);
vst_h_p_x(out0_1, output0);
vst_h_p_x(out1_0, output1);
vst_h_p_x(out1_1, output1);
vmvp_vv(prev0, curr0, curr1);
vmvp_vv(curr0, next0, next1);
}
}
}; // namespace kelvin::cv