tests/cv/upsample.cc - sw/kelvin - Git at Google

 /*
  * Copyright 2023 Google LLC
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "tests/cv/upsample.h"

 #include <cstdint>

 #include "crt/kelvin.h"

 namespace kelvin::cv {

 void upsample(int num_output_cols, uint16_t* input0_row, uint16_t* input1_row,
               uint16_t* output0_row, uint16_t* output1_row) {
   // Mirror the edges using padded input buffers.
   {
     const int r = (num_output_cols / 2) - 1;
     input0_row[-1] = input0_row[0];
     input1_row[-1] = input1_row[0];
     input0_row[r + 1] = input0_row[r];
     input1_row[r + 1] = input1_row[r];
   }

   int vlenh;
   getmaxvl_h(vlenh);

   uint16_t* __restrict input0 =
       reinterpret_cast<uint16_t*>((input0_row - vlenh));
   uint16_t* __restrict input1 =
       reinterpret_cast<uint16_t*>((input1_row - vlenh));
   uint16_t* __restrict output0 = reinterpret_cast<uint16_t*>(output0_row);
   uint16_t* __restrict output1 = reinterpret_cast<uint16_t*>(output1_row);

 #define prev0 v0
 #define prev1 v1
 #define curr0 v2
 #define curr1 v3
 #define next0 v4
 #define next1 v5
 #define c0 curr0
 #define c1 curr1
 #define p0 v6
 #define p0_0 v6
 #define p0_1 v7
 #define p1 v8
 #define p1_0 v8
 #define p1_1 v9
 #define n0 v10
 #define n0_0 v10
 #define n0_1 v11
 #define n1 v12
 #define n1_0 v12
 #define n1_1 v13
 #define a v14
 #define a_0 v14
 #define a_1 v15
 #define b v16
 #define b_0 v16
 #define b_1 v17
 #define ae v18
 #define ae_0 v18
 #define ae_1 v19
 #define be v20
 #define be_0 v20
 #define be_1 v21
 #define ao v22
 #define ao_0 v22
 #define ao_1 v23
 #define bo v24
 #define bo_0 v24
 #define bo_1 v25
 #define re0 v26
 #define re0_0 v26
 #define re0_1 v27
 #define re1 v28
 #define re1_0 v28
 #define re1_1 v29
 #define ro0 v30
 #define ro0_0 v30
 #define ro0_1 v31
 #define ro1 v32
 #define ro1_0 v32
 #define ro1_1 v33
 #define out0 v34
 #define out0_0 v34
 #define out0_1 v35
 #define out1 v36
 #define out1_0 v36
 #define out1_1 v37

   vld_h_p_x(prev0, input0);
   vld_h_p_x(prev1, input1);

   vld_h_p_x(curr0, input0);
   vld_h_p_x(curr1, input1);

   for (int i = 0; i < num_output_cols; i += 2 * vlenh) {
     vld_h_p_x(next0, input0);
     vld_h_p_x(next1, input1);

     vslidep_h_1_vv(p0, prev0, curr0);
     vslidep_h_1_vv(p1, prev1, curr1);
     vsliden_h_1_vv(n0, curr0, next0);
     vsliden_h_1_vv(n1, curr1, next1);

     vmulw_w_u_vx(ae, c0, 3);
     vmulw_w_u_vx(be, c1, 3);
     vacc_w_u_vv(ao, ae, n0);
     vacc_w_u_vv(bo, be, n1);
     vacc_w_u_vv(ae, ae, p0);
     vacc_w_u_vv(be, be, p1);

     vmvp_vv(re0, be_0, be_1);
     vmvp_vv(re1, ae_0, ae_1);
     vmvp_vv(ro0, bo_0, bo_1);
     vmvp_vv(ro1, ao_0, ao_1);

     vmacc_w_vx(re0_0, ae_0, 3);
     vmacc_w_vx(re0_1, ae_1, 3);
     vmacc_w_vx(re1_0, be_0, 3);
     vmacc_w_vx(re1_1, be_1, 3);
     vmacc_w_vx(ro0_0, ao_0, 3);
     vmacc_w_vx(ro0_1, ao_1, 3);
     vmacc_w_vx(ro1_0, bo_0, 3);
     vmacc_w_vx(ro1_1, bo_1, 3);

     vsransu_h_r_vx(re0, re0, 4);
     vsransu_h_r_vx(re1, re1, 4);
     vsransu_h_r_vx(ro0, ro0, 4);
     vsransu_h_r_vx(ro1, ro1, 4);

     vzip_h_vv(out0, re0, ro0);
     vzip_h_vv(out1, re1, ro1);
     vst_h_p_x(out0_0, output0);
     vst_h_p_x(out0_1, output0);
     vst_h_p_x(out1_0, output1);
     vst_h_p_x(out1_1, output1);

     vmvp_vv(prev0, curr0, curr1);
     vmvp_vv(curr0, next0, next1);
   }
 }

 };  // namespace kelvin::cv
	/*
	* Copyright 2023 Google LLC
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "tests/cv/upsample.h"

	#include <cstdint>

	#include "crt/kelvin.h"

	namespace kelvin::cv {

	void upsample(int num_output_cols, uint16_t* input0_row, uint16_t* input1_row,
	uint16_t* output0_row, uint16_t* output1_row) {
	// Mirror the edges using padded input buffers.
	{
	const int r = (num_output_cols / 2) - 1;
	input0_row[-1] = input0_row[0];
	input1_row[-1] = input1_row[0];
	input0_row[r + 1] = input0_row[r];
	input1_row[r + 1] = input1_row[r];
	}

	int vlenh;
	getmaxvl_h(vlenh);

	uint16_t* __restrict input0 =
	reinterpret_cast<uint16_t*>((input0_row - vlenh));
	uint16_t* __restrict input1 =
	reinterpret_cast<uint16_t*>((input1_row - vlenh));
	uint16_t* __restrict output0 = reinterpret_cast<uint16_t*>(output0_row);
	uint16_t* __restrict output1 = reinterpret_cast<uint16_t*>(output1_row);

	#define prev0 v0
	#define prev1 v1
	#define curr0 v2
	#define curr1 v3
	#define next0 v4
	#define next1 v5
	#define c0 curr0
	#define c1 curr1
	#define p0 v6
	#define p0_0 v6
	#define p0_1 v7
	#define p1 v8
	#define p1_0 v8
	#define p1_1 v9
	#define n0 v10
	#define n0_0 v10
	#define n0_1 v11
	#define n1 v12
	#define n1_0 v12
	#define n1_1 v13
	#define a v14
	#define a_0 v14
	#define a_1 v15
	#define b v16
	#define b_0 v16
	#define b_1 v17
	#define ae v18
	#define ae_0 v18
	#define ae_1 v19
	#define be v20
	#define be_0 v20
	#define be_1 v21
	#define ao v22
	#define ao_0 v22
	#define ao_1 v23
	#define bo v24
	#define bo_0 v24
	#define bo_1 v25
	#define re0 v26
	#define re0_0 v26
	#define re0_1 v27
	#define re1 v28
	#define re1_0 v28
	#define re1_1 v29
	#define ro0 v30
	#define ro0_0 v30
	#define ro0_1 v31
	#define ro1 v32
	#define ro1_0 v32
	#define ro1_1 v33
	#define out0 v34
	#define out0_0 v34
	#define out0_1 v35
	#define out1 v36
	#define out1_0 v36
	#define out1_1 v37

	vld_h_p_x(prev0, input0);
	vld_h_p_x(prev1, input1);

	vld_h_p_x(curr0, input0);
	vld_h_p_x(curr1, input1);

	for (int i = 0; i < num_output_cols; i += 2 * vlenh) {
	vld_h_p_x(next0, input0);
	vld_h_p_x(next1, input1);

	vslidep_h_1_vv(p0, prev0, curr0);
	vslidep_h_1_vv(p1, prev1, curr1);
	vsliden_h_1_vv(n0, curr0, next0);
	vsliden_h_1_vv(n1, curr1, next1);

	vmulw_w_u_vx(ae, c0, 3);
	vmulw_w_u_vx(be, c1, 3);
	vacc_w_u_vv(ao, ae, n0);
	vacc_w_u_vv(bo, be, n1);
	vacc_w_u_vv(ae, ae, p0);
	vacc_w_u_vv(be, be, p1);

	vmvp_vv(re0, be_0, be_1);
	vmvp_vv(re1, ae_0, ae_1);
	vmvp_vv(ro0, bo_0, bo_1);
	vmvp_vv(ro1, ao_0, ao_1);

	vmacc_w_vx(re0_0, ae_0, 3);
	vmacc_w_vx(re0_1, ae_1, 3);
	vmacc_w_vx(re1_0, be_0, 3);
	vmacc_w_vx(re1_1, be_1, 3);
	vmacc_w_vx(ro0_0, ao_0, 3);
	vmacc_w_vx(ro0_1, ao_1, 3);
	vmacc_w_vx(ro1_0, bo_0, 3);
	vmacc_w_vx(ro1_1, bo_1, 3);

	vsransu_h_r_vx(re0, re0, 4);
	vsransu_h_r_vx(re1, re1, 4);
	vsransu_h_r_vx(ro0, ro0, 4);
	vsransu_h_r_vx(ro1, ro1, 4);

	vzip_h_vv(out0, re0, ro0);
	vzip_h_vv(out1, re1, ro1);
	vst_h_p_x(out0_0, output0);
	vst_h_p_x(out0_1, output0);
	vst_h_p_x(out1_0, output1);
	vst_h_p_x(out1_1, output1);

	vmvp_vv(prev0, curr0, curr1);
	vmvp_vv(curr0, next0, next1);
	}
	}

	}; // namespace kelvin::cv