blob: 580b618948a9e7bb23cd08e7ab81851bd454a107 [file] [log] [blame]
/*
* Copyright 2023 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <riscv_vector.h>
#include "risp4ml/common/utils.h"
#include "risp4ml/isp_stages/downscale.h"
static const uint16_t kScalePrecision = 10;
static const uint32_t kScaleFixedOne = (1 << kScalePrecision);
static DownscaleParams params = {
.enable = true,
.scale_precision = kScalePrecision,
.scale_fixed_one = kScaleFixedOne,
};
void set_downscale_param(DownscaleParams* in_params) { params = *in_params; }
void set_downscale_factor(Image* input, ImageU8* output) { return; }
// Basic bilinear downscale
// Implementation based on:
// https://chao-ji.github.io/jekyll/update/2018/07/19/BilinearResize.html
// Resamples image using bilinear interpolation.
// 'output' is modified by this function to store the output image.
void downscale_process(Image* input, ImageU8* output) {
if (!params.enable) {
return;
}
uint32_t input_width = input->width;
uint32_t input_w_1 = input->width - 1;
uint32_t input_h_1 = input->height - 1;
uint32_t w_1 = output->width - 1;
uint32_t h_1 = output->height - 1;
size_t vl;
size_t n = output->height * output->width;
// auxiliary variables
vuint32m8_t vx, vy, vz, vid, vp, vq;
// neighboring x & y coordinates
vuint32m8_t vx_l, vy_l, vx_h, vy_h;
// weights of neighbors
vuint32m8_t vx_weight, vy_weight, vx_weight_1minus, vy_weight_1minus;
// values of neighboring data points
vuint32m8_t va, vb, vc, vd;
vuint32m8_t vo; // 32bit output
vuint16m4_t vo_16b; // 16bit output
vuint8m2_t vo_8b; // 8bit output
for (uint16_t c = 0; c < output->num_channels; ++c) {
pixel_type_t* in = image_row(input, c, 0);
uint8_t* out = output->data + c;
for (size_t i = 0; i < n; i += vl) {
vl = __riscv_vsetvl_e16m4(n - i);
vid = __riscv_vid_v_u32m8(vl);
vid = __riscv_vadd(vid, i, vl);
vy = __riscv_vdivu(vid, output->width, vl);
vx = __riscv_vremu(vid, output->width, vl);
// find 4 neighboring points
// four neighboring coordinates are
// [y_l, x_l], [y_l, x_h], [y_h, x_l], [y_h, x_h]
vx_l = __riscv_vmul(vx, input_w_1, vl);
vx_l = __riscv_vdivu(vx_l, w_1, vl);
vx_h = __riscv_vadd(vx_l, 1, vl);
vx_h = __riscv_vminu(vx_h, input_w_1, vl); // clamp
vy_l = __riscv_vmul(vy, input_h_1, vl);
vy_l = __riscv_vdivu(vy_l, h_1, vl);
vy_h = __riscv_vadd(vy_l, 1, vl);
vy_h = __riscv_vminu(vy_h, input_h_1, vl); // clamp
// load values of four neighboring points: A, B C, D
vz = __riscv_vmul(vy_l, input_width, vl);
vz = __riscv_vadd(vz, vx_l, vl);
vz = __riscv_vsll(vz, 1, vl); // *2
vo_16b = __riscv_vluxei32(in, vz, vl);
va = __riscv_vwaddu_vx(vo_16b, 0, vl);
vz = __riscv_vmul(vy_l, input_width, vl);
vz = __riscv_vadd(vz, vx_h, vl);
vz = __riscv_vsll(vz, 1, vl); // *2
vo_16b = __riscv_vluxei32(in, vz, vl);
vb = __riscv_vwaddu_vx(vo_16b, 0, vl);
vz = __riscv_vmul(vy_h, input_width, vl);
vz = __riscv_vadd(vz, vx_l, vl);
vz = __riscv_vsll(vz, 1, vl); // *2
vo_16b = __riscv_vluxei32(in, vz, vl);
vc = __riscv_vwaddu_vx(vo_16b, 0, vl);
vz = __riscv_vmul(vy_h, input_width, vl);
vz = __riscv_vadd(vz, vx_h, vl);
vz = __riscv_vsll(vz, 1, vl); // *2
vo_16b = __riscv_vluxei32(in, vz, vl);
vd = __riscv_vwaddu_vx(vo_16b, 0, vl);
// compute weights of four neighboring points: wx, wy, 1-wx, 1-wy
vp = __riscv_vmul(vx, input_w_1, vl);
vq = __riscv_vmul(vx_l, w_1, vl);
vp = __riscv_vssubu(vp, vq, vl);
vp = __riscv_vsll(vp, params.scale_precision, vl);
vx_weight = __riscv_vdivu(vp, w_1, vl);
vp = __riscv_vmul(vy, input_h_1, vl);
vq = __riscv_vmul(vy_l, h_1, vl);
vp = __riscv_vssubu(vp, vq, vl);
vp = __riscv_vsll(vp, params.scale_precision, vl);
vy_weight = __riscv_vdivu(vp, h_1, vl);
vx_weight_1minus = __riscv_vrsub(vx_weight, params.scale_fixed_one, vl);
vy_weight_1minus = __riscv_vrsub(vy_weight, params.scale_fixed_one, vl);
// Bilinear Interpolation Formular:
// out = A*(1-wx)*(1-wy) + B*wx*(1-wy)
// + C*(1-wx)*wy + D*wx*wy
vo = __riscv_vmul(va, vx_weight_1minus, vl);
vo = __riscv_vsrl(vo, params.scale_precision, vl);
vo = __riscv_vmul(vo, vy_weight_1minus, vl);
vp = __riscv_vmul(vb, vx_weight, vl);
vp = __riscv_vsrl(vp, params.scale_precision, vl);
vp = __riscv_vmul(vp, vy_weight_1minus, vl);
vo = __riscv_vadd(vo, vp, vl);
vp = __riscv_vmul(vc, vx_weight_1minus, vl);
vp = __riscv_vsrl(vp, params.scale_precision, vl);
vp = __riscv_vmul(vp, vy_weight, vl);
vo = __riscv_vadd(vo, vp, vl);
vp = __riscv_vmul(vd, vx_weight, vl);
vp = __riscv_vsrl(vp, params.scale_precision, vl);
vp = __riscv_vmul(vp, vy_weight, vl);
vo = __riscv_vadd(vo, vp, vl);
// bit shift from 32bits to 8bits
vo_16b = __riscv_vnsrl(vo, params.scale_precision, vl);
vo_8b = __riscv_vnsrl(vo_16b, kRawPipelineBpp - kPipeOutputBpp, vl);
// save
__riscv_vsse8(out + i * output->num_channels, output->num_channels, vo_8b,
vl);
}
}
}