| /* |
| * Copyright 2022 Google LLC |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include <riscv_vector.h> |
| |
| #include "risp4ml/common/utils.h" |
| #include "risp4ml/isp_stages/downscale.h" |
| |
| static const uint16_t kScalePrecision = 10; |
| static const uint32_t kScaleFixedOne = (1 << kScalePrecision); |
| |
| static DownscaleParams params = { |
| .enable = true, |
| .scale_precision = kScalePrecision, |
| .scale_fixed_one = kScaleFixedOne, |
| }; |
| |
| void set_downscale_param(DownscaleParams* in_params) { params = *in_params; } |
| void set_downscale_factor(Image* input, ImageU8* output) { return; } |
| |
| // Basic bilinear downscale |
| // Implementation based on: |
| // https://chao-ji.github.io/jekyll/update/2018/07/19/BilinearResize.html |
| // Resamples image using bilinear interpolation. |
| // 'output' is modified by this function to store the output image. |
| void downscale_process(Image* input, ImageU8* output) { |
| if (!params.enable) { |
| return; |
| } |
| |
| uint32_t input_width = input->width; |
| uint32_t input_w_1 = input->width - 1; |
| uint32_t input_h_1 = input->height - 1; |
| uint32_t w_1 = output->width - 1; |
| uint32_t h_1 = output->height - 1; |
| |
| size_t vl; |
| size_t n = output->height * output->width; |
| // auxiliary variables |
| vuint32m8_t vx, vy, vz, vid, vp, vq; |
| // neighboring x & y coordinates |
| vuint32m8_t vx_l, vy_l, vx_h, vy_h; |
| // weights of neighbors |
| vuint32m8_t vx_weight, vy_weight, vx_weight_1minus, vy_weight_1minus; |
| // values of neighboring data points |
| vuint32m8_t va, vb, vc, vd; |
| vuint32m8_t vo; // 32bit output |
| vuint16m4_t vo_16b; // 16bit output |
| vuint8m2_t vo_8b; // 8bit output |
| |
| for (uint16_t c = 0; c < output->num_channels; ++c) { |
| pixel_type_t* in = image_row(input, c, 0); |
| uint8_t* out = output->data + c; |
| |
| for (size_t i = 0; i < n; i += vl) { |
| vl = __riscv_vsetvl_e16m4(n - i); |
| vid = __riscv_vid_v_u32m8(vl); |
| vid = __riscv_vadd(vid, i, vl); |
| |
| vy = __riscv_vdivu(vid, output->width, vl); |
| vx = __riscv_vremu(vid, output->width, vl); |
| |
| // find 4 neighboring points |
| // four neighboring coordinates are |
| // [y_l, x_l], [y_l, x_h], [y_h, x_l], [y_h, x_h] |
| vx_l = __riscv_vmul(vx, input_w_1, vl); |
| vx_l = __riscv_vdivu(vx_l, w_1, vl); |
| vx_h = __riscv_vadd(vx_l, 1, vl); |
| vx_h = __riscv_vminu(vx_h, input_w_1, vl); // clamp |
| |
| vy_l = __riscv_vmul(vy, input_h_1, vl); |
| vy_l = __riscv_vdivu(vy_l, h_1, vl); |
| vy_h = __riscv_vadd(vy_l, 1, vl); |
| vy_h = __riscv_vminu(vy_h, input_h_1, vl); // clamp |
| |
| // load values of four neighboring points: A, B C, D |
| vz = __riscv_vmul(vy_l, input_width, vl); |
| vz = __riscv_vadd(vz, vx_l, vl); |
| vz = __riscv_vsll(vz, 1, vl); // *2 |
| vo_16b = __riscv_vluxei32(in, vz, vl); |
| va = __riscv_vwaddu_vx(vo_16b, 0, vl); |
| |
| vz = __riscv_vmul(vy_l, input_width, vl); |
| vz = __riscv_vadd(vz, vx_h, vl); |
| vz = __riscv_vsll(vz, 1, vl); // *2 |
| vo_16b = __riscv_vluxei32(in, vz, vl); |
| vb = __riscv_vwaddu_vx(vo_16b, 0, vl); |
| |
| vz = __riscv_vmul(vy_h, input_width, vl); |
| vz = __riscv_vadd(vz, vx_l, vl); |
| vz = __riscv_vsll(vz, 1, vl); // *2 |
| vo_16b = __riscv_vluxei32(in, vz, vl); |
| vc = __riscv_vwaddu_vx(vo_16b, 0, vl); |
| |
| vz = __riscv_vmul(vy_h, input_width, vl); |
| vz = __riscv_vadd(vz, vx_h, vl); |
| vz = __riscv_vsll(vz, 1, vl); // *2 |
| vo_16b = __riscv_vluxei32(in, vz, vl); |
| vd = __riscv_vwaddu_vx(vo_16b, 0, vl); |
| |
| // compute weights of four neighboring points: wx, wy, 1-wx, 1-wy |
| vp = __riscv_vmul(vx, input_w_1, vl); |
| vq = __riscv_vmul(vx_l, w_1, vl); |
| vp = __riscv_vssubu(vp, vq, vl); |
| vp = __riscv_vsll(vp, params.scale_precision, vl); |
| vx_weight = __riscv_vdivu(vp, w_1, vl); |
| |
| vp = __riscv_vmul(vy, input_h_1, vl); |
| vq = __riscv_vmul(vy_l, h_1, vl); |
| vp = __riscv_vssubu(vp, vq, vl); |
| vp = __riscv_vsll(vp, params.scale_precision, vl); |
| vy_weight = __riscv_vdivu(vp, h_1, vl); |
| |
| vx_weight_1minus = __riscv_vrsub(vx_weight, params.scale_fixed_one, vl); |
| vy_weight_1minus = __riscv_vrsub(vy_weight, params.scale_fixed_one, vl); |
| |
| // Bilinear Interpolation Formular: |
| // out = A*(1-wx)*(1-wy) + B*wx*(1-wy) |
| // + C*(1-wx)*wy + D*wx*wy |
| vo = __riscv_vmul(va, vx_weight_1minus, vl); |
| vo = __riscv_vsrl(vo, params.scale_precision, vl); |
| vo = __riscv_vmul(vo, vy_weight_1minus, vl); |
| |
| vp = __riscv_vmul(vb, vx_weight, vl); |
| vp = __riscv_vsrl(vp, params.scale_precision, vl); |
| vp = __riscv_vmul(vp, vy_weight_1minus, vl); |
| vo = __riscv_vadd(vo, vp, vl); |
| |
| vp = __riscv_vmul(vc, vx_weight_1minus, vl); |
| vp = __riscv_vsrl(vp, params.scale_precision, vl); |
| vp = __riscv_vmul(vp, vy_weight, vl); |
| vo = __riscv_vadd(vo, vp, vl); |
| |
| vp = __riscv_vmul(vd, vx_weight, vl); |
| vp = __riscv_vsrl(vp, params.scale_precision, vl); |
| vp = __riscv_vmul(vp, vy_weight, vl); |
| vo = __riscv_vadd(vo, vp, vl); |
| |
| // bit shift from 32bits to 8bits |
| vo_16b = __riscv_vnsrl(vo, params.scale_precision, vl); |
| vo_8b = __riscv_vnsrl(vo_16b, kRawPipelineBpp - kPipeOutputBpp, vl); |
| |
| // save |
| __riscv_vsse8(out + i * output->num_channels, output->num_channels, vo_8b, |
| vl); |
| } |
| } |
| } |