blob: 0d04e2a525305e7f48a36cd922a6219025103edf [file] [log] [blame]
// Copyright 2023 Google LLC
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
#include <algorithm>
#include "crt/kelvin.h"
#include "tflm/opt/opt.h"
#include "tensorflow/lite/kernels/internal/common.h"
namespace kelvin::opt {
void DepthwiseConv2DKelvinS16K3x1(const int16_t* activations,
const int8_t* weights,
const int64_t* biases,
int channels, int frames, int dilation,
const int32_t* output_mult,
const int32_t* output_shift,
int32_t output_activation_min,
int32_t output_activation_max,
int16_t* output) {
for (int c = 0; c + 32 <= channels; c += 32) {
// Load weights and interleave into correct order [v58-v63].
// Because there are more activations than weights, interleave weights.
const int8_t* local_weights0 = weights + c;
vld_b_p_xx(v0, local_weights0, channels);
vaddw_h_vx(v48, v0, 0);
vzip_h_vv(v58, v48, v49);
vld_b_p_xx(v1, local_weights0, channels);
vaddw_h_vx(v50, v1, 0);
vzip_h_vv(v60, v50, v51);
vld_b_x(v2, local_weights0);
vaddw_h_vx(v52, v2, 0);
vzip_h_vv(v62, v52, v53);
// Assume biases fit in 32-bit. This assumption is verified offline.
// Load biases and swizzle [v52-v55].
int32_t local_biases[32];
for (int j = 0; j < 32; j++) {
local_biases[j] = static_cast<int32_t>(biases[c + j]);
}
vld_w_x_m(v4, local_biases);
vzip_w_vv(v52, v4, v5);
vzip_w_vv(v54, v6, v7);
const int32_t step = dilation * channels;
const int32_t* local_output_mult = output_mult + c;
const int32_t* local_output_shift = output_shift + c;
for (int d = 0; d < dilation; d++) {
// Accumulators will be [v48 - v51].
const int16_t* local_activations0 = activations + (d * channels) + c;
const int16_t* local_activations1 = local_activations0 + 16;
int16_t* local_output = output + (d * channels) + c;
// Registers [v0-v5 will be for loading activations]
// Preload for valid padding:
vld_h_p_xx(v0, local_activations0, step);
vld_h_p_xx(v1, local_activations1, step);
vld_h_p_xx(v2, local_activations0, step);
vld_h_p_xx(v3, local_activations1, step);
int frames_idx = (2 * dilation) + d;
int32_t accumulators[32];
for (; frames_idx < frames; frames_idx += dilation) {
vld_h_p_xx(v4, local_activations0, step);
vld_h_p_xx(v5, local_activations1, step);
vmulw_w_vv(v48, v58, v0); // Clobber accumulator
vmulw_w_vv(v50, v59, v1); // Clobber accumulator
vadd_w_vv_m(v48, v48, v52); // Add bias.
vmulw_w_vv(v40, v60, v2);
vmulw_w_vv(v42, v61, v3);
vadd_w_vv_m(v48, v48, v40);
vmulw_w_vv(v44, v62, v4);
vmulw_w_vv(v46, v63, v5);
vadd_w_vv_m(v48, v48, v44);
vzip_w_vv(v48, v48, v49); // Swizzle accumulators
vzip_w_vv(v50, v50, v51);
vst_w_x_m(v48, accumulators); // Store accumulators
// Output pipeline in scalar, to preserve bit accuracy with the ARM CPU
// implementation.
for (int i = 0; i < 32; i++) {
int32_t result = tflite::MultiplyByQuantizedMultiplier(
static_cast<int64_t>(accumulators[i]), local_output_mult[i],
local_output_shift[i]);
local_output[i] = static_cast<int16_t>(
std::clamp(result, output_activation_min, output_activation_max));
}
// Slide registers
vmvp_vv(v0, v2, v3);
vmvp_vv(v2, v4, v5);
local_output += step;
}
}
}
// TODO(derekjchow): Handle channels % 32 cases.
// Break it down into:
// - one loop looking for 16 byte stripes
// - one final loop handling remainder
}
} // namespace kelvin::opt