blob: b924e24747a1ecb3b26e04c4a84ddec2eaef055c [file] [log] [blame]
/*
* Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Convolution based on Kelvin ops
// Data types: input: s8, filter: s8, bias: s32
// Special case for 48x3x1x48 filter
#include "tflm/opt/conv_util.h"
namespace kelvin::opt {
void ConvS8K3x1D48(
const tflite::ConvParams& params, const int32_t* output_multiplier,
const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
int8_t* output_data) {
const auto batches = MatchingDim(input_shape, 0, output_shape, 0);
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;
const int dilation_height_factor = params.dilation_height_factor;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
const int input_width = input_shape.Dims(2);
const int input_depth = input_shape.Dims(3);
const int32_t input_offset = params.input_offset;
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int filter_depth = filter_shape.Dims(3);
const int output_height = output_shape.Dims(1);
const int output_depth = output_shape.Dims(3);
const int32_t output_offset = params.output_offset;
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;
TFLITE_DCHECK(batches == 1);
TFLITE_DCHECK(filter_depth == input_depth);
TFLITE_DCHECK(filter_height == 3);
TFLITE_DCHECK(filter_width == 1);
TFLITE_DCHECK(input_width == 1);
TFLITE_DCHECK(stride_width == 1);
TFLITE_DCHECK(stride_height == 1);
TFLITE_DCHECK(dilation_width_factor == 1);
TFLITE_DCHECK(dilation_height_factor == 1);
TFLITE_DCHECK(pad_width == 0);
TFLITE_DCHECK(pad_height == 0);
int32_t bias[48 * 4];
int32_t mult[48 * 4];
int32_t shft[48 * 4];
Swizzle<48>(bias_data, bias);
Swizzle<48>(output_multiplier, mult);
Swizzle<48, true>(output_shift, shft);
int8_t juggled_filter_data[48 / 8][3][1][48 / 4][8][4];
Filter_N_H_W_M<48>(filter_data, juggled_filter_data[0][0][0][0][0], 3, 1, 48);
union {
vconv_u8_t conv;
uint32_t raw;
} cmds;
cmds.conv.mode = 0;
cmds.conv.start = 0;
cmds.conv.stop = 7;
cmds.conv.sbias1 = input_offset;
cmds.conv.sdata1 = true;
cmds.conv.sbias2 = 0;
cmds.conv.sdata2 = true;
union {
vconv_u8_t conv;
uint32_t raw;
} cmds16;
cmds16.conv.mode = 0;
cmds16.conv.start = 0;
cmds16.conv.stop = 3;
cmds16.conv.sbias1 = input_offset;
cmds16.conv.sdata1 = true;
cmds16.conv.sbias2 = 0;
cmds16.conv.sdata2 = true;
for (int zo_hi = 0; zo_hi < output_depth; zo_hi += 8) {
// For each pixel, the general flow for this kernel looks like:
// 1) Reset accumulator and load activations into [v32, v46]
// 2) For each group of 32 scalars in the pixel fan-in, run MAC pipeline
// 2a) Load subset of activations from [v32, v46] to [v0, v7]
// 2b) Load subset of weights
// 2c) Run aconv
// 3) Run the output pipeline and store.
//
// For step 1, we'll alias [v32, v46] to [L0, LE]. For most iterations,
// we load all of these registers (10 pixels). For remainder iterations,
// we load a subset and pad the rest with 0's. The data will be stored as
// follows, where each letter represents 16 bytes of a pixel stored into
// a register (capitalization used to help distinguish channels in a pixel):
// L0 L1 L2 L3 L4 L5 L6 L7 L8 L9 LA LB LC LD LE
// Aa AB bB Cc CD dD Ee EF fF Gg GH hH Ii IJ jJ
#define L0 v32
#define L1 v33
#define L2 v34
#define L3 v35
#define L4 v36
#define L5 v37
#define L6 v38
#define L7 v39
#define L8 v40
#define L9 v41
#define LA v42
#define LB v43
#define LC v44
#define LD v45
#define LE v46
// We run 5 iterations of step 2, 4 full iterations and one half iteration.
// Because each pixel takes 1.5 registers, we have to interleave vmv_v and
// vsliden_w_4_vv instructions to ensure the same output channels are stored
// in each register per-pixel. As a refresher, vsliden_w_4_vv takes two
// register arguments (X and Y), and returns the concatenation of the last
// half of X and the first half of Y. ie:
// L1 L2
// AB bB
// vsliden_w_4_vv(v1, L1, L2); -> v1 = Bb
#define CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE(p_flt) \
{ \
/* 1/5 */ \
/* Ky = 0, IC:[0-31] */ \
vmv_v(v0, L0); /* Aa */ \
vsliden_w_4_vv(v1, L1, L2); /* Bb */ \
vmv_v(v2, L3); /* Cc */ \
vsliden_w_4_vv(v3, L4, L5); /* Dd */ \
vmv_v(v4, L6); /* Ee */ \
vsliden_w_4_vv(v5, L7, L8); /* Ff */ \
vmv_v(v6, L9); /* Gg */ \
vsliden_w_4_vv(v7, LA, LB); /* Hh */ \
vld_b_x_m(v56, p_flt + 128 * 0); \
vld_b_x_m(v60, p_flt + 128 * 1); \
aconv_vxv(v48, v0, cmds, v56); \
\
/* 2/5 */ \
/* Ky = 0, IC:[32-47]; Ky = 1, IC:[0-15] */ \
vmv_v(v0, L1); /* AB */ \
vsliden_w_4_vv(v1, L2, L3); /* BC */ \
vmv_v(v2, L4); /* CD */ \
vsliden_w_4_vv(v3, L5, L6); /* DE */ \
vmv_v(v4, L7); /* EF */ \
vsliden_w_4_vv(v5, L8, L9); /* FG */ \
vmv_v(v6, LA); /* GH */ \
vsliden_w_4_vv(v7, LB, LC); /* HI */ \
vld_b_x_m(v56, p_flt + 128 * 2); \
vld_b_x_m(v60, p_flt + 128 * 3); \
aconv_vxv(v48, v0, cmds, v56); \
\
/* 3/5 */ \
/* Ky = 1, IC:[16-47] */ \
vmv_v(v0, L2); /* bB */ \
vsliden_w_4_vv(v1, L3, L4); /* cC */ \
vmv_v(v2, L5); /* dD */ \
vsliden_w_4_vv(v3, L6, L7); /* eE */ \
vmv_v(v4, L8); /* fF */ \
vsliden_w_4_vv(v5, L9, LA); /* gG */ \
vmv_v(v6, LB); /* hH */ \
vsliden_w_4_vv(v7, LC, LD); /* iI */ \
vld_b_x_m(v56, p_flt + 128 * 4); \
vld_b_x_m(v60, p_flt + 128 * 5); \
aconv_vxv(v48, v0, cmds, v56); \
\
/* 4/5 */ \
/* Ky = 2, IC:[0-31] */ \
vmv_v(v0, L3); /* Cc */ \
vsliden_w_4_vv(v1, L4, L5); /* Dd */ \
vmv_v(v2, L6); /* Ee */ \
vsliden_w_4_vv(v3, L4, L5); /* Ff */ \
vmv_v(v4, L9); /* Gg */ \
vsliden_w_4_vv(v5, LA, LB); /* Hh */ \
vmv_v(v6, LC); /* Ii */ \
vsliden_w_4_vv(v7, LD, LE); /* Jj */ \
vld_b_x_m(v56, p_flt + 128 * 6); \
vld_b_x_m(v60, p_flt + 128 * 7); \
aconv_vxv(v48, v0, cmds, v56); \
\
/* 5/5 */ \
/* Ky = 2, IC:[32-47] half iteration */ \
vmv_v(v0, L4); /* C(D- ignored) */ \
vsliden_w_4_vv(v1, L5, L6); /* D(E- ignored) */ \
vmv_v(v2, L7); /* E(F- ignored) */ \
vsliden_w_4_vv(v3, L8, L9); /* F(G- ignored) */ \
vmv_v(v4, LA); /* G(H- ignored) */ \
vsliden_w_4_vv(v5, LB, LC); /* H(I- ignored) */ \
vmv_v(v6, LD); /* I(J- ignored) */ \
/* Pad last iteration with first pixel. Gets ignored by cmd16 */ \
vsliden_w_4_vv(v7, LE, L0); /* J(A- ignored) */ \
vld_b_x_m(v56, p_flt + 128 * 8); /*Load once half iteration*/ \
/* cmds16 runs subset of outer product */ \
aconv_vxv(v48, v0, cmds16, v56); \
}
// Iterate over outputs
int out_y = 0;
for (; out_y + 8 <= output_height; out_y += 8) {
// Reset accumulator
vdup_w_x_m(v48, 0);
vdup_w_x_m(v52, 0);
acset_v(v48, v48);
const int8_t* p_flt = juggled_filter_data[zo_hi / 8][0][0][0][0];
const int8_t* p_in = input_data + (out_y * input_width * input_depth);
// Load 10*48 activations into 10*48*32 = 15 registers
vld_b_x_m(L0, p_in);
vld_b_x_m(L4, p_in + 32 * 4);
vld_b_x_m(L8, p_in + 32 * 8);
vld_b_x(LC, p_in + 32 * 12);
vld_b_x(LD, p_in + 32 * 13);
vld_b_x(LE, p_in + 32 * 14);
// MAC pipeline
CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE(p_flt);
// Output pipeline
INT32_TO_INT8_OUTPUT_PIPELINE(bias + zo_hi * 4, mult + zo_hi * 4,
shft + zo_hi * 4, output_activation_min,
output_activation_max, output_offset, v36,
v40, v44);
int8_t* p_out =
output_data + tflite::Offset(output_shape, 0, out_y, 0, zo_hi);
vstq_b_sp_xx(v48, p_out, output_depth);
vstq_b_sp_xx(v52, p_out, output_depth);
}
// Left over minibatch
int remainder = output_height - out_y;
if (remainder != 0) {
// Reset accumulator
vdup_w_x_m(v48, 0);
vdup_w_x_m(v52, 0);
acset_v(v48, v48);
const int8_t* p_flt = juggled_filter_data[zo_hi / 8][0][0][0][0];
const int8_t* p_in = input_data + (out_y * input_width * input_depth);
// Load (remainder + 2) * 48 activations
// L0 L1 L2 L3 L4 L5 L6 L7 L8 L9 LA LB LC LD
// AA AB BB CC CD DD EE EF FF GG GH HH II I-
vld_b_x_m(L0, p_in);
vdup_w_x_m(L4, 0);
vdup_w_x_m(L8, 0);
vdup_w_x_m(LC, 0);
switch (remainder) {
case 7:
vld_b_x(LD, p_in + 32 * 13);
vld_b_x(LC, p_in + 32 * 12);
case 6:
vld_b_x(LB, p_in + 32 * 11);
case 5:
vld_b_x(LA, p_in + 32 * 10);
vld_b_x(L9, p_in + 32 * 9);
case 4:
vld_b_x(L8, p_in + 32 * 8);
case 3:
vld_b_x(L7, p_in + 32 * 7);
vld_b_x(L6, p_in + 32 * 6);
case 2:
vld_b_x(L5, p_in + 32 * 5);
default:
break;
}
vld_b_x(L4, p_in + 32 * 4);
// MAC pipeline
CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE(p_flt);
// Output pipeline
INT32_TO_INT8_OUTPUT_PIPELINE(bias + zo_hi * 4, mult + zo_hi * 4,
shft + zo_hi * 4, output_activation_min,
output_activation_max, output_offset, v36,
v40, v44);
int8_t* p_out =
output_data + tflite::Offset(output_shape, 0, out_y, 0, zo_hi);
uint8_t local_data[64];
vst_b_x(v0, local_data);
vst_b_x(v1, local_data + 32);
for (int i = 0; i < remainder; i++) {
memcpy(p_out + (i * output_depth), local_data + (i * 8), 8);
}
}
#undef CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE
#undef L0
#undef L1
#undef L2
#undef L3
#undef L4
#undef L5
#undef L6
#undef L7
#undef L8
#undef L9
#undef LA
#undef LB
#undef LC
#undef LD
#undef LE
}
}
} // namespace kelvin::opt