sw/kelvin: Refactoring TFLM Kelvin kernel optimization codes This is the first version for refactoring TFLM kelvin kernel. Change-Id: I596fe47d6b9484f756b4d4df9bf657f22cdea54b
diff --git a/tflm/opt/BUILD b/tflm/opt/BUILD index 19eb017..277eef2 100644 --- a/tflm/opt/BUILD +++ b/tflm/opt/BUILD
@@ -17,8 +17,12 @@ cc_library( name = "opt", srcs = [ - "conv.cc", + "conv_s16_b32.cc", + "conv_s16_b64.cc", "conv_s8.cc", + "conv_s8_1x1.cc", + "conv_s8_3x1_d48.cc", + "conv_s8_d32.cc", "depthwise_conv_s16.cc", "depthwise_conv_s8.cc", "elementwise_add_s16.cc", @@ -26,10 +30,12 @@ "elementwise_add_s8.cc", "leaky_relu_s16.cc", "leaky_relu_s8.cc", + "max_pool_s8.cc", "memcpy.cc", - "max_pool_s8.cc" ], hdrs = [ + "conv_s8.h", + "conv_util.h", "opt.h", "util.h", ],
diff --git a/tflm/opt/conv_s16_b32.cc b/tflm/opt/conv_s16_b32.cc new file mode 100644 index 0000000..07625d0 --- /dev/null +++ b/tflm/opt/conv_s16_b32.cc
@@ -0,0 +1,144 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Convolution based on Kelvin ops +// Data types: input: s16, filter: s8, bias s32 + +#include "tflm/opt/conv_util.h" + +namespace kelvin::opt { +namespace { +void ConvS16B32Generic( + const tflite::ConvParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const tflite::RuntimeShape& input_shape, + const int16_t* input_data, const tflite::RuntimeShape& filter_shape, + const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, + const int32_t* bias_data, const tflite::RuntimeShape& output_shape, + int16_t* output_data) { + const auto batches = MatchingDim(input_shape, 0, output_shape, 0); + const auto stride_width = params.stride_width; + const auto stride_height = params.stride_height; + const auto dilation_width_factor = params.dilation_width_factor; + const auto dilation_height_factor = params.dilation_height_factor; + const auto pad_width = params.padding_values.width; + const auto pad_height = params.padding_values.height; + const auto input_height = input_shape.Dims(1); + const auto input_width = input_shape.Dims(2); + const auto input_depth = input_shape.Dims(3); + const auto input_offset = params.input_offset; + const auto filter_height = filter_shape.Dims(1); + const auto filter_width = filter_shape.Dims(2); + const auto filter_depth = filter_shape.Dims(3); + const auto output_height = output_shape.Dims(1); + const auto output_width = output_shape.Dims(2); + const auto output_depth = output_shape.Dims(3); + const auto output_offset = params.output_offset; + const auto output_activation_min = params.quantized_activation_min; + const auto output_activation_max = params.quantized_activation_max; + const auto groups = input_depth / filter_depth; + const auto filters_per_group = output_depth / groups; + + for (int batch = 0; batch < batches; ++batch) { + for (int out_y = 0; out_y < output_height; ++out_y) { + const int in_y_origin = out_y * stride_height - pad_height; + for (int out_x = 0; out_x < output_width; ++out_x) { + const int in_x_origin = out_x * stride_width - pad_width; + for (int out_channel = 0; out_channel < output_depth; ++out_channel) { + auto group = out_channel / filters_per_group; + int32_t acc32 = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) { + const int in_y = in_y_origin + dilation_height_factor * filter_y; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const bool inside = (in_x >= 0) && (in_x < input_width) && + (in_y >= 0) && (in_y < input_height); + if (!inside) { + continue; + } + int in_channel = 0; + do { + int load_count = std::min(filter_depth - in_channel, 16L); + int32_t input_swizzled[16]; + const int16_t* p_input = &input_data[tflite::Offset( + input_shape, batch, in_y, in_x, + in_channel + group * filter_depth)]; + for (int i = 0; i < 16; ++i) { + int swizzle_idx = swizzle[i]; + if (swizzle_idx < load_count) + input_swizzled[i] = *(p_input + swizzle_idx) + input_offset; + else + input_swizzled[i] = 0; + } + vld_w_l_xx(v0, input_swizzled, 4); + vld_w_l_xx(v1, input_swizzled + 4, 4); + vld_w_l_xx(v2, input_swizzled + 8, 4); + vld_w_l_xx(v3, input_swizzled + 12, 4); + vld_b_l_xx(v4, + &filter_data[tflite::Offset(filter_shape, + out_channel, filter_y, + filter_x, in_channel)], + load_count); + vaddw_h_vx(v4, v4, 0); + vaddw_w_vx(v6, v5, 0); + vaddw_w_vx(v4, v4, 0); + + vmul_w_vv_m(vm0, vm0, vm1); + vadd_w_vv(v0, v0, v1); + vadd_w_vv(v0, v0, v2); + vadd_w_vv(v0, v0, v3); + int32_t acc_spill[4]; + vst_w_l_xx(v0, acc_spill, 4); + for (int i = 0; i < 4; ++i) { + acc32 += acc_spill[i]; + } + in_channel += 16; + } while (in_channel + 16 <= filter_depth); + } + } + if (bias_data) { + acc32 = acc32 + bias_data[out_channel]; + } + int32_t acc = tflite::MultiplyByQuantizedMultiplier( + acc32, output_multiplier[out_channel], output_shift[out_channel]); + acc += output_offset; + acc = std::clamp(acc, output_activation_min, output_activation_max); + output_data[tflite::Offset(output_shape, batch, out_y, out_x, + out_channel)] = static_cast<int16_t>(acc); + } + } + } + } +} +} // namespace + +void ConvS16B32( + const tflite::ConvParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const tflite::RuntimeShape& input_shape, + const int16_t* input_data, const tflite::RuntimeShape& filter_shape, + const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, + const int32_t* bias_data, const tflite::RuntimeShape& output_shape, + int16_t* output_data) { + // generic implementation by default + auto fn = ConvS16B32Generic; + + // can add special cases below + + fn(params, output_multiplier, output_shift, input_shape, input_data, + filter_shape, filter_data, bias_shape, bias_data, output_shape, + output_data); +} + +} // namespace kelvin::opt
diff --git a/tflm/opt/conv.cc b/tflm/opt/conv_s16_b64.cc similarity index 66% rename from tflm/opt/conv.cc rename to tflm/opt/conv_s16_b64.cc index 49d32d5..48823dd 100644 --- a/tflm/opt/conv.cc +++ b/tflm/opt/conv_s16_b64.cc
@@ -1,5 +1,5 @@ /* - * Copyright 2023 Google LLC + * Copyright 2024 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,144 +14,19 @@ * limitations under the License. */ -#include <cassert> -#include <memory> +// Convolution based on Kelvin ops +// Data types: input: s16, filter: s8, bias s64 -#include "crt/kelvin.h" -#include "tensorflow/lite/kernels/internal/common.h" -#include "tensorflow/lite/kernels/internal/runtime_shape.h" -#include "tensorflow/lite/kernels/internal/types.h" -#include "tflm/opt/opt.h" -#include "tflm/opt/util.h" +#include "tflm/opt/conv_util.h" namespace kelvin::opt { namespace { -/* clang-format off */ -constexpr const int swizzle[16] = { - 0, 4, 8, 12, - 2, 6, 10, 14, - 1, 5, 9, 13, - 3, 7, 11, 15, -}; -/* clang-format on */ - -constexpr int kFilterHeightIndex = 1; -constexpr int kFilterWidthIndex = 2; -constexpr int kFilterInputChannelIndex = 3; -constexpr int kInputChannelIndex = 3; -constexpr int kOutputChannelIndex = 3; -} // namespace - -void conv_per_channel_b32( - const tflite::ConvParams& params, const int32_t* output_multiplier, - const int32_t* output_shift, const tflite::RuntimeShape& input_shape, - const int16_t* input_data, const tflite::RuntimeShape& filter_shape, - const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, - const int32_t* bias_data, const tflite::RuntimeShape& output_shape, - int16_t* output_data) { - const auto batches = MatchingDim(input_shape, 0, output_shape, 0); - const auto stride_width = params.stride_width; - const auto stride_height = params.stride_height; - const auto dilation_width_factor = params.dilation_width_factor; - const auto dilation_height_factor = params.dilation_height_factor; - const auto pad_width = params.padding_values.width; - const auto pad_height = params.padding_values.height; - const auto input_height = input_shape.Dims(1); - const auto input_width = input_shape.Dims(2); - const auto input_depth = input_shape.Dims(3); - const auto input_offset = params.input_offset; - const auto filter_height = filter_shape.Dims(1); - const auto filter_width = filter_shape.Dims(2); - const auto filter_depth = filter_shape.Dims(3); - const auto output_height = output_shape.Dims(1); - const auto output_width = output_shape.Dims(2); - const auto output_depth = output_shape.Dims(3); - const auto output_offset = params.output_offset; - const auto output_activation_min = params.quantized_activation_min; - const auto output_activation_max = params.quantized_activation_max; - const auto groups = input_depth / filter_depth; - const auto filters_per_group = output_depth / groups; - - for (int batch = 0; batch < batches; ++batch) { - for (int out_y = 0; out_y < output_height; ++out_y) { - const int in_y_origin = out_y * stride_height - pad_height; - for (int out_x = 0; out_x < output_width; ++out_x) { - const int in_x_origin = out_x * stride_width - pad_width; - for (int out_channel = 0; out_channel < output_depth; ++out_channel) { - auto group = out_channel / filters_per_group; - int32_t acc32 = 0; - for (int filter_y = 0; filter_y < filter_height; ++filter_y) { - const int in_y = in_y_origin + dilation_height_factor * filter_y; - for (int filter_x = 0; filter_x < filter_width; ++filter_x) { - const int in_x = in_x_origin + dilation_width_factor * filter_x; - const bool inside = (in_x >= 0) && (in_x < input_width) && - (in_y >= 0) && (in_y < input_height); - if (!inside) { - continue; - } - int in_channel = 0; - do { - int load_count = std::min(filter_depth - in_channel, 16L); - int32_t input_swizzled[16]; - const int16_t* p_input = &input_data[tflite::Offset( - input_shape, batch, in_y, in_x, - in_channel + group * filter_depth)]; - for (int i = 0; i < 16; ++i) { - int swizzle_idx = swizzle[i]; - if (swizzle_idx < load_count) - input_swizzled[i] = *(p_input + swizzle_idx) + input_offset; - else - input_swizzled[i] = 0; - } - vld_w_l_xx(v0, input_swizzled, 4); - vld_w_l_xx(v1, input_swizzled + 4, 4); - vld_w_l_xx(v2, input_swizzled + 8, 4); - vld_w_l_xx(v3, input_swizzled + 12, 4); - vld_b_l_xx(v4, - &filter_data[tflite::Offset(filter_shape, - out_channel, filter_y, - filter_x, in_channel)], - load_count); - vaddw_h_vx(v4, v4, 0); - vaddw_w_vx(v6, v5, 0); - vaddw_w_vx(v4, v4, 0); - - vmul_w_vv_m(vm0, vm0, vm1); - vadd_w_vv(v0, v0, v1); - vadd_w_vv(v0, v0, v2); - vadd_w_vv(v0, v0, v3); - int32_t acc_spill[4]; - vst_w_l_xx(v0, acc_spill, 4); - for (int i = 0; i < 4; ++i) { - acc32 += acc_spill[i]; - } - in_channel += 16; - } while (in_channel + 16 <= filter_depth); - } - } - if (bias_data) { - acc32 = acc32 + bias_data[out_channel]; - } - int32_t acc = tflite::MultiplyByQuantizedMultiplier( - acc32, output_multiplier[out_channel], output_shift[out_channel]); - acc += output_offset; - acc = std::clamp(acc, output_activation_min, output_activation_max); - output_data[tflite::Offset(output_shape, batch, out_y, out_x, - out_channel)] = static_cast<int16_t>(acc); - } - } - } - } -} - // Accumulates in v0-v7. [v0-v3], [v4-v7] are sub accumulators for two outputs. // Load/swizzle filters use [v52-v63]. // Input activations use [v32-v33]. // No clobbers. -void ukernel_s8_s16(const int16_t* input_data0, - const int8_t* filter_data0, - const int8_t* filter_data1, - size_t n) { +void ConvUkernelS8S16(const int16_t* input_data0, const int8_t* filter_data0, + const int8_t* filter_data1, size_t n) { n = n >> 5; while (n > 0) { // Load filters 0 to v58, v59 @@ -185,7 +60,7 @@ } } -void conv_per_channel_b64_1x1( +void ConvS16B64K1x1( const tflite::ConvParams& params, const int32_t* output_multiplier, const int32_t* output_shift, const tflite::RuntimeShape& input_shape, const int16_t* input_data, const tflite::RuntimeShape& filter_shape, @@ -217,8 +92,8 @@ vdup_w_x_m(v0, 0); vdup_w_x_m(v4, 0); - ukernel_s8_s16(group_input, local_filters0, local_filters1, - filter_input_depth); + ConvUkernelS8S16(group_input, local_filters0, local_filters1, + filter_input_depth); // sum accumulators vadd_w_vv(v0, v0, v1); vadd_w_vv(v2, v2, v3); @@ -234,7 +109,7 @@ acc64 += accumulators[i]; } int32_t acc = tflite::MultiplyByQuantizedMultiplier( - acc64, output_multiplier[oc], output_shift[oc]); + acc64, output_multiplier[oc], output_shift[oc]); acc += output_offset; acc = std::clamp(acc, output_activation_min, output_activation_max); local_output[oc] = static_cast<int16_t>(acc); @@ -247,7 +122,7 @@ acc64 += accumulators[i]; } int32_t acc = tflite::MultiplyByQuantizedMultiplier( - acc64, output_multiplier[oc + 1], output_shift[oc + 1]); + acc64, output_multiplier[oc + 1], output_shift[oc + 1]); acc += output_offset; acc = std::clamp(acc, output_activation_min, output_activation_max); local_output[oc + 1] = static_cast<int16_t>(acc); @@ -258,7 +133,7 @@ } // Optimized for grouped convolutions, no dilation, 1xn filter -void conv_per_channel_b64_filter1xn_group( +void ConvS16B64K1xnGroup( const tflite::ConvParams& params, const int32_t* output_multiplier, const int32_t* output_shift, const tflite::RuntimeShape& input_shape, const int16_t* input_data, const tflite::RuntimeShape& filter_shape, @@ -292,13 +167,12 @@ filter_data + (oc * filter_width * filter_depth); const int8_t* local_filters1 = local_filters0 + (filter_width * filter_depth); - const int16_t* local_input = input_data + - (b * input_width * input_depth) + - (in_x_origin * input_depth) + - (g * filter_depth); + const int16_t* local_input = + input_data + (b * input_width * input_depth) + + (in_x_origin * input_depth) + (g * filter_depth); int16_t* local_output = output_data + - (b * output_width * output_depth) + - (out_x * output_depth); + (b * output_width * output_depth) + + (out_x * output_depth); int64_t acc64_0 = 0; int64_t acc64_1 = 0; @@ -312,8 +186,8 @@ const int16_t* local_inputx = local_input + (filter_x * input_depth); - ukernel_s8_s16(local_inputx, local_filters0x, local_filters1x, - filter_depth); + ConvUkernelS8S16(local_inputx, local_filters0x, local_filters1x, + filter_depth); } // sum accumulators @@ -331,7 +205,7 @@ } acc64_0 += bias_data[oc]; int32_t acc = tflite::MultiplyByQuantizedMultiplier( - acc64_0, output_multiplier[oc], output_shift[oc]); + acc64_0, output_multiplier[oc], output_shift[oc]); acc += output_offset; acc = std::clamp(acc, output_activation_min, output_activation_max); local_output[oc] = static_cast<int16_t>(acc); @@ -344,7 +218,7 @@ } acc64_1 += bias_data[oc + 1]; int32_t acc = tflite::MultiplyByQuantizedMultiplier( - acc64_1, output_multiplier[oc + 1], output_shift[oc + 1]); + acc64_1, output_multiplier[oc + 1], output_shift[oc + 1]); acc += output_offset; acc = std::clamp(acc, output_activation_min, output_activation_max); local_output[oc + 1] = static_cast<int16_t>(acc); @@ -356,7 +230,7 @@ } // Optimized for no group, no dilation, 1xn filter. -void conv_per_channel_b64_filter1xn_non_group( +void ConvS16B64K1xnNonGroup( const tflite::ConvParams& params, const int32_t* output_multiplier, const int32_t* output_shift, const tflite::RuntimeShape& input_shape, const int16_t* input_data, const tflite::RuntimeShape& filter_shape, @@ -386,16 +260,16 @@ const int8_t* local_filters1 = local_filters0 + (filter_width * filter_depth); const int16_t* local_input = input_data + - (batch * input_width * input_depth) + - (in_x_origin * input_depth); + (batch * input_width * input_depth) + + (in_x_origin * input_depth); int16_t* local_output = output_data + - (batch * output_width * output_depth) + - (out_x * output_depth); + (batch * output_width * output_depth) + + (out_x * output_depth); vdup_w_x_m(v0, 0); vdup_w_x_m(v4, 0); - ukernel_s8_s16(local_input, local_filters0, local_filters1, - filter_width * filter_depth); + ConvUkernelS8S16(local_input, local_filters0, local_filters1, + filter_width * filter_depth); // sum accumulators vadd_w_vv(v0, v0, v1); vadd_w_vv(v2, v2, v3); @@ -410,7 +284,7 @@ acc64 += accumulators[i]; } int32_t acc = tflite::MultiplyByQuantizedMultiplier( - acc64, output_multiplier[oc], output_shift[oc]); + acc64, output_multiplier[oc], output_shift[oc]); acc += output_offset; acc = std::clamp(acc, output_activation_min, output_activation_max); local_output[oc] = static_cast<int16_t>(acc); @@ -423,7 +297,7 @@ acc64 += accumulators[i]; } int32_t acc = tflite::MultiplyByQuantizedMultiplier( - acc64, output_multiplier[oc + 1], output_shift[oc + 1]); + acc64, output_multiplier[oc + 1], output_shift[oc + 1]); acc += output_offset; acc = std::clamp(acc, output_activation_min, output_activation_max); local_output[oc + 1] = static_cast<int16_t>(acc); @@ -433,7 +307,7 @@ } } -void conv_per_channel_b64_generic( +void ConvS16B64Generic( const tflite::ConvParams& params, const int32_t* output_multiplier, const int32_t* output_shift, const tflite::RuntimeShape& input_shape, const int16_t* input_data, const tflite::RuntimeShape& filter_shape, @@ -534,51 +408,47 @@ } } } +} // namespace -void conv_per_channel_b64( +void ConvS16B64( const tflite::ConvParams& params, const int32_t* output_multiplier, const int32_t* output_shift, const tflite::RuntimeShape& input_shape, const int16_t* input_data, const tflite::RuntimeShape& filter_shape, const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, const int64_t* bias_data, const tflite::RuntimeShape& output_shape, int16_t* output_data) { - if (filter_shape.Dims(kFilterHeightIndex) == 1 && - output_shape.Dims(kOutputChannelIndex) % 2 == 0) { - if (filter_shape.Dims(kFilterWidthIndex) == 1 && - filter_shape.Dims(kFilterInputChannelIndex) % 32 == 0) { - kelvin::opt::conv_per_channel_b64_1x1( - params, output_multiplier, output_shift, input_shape, input_data, - filter_shape, filter_data, bias_shape, bias_data, output_shape, - output_data); - return; + const auto input_depth = input_shape.Dims(3); + const auto filter_height = filter_shape.Dims(1); + const auto filter_width = filter_shape.Dims(2); + const auto filter_depth = filter_shape.Dims(3); + const auto output_depth = output_shape.Dims(3); + + // generic implementation by default + auto fn = ConvS16B64Generic; + + // special cases + if (filter_height == 1 && output_depth % 2 == 0) { + // 1x1 filter, filter depth = 32n + if (filter_width == 1 && filter_depth % 32 == 0) { + fn = ConvS16B64K1x1; } - // TODO(derekjchow): Check for valid padding - bool group_conv = !(input_shape.Dims(kInputChannelIndex) == - filter_shape.Dims(kFilterInputChannelIndex)); - int32_t fan_in = filter_shape.Dims(kFilterWidthIndex) * - filter_shape.Dims(kFilterInputChannelIndex); + // 1xn non group filter + bool group_conv = !(input_depth == filter_depth); + int32_t fan_in = filter_width * filter_depth; if (!group_conv && fan_in % 32 == 0) { - kelvin::opt::conv_per_channel_b64_filter1xn_non_group( - params, output_multiplier, output_shift, input_shape, input_data, - filter_shape, filter_data, bias_shape, bias_data, output_shape, - output_data); - return; + fn = ConvS16B64K1xnNonGroup; } + // 1xn group filter if (fan_in % 32 == 0) { - kelvin::opt::conv_per_channel_b64_filter1xn_group( - params, output_multiplier, output_shift, input_shape, input_data, - filter_shape, filter_data, bias_shape, bias_data, output_shape, - output_data); - return; + fn = ConvS16B64K1xnGroup; } } - kelvin::opt::conv_per_channel_b64_generic( - params, output_multiplier, output_shift, input_shape, input_data, - filter_shape, filter_data, bias_shape, bias_data, output_shape, - output_data); + fn(params, output_multiplier, output_shift, input_shape, input_data, + filter_shape, filter_data, bias_shape, bias_data, output_shape, + output_data); } } // namespace kelvin::opt
diff --git a/tflm/opt/conv_s8.cc b/tflm/opt/conv_s8.cc index 2da0028..4582b9f 100644 --- a/tflm/opt/conv_s8.cc +++ b/tflm/opt/conv_s8.cc
@@ -14,407 +14,19 @@ * limitations under the License. */ -#include <cstdlib> -#include <memory> +// Convolution based on Kelvin ops +// Data types: input: s8, filter: s8, bias: s32 -#include "crt/kelvin.h" -#include "tensorflow/lite/kernels/internal/common.h" +#include "tflm/opt/conv_s8.h" + +#include <algorithm> + #include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h" -#include "tensorflow/lite/kernels/internal/runtime_shape.h" -#include "tensorflow/lite/kernels/internal/types.h" -#include "tflm/opt/opt.h" -#include "tflm/opt/util.h" +#include "tflm/opt/conv_util.h" namespace kelvin::opt { - namespace { -constexpr int kFilterInputChannelIndex = 3; -constexpr int kOutputWidthIndex = 2; -constexpr int kOutputChannelIndex = 3; - -// Convert: input [zo][ky][kx][zi] (N,4,4,M) -// output [ky][kx][zi_hi=M/4][zo=8][zi_lo=4] -// output [3][3][16][8][4] -void Filter_8_H_W_M(const int8_t* input, int8_t* output, int H, int W, int M) { - const int8_t(&in)[8][H][W][M] = *(int8_t(*)[8][H][W][M])input; - int8_t(&out)[H][W][M / 4][8][4] = *(int8_t(*)[H][W][M / 4][8][4]) output; - assert(M >= 4); - for (int zo = 0; zo < 8; ++zo) { - for (int ky = 0; ky < H; ++ky) { - for (int kx = 0; kx < W; ++kx) { - for (int zi = 0; zi < M; ++zi) { - const int zi_hi = zi >> 2; // div4 - const int zi_lo = zi & 3; // rem4 - out[ky][kx][zi_hi][zo][zi_lo] = in[zo][ky][kx][zi]; - } - } - } - } -} - -void Swizzle(const int32_t* input, int32_t* output, int N) { - const int32_t(&in)[N] = *(int32_t(*)[N])input; - int32_t(&out)[N * 4] = *(int32_t(*)[N * 4]) output; - // Convert to accumulator swizzle pattern. - for (int i = 0; i < N / 8; ++i) { - int32_t* out0 = out + i * 32 + 0; - int32_t* out1 = out + i * 32 + 16; - int32_t* out2 = out + i * 32 + 8; - int32_t* out3 = out + i * 32 + 24; - for (int j = 0; j < 4; ++j) { - const int32_t* p_in = in + i * 8; - for (int k = 0; k < 2; ++k) { - *out0++ = *p_in++; - *out1++ = *p_in++; - *out2++ = *p_in++; - *out3++ = *p_in++; - } - } - } -} - -} // namespace - -void conv_per_channel_pw1_ow8_id8_filterd32( - const tflite::ConvParams& params, const int32_t* output_multiplier, - const int32_t* output_shift, const tflite::RuntimeShape& input_shape, - const int8_t* input_data, const tflite::RuntimeShape& filter_shape, - const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, - const int32_t* bias_data, const tflite::RuntimeShape& output_shape, - int8_t* output_data) { - // Get parameters. - const int32_t input_offset = params.input_offset; // r = s(q - Z) - const int stride_width = params.stride_width; - const int stride_height = params.stride_height; - const int dilation_width_factor = params.dilation_width_factor; - const int dilation_height_factor = params.dilation_height_factor; - const int pad_width = params.padding_values.width; - const int pad_height = params.padding_values.height; - const int32_t output_offset = params.output_offset; - - // Set min and max value of the output. - const int32_t output_activation_min = params.quantized_activation_min; - const int32_t output_activation_max = params.quantized_activation_max; - - // Consistency check. - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); - TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); - TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); - const int batches = MatchingDim(input_shape, 0, output_shape, 0); - const int input_depth = input_shape.Dims(3); - const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); - if (bias_data) { - TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); - } - - // Check dimensions of the tensors. - const int input_height = input_shape.Dims(1); - const int input_width = input_shape.Dims(2); - const int filter_height = filter_shape.Dims(1); - const int filter_width = filter_shape.Dims(2); - const int filter_input_depth = filter_shape.Dims(3); - const int groups = input_depth / filter_input_depth; - TFLITE_DCHECK_NE(groups, 0); - TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0); - const int filters_per_group = output_depth / groups; - TFLITE_DCHECK_NE(filters_per_group, 0); - const int output_height = output_shape.Dims(1); - const int output_width = output_shape.Dims(2); - - union { - vconv_u8_t conv; - uint32_t raw; - } cmds; - cmds.conv.mode = 0; - cmds.conv.start = 0; - cmds.conv.stop = 7; - cmds.conv.sbias1 = input_offset; - cmds.conv.sdata1 = true; - cmds.conv.sbias2 = 0; - cmds.conv.sdata2 = true; - - const size_t swizzled_filter_data_size = - 8 * filter_height * filter_width * filter_input_depth; - std::unique_ptr<int8_t> swizzled_filter_data(reinterpret_cast<int8_t*>( - ::aligned_alloc(32, swizzled_filter_data_size))); - int8_t* p_swizzled_filter_data = swizzled_filter_data.get(); - int32_t swizzled_bias_data[32]; - int32_t swizzled_mult_data[32]; - int32_t swizzled_shift_data[32]; - - for (int out_channel = 0; out_channel + 8 <= output_depth; out_channel += 8) { - Filter_8_H_W_M(filter_data + (out_channel * filter_height * filter_width * - filter_input_depth), - p_swizzled_filter_data, filter_height, filter_width, - filter_input_depth); - Swizzle(bias_data + out_channel, swizzled_bias_data, 8); - Swizzle(output_multiplier + out_channel, swizzled_mult_data, 8); - Swizzle(output_shift + out_channel, swizzled_shift_data, 8); - vld_w_x_m(v16, swizzled_bias_data); - vld_w_x_m(v20, swizzled_mult_data); - vld_w_x_m(v24, swizzled_shift_data); - vrsub_w_vx_m(v24, v24, 0); - - for (int batch = 0; batch < batches; ++batch) { - for (int out_y = 0; out_y < output_height; ++out_y) { - const int in_y_origin = (out_y * stride_height) - pad_height; - for (int out_x = 0; out_x + 8 <= output_width; out_x += 8) { - // 8x accumulators - vdup_w_x_m(v48, 0); - vdup_w_x_m(v52, 0); - acset_v(v48, v48); - for (int in_channel = 0; in_channel + 32 <= filter_input_depth; - in_channel += 32) { - for (int filter_y = 0; filter_y < filter_height; ++filter_y) { - const int in_y = in_y_origin + dilation_height_factor * filter_y; - const bool is_row_inside_input = - (in_y >= 0) && (in_y < input_height); - if (!is_row_inside_input) { - continue; - } - - for (int filter_x = 0; filter_x < filter_width; ++filter_x) { - int in_x[8]; - bool left_pad = false; - bool right_pad = false; - for (int i = 0; i < 8; ++i) { - const int in_x_origin = - ((out_x + i) * stride_width) - pad_width; - in_x[i] = in_x_origin + dilation_width_factor * filter_x; - if (in_x[i] < 0) { - left_pad = true; - } - if (in_x[i] >= input_width) { - right_pad = true; - } - } - - if (left_pad) { - vdup_b_x(v0, -input_offset); - vld_b_s_xx( - v1, - &input_data[tflite::Offset(input_shape, batch, in_y, - in_x[1], in_channel)], - input_depth * stride_width); - vld_b_s_xx( - v2, - &input_data[tflite::Offset(input_shape, batch, in_y, - in_x[2], in_channel)], - input_depth * stride_width); - vld_b_s_xx( - v3, - &input_data[tflite::Offset(input_shape, batch, in_y, - in_x[3], in_channel)], - input_depth * stride_width); - vld_b_s_xx_m( - v4, - &input_data[tflite::Offset(input_shape, batch, in_y, - in_x[4], in_channel)], - input_depth * stride_width); - } else if (right_pad) { - vld_b_s_xx_m( - v0, - &input_data[tflite::Offset(input_shape, batch, in_y, - in_x[0], in_channel)], - input_depth * stride_width); - vld_b_s_xx( - v4, - &input_data[tflite::Offset(input_shape, batch, in_y, - in_x[4], in_channel)], - input_depth * stride_width); - vld_b_s_xx( - v5, - &input_data[tflite::Offset(input_shape, batch, in_y, - in_x[5], in_channel)], - input_depth * stride_width); - vld_b_s_xx( - v6, - &input_data[tflite::Offset(input_shape, batch, in_y, - in_x[6], in_channel)], - input_depth * stride_width); - vdup_b_x(v7, -input_offset); - } else if (!left_pad && !right_pad) { - // Inputs - vld_b_s_xx_m( - v0, - &input_data[tflite::Offset(input_shape, batch, in_y, - in_x[0], in_channel)], - input_depth * stride_width); - vld_b_s_xx_m( - v4, - &input_data[tflite::Offset(input_shape, batch, in_y, - in_x[4], in_channel)], - input_depth * stride_width); - } else { - vdup_b_x(v0, -input_offset); - vdup_b_x(v7, -input_offset); - vld_b_s_xx_m( - v1, - &input_data[tflite::Offset(input_shape, batch, in_y, - in_x[1], in_channel)], - input_depth * stride_width); - vld_b_s_xx( - v5, - &input_data[tflite::Offset(input_shape, batch, in_y, - in_x[5], in_channel)], - input_depth * stride_width); - vld_b_s_xx( - v6, - &input_data[tflite::Offset(input_shape, batch, in_y, - in_x[6], in_channel)], - input_depth * stride_width); - } - size_t local_filter_offset = - (filter_y * filter_width * 8 * input_depth) + - (filter_x * 8 * input_depth) + (in_channel * 8); - int8_t* p_local_filter_start = - p_swizzled_filter_data + local_filter_offset; - vld_b_p_x_m(v8, p_local_filter_start); - vld_b_x_m(v12, p_local_filter_start); - - aconv_vxv(v48, v0, cmds, v8); - } - } - } - vcget(v48); - vadd_w_vv_m(v48, v48, v16); - vadd_w_vv_m(v52, v52, v16); - vdmulh_w_r_vv_m(v48, v48, v20); - vdmulh_w_r_vv_m(v52, v52, v20); - vsha_w_r_vv_m(v48, v48, v24); - vsha_w_r_vv_m(v52, v52, v24); - vadd_w_vx_m(v48, v48, output_offset); - vadd_w_vx_m(v52, v52, output_offset); - vmin_w_vx_m(v48, v48, output_activation_max); - vmin_w_vx_m(v52, v52, output_activation_max); - vmax_w_vx_m(v48, v48, output_activation_min); - vmax_w_vx_m(v52, v52, output_activation_min); - vsraqs_b_vx(v56, v48, 0); - vsraqs_b_vx(v57, v52, 0); - vstq_b_s_xx(v56, - &output_data[tflite::Offset(output_shape, batch, out_y, - out_x, out_channel)], - output_depth); - vstq_b_s_xx(v57, - &output_data[tflite::Offset(output_shape, batch, out_y, - out_x + 4, out_channel)], - output_depth); - } - } - } - } -} - -// Fixed-point per-channel-quantization convolution reference kernel. -void conv_per_channel_filterd32( - const tflite::ConvParams& params, const int32_t* output_multiplier, - const int32_t* output_shift, const tflite::RuntimeShape& input_shape, - const int8_t* input_data, const tflite::RuntimeShape& filter_shape, - const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, - const int32_t* bias_data, const tflite::RuntimeShape& output_shape, - int8_t* output_data) { - // Get parameters. - const int32_t input_offset = params.input_offset; // r = s(q - Z) - const int stride_width = params.stride_width; - const int stride_height = params.stride_height; - const int dilation_width_factor = params.dilation_width_factor; - const int dilation_height_factor = params.dilation_height_factor; - const int pad_width = params.padding_values.width; - const int pad_height = params.padding_values.height; - const int32_t output_offset = params.output_offset; - - // Set min and max value of the output. - const int32_t output_activation_min = params.quantized_activation_min; - const int32_t output_activation_max = params.quantized_activation_max; - - // Consistency check. - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); - TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); - TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); - const int batches = MatchingDim(input_shape, 0, output_shape, 0); - const int input_depth = input_shape.Dims(3); - const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); - if (bias_data) { - TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); - } - - // Check dimensions of the tensors. - const int input_height = input_shape.Dims(1); - const int input_width = input_shape.Dims(2); - const int filter_height = filter_shape.Dims(1); - const int filter_width = filter_shape.Dims(2); - const int filter_input_depth = filter_shape.Dims(3); - const int groups = input_depth / filter_input_depth; - TFLITE_DCHECK_NE(groups, 0); - TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0); - const int filters_per_group = output_depth / groups; - TFLITE_DCHECK_NE(filters_per_group, 0); - const int output_height = output_shape.Dims(1); - const int output_width = output_shape.Dims(2); - for (int out_channel = 0; out_channel < output_depth; ++out_channel) { - for (int batch = 0; batch < batches; ++batch) { - for (int out_y = 0; out_y < output_height; ++out_y) { - const int in_y_origin = (out_y * stride_height) - pad_height; - for (int out_x = 0; out_x < output_width; ++out_x) { - const int in_x_origin = (out_x * stride_width) - pad_width; - vdup_w_x_m(v60, 0); - int32_t acc = 0; - for (int in_channel = 0; in_channel + 32 <= filter_input_depth; - in_channel += 32) { - for (int filter_y = 0; filter_y < filter_height; ++filter_y) { - const int in_y = in_y_origin + dilation_height_factor * filter_y; - for (int filter_x = 0; filter_x < filter_width; ++filter_x) { - const int in_x = in_x_origin + dilation_width_factor * filter_x; - - // Zero padding by omitting the areas outside the image. - const bool is_point_inside_image = - (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && - (in_y < input_height); - - if (!is_point_inside_image) { - continue; - } - - vld_b_x(v0, &input_data[tflite::Offset(input_shape, batch, in_y, - in_x, in_channel)]); - vaddw_h_vx(v0, v0, 0); - vadd_h_vx(v0, v0, static_cast<int16_t>(input_offset)); - vadd_h_vx(v1, v1, static_cast<int16_t>(input_offset)); - vld_b_x(v2, &filter_data[tflite::Offset(filter_shape, - out_channel, filter_y, - filter_x, in_channel)]); - vaddw_h_vx(v2, v2, 0); - vmulw_w_vv(v48, v0, v2); - vmulw_w_vv(v50, v1, v3); - vadd_w_vv_m(v60, v60, v48); - } - } - } - int32_t accumulators[32]; - vst_w_x_m(v60, accumulators); - for (int i = 0; i < 32; ++i) { - acc += accumulators[i]; - } - - if (bias_data) { - acc += bias_data[out_channel]; - } - acc = tflite::MultiplyByQuantizedMultiplier( - acc, output_multiplier[out_channel], output_shift[out_channel]); - acc += output_offset; - acc = std::max(acc, output_activation_min); - acc = std::min(acc, output_activation_max); - output_data[tflite::Offset(output_shape, batch, out_y, out_x, - out_channel)] = static_cast<int8_t>(acc); - } - } - } - } -} - -void conv_per_channel_generic( +void ConvS8Generic( const tflite::ConvParams& params, const int32_t* output_multiplier, const int32_t* output_shift, const tflite::RuntimeShape& input_shape, const int8_t* input_data, const tflite::RuntimeShape& filter_shape, @@ -443,6 +55,17 @@ const auto output_activation_max = params.quantized_activation_max; const auto groups = input_depth / filter_depth; const auto filters_per_group = output_depth / groups; + + if (pad_width > 0 || pad_height > 0 || dilation_width_factor > 1 || + dilation_height_factor > 1) { + // use reference implementation + tflite::reference_integer_ops::ConvPerChannel( + params, output_multiplier, output_shift, input_shape, input_data, + filter_shape, filter_data, bias_shape, bias_data, output_shape, + output_data); + return; + } + union { vconv_u8_t conv; uint32_t raw; @@ -457,8 +80,8 @@ // Zero out accumulators. vdup_b_x(v0, 0); - acset_v(v48, v0); - vdup_b_x_m(v48, 0); + acset_v(ACC, v0); + vdup_b_x_m(ACC0, 0); for (int batch = 0; batch < batches; ++batch) { for (int out_y = 0; out_y < output_height; ++out_y) { const int in_y_origin = (out_y * stride_height) - pad_height; @@ -489,111 +112,114 @@ i; vdup_w_x_m(vm0, 0); vdup_w_x_m(vm1, 0); - vld_b_l_xx(v0, &input_data[input_offset], count); + vld_b_l_xx(INA0, &input_data[input_offset], count); int filter_offset = tflite::Offset(filter_shape, out_channel, filter_y, 0, 0) + i; - vdup_w_x_m(v8, 0); - vdup_w_x_m(v12, 0); + vdup_w_x_m(FLTA0, 0); + vdup_w_x_m(FLTA4, 0); if (count > 0) { - vld_b_l_xx(v8, &filter_data[filter_offset], std::min(count, 4)); + vld_b_l_xx(FLTA0, &filter_data[filter_offset], + std::min(count, 4)); } if (count > 4) { - vld_b_l_xx(v9, &filter_data[filter_offset + 4], + vld_b_l_xx(FLTA1, &filter_data[filter_offset + 4], std::min(count - 4, 4)); } if (count > 8) { - vld_b_l_xx(v10, &filter_data[filter_offset + 8], + vld_b_l_xx(FLTA2, &filter_data[filter_offset + 8], std::min(count - 8, 4)); } if (count > 12) { - vld_b_l_xx(v11, &filter_data[filter_offset + 12], + vld_b_l_xx(FLTA3, &filter_data[filter_offset + 12], std::min(count - 12, 4)); } if (count > 16) { - vld_b_l_xx(v12, &filter_data[filter_offset + 16], + vld_b_l_xx(FLTA4, &filter_data[filter_offset + 16], std::min(count - 16, 4)); } if (count > 20) { - vld_b_l_xx(v13, &filter_data[filter_offset + 20], + vld_b_l_xx(FLTA5, &filter_data[filter_offset + 20], std::min(count - 20, 4)); } if (count > 24) { - vld_b_l_xx(v14, &filter_data[filter_offset + 24], + vld_b_l_xx(FLTA6, &filter_data[filter_offset + 24], std::min(count - 24, 4)); } if (count > 28) { - vld_b_l_xx(v15, &filter_data[filter_offset + 28], + vld_b_l_xx(FLTA7, &filter_data[filter_offset + 28], std::min(count - 28, 4)); } - aconv_vxv(v48, v0, cmds, v8); + aconv_vxv(ACC, INA0, cmds, FLTA0); } } - vcget(v48); - vadd_w_vx_m(v48, v48, bias_data[out_channel]); - vsll_w_vx_m(v48, v48, LEFT_SHIFT(output_shift[out_channel])); - vdmulh_w_r_vx_m(v48, v48, output_multiplier[out_channel]); - vsha_w_r_vx_m(v48, v48, RIGHT_SHIFT(output_shift[out_channel])); - vadd_w_vx_m(v48, v48, output_offset); - vmin_w_vx_m(v48, v48, output_activation_max); - vmax_w_vx_m(v48, v48, output_activation_min); - vsraqs_b_vx(v56, v48, 0); + vcget(ACC); + vadd_w_vx_m(ACC0, ACC0, bias_data[out_channel]); + vsll_w_vx_m(ACC0, ACC0, LEFT_SHIFT(output_shift[out_channel])); + vdmulh_w_r_vx_m(ACC0, ACC0, output_multiplier[out_channel]); + vsha_w_r_vx_m(ACC0, ACC0, RIGHT_SHIFT(output_shift[out_channel])); + vadd_w_vx_m(ACC0, ACC0, output_offset); + vmin_w_vx_m(ACC0, ACC0, output_activation_max); + vmax_w_vx_m(ACC0, ACC0, output_activation_min); + vsraqs_b_vx(OUT0, ACC0, 0); size_t output_offset = tflite::Offset(output_shape, batch, out_y, out_x, out_channel); - vst_b_l_xx(v56, &output_data[output_offset], 1); + vst_b_l_xx(OUT0, &output_data[output_offset], 1); } } } } } +} // namespace -void conv_per_channel_b8( - const tflite::ConvParams& params, const int32_t* output_multiplier, - const int32_t* output_shift, const tflite::RuntimeShape& input_shape, - const int8_t* input_data, const tflite::RuntimeShape& filter_shape, - const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, - const int32_t* bias_data, const tflite::RuntimeShape& output_shape, - int8_t* output_data) { +void ConvS8(const tflite::ConvParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, + const tflite::RuntimeShape& input_shape, const int8_t* input_data, + const tflite::RuntimeShape& filter_shape, const int8_t* filter_data, + const tflite::RuntimeShape& bias_shape, const int32_t* bias_data, + const tflite::RuntimeShape& output_shape, int8_t* output_data) { + const auto batches = MatchingDim(input_shape, 0, output_shape, 0); const auto stride_width = params.stride_width; const auto stride_height = params.stride_height; const auto dilation_width_factor = params.dilation_width_factor; const auto dilation_height_factor = params.dilation_height_factor; - const int pad_width = params.padding_values.width; - const int pad_height = params.padding_values.height; + const auto pad_width = params.padding_values.width; + const auto pad_height = params.padding_values.height; + const auto input_width = input_shape.Dims(2); + const auto input_depth = input_shape.Dims(3); + const auto filter_height = filter_shape.Dims(1); + const auto filter_width = filter_shape.Dims(2); + const auto filter_depth = filter_shape.Dims(3); + const auto output_depth = output_shape.Dims(3); + // use generic implementation by default + auto fn = ConvS8Generic; + + // special case of filter depth = 32n if (dilation_width_factor == 1 && dilation_height_factor == 1 && - stride_width <= 2 && stride_height <= 2) { - if (filter_shape.Dims(kFilterInputChannelIndex) % 32 == 0 && - output_shape.Dims(kOutputChannelIndex) % 8 == 0 && - output_shape.Dims(kOutputWidthIndex) % 8 == 0 && pad_width <= 1) { - conv_per_channel_pw1_ow8_id8_filterd32( - params, output_multiplier, output_shift, input_shape, input_data, - filter_shape, filter_data, bias_shape, bias_data, output_shape, - output_data); - return; - } else if (filter_shape.Dims(kFilterInputChannelIndex) % 32 == 0) { - conv_per_channel_filterd32(params, output_multiplier, output_shift, - input_shape, input_data, filter_shape, - filter_data, bias_shape, bias_data, - output_shape, output_data); - return; - } + stride_width <= 2 && stride_height <= 2 && filter_depth % 32 == 0) { + fn = kelvin::opt::ConvS8D32; } - if (stride_width == 1 && stride_height == 1 && dilation_width_factor == 1 && - dilation_height_factor == 1) { - if (pad_width == 0 && pad_height == 0) { - conv_per_channel_generic(params, output_multiplier, output_shift, - input_shape, input_data, filter_shape, - filter_data, bias_shape, bias_data, output_shape, - output_data); - return; - } + // special case of filter size 1x1 + if (filter_height == 1 && filter_width == 1 && stride_height == 1 && + stride_width == 1 && dilation_height_factor == 1 && + dilation_width_factor == 1 && pad_height == 0 && pad_width == 0 && + (output_depth % 8) == 0 && (input_depth % 32) == 0) { + // TODO(ndodda): uncomment it when all tests are passed + // fn = kelvin::opt::ConvS8K1x1; } - tflite::reference_integer_ops::ConvPerChannel( - params, output_multiplier, output_shift, input_shape, input_data, - filter_shape, filter_data, bias_shape, bias_data, output_shape, - output_data); + // special case of filter size 48x3x1x48 + if (batches == 1 && filter_height == 3 && filter_width == 1 && + input_width == 1 && input_depth == 48 && output_depth == 48 && + stride_height == 1 && stride_width == 1 && dilation_height_factor == 1 && + dilation_width_factor == 1 && pad_height == 0 && pad_width == 0) { + fn = kelvin::opt::ConvS8K3x1D48; + } + + fn(params, output_multiplier, output_shift, input_shape, input_data, + filter_shape, filter_data, bias_shape, bias_data, output_shape, + output_data); } } // namespace kelvin::opt
diff --git a/tflm/opt/conv_s8.h b/tflm/opt/conv_s8.h new file mode 100644 index 0000000..e1d88ef --- /dev/null +++ b/tflm/opt/conv_s8.h
@@ -0,0 +1,57 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TFLM_OPT_CONV_S8_H_ +#define TFLM_OPT_CONV_S8_H_ + +#include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/runtime_shape.h" + +namespace kelvin::opt { + +// filter 1x1 +void ConvS8K1x1(const tflite::ConvParams& params, + const int32_t* output_multiplier, const int32_t* output_shift, + const tflite::RuntimeShape& input_shape, + const int8_t* input_data, + const tflite::RuntimeShape& filter_shape, + const int8_t* filter_data, + const tflite::RuntimeShape& bias_shape, + const int32_t* bias_data, + const tflite::RuntimeShape& output_shape, int8_t* output_data); + +// filter depth 32n +void ConvS8D32(const tflite::ConvParams& params, + const int32_t* output_multiplier, const int32_t* output_shift, + const tflite::RuntimeShape& input_shape, + const int8_t* input_data, + const tflite::RuntimeShape& filter_shape, + const int8_t* filter_data, + const tflite::RuntimeShape& bias_shape, const int32_t* bias_data, + const tflite::RuntimeShape& output_shape, int8_t* output_data); + +// filter size 48x3x1x48 +void ConvS8K3x1D48( + const tflite::ConvParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const tflite::RuntimeShape& input_shape, + const int8_t* input_data, const tflite::RuntimeShape& filter_shape, + const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, + const int32_t* bias_data, const tflite::RuntimeShape& output_shape, + int8_t* output_data); + +} // namespace kelvin::opt + +#endif // TFLM_OPT_CONV_S8_H_
diff --git a/tflm/opt/conv_s8_1x1.cc b/tflm/opt/conv_s8_1x1.cc new file mode 100644 index 0000000..9da99c3 --- /dev/null +++ b/tflm/opt/conv_s8_1x1.cc
@@ -0,0 +1,97 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Convolution based on Kelvin ops +// Data types: input: s8, filter: s8, bias: s32 +// Special case for 1x1 filter + +#include "tflm/opt/conv_util.h" + +namespace kelvin::opt { + +void ConvS8K1x1(const tflite::ConvParams& params, + const int32_t* output_multiplier, const int32_t* output_shift, + const tflite::RuntimeShape& input_shape, + const int8_t* input_data, + const tflite::RuntimeShape& filter_shape, + const int8_t* filter_data, + const tflite::RuntimeShape& bias_shape, + const int32_t* bias_data, + const tflite::RuntimeShape& output_shape, int8_t* output_data) { + const auto batches = MatchingDim(input_shape, 0, output_shape, 0); + const auto input_depth = input_shape.Dims(3); + const auto input_offset = params.input_offset; + const auto output_height = output_shape.Dims(1); + const auto output_width = output_shape.Dims(2); + const auto output_depth = output_shape.Dims(3); + const auto output_offset = params.output_offset; + const auto output_activation_min = params.quantized_activation_min; + const auto output_activation_max = params.quantized_activation_max; + // ToDo : support group convolutions. + int32_t bias[8 * 4]; + int32_t mult[8 * 4]; + int32_t shft[8 * 4]; + union { + vconv_u8_t conv; + uint32_t raw; + } cmds; + cmds.conv.mode = 0; + cmds.conv.start = 0; + cmds.conv.stop = 7; + cmds.conv.sbias1 = input_offset; + cmds.conv.sdata1 = true; + cmds.conv.sbias2 = 0; + cmds.conv.sdata2 = true; + for (int zo_hi = 0; zo_hi < output_depth; zo_hi += 8) { + // transpose filter weigths to support outer prodcut multiplication + int8_t juggled_filter_data[1][1][1][input_depth / 4][8][4]; + Filter_N_H_W_M<8>(filter_data, juggled_filter_data[0][0][0][0][0], 1, 1, + 32); + + Swizzle(bias_data, bias, 8); + Swizzle(output_multiplier, mult, 8); + Swizzle(output_shift, shft, 8, true); + int out = 0; + for (; out + 8 <= output_height * output_width * batches; out += 8) { + // resetting accumulators to clean up old output + vdup_b_x_m(v48, 0); + vdup_b_x_m(v52, 0); + + int in = 0; + for (; in <= input_depth; in += 32) { + vld_b_s_xx_m(v0, input_data + out * input_depth + in, input_depth); + vld_b_s_xx_m(v4, input_data + out * input_depth + in + 4 * input_depth, + input_depth); + + vld_b_x_m(v8, juggled_filter_data[0][0][0][in / 32][0][0]); + vld_b_x_m(v12, juggled_filter_data[0][0][0][(in / 32) + 4][0][0]); + + aconv_vxv(v48, v0, cmds, v8); + } + + INT32_TO_INT8_OUTPUT_PIPELINE(bias, mult, shft, output_activation_min, + output_activation_max, output_offset, v16, + v20, v24); + + // store the results to ouput memory + int8_t* p_out = output_data + (out * output_depth) + zo_hi; + vstq_b_sp_xx(v48, p_out, output_depth); + vstq_b_sp_xx(v52, p_out, output_depth); + } + } +} + +} // namespace kelvin::opt
diff --git a/tflm/opt/conv_s8_3x1_d48.cc b/tflm/opt/conv_s8_3x1_d48.cc new file mode 100644 index 0000000..70a23b0 --- /dev/null +++ b/tflm/opt/conv_s8_3x1_d48.cc
@@ -0,0 +1,321 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Convolution based on Kelvin ops +// Data types: input: s8, filter: s8, bias: s32 +// Special case for 48x3x1x48 filter + +#include "tflm/opt/conv_util.h" + +namespace kelvin::opt { + +void ConvS8K3x1D48( + const tflite::ConvParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const tflite::RuntimeShape& input_shape, + const int8_t* input_data, const tflite::RuntimeShape& filter_shape, + const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, + const int32_t* bias_data, const tflite::RuntimeShape& output_shape, + int8_t* output_data) { + const auto batches = MatchingDim(input_shape, 0, output_shape, 0); + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int32_t input_offset = params.input_offset; + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int filter_depth = filter_shape.Dims(3); + const int output_height = output_shape.Dims(1); + const int output_depth = output_shape.Dims(3); + const int32_t output_offset = params.output_offset; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + + TFLITE_DCHECK(batches == 1); + TFLITE_DCHECK(filter_depth == input_depth); + TFLITE_DCHECK(filter_height == 3); + TFLITE_DCHECK(filter_width == 1); + TFLITE_DCHECK(input_width == 1); + TFLITE_DCHECK(stride_width == 1); + TFLITE_DCHECK(stride_height == 1); + TFLITE_DCHECK(dilation_width_factor == 1); + TFLITE_DCHECK(dilation_height_factor == 1); + TFLITE_DCHECK(pad_width == 0); + TFLITE_DCHECK(pad_height == 0); + + int32_t bias[48 * 4]; + int32_t mult[48 * 4]; + int32_t shft[48 * 4]; + Swizzle(bias_data, bias, 48); + Swizzle(output_multiplier, mult, 48); + Swizzle(output_shift, shft, 48, true); + + int8_t juggled_filter_data[48 / 8][3][1][48 / 4][8][4]; + Filter_N_H_W_M<48>(filter_data, juggled_filter_data[0][0][0][0][0], 3, 1, 48); + union { + vconv_u8_t conv; + uint32_t raw; + } cmds; + cmds.conv.mode = 0; + cmds.conv.start = 0; + cmds.conv.stop = 7; + cmds.conv.sbias1 = input_offset; + cmds.conv.sdata1 = true; + cmds.conv.sbias2 = 0; + cmds.conv.sdata2 = true; + + union { + vconv_u8_t conv; + uint32_t raw; + } cmds16; + cmds16.conv.mode = 0; + cmds16.conv.start = 0; + cmds16.conv.stop = 3; + cmds16.conv.sbias1 = input_offset; + cmds16.conv.sdata1 = true; + cmds16.conv.sbias2 = 0; + cmds16.conv.sdata2 = true; + + for (int zo_hi = 0; zo_hi < output_depth; zo_hi += 8) { +// For each pixel, the general flow for this kernel looks like: +// 1) Reset accumulator and load activations into [v32, v46] +// 2) For each group of 32 scalars in the pixel fan-in, run MAC pipeline +// 2a) Load subset of activations from [v32, v46] to [v0, v7] +// 2b) Load subset of weights +// 2c) Run aconv +// 3) Run the output pipeline and store. +// +// For step 1, we'll alias [v32, v46] to [L0, LE]. For most iterations, +// we load all of these registers (10 pixels). For remainder iterations, +// we load a subset and pad the rest with 0's. The data will be stored as +// follows, where each letter represents 16 bytes of a pixel stored into +// a register (capitalization used to help distinguish channels in a pixel): +// L0 L1 L2 L3 L4 L5 L6 L7 L8 L9 LA LB LC LD LE +// Aa AB bB Cc CD dD Ee EF fF Gg GH hH Ii IJ jJ +#define L0 v32 +#define L1 v33 +#define L2 v34 +#define L3 v35 +#define L4 v36 +#define L5 v37 +#define L6 v38 +#define L7 v39 +#define L8 v40 +#define L9 v41 +#define LA v42 +#define LB v43 +#define LC v44 +#define LD v45 +#define LE v46 + +// We run 5 iterations of step 2, 4 full iterations and one half iteration. +// Because each pixel takes 1.5 registers, we have to interleave vmv_v and +// vsliden_w_4_vv instructions to ensure the same output channels are stored +// in each register per-pixel. As a refresher, vsliden_w_4_vv takes two +// register arguments (X and Y), and returns the concatenation of the last +// half of X and the first half of Y. ie: +// L1 L2 +// AB bB +// vsliden_w_4_vv(v1, L1, L2); -> v1 = Bb +#define CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE(p_flt) \ + { \ + /* 1/5 */ \ + /* Ky = 0, IC:[0-31] */ \ + vmv_v(v0, L0); /* Aa */ \ + vsliden_w_4_vv(v1, L1, L2); /* Bb */ \ + vmv_v(v2, L3); /* Cc */ \ + vsliden_w_4_vv(v3, L4, L5); /* Dd */ \ + vmv_v(v4, L6); /* Ee */ \ + vsliden_w_4_vv(v5, L7, L8); /* Ff */ \ + vmv_v(v6, L9); /* Gg */ \ + vsliden_w_4_vv(v7, LA, LB); /* Hh */ \ + vld_b_x_m(v56, p_flt + 128 * 0); \ + vld_b_x_m(v60, p_flt + 128 * 1); \ + aconv_vxv(v48, v0, cmds, v56); \ + \ + /* 2/5 */ \ + /* Ky = 0, IC:[32-47]; Ky = 1, IC:[0-15] */ \ + vmv_v(v0, L1); /* AB */ \ + vsliden_w_4_vv(v1, L2, L3); /* BC */ \ + vmv_v(v2, L4); /* CD */ \ + vsliden_w_4_vv(v3, L5, L6); /* DE */ \ + vmv_v(v4, L7); /* EF */ \ + vsliden_w_4_vv(v5, L8, L9); /* FG */ \ + vmv_v(v6, LA); /* GH */ \ + vsliden_w_4_vv(v7, LB, LC); /* HI */ \ + vld_b_x_m(v56, p_flt + 128 * 2); \ + vld_b_x_m(v60, p_flt + 128 * 3); \ + aconv_vxv(v48, v0, cmds, v56); \ + \ + /* 3/5 */ \ + /* Ky = 1, IC:[16-47] */ \ + vmv_v(v0, L2); /* bB */ \ + vsliden_w_4_vv(v1, L3, L4); /* cC */ \ + vmv_v(v2, L5); /* dD */ \ + vsliden_w_4_vv(v3, L6, L7); /* eE */ \ + vmv_v(v4, L8); /* fF */ \ + vsliden_w_4_vv(v5, L9, LA); /* gG */ \ + vmv_v(v6, LB); /* hH */ \ + vsliden_w_4_vv(v7, LC, LD); /* iI */ \ + vld_b_x_m(v56, p_flt + 128 * 4); \ + vld_b_x_m(v60, p_flt + 128 * 5); \ + aconv_vxv(v48, v0, cmds, v56); \ + \ + /* 4/5 */ \ + /* Ky = 2, IC:[0-31] */ \ + vmv_v(v0, L3); /* Cc */ \ + vsliden_w_4_vv(v1, L4, L5); /* Dd */ \ + vmv_v(v2, L6); /* Ee */ \ + vsliden_w_4_vv(v3, L4, L5); /* Ff */ \ + vmv_v(v4, L9); /* Gg */ \ + vsliden_w_4_vv(v5, LA, LB); /* Hh */ \ + vmv_v(v6, LC); /* Ii */ \ + vsliden_w_4_vv(v7, LD, LE); /* Jj */ \ + vld_b_x_m(v56, p_flt + 128 * 6); \ + vld_b_x_m(v60, p_flt + 128 * 7); \ + aconv_vxv(v48, v0, cmds, v56); \ + \ + /* 5/5 */ \ + /* Ky = 2, IC:[32-47] half iteration */ \ + vmv_v(v0, L4); /* C(D- ignored) */ \ + vsliden_w_4_vv(v1, L5, L6); /* D(E- ignored) */ \ + vmv_v(v2, L7); /* E(F- ignored) */ \ + vsliden_w_4_vv(v3, L8, L9); /* F(G- ignored) */ \ + vmv_v(v4, LA); /* G(H- ignored) */ \ + vsliden_w_4_vv(v5, LB, LC); /* H(I- ignored) */ \ + vmv_v(v6, LD); /* I(J- ignored) */ \ + /* Pad last iteration with first pixel. Gets ignored by cmd16 */ \ + vsliden_w_4_vv(v7, LE, L0); /* J(A- ignored) */ \ + vld_b_x_m(v56, p_flt + 128 * 8); /*Load once half iteration*/ \ + /* cmds16 runs subset of outer product */ \ + aconv_vxv(v48, v0, cmds16, v56); \ + } + + // Iterate over outputs + int out_y = 0; + for (; out_y + 8 <= output_height; out_y += 8) { + // Reset accumulator + vdup_w_x_m(v48, 0); + vdup_w_x_m(v52, 0); + acset_v(v48, v48); + + const int8_t* p_flt = juggled_filter_data[zo_hi / 8][0][0][0][0]; + const int8_t* p_in = input_data + (out_y * input_width * input_depth); + + // Load 10*48 activations into 10*48*32 = 15 registers + vld_b_x_m(L0, p_in); + vld_b_x_m(L4, p_in + 32 * 4); + vld_b_x_m(L8, p_in + 32 * 8); + vld_b_x(LC, p_in + 32 * 12); + vld_b_x(LD, p_in + 32 * 13); + vld_b_x(LE, p_in + 32 * 14); + + // MAC pipeline + CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE(p_flt); + + // Output pipeline + INT32_TO_INT8_OUTPUT_PIPELINE(bias + zo_hi * 4, mult + zo_hi * 4, + shft + zo_hi * 4, output_activation_min, + output_activation_max, output_offset, v36, + v40, v44); + int8_t* p_out = + output_data + tflite::Offset(output_shape, 0, out_y, 0, zo_hi); + vstq_b_sp_xx(v48, p_out, output_depth); + vstq_b_sp_xx(v52, p_out, output_depth); + } + + // Left over minibatch + int remainder = output_height - out_y; + if (remainder != 0) { + // Reset accumulator + vdup_w_x_m(v48, 0); + vdup_w_x_m(v52, 0); + acset_v(v48, v48); + + const int8_t* p_flt = juggled_filter_data[zo_hi / 8][0][0][0][0]; + const int8_t* p_in = input_data + (out_y * input_width * input_depth); + + // Load (remainder + 2) * 48 activations + // L0 L1 L2 L3 L4 L5 L6 L7 L8 L9 LA LB LC LD + // AA AB BB CC CD DD EE EF FF GG GH HH II I- + vld_b_x_m(L0, p_in); + vdup_w_x_m(L4, 0); + vdup_w_x_m(L8, 0); + vdup_w_x_m(LC, 0); + switch (remainder) { + case 7: + vld_b_x(LD, p_in + 32 * 13); + vld_b_x(LC, p_in + 32 * 12); + case 6: + vld_b_x(LB, p_in + 32 * 11); + case 5: + vld_b_x(LA, p_in + 32 * 10); + vld_b_x(L9, p_in + 32 * 9); + case 4: + vld_b_x(L8, p_in + 32 * 8); + case 3: + vld_b_x(L7, p_in + 32 * 7); + vld_b_x(L6, p_in + 32 * 6); + case 2: + vld_b_x(L5, p_in + 32 * 5); + default: + break; + } + vld_b_x(L4, p_in + 32 * 4); + + // MAC pipeline + CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE(p_flt); + + // Output pipeline + INT32_TO_INT8_OUTPUT_PIPELINE(bias + zo_hi * 4, mult + zo_hi * 4, + shft + zo_hi * 4, output_activation_min, + output_activation_max, output_offset, v36, + v40, v44); + + int8_t* p_out = + output_data + tflite::Offset(output_shape, 0, out_y, 0, zo_hi); + uint8_t local_data[64]; + vst_b_x(v0, local_data); + vst_b_x(v1, local_data + 32); + for (int i = 0; i < remainder; i++) { + memcpy(p_out + (i * output_depth), local_data + (i * 8), 8); + } + } + +#undef CONV_PER_CHANNEL_B8_3X1_48C_MAC_PIPELINE +#undef L0 +#undef L1 +#undef L2 +#undef L3 +#undef L4 +#undef L5 +#undef L6 +#undef L7 +#undef L8 +#undef L9 +#undef LA +#undef LB +#undef LC +#undef LD +#undef LE + } +} + +} // namespace kelvin::opt
diff --git a/tflm/opt/conv_s8_d32.cc b/tflm/opt/conv_s8_d32.cc new file mode 100644 index 0000000..e3e7e10 --- /dev/null +++ b/tflm/opt/conv_s8_d32.cc
@@ -0,0 +1,376 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Convolution based on Kelvin ops +// Data types: input: s8, filter: s8, bias: s32 +// Special case for filter depth = 32n + +#include "tflm/opt/conv_util.h" + +namespace kelvin::opt { +namespace { +void ConvS8D32Pw1Ow8Id8( + const tflite::ConvParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const tflite::RuntimeShape& input_shape, + const int8_t* input_data, const tflite::RuntimeShape& filter_shape, + const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, + const int32_t* bias_data, const tflite::RuntimeShape& output_shape, + int8_t* output_data) { + // Get parameters. + const int32_t input_offset = params.input_offset; // r = s(q - Z) + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int32_t output_offset = params.output_offset; + + // Set min and max value of the output. + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + + // Consistency check. + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = input_shape.Dims(3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + if (bias_data) { + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); + } + + // Check dimensions of the tensors. + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int filter_input_depth = filter_shape.Dims(3); + const int groups = input_depth / filter_input_depth; + TFLITE_DCHECK_NE(groups, 0); + TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0); + const int filters_per_group = output_depth / groups; + TFLITE_DCHECK_NE(filters_per_group, 0); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + + union { + vconv_u8_t conv; + uint32_t raw; + } cmds; + cmds.conv.mode = 0; + cmds.conv.start = 0; + cmds.conv.stop = 7; + cmds.conv.sbias1 = input_offset; + cmds.conv.sdata1 = true; + cmds.conv.sbias2 = 0; + cmds.conv.sdata2 = true; + + const size_t swizzled_filter_data_size = + 8 * filter_height * filter_width * filter_input_depth; + std::unique_ptr<int8_t> swizzled_filter_data(reinterpret_cast<int8_t*>( + ::aligned_alloc(32, swizzled_filter_data_size))); + int8_t* p_swizzled_filter_data = swizzled_filter_data.get(); + int32_t swizzled_bias_data[32]; + int32_t swizzled_mult_data[32]; + int32_t swizzled_shift_data[32]; + + for (int out_channel = 0; out_channel + 8 <= output_depth; out_channel += 8) { + Filter_N_H_W_M<8>(filter_data + (out_channel * filter_height * + filter_width * filter_input_depth), + p_swizzled_filter_data, filter_height, filter_width, + filter_input_depth); + Swizzle(bias_data + out_channel, swizzled_bias_data, 8); + Swizzle(output_multiplier + out_channel, swizzled_mult_data, 8); + Swizzle(output_shift + out_channel, swizzled_shift_data, 8); + vld_w_x_m(v16, swizzled_bias_data); + vld_w_x_m(v20, swizzled_mult_data); + vld_w_x_m(v24, swizzled_shift_data); + vrsub_w_vx_m(v24, v24, 0); + + for (int batch = 0; batch < batches; ++batch) { + for (int out_y = 0; out_y < output_height; ++out_y) { + const int in_y_origin = (out_y * stride_height) - pad_height; + for (int out_x = 0; out_x + 8 <= output_width; out_x += 8) { + // 8x accumulators + vdup_w_x_m(v48, 0); + vdup_w_x_m(v52, 0); + acset_v(v48, v48); + for (int in_channel = 0; in_channel + 32 <= filter_input_depth; + in_channel += 32) { + for (int filter_y = 0; filter_y < filter_height; ++filter_y) { + const int in_y = in_y_origin + dilation_height_factor * filter_y; + const bool is_row_inside_input = + (in_y >= 0) && (in_y < input_height); + if (!is_row_inside_input) { + continue; + } + + for (int filter_x = 0; filter_x < filter_width; ++filter_x) { + int in_x[8]; + bool left_pad = false; + bool right_pad = false; + for (int i = 0; i < 8; ++i) { + const int in_x_origin = + ((out_x + i) * stride_width) - pad_width; + in_x[i] = in_x_origin + dilation_width_factor * filter_x; + if (in_x[i] < 0) { + left_pad = true; + } + if (in_x[i] >= input_width) { + right_pad = true; + } + } + + if (left_pad) { + vdup_b_x(v0, -input_offset); + vld_b_s_xx( + v1, + &input_data[tflite::Offset(input_shape, batch, in_y, + in_x[1], in_channel)], + input_depth * stride_width); + vld_b_s_xx( + v2, + &input_data[tflite::Offset(input_shape, batch, in_y, + in_x[2], in_channel)], + input_depth * stride_width); + vld_b_s_xx( + v3, + &input_data[tflite::Offset(input_shape, batch, in_y, + in_x[3], in_channel)], + input_depth * stride_width); + vld_b_s_xx_m( + v4, + &input_data[tflite::Offset(input_shape, batch, in_y, + in_x[4], in_channel)], + input_depth * stride_width); + } else if (right_pad) { + vld_b_s_xx_m( + v0, + &input_data[tflite::Offset(input_shape, batch, in_y, + in_x[0], in_channel)], + input_depth * stride_width); + vld_b_s_xx( + v4, + &input_data[tflite::Offset(input_shape, batch, in_y, + in_x[4], in_channel)], + input_depth * stride_width); + vld_b_s_xx( + v5, + &input_data[tflite::Offset(input_shape, batch, in_y, + in_x[5], in_channel)], + input_depth * stride_width); + vld_b_s_xx( + v6, + &input_data[tflite::Offset(input_shape, batch, in_y, + in_x[6], in_channel)], + input_depth * stride_width); + vdup_b_x(v7, -input_offset); + } else if (!left_pad && !right_pad) { + // Inputs + vld_b_s_xx_m( + v0, + &input_data[tflite::Offset(input_shape, batch, in_y, + in_x[0], in_channel)], + input_depth * stride_width); + vld_b_s_xx_m( + v4, + &input_data[tflite::Offset(input_shape, batch, in_y, + in_x[4], in_channel)], + input_depth * stride_width); + } else { + vdup_b_x(v0, -input_offset); + vdup_b_x(v7, -input_offset); + vld_b_s_xx_m( + v1, + &input_data[tflite::Offset(input_shape, batch, in_y, + in_x[1], in_channel)], + input_depth * stride_width); + vld_b_s_xx( + v5, + &input_data[tflite::Offset(input_shape, batch, in_y, + in_x[5], in_channel)], + input_depth * stride_width); + vld_b_s_xx( + v6, + &input_data[tflite::Offset(input_shape, batch, in_y, + in_x[6], in_channel)], + input_depth * stride_width); + } + size_t local_filter_offset = + (filter_y * filter_width * 8 * input_depth) + + (filter_x * 8 * input_depth) + (in_channel * 8); + int8_t* p_local_filter_start = + p_swizzled_filter_data + local_filter_offset; + vld_b_p_x_m(v8, p_local_filter_start); + vld_b_x_m(v12, p_local_filter_start); + + aconv_vxv(v48, v0, cmds, v8); + } + } + } + vcget(v48); + vadd_w_vv_m(v48, v48, v16); + vadd_w_vv_m(v52, v52, v16); + vdmulh_w_r_vv_m(v48, v48, v20); + vdmulh_w_r_vv_m(v52, v52, v20); + vsha_w_r_vv_m(v48, v48, v24); + vsha_w_r_vv_m(v52, v52, v24); + vadd_w_vx_m(v48, v48, output_offset); + vadd_w_vx_m(v52, v52, output_offset); + vmin_w_vx_m(v48, v48, output_activation_max); + vmin_w_vx_m(v52, v52, output_activation_max); + vmax_w_vx_m(v48, v48, output_activation_min); + vmax_w_vx_m(v52, v52, output_activation_min); + vsraqs_b_vx(v56, v48, 0); + vsraqs_b_vx(v57, v52, 0); + vstq_b_s_xx(v56, + &output_data[tflite::Offset(output_shape, batch, out_y, + out_x, out_channel)], + output_depth); + vstq_b_s_xx(v57, + &output_data[tflite::Offset(output_shape, batch, out_y, + out_x + 4, out_channel)], + output_depth); + } + } + } + } +} + +} // namespace + +// Fixed-point per-channel-quantization convolution reference kernel. +void ConvS8D32(const tflite::ConvParams& params, + const int32_t* output_multiplier, const int32_t* output_shift, + const tflite::RuntimeShape& input_shape, + const int8_t* input_data, + const tflite::RuntimeShape& filter_shape, + const int8_t* filter_data, + const tflite::RuntimeShape& bias_shape, const int32_t* bias_data, + const tflite::RuntimeShape& output_shape, int8_t* output_data) { + // Get parameters. + const int32_t input_offset = params.input_offset; // r = s(q - Z) + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int32_t output_offset = params.output_offset; + + // Set min and max value of the output. + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + + // Consistency check. + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = input_shape.Dims(3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + if (bias_data) { + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); + } + + // Check dimensions of the tensors. + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int filter_input_depth = filter_shape.Dims(3); + const int groups = input_depth / filter_input_depth; + TFLITE_DCHECK_NE(groups, 0); + TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0); + const int filters_per_group = output_depth / groups; + TFLITE_DCHECK_NE(filters_per_group, 0); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + + // filter_depth = 32n && input_channels = 8n && output_width = 8n + if (output_depth % 8 == 0 && output_width % 8 == 0 && pad_width <= 1) { + ConvS8D32Pw1Ow8Id8(params, output_multiplier, output_shift, input_shape, + input_data, filter_shape, filter_data, bias_shape, + bias_data, output_shape, output_data); + return; + } + + for (int out_channel = 0; out_channel < output_depth; ++out_channel) { + for (int batch = 0; batch < batches; ++batch) { + for (int out_y = 0; out_y < output_height; ++out_y) { + const int in_y_origin = (out_y * stride_height) - pad_height; + for (int out_x = 0; out_x < output_width; ++out_x) { + const int in_x_origin = (out_x * stride_width) - pad_width; + vdup_w_x_m(v60, 0); + int32_t acc = 0; + for (int in_channel = 0; in_channel + 32 <= filter_input_depth; + in_channel += 32) { + for (int filter_y = 0; filter_y < filter_height; ++filter_y) { + const int in_y = in_y_origin + dilation_height_factor * filter_y; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + + // Zero padding by omitting the areas outside the image. + const bool is_point_inside_image = + (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height); + + if (!is_point_inside_image) { + continue; + } + + vld_b_x(v0, &input_data[tflite::Offset(input_shape, batch, in_y, + in_x, in_channel)]); + vaddw_h_vx(v0, v0, 0); + vadd_h_vx(v0, v0, static_cast<int16_t>(input_offset)); + vadd_h_vx(v1, v1, static_cast<int16_t>(input_offset)); + vld_b_x(v2, &filter_data[tflite::Offset(filter_shape, + out_channel, filter_y, + filter_x, in_channel)]); + vaddw_h_vx(v2, v2, 0); + vmulw_w_vv(v48, v0, v2); + vmulw_w_vv(v50, v1, v3); + vadd_w_vv_m(v60, v60, v48); + } + } + } + int32_t accumulators[32]; + vst_w_x_m(v60, accumulators); + for (int i = 0; i < 32; ++i) { + acc += accumulators[i]; + } + + if (bias_data) { + acc += bias_data[out_channel]; + } + acc = tflite::MultiplyByQuantizedMultiplier( + acc, output_multiplier[out_channel], output_shift[out_channel]); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_data[tflite::Offset(output_shape, batch, out_y, out_x, + out_channel)] = static_cast<int8_t>(acc); + } + } + } + } +} + +} // namespace kelvin::opt
diff --git a/tflm/opt/conv_util.h b/tflm/opt/conv_util.h new file mode 100644 index 0000000..b9470aa --- /dev/null +++ b/tflm/opt/conv_util.h
@@ -0,0 +1,136 @@ +/* + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TFLM_OPT_CONV_UTIL_H_ +#define TFLM_OPT_CONV_UTIL_H_ + +#include <cassert> +#include <memory> + +#include "crt/kelvin.h" +#include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/runtime_shape.h" +#include "tensorflow/lite/kernels/internal/types.h" +#include "tflm/opt/util.h" + +namespace kelvin::opt { +/* clang-format off */ +constexpr const int swizzle[16] = { + 0, 4, 8, 12, + 2, 6, 10, 14, + 1, 5, 9, 13, + 3, 7, 11, 15, +}; +/* clang-format on */ + +constexpr int kFilterHeightIndex = 1; +constexpr int kFilterWidthIndex = 2; +constexpr int kFilterInputChannelIndex = 3; +constexpr int kInputChannelIndex = 3; +constexpr int kOutputChannelIndex = 3; + +#define INA0 v0 +#define FLTA0 v8 +#define FLTA1 v9 +#define FLTA2 v10 +#define FLTA3 v11 +#define FLTA4 v12 +#define FLTA5 v13 +#define FLTA6 v14 +#define FLTA7 v15 +#define ACC v48 +#define ACC0 v48 +#define OUT0 v56 + +// H,W ( height and width of filter) N -number of inputs, M -number of outputs +template <int N> +inline void Filter_N_H_W_M(const int8_t* input, int8_t* output, int H, int W, + int M) { + // Convert: input [zo][ky][kx][zi] (N,3,1,M) + // output [zo.hi=N/8][ky][kx][zi_hi=M/4][zo.lo=8][zi_lo=4] + const int8_t(&in)[N][H][W][M] = *(int8_t(*)[N][H][W][M])input; + int8_t(&out)[N / 8][H][W][M / 4][8][4] = + *(int8_t(*)[N / 8][H][W][M / 4][8][4]) output; + assert(N >= 4 && M >= 4); + for (int zo = 0; zo < N; ++zo) { + for (int ky = 0; ky < H; ++ky) { + for (int kx = 0; kx < W; ++kx) { + for (int zi = 0; zi < M; ++zi) { + const int zo_hi = zo >> 3; // div8 + const int zo_lo = zo & 7; // rem8 + const int zi_hi = zi >> 2; // div4 + const int zi_lo = zi & 3; // rem4 + out[zo_hi][ky][kx][zi_hi][zo_lo][zi_lo] = in[zo][ky][kx][zi]; + } + } + } + } +} + +// Swizzle values, and duplicate 4 times for stripmining. +inline void Swizzle(const int32_t* input, int32_t* output, int N, + bool negate = false) { + const int32_t(&in)[N] = *(int32_t(*)[N])input; + int32_t(&out)[N * 4] = *(int32_t(*)[N * 4]) output; + // Convert to accumulator swizzle pattern. + for (int i = 0; i < N / 8; ++i) { + int32_t* out0 = out + i * 32 + 0; + int32_t* out1 = out + i * 32 + 16; + int32_t* out2 = out + i * 32 + 8; + int32_t* out3 = out + i * 32 + 24; + for (int j = 0; j < 4; ++j) { + const int32_t* p_in = in + i * 8; + for (int k = 0; k < 2; ++k) { + *out0++ = *p_in++; + *out1++ = *p_in++; + *out2++ = *p_in++; + *out3++ = *p_in++; + } + } + } + if (negate) { + for (int i = 0; i < N * 4; ++i) { + out[i] = -out[i]; + } + } +} + +// Run output pipeline on int32 accumulators in [v48-v55] and store results +// in v48 and v52. Clobbers [v48-v55]. +#define INT32_TO_INT8_OUTPUT_PIPELINE(bias, mult, shft, output_min, \ + output_max, output_offset, bias_reg, \ + mult_reg, shift_reg) \ + { \ + vcget(v48); \ + vld_w_x_m(bias_reg, bias); \ + vld_w_x_m(mult_reg, mult); \ + vld_w_x_m(shift_reg, shft); \ + vadd_w_vv_m(v48, v48, bias_reg); \ + vadd_w_vv_m(v52, v52, bias_reg); \ + vmin_w_vx_m(v48, v48, output_max); \ + vmax_w_vx_m(v52, v52, output_min); \ + vdmulh_w_r_vv_m(v48, v48, mult_reg); \ + vdmulh_w_r_vv_m(v52, v52, mult_reg); \ + vsha_w_r_vv_m(v48, v48, shift_reg); \ + vsha_w_r_vv_m(v52, v52, shift_reg); \ + vadd_w_vx_m(v48, v48, output_offset); \ + vadd_w_vx_m(v52, v52, output_offset); \ + vsraqs_b_vx(v48, v48, 0); \ + vsraqs_b_vx(v52, v52, 0); \ + } +} // namespace kelvin::opt + +#endif // TFLM_OPT_CONV_UTIL_H_
diff --git a/tflm/opt/depthwise_conv_s16.cc b/tflm/opt/depthwise_conv_s16.cc index 13ae125..c7db407 100644 --- a/tflm/opt/depthwise_conv_s16.cc +++ b/tflm/opt/depthwise_conv_s16.cc
@@ -14,23 +14,32 @@ * limitations under the License. */ -#include <algorithm> +// Depthwise convolution based on Kelvin ops +// Data types: input: s16, filter: s8, bias s64 -#include "crt/kelvin.h" -#include "tflm/opt/opt.h" -#include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h" +#include "tflm/opt/conv_util.h" namespace kelvin::opt { +namespace { +void DepthwiseConvS16K3x1( + const tflite::DepthwiseParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const tflite::RuntimeShape& input_shape, + const int16_t* input_data, const tflite::RuntimeShape& filter_shape, + const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, + const int64_t* bias_data, const tflite::RuntimeShape& output_shape, + int16_t* output_data) { + const int16_t* activations = input_data; + const int8_t* weights = filter_data; + const int64_t* biases = bias_data; + int channels = filter_shape.Dims(3); + int frames = input_shape.Dims(2); + int dilation = params.dilation_width_factor; + const int32_t* output_mult = output_multiplier; + int32_t output_activation_min = params.quantized_activation_min; + int32_t output_activation_max = params.quantized_activation_max; + int16_t* output = output_data; -void DepthwiseConv2DKelvinS16K3x1(const int16_t* activations, - const int8_t* weights, - const int64_t* biases, - int channels, int frames, int dilation, - const int32_t* output_mult, - const int32_t* output_shift, - int32_t output_activation_min, - int32_t output_activation_max, - int16_t* output) { for (int c = 0; c + 32 <= channels; c += 32) { // Load weights and interleave into correct order [v58-v63]. // Because there are more activations than weights, interleave weights. @@ -78,8 +87,8 @@ for (; frames_idx < frames; frames_idx += dilation) { vld_h_p_xx(v4, local_activations0, step); vld_h_p_xx(v5, local_activations1, step); - vmulw_w_vv(v48, v58, v0); // Clobber accumulator - vmulw_w_vv(v50, v59, v1); // Clobber accumulator + vmulw_w_vv(v48, v58, v0); // Clobber accumulator + vmulw_w_vv(v50, v59, v1); // Clobber accumulator vadd_w_vv_m(v48, v48, v52); // Add bias. vmulw_w_vv(v40, v60, v2); vmulw_w_vv(v42, v61, v3); @@ -118,4 +127,60 @@ // - one final loop handling remainder } +// generic implementation based on Kelvin ops +void DepthwiseConvS16Generic( + const tflite::DepthwiseParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const tflite::RuntimeShape& input_shape, + const int16_t* input_data, const tflite::RuntimeShape& filter_shape, + const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, + const int64_t* bias_data, const tflite::RuntimeShape& output_shape, + int16_t* output_data) { + // TBD: Use Kelvin implementation to replace the below + tflite::reference_integer_ops::DepthwiseConvPerChannel( + params, output_multiplier, output_shift, input_shape, input_data, + filter_shape, filter_data, bias_shape, bias_data, output_shape, + output_data); + return; +} +} // namespace + +void DepthwiseConvS16( + const tflite::DepthwiseParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const tflite::RuntimeShape& input_shape, + const int16_t* input_data, const tflite::RuntimeShape& filter_shape, + const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, + const int64_t* bias_data, const tflite::RuntimeShape& output_shape, + int16_t* output_data) { + // Get parameters. + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + + if (params.padding_type == tflite::PaddingType::kValid && stride_width == 1 && + stride_height == 1 && dilation_width_factor == 1 && + dilation_height_factor == 1) { + // generic implementation by default + auto fn = DepthwiseConvS16Generic; + + // special case of filter size 3x1 + if (filter_height == 1 && filter_width == 3) { + fn = DepthwiseConvS16K3x1; + } + + fn(params, output_multiplier, output_shift, input_shape, input_data, + filter_shape, filter_data, bias_shape, bias_data, output_shape, + output_data); + return; + } + + // Use reference implementation + tflite::reference_integer_ops::DepthwiseConvPerChannel( + params, output_multiplier, output_shift, input_shape, input_data, + filter_shape, filter_data, bias_shape, bias_data, output_shape, + output_data); +} + } // namespace kelvin::opt
diff --git a/tflm/opt/depthwise_conv_s8.cc b/tflm/opt/depthwise_conv_s8.cc index c4ee35a..c94b38a 100644 --- a/tflm/opt/depthwise_conv_s8.cc +++ b/tflm/opt/depthwise_conv_s8.cc
@@ -1,5 +1,5 @@ /* - * Copyright 2023 Google LLC + * Copyright 2024 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,39 +14,16 @@ * limitations under the License. */ -#include <algorithm> +// Depthwise convolution based on Kelvin ops +// Data types: input: s8, filter: s8, bias s32 -#include "crt/kelvin.h" -#include "tensorflow/lite/kernels/internal/common.h" #include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h" -#include "tensorflow/lite/kernels/internal/runtime_shape.h" -#include "tensorflow/lite/kernels/internal/types.h" -#include "tflm/opt/opt.h" +#include "tflm/opt/conv_util.h" namespace kelvin::opt { - -void Swizzle(const int32_t* input, int32_t* output, int N) { - const int32_t(&in)[N] = *(int32_t(*)[N])input; - int32_t(&out)[N * 4] = *(int32_t(*)[N * 4]) output; - // Convert to accumulator swizzle pattern. - for (int i = 0; i < N / 8; ++i) { - int32_t* out0 = out + i * 32 + 0; - int32_t* out1 = out + i * 32 + 16; - int32_t* out2 = out + i * 32 + 8; - int32_t* out3 = out + i * 32 + 24; - for (int j = 0; j < 4; ++j) { - const int32_t* p_in = in + i * 8; - for (int k = 0; k < 2; ++k) { - *out0++ = *p_in++; - *out1++ = *p_in++; - *out2++ = *p_in++; - *out3++ = *p_in++; - } - } - } -} - -void DWConv2DKelvin_d32( +namespace { +// special case of input depth = 32n +void DepthwiseConvS8D32( const tflite::DepthwiseParams& params, const int32_t* output_multiplier, const int32_t* output_shift, const tflite::RuntimeShape& input_shape, const int8_t* input_data, const tflite::RuntimeShape& filter_shape, @@ -57,8 +34,6 @@ ) { const int stride_width = params.stride_width; const int stride_height = params.stride_height; - const int dilation_width_factor = params.dilation_width_factor; - const int dilation_height_factor = params.dilation_height_factor; const int pad_width = params.padding_values.width; const int pad_height = params.padding_values.height; const int32_t input_offset = params.input_offset; @@ -138,7 +113,24 @@ } } -void DepthwiseConv2DKelvin( +// generic implementation based on Kelvin ops +void DepthwiseConvS8Generic( + const tflite::DepthwiseParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const tflite::RuntimeShape& input_shape, + const int8_t* input_data, const tflite::RuntimeShape& filter_shape, + const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, + const int32_t* bias_data, const tflite::RuntimeShape& output_shape, + int8_t* output_data) { + // TBD: Use Kelvin implementation to replace the below + tflite::reference_integer_ops::DepthwiseConvPerChannel( + params, output_multiplier, output_shift, input_shape, input_data, + filter_shape, filter_data, bias_shape, bias_data, output_shape, + output_data); + return; +} +} // namespace + +void DepthwiseConvS8( const tflite::DepthwiseParams& params, const int32_t* output_multiplier, const int32_t* output_shift, const tflite::RuntimeShape& input_shape, const int8_t* input_data, const tflite::RuntimeShape& filter_shape, @@ -154,8 +146,6 @@ const int pad_width = params.padding_values.width; const int pad_height = params.padding_values.height; const int depth_multiplier = params.depth_multiplier; - const int32_t input_offset = params.input_offset; - const int32_t output_offset = params.output_offset; const int32_t output_activation_min = params.quantized_activation_min; const int32_t output_activation_max = params.quantized_activation_max; @@ -165,30 +155,33 @@ TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - const int batches = MatchingDim(input_shape, 0, output_shape, 0); const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); - const int input_height = input_shape.Dims(1); - const int input_width = input_shape.Dims(2); const int input_depth = input_shape.Dims(3); - const int filter_height = filter_shape.Dims(1); - const int filter_width = filter_shape.Dims(2); - const int output_height = output_shape.Dims(1); - const int output_width = output_shape.Dims(2); TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); if (depth_multiplier == 1 && pad_height < 2 && pad_width < 2 && dilation_height_factor == 1 && dilation_width_factor == 1 && - stride_height == 1 && stride_width == 1 && output_depth % 32 == 0) { - DWConv2DKelvin_d32(params, output_multiplier, output_shift, input_shape, - input_data, filter_shape, filter_data, bias_shape, - bias_data, output_shape, output_data); + stride_height == 1 && stride_width == 1) { + // generic implementation by default + auto fn = DepthwiseConvS8Generic; + + // special case of output depth = 32n + if (output_depth % 32 == 0) { + fn = DepthwiseConvS8D32; + } + + fn(params, output_multiplier, output_shift, input_shape, input_data, + filter_shape, filter_data, bias_shape, bias_data, output_shape, + output_data); return; } + + // Use reference implementation tflite::reference_integer_ops::DepthwiseConvPerChannel( params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data); - return; } -} // namespace kelvin::opt \ No newline at end of file + +} // namespace kelvin::opt
diff --git a/tflm/opt/elementwise_add_s16.cc b/tflm/opt/elementwise_add_s16.cc index e4220f0..001113e 100644 --- a/tflm/opt/elementwise_add_s16.cc +++ b/tflm/opt/elementwise_add_s16.cc
@@ -1,5 +1,5 @@ /* - * Copyright 2023 Google LLC + * Copyright 2024 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,16 +20,16 @@ namespace kelvin::opt { -void elementwise_add_s16(const int16_t* input1, const int16_t* input2, - const int32_t input1_offset, const int32_t input1_mult, - const int32_t input1_shift, - const int32_t input2_offset, const int32_t input2_mult, - const int32_t input2_shift, const int32_t left_shift, - int16_t* output, const int32_t output_offset, - const int32_t output_mult, const int32_t output_shift, - const int32_t output_activation_min, - const int32_t output_activation_max, - const int32_t block_size) { +void ElementwiseAddS16(const int16_t* input1, const int16_t* input2, + const int32_t input1_offset, const int32_t input1_mult, + const int32_t input1_shift, const int32_t input2_offset, + const int32_t input2_mult, const int32_t input2_shift, + const int32_t left_shift, int16_t* output, + const int32_t output_offset, const int32_t output_mult, + const int32_t output_shift, + const int32_t output_activation_min, + const int32_t output_activation_max, + const int32_t block_size) { int blocks = block_size; int vl; getmaxvl_h(vl);
diff --git a/tflm/opt/elementwise_add_s32.cc b/tflm/opt/elementwise_add_s32.cc index 483799a..ab2b3d1 100644 --- a/tflm/opt/elementwise_add_s32.cc +++ b/tflm/opt/elementwise_add_s32.cc
@@ -1,5 +1,5 @@ /* - * Copyright 2023 Google LLC + * Copyright 2024 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,10 +18,10 @@ #include "tflm/opt/opt.h" namespace kelvin::opt { -void elementwise_add_s32(const int32_t* input1, const int32_t* input2, - int32_t* output, const int32_t output_activation_min, - const int32_t output_activation_max, - const int32_t block_size) { +void ElementwiseAddS32(const int32_t* input1, const int32_t* input2, + int32_t* output, const int32_t output_activation_min, + const int32_t output_activation_max, + const int32_t block_size) { int blocks = block_size; int vl; getmaxvl_w_m(vl);
diff --git a/tflm/opt/elementwise_add_s8.cc b/tflm/opt/elementwise_add_s8.cc index ac83e1f..762d7af 100644 --- a/tflm/opt/elementwise_add_s8.cc +++ b/tflm/opt/elementwise_add_s8.cc
@@ -1,5 +1,5 @@ /* - * Copyright 2023 Google LLC + * Copyright 2024 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,16 +20,16 @@ namespace kelvin::opt { -void elementwise_add_s8(const int8_t* input1, const int8_t* input2, - const int32_t input1_offset, const int32_t input1_mult, - const int32_t input1_shift, const int32_t input2_offset, - const int32_t input2_mult, const int32_t input2_shift, - const int32_t left_shift, int8_t* output, - const int32_t output_offset, const int32_t output_mult, - const int32_t output_shift, - const int32_t output_activation_min, - const int32_t output_activation_max, - const int32_t block_size) { +void ElementwiseAddS8(const int8_t* input1, const int8_t* input2, + const int32_t input1_offset, const int32_t input1_mult, + const int32_t input1_shift, const int32_t input2_offset, + const int32_t input2_mult, const int32_t input2_shift, + const int32_t left_shift, int8_t* output, + const int32_t output_offset, const int32_t output_mult, + const int32_t output_shift, + const int32_t output_activation_min, + const int32_t output_activation_max, + const int32_t block_size) { int blocks = block_size; int vl; getmaxvl_b(vl);
diff --git a/tflm/opt/leaky_relu_s16.cc b/tflm/opt/leaky_relu_s16.cc index 5cd1128..7427a6c 100644 --- a/tflm/opt/leaky_relu_s16.cc +++ b/tflm/opt/leaky_relu_s16.cc
@@ -1,5 +1,5 @@ /* - * Copyright 2023 Google LLC + * Copyright 2024 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,13 +21,13 @@ #include "tflm/opt/util.h" namespace kelvin::opt { -void leaky_relu_s16(const int16_t* input, int16_t* output, - const int32_t block_size, const int32_t input_zero_point, - const int32_t output_zero_point, - const int32_t output_multiplier_alpha, - const int32_t output_shift_alpha, - const int32_t output_multiplier_identity, - const int32_t output_shift_identity) { +void LeakyReluS16(const int16_t* input, int16_t* output, + const int32_t block_size, const int32_t input_zero_point, + const int32_t output_zero_point, + const int32_t output_multiplier_alpha, + const int32_t output_shift_alpha, + const int32_t output_multiplier_identity, + const int32_t output_shift_identity) { constexpr int32_t quantized_output_min = std::numeric_limits<int16_t>::min(); constexpr int32_t quantized_output_max = std::numeric_limits<int16_t>::max(); int32_t right_shift_identity = std::min(output_shift_identity, 0L);
diff --git a/tflm/opt/leaky_relu_s8.cc b/tflm/opt/leaky_relu_s8.cc index b32d260..8b30d19 100644 --- a/tflm/opt/leaky_relu_s8.cc +++ b/tflm/opt/leaky_relu_s8.cc
@@ -1,5 +1,5 @@ /* - * Copyright 2023 Google LLC + * Copyright 2024 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,13 +22,13 @@ namespace kelvin::opt { -void leaky_relu_s8(const int8_t* input, int8_t* output, - const int32_t block_size, const int32_t input_zero_point, - const int32_t output_zero_point, - const int32_t output_multiplier_alpha, - const int32_t output_shift_alpha, - const int32_t output_multiplier_identity, - const int32_t output_shift_identity) { +void LeakyReluS8(const int8_t* input, int8_t* output, const int32_t block_size, + const int32_t input_zero_point, + const int32_t output_zero_point, + const int32_t output_multiplier_alpha, + const int32_t output_shift_alpha, + const int32_t output_multiplier_identity, + const int32_t output_shift_identity) { constexpr int32_t quantized_output_min = std::numeric_limits<int16_t>::min(); constexpr int32_t quantized_output_max = std::numeric_limits<int16_t>::max(); int32_t right_shift_identity = std::min(output_shift_identity, 0L);
diff --git a/tflm/opt/max_pool_s8.cc b/tflm/opt/max_pool_s8.cc index 5986746..544f85a 100644 --- a/tflm/opt/max_pool_s8.cc +++ b/tflm/opt/max_pool_s8.cc
@@ -1,5 +1,5 @@ /* - * Copyright 2023 Google LLC + * Copyright 2024 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,11 +20,10 @@ #include "tensorflow/lite/kernels/internal/types.h" namespace kelvin::opt { -void MaxPoolGeneric(const tflite::PoolParams ¶ms, - const tflite::RuntimeShape &input_shape, - const int8_t *input_data, - const tflite::RuntimeShape &output_shape, - int8_t *output_data) { +void MaxPoolS8(const tflite::PoolParams ¶ms, + const tflite::RuntimeShape &input_shape, + const int8_t *input_data, + const tflite::RuntimeShape &output_shape, int8_t *output_data) { const int batches = MatchingDim(input_shape, 0, output_shape, 0); const int depth = MatchingDim(input_shape, 3, output_shape, 3); const int input_height = input_shape.Dims(1); @@ -97,4 +96,4 @@ } } -} // namespace kelvin::opt +} // namespace kelvin::opt
diff --git a/tflm/opt/memcpy.cc b/tflm/opt/memcpy.cc index 4669a83..29e0434 100644 --- a/tflm/opt/memcpy.cc +++ b/tflm/opt/memcpy.cc
@@ -1,5 +1,5 @@ /* - * Copyright 2023 Google LLC + * Copyright 2024 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ namespace kelvin::opt { -void *memcpy(void *dst, const void *src, size_t n) { +void *Memcpy(void *dst, const void *src, size_t n) { const uint8_t *s = reinterpret_cast<const uint8_t *>(src); uint8_t *d = reinterpret_cast<uint8_t *>(dst); int vl;
diff --git a/tflm/opt/opt.h b/tflm/opt/opt.h index 277f338..76d5218 100644 --- a/tflm/opt/opt.h +++ b/tflm/opt/opt.h
@@ -1,5 +1,5 @@ /* - * Copyright 2023 Google LLC + * Copyright 2024 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,121 +24,87 @@ /* clang-format on */ namespace kelvin::opt { -void* memcpy(void* dst, const void* src, size_t n); -void elementwise_add_s8(const int8_t* input1, const int8_t* input2, - const int32_t input1_offset, const int32_t input1_mult, - const int32_t input1_shift, const int32_t input2_offset, - const int32_t input2_mult, const int32_t input2_shift, - const int32_t left_shift, int8_t* output, - const int32_t output_offset, const int32_t output_mult, - const int32_t output_shift, - const int32_t output_activation_min, - const int32_t output_activation_max, - const int32_t block_size); -void elementwise_add_s16(const int16_t* input1, const int16_t* input2, - const int32_t input1_offset, const int32_t input1_mult, - const int32_t input1_shift, - const int32_t input2_offset, const int32_t input2_mult, - const int32_t input2_shift, const int32_t left_shift, - int16_t* output, const int32_t output_offset, - const int32_t output_mult, const int32_t output_shift, - const int32_t output_activation_min, - const int32_t output_activation_max, - const int32_t block_size); -void elementwise_add_s32(const int32_t* input1, const int32_t* input2, - int32_t* output, const int32_t output_activation_min, - const int32_t output_activation_max, - const int32_t block_size); -void leaky_relu_s8(const int8_t* input, int8_t* output, - const int32_t block_size, const int32_t input_zero_point, - const int32_t output_zero_point, - const int32_t output_multiplier_alpha, - const int32_t output_shift_alpha, - const int32_t output_multiplier_identity, - const int32_t output_shift_identity); -void leaky_relu_s16(const int16_t* input, int16_t* output, - const int32_t block_size, const int32_t input_zero_point, - const int32_t output_zero_point, - const int32_t output_multiplier_alpha, - const int32_t output_shift_alpha, - const int32_t output_multiplier_identity, - const int32_t output_shift_identity); -void conv_per_channel_b32( - const tflite::ConvParams& params, const int32_t* output_multiplier, - const int32_t* output_shift, const tflite::RuntimeShape& input_shape, - const int16_t* input_data, const tflite::RuntimeShape& filter_shape, - const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, - const int32_t* bias_data, const tflite::RuntimeShape& output_shape, - int16_t* output_data); - -// Top level conv function, will invoke correct variant below. -void conv_per_channel_b64( - const tflite::ConvParams& params, const int32_t* output_multiplier, - const int32_t* output_shift, const tflite::RuntimeShape& input_shape, - const int16_t* input_data, const tflite::RuntimeShape& filter_shape, - const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, - const int64_t* bias_data, const tflite::RuntimeShape& output_shape, - int16_t* output_data); -void conv_per_channel_b64_1x1( - const tflite::ConvParams& params, const int32_t* output_multiplier, - const int32_t* output_shift, const tflite::RuntimeShape& input_shape, - const int16_t* input_data, const tflite::RuntimeShape& filter_shape, - const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, - const int64_t* bias_data, const tflite::RuntimeShape& output_shape, - int16_t* output_data); -void conv_per_channel_b64_filter1xn_non_group( - const tflite::ConvParams& params, const int32_t* output_multiplier, - const int32_t* output_shift, const tflite::RuntimeShape& input_shape, - const int16_t* input_data, const tflite::RuntimeShape& filter_shape, - const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, - const int64_t* bias_data, const tflite::RuntimeShape& output_shape, - int16_t* output_data); -void conv_per_channel_b64_filter1xn_group( - const tflite::ConvParams& params, const int32_t* output_multiplier, - const int32_t* output_shift, const tflite::RuntimeShape& input_shape, - const int16_t* input_data, const tflite::RuntimeShape& filter_shape, - const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, - const int64_t* bias_data, const tflite::RuntimeShape& output_shape, - int16_t* output_data); -void conv_per_channel_b64_generic( - const tflite::ConvParams& params, const int32_t* output_multiplier, - const int32_t* output_shift, const tflite::RuntimeShape& input_shape, - const int16_t* input_data, const tflite::RuntimeShape& filter_shape, - const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, - const int64_t* bias_data, const tflite::RuntimeShape& output_shape, - int16_t* output_data); - -void conv_per_channel_b8( - const tflite::ConvParams& params, const int32_t* output_multiplier, - const int32_t* output_shift, const tflite::RuntimeShape& input_shape, - const int8_t* input_data, const tflite::RuntimeShape& filter_shape, - const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, - const int32_t* bias_data, const tflite::RuntimeShape& output_shape, - int8_t* output_data); -void DepthwiseConv2DKelvin( +void* Memcpy(void* dst, const void* src, size_t n); +void ElementwiseAddS8(const int8_t* input1, const int8_t* input2, + const int32_t input1_offset, const int32_t input1_mult, + const int32_t input1_shift, const int32_t input2_offset, + const int32_t input2_mult, const int32_t input2_shift, + const int32_t left_shift, int8_t* output, + const int32_t output_offset, const int32_t output_mult, + const int32_t output_shift, + const int32_t output_activation_min, + const int32_t output_activation_max, + const int32_t block_size); +void ElementwiseAddS16(const int16_t* input1, const int16_t* input2, + const int32_t input1_offset, const int32_t input1_mult, + const int32_t input1_shift, const int32_t input2_offset, + const int32_t input2_mult, const int32_t input2_shift, + const int32_t left_shift, int16_t* output, + const int32_t output_offset, const int32_t output_mult, + const int32_t output_shift, + const int32_t output_activation_min, + const int32_t output_activation_max, + const int32_t block_size); +void ElementwiseAddS32(const int32_t* input1, const int32_t* input2, + int32_t* output, const int32_t output_activation_min, + const int32_t output_activation_max, + const int32_t block_size); +void LeakyReluS8(const int8_t* input, int8_t* output, const int32_t block_size, + const int32_t input_zero_point, + const int32_t output_zero_point, + const int32_t output_multiplier_alpha, + const int32_t output_shift_alpha, + const int32_t output_multiplier_identity, + const int32_t output_shift_identity); +void LeakyReluS16(const int16_t* input, int16_t* output, + const int32_t block_size, const int32_t input_zero_point, + const int32_t output_zero_point, + const int32_t output_multiplier_alpha, + const int32_t output_shift_alpha, + const int32_t output_multiplier_identity, + const int32_t output_shift_identity); +void ConvS16B32(const tflite::ConvParams& params, + const int32_t* output_multiplier, const int32_t* output_shift, + const tflite::RuntimeShape& input_shape, + const int16_t* input_data, + const tflite::RuntimeShape& filter_shape, + const int8_t* filter_data, + const tflite::RuntimeShape& bias_shape, + const int32_t* bias_data, + const tflite::RuntimeShape& output_shape, int16_t* output_data); +void ConvS16B64(const tflite::ConvParams& params, + const int32_t* output_multiplier, const int32_t* output_shift, + const tflite::RuntimeShape& input_shape, + const int16_t* input_data, + const tflite::RuntimeShape& filter_shape, + const int8_t* filter_data, + const tflite::RuntimeShape& bias_shape, + const int64_t* bias_data, + const tflite::RuntimeShape& output_shape, int16_t* output_data); +void ConvS8(const tflite::ConvParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, + const tflite::RuntimeShape& input_shape, const int8_t* input_data, + const tflite::RuntimeShape& filter_shape, const int8_t* filter_data, + const tflite::RuntimeShape& bias_shape, const int32_t* bias_data, + const tflite::RuntimeShape& output_shape, int8_t* output_data); +void DepthwiseConvS8( const tflite::DepthwiseParams& params, const int32_t* output_multiplier, const int32_t* output_shift, const tflite::RuntimeShape& input_shape, const int8_t* input_data, const tflite::RuntimeShape& filter_shape, const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, const int32_t* bias_data, const tflite::RuntimeShape& output_shape, int8_t* output_data); -void DWConv2DKelvin_d32( +void DepthwiseConvS16( const tflite::DepthwiseParams& params, const int32_t* output_multiplier, const int32_t* output_shift, const tflite::RuntimeShape& input_shape, - const int8_t* input_data, const tflite::RuntimeShape& filter_shape, + const int16_t* input_data, const tflite::RuntimeShape& filter_shape, const int8_t* filter_data, const tflite::RuntimeShape& bias_shape, - const int32_t* bias_data, const tflite::RuntimeShape& output_shape, - int8_t* output_data); -void DepthwiseConv2DKelvinS16K3x1( - const int16_t* activations, const int8_t* weights, const int64_t* biases, - int channels, int frames, int dilation, const int32_t* output_mult, - const int32_t* output_shift, int32_t output_activation_min, - int32_t output_activation_max, int16_t* output); -void MaxPoolGeneric(const tflite::PoolParams& params, - const tflite::RuntimeShape& input_shape, - const int8_t* input_data, - const tflite::RuntimeShape& output_shape, - int8_t* output_data); + const int64_t* bias_data, const tflite::RuntimeShape& output_shape, + int16_t* output_data); +void MaxPoolS8(const tflite::PoolParams& params, + const tflite::RuntimeShape& input_shape, + const int8_t* input_data, + const tflite::RuntimeShape& output_shape, int8_t* output_data); } // namespace kelvin::opt
diff --git a/tflm/opt/util.h b/tflm/opt/util.h index d94ef3e..d0c16db 100644 --- a/tflm/opt/util.h +++ b/tflm/opt/util.h
@@ -1,5 +1,5 @@ /* - * Copyright 2023 Google LLC + * Copyright 2024 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License.