tflm/opt/depthwise_conv_s16.cc - sw/kelvin - Git at Google

 /*
  * Copyright 2023 Google LLC
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // Depthwise convolution based on Kelvin ops
 // Data types: input: s16, filter: s8, bias s64

 #include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
 #include "tflm/opt/conv_util.h"

 namespace kelvin::opt {
 namespace {
 void DepthwiseConvS16K3x1(
     const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
     const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
     const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
     const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
     const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
     int16_t* output_data) {
   const int16_t* activations = input_data;
   const int8_t* weights = filter_data;
   const int64_t* biases = bias_data;
   int channels = filter_shape.Dims(3);
   int frames = input_shape.Dims(2);
   int dilation = params.dilation_width_factor;
   const int32_t* output_mult = output_multiplier;
   int32_t output_activation_min = params.quantized_activation_min;
   int32_t output_activation_max = params.quantized_activation_max;
   int16_t* output = output_data;

   for (int c = 0; c + 32 <= channels; c += 32) {
     // Load weights and interleave into correct order [v58-v63].
     // Because there are more activations than weights, interleave weights.
     const int8_t* local_weights0 = weights + c;
     vld_b_p_xx(v0, local_weights0, channels);
     vaddw_h_vx(v48, v0, 0);
     vzip_h_vv(v58, v48, v49);

     vld_b_p_xx(v1, local_weights0, channels);
     vaddw_h_vx(v50, v1, 0);
     vzip_h_vv(v60, v50, v51);

     vld_b_x(v2, local_weights0);
     vaddw_h_vx(v52, v2, 0);
     vzip_h_vv(v62, v52, v53);

     // Assume biases fit in 32-bit. This assumption is verified offline.
     // Load biases and swizzle [v52-v55].
     int32_t local_biases[32];
     for (int j = 0; j < 32; j++) {
       local_biases[j] = static_cast<int32_t>(biases[c + j]);
     }
     vld_w_x_m(v4, local_biases);
     vzip_w_vv(v52, v4, v5);
     vzip_w_vv(v54, v6, v7);

     const int32_t step = dilation * channels;
     const int32_t* local_output_mult = output_mult + c;
     const int32_t* local_output_shift = output_shift + c;
     for (int d = 0; d < dilation; d++) {
       // Accumulators will be [v48 - v51].
       const int16_t* local_activations0 = activations + (d * channels) + c;
       const int16_t* local_activations1 = local_activations0 + 16;
       int16_t* local_output = output + (d * channels) + c;

       // Registers [v0-v5 will be for loading activations]
       // Preload for valid padding:
       vld_h_p_xx(v0, local_activations0, step);
       vld_h_p_xx(v1, local_activations1, step);
       vld_h_p_xx(v2, local_activations0, step);
       vld_h_p_xx(v3, local_activations1, step);

       int frames_idx = (2 * dilation) + d;
       int32_t accumulators[32];
       for (; frames_idx < frames; frames_idx += dilation) {
         vld_h_p_xx(v4, local_activations0, step);
         vld_h_p_xx(v5, local_activations1, step);
         vmulw_w_vv(v48, v58, v0);    // Clobber accumulator
         vmulw_w_vv(v50, v59, v1);    // Clobber accumulator
         vadd_w_vv_m(v48, v48, v52);  // Add bias.
         vmulw_w_vv(v40, v60, v2);
         vmulw_w_vv(v42, v61, v3);
         vadd_w_vv_m(v48, v48, v40);
         vmulw_w_vv(v44, v62, v4);
         vmulw_w_vv(v46, v63, v5);
         vadd_w_vv_m(v48, v48, v44);

         vzip_w_vv(v48, v48, v49);  // Swizzle accumulators
         vzip_w_vv(v50, v50, v51);

         vst_w_x_m(v48, accumulators);  // Store accumulators

         // Output pipeline in scalar, to preserve bit accuracy with the ARM CPU
         // implementation.
         for (int i = 0; i < 32; i++) {
           int32_t result = tflite::MultiplyByQuantizedMultiplier(
               static_cast<int64_t>(accumulators[i]), local_output_mult[i],
               local_output_shift[i]);

           local_output[i] = static_cast<int16_t>(
               std::clamp(result, output_activation_min, output_activation_max));
         }

         // Slide registers
         vmvp_vv(v0, v2, v3);
         vmvp_vv(v2, v4, v5);

         local_output += step;
       }
     }
   }
   // TODO(derekjchow): Handle channels % 32 cases.
   // Break it down into:
   //   - one loop looking for 16 byte stripes
   //   - one final loop handling remainder
 }

 // generic implementation based on Kelvin ops
 void DepthwiseConvS16Generic(
     const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
     const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
     const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
     const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
     const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
     int16_t* output_data) {
   // TBD: Use Kelvin implementation to replace the below
   tflite::reference_integer_ops::DepthwiseConvPerChannel(
       params, output_multiplier, output_shift, input_shape, input_data,
       filter_shape, filter_data, bias_shape, bias_data, output_shape,
       output_data);
   return;
 }
 }  // namespace

 void DepthwiseConvS16(
     const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
     const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
     const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
     const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
     const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
     int16_t* output_data) {
   // Get parameters.
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
   const int dilation_width_factor = params.dilation_width_factor;
   const int dilation_height_factor = params.dilation_height_factor;
   const int filter_height = filter_shape.Dims(1);
   const int filter_width = filter_shape.Dims(2);

   if (params.padding_type == tflite::PaddingType::kValid && stride_width == 1 &&
       stride_height == 1 && dilation_width_factor == 1 &&
       dilation_height_factor == 1) {
     // generic implementation by default
     auto fn = DepthwiseConvS16Generic;

     // special case of filter size 3x1
     if (filter_height == 1 && filter_width == 3) {
       fn = DepthwiseConvS16K3x1;
     }

     fn(params, output_multiplier, output_shift, input_shape, input_data,
        filter_shape, filter_data, bias_shape, bias_data, output_shape,
        output_data);
     return;
   }

   // Use reference implementation
   tflite::reference_integer_ops::DepthwiseConvPerChannel(
       params, output_multiplier, output_shift, input_shape, input_data,
       filter_shape, filter_data, bias_shape, bias_data, output_shape,
       output_data);
 }

 }  // namespace kelvin::opt
	/*
	* Copyright 2023 Google LLC
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// Depthwise convolution based on Kelvin ops
	// Data types: input: s16, filter: s8, bias s64

	#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
	#include "tflm/opt/conv_util.h"

	namespace kelvin::opt {
	namespace {
	void DepthwiseConvS16K3x1(
	const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
	const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
	const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
	const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
	const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
	int16_t* output_data) {
	const int16_t* activations = input_data;
	const int8_t* weights = filter_data;
	const int64_t* biases = bias_data;
	int channels = filter_shape.Dims(3);
	int frames = input_shape.Dims(2);
	int dilation = params.dilation_width_factor;
	const int32_t* output_mult = output_multiplier;
	int32_t output_activation_min = params.quantized_activation_min;
	int32_t output_activation_max = params.quantized_activation_max;
	int16_t* output = output_data;

	for (int c = 0; c + 32 <= channels; c += 32) {
	// Load weights and interleave into correct order [v58-v63].
	// Because there are more activations than weights, interleave weights.
	const int8_t* local_weights0 = weights + c;
	vld_b_p_xx(v0, local_weights0, channels);
	vaddw_h_vx(v48, v0, 0);
	vzip_h_vv(v58, v48, v49);

	vld_b_p_xx(v1, local_weights0, channels);
	vaddw_h_vx(v50, v1, 0);
	vzip_h_vv(v60, v50, v51);

	vld_b_x(v2, local_weights0);
	vaddw_h_vx(v52, v2, 0);
	vzip_h_vv(v62, v52, v53);

	// Assume biases fit in 32-bit. This assumption is verified offline.
	// Load biases and swizzle [v52-v55].
	int32_t local_biases[32];
	for (int j = 0; j < 32; j++) {
	local_biases[j] = static_cast<int32_t>(biases[c + j]);
	}
	vld_w_x_m(v4, local_biases);
	vzip_w_vv(v52, v4, v5);
	vzip_w_vv(v54, v6, v7);

	const int32_t step = dilation * channels;
	const int32_t* local_output_mult = output_mult + c;
	const int32_t* local_output_shift = output_shift + c;
	for (int d = 0; d < dilation; d++) {
	// Accumulators will be [v48 - v51].
	const int16_t* local_activations0 = activations + (d * channels) + c;
	const int16_t* local_activations1 = local_activations0 + 16;
	int16_t* local_output = output + (d * channels) + c;

	// Registers [v0-v5 will be for loading activations]
	// Preload for valid padding:
	vld_h_p_xx(v0, local_activations0, step);
	vld_h_p_xx(v1, local_activations1, step);
	vld_h_p_xx(v2, local_activations0, step);
	vld_h_p_xx(v3, local_activations1, step);

	int frames_idx = (2 * dilation) + d;
	int32_t accumulators[32];
	for (; frames_idx < frames; frames_idx += dilation) {
	vld_h_p_xx(v4, local_activations0, step);
	vld_h_p_xx(v5, local_activations1, step);
	vmulw_w_vv(v48, v58, v0); // Clobber accumulator
	vmulw_w_vv(v50, v59, v1); // Clobber accumulator
	vadd_w_vv_m(v48, v48, v52); // Add bias.
	vmulw_w_vv(v40, v60, v2);
	vmulw_w_vv(v42, v61, v3);
	vadd_w_vv_m(v48, v48, v40);
	vmulw_w_vv(v44, v62, v4);
	vmulw_w_vv(v46, v63, v5);
	vadd_w_vv_m(v48, v48, v44);

	vzip_w_vv(v48, v48, v49); // Swizzle accumulators
	vzip_w_vv(v50, v50, v51);

	vst_w_x_m(v48, accumulators); // Store accumulators

	// Output pipeline in scalar, to preserve bit accuracy with the ARM CPU
	// implementation.
	for (int i = 0; i < 32; i++) {
	int32_t result = tflite::MultiplyByQuantizedMultiplier(
	static_cast<int64_t>(accumulators[i]), local_output_mult[i],
	local_output_shift[i]);

	local_output[i] = static_cast<int16_t>(
	std::clamp(result, output_activation_min, output_activation_max));
	}

	// Slide registers
	vmvp_vv(v0, v2, v3);
	vmvp_vv(v2, v4, v5);

	local_output += step;
	}
	}
	}
	// TODO(derekjchow): Handle channels % 32 cases.
	// Break it down into:
	// - one loop looking for 16 byte stripes
	// - one final loop handling remainder
	}

	// generic implementation based on Kelvin ops
	void DepthwiseConvS16Generic(
	const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
	const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
	const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
	const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
	const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
	int16_t* output_data) {
	// TBD: Use Kelvin implementation to replace the below
	tflite::reference_integer_ops::DepthwiseConvPerChannel(
	params, output_multiplier, output_shift, input_shape, input_data,
	filter_shape, filter_data, bias_shape, bias_data, output_shape,
	output_data);
	return;
	}
	} // namespace

	void DepthwiseConvS16(
	const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
	const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
	const int16_t* input_data, const tflite::RuntimeShape& filter_shape,
	const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
	const int64_t* bias_data, const tflite::RuntimeShape& output_shape,
	int16_t* output_data) {
	// Get parameters.
	const int stride_width = params.stride_width;
	const int stride_height = params.stride_height;
	const int dilation_width_factor = params.dilation_width_factor;
	const int dilation_height_factor = params.dilation_height_factor;
	const int filter_height = filter_shape.Dims(1);
	const int filter_width = filter_shape.Dims(2);

	if (params.padding_type == tflite::PaddingType::kValid && stride_width == 1 &&
	stride_height == 1 && dilation_width_factor == 1 &&
	dilation_height_factor == 1) {
	// generic implementation by default
	auto fn = DepthwiseConvS16Generic;

	// special case of filter size 3x1
	if (filter_height == 1 && filter_width == 3) {
	fn = DepthwiseConvS16K3x1;
	}

	fn(params, output_multiplier, output_shift, input_shape, input_data,
	filter_shape, filter_data, bias_shape, bias_data, output_shape,
	output_data);
	return;
	}

	// Use reference implementation
	tflite::reference_integer_ops::DepthwiseConvPerChannel(
	params, output_multiplier, output_shift, input_shape, input_data,
	filter_shape, filter_data, bias_shape, bias_data, output_shape,
	output_data);
	}

	} // namespace kelvin::opt