Merge "Depthwise conv for Depth % == 16"
diff --git a/docs/kelvin_isa.md b/docs/kelvin_isa.md
index 4b9b06c..5aaac0c 100644
--- a/docs/kelvin_isa.md
+++ b/docs/kelvin_isa.md
@@ -85,6 +85,10 @@
where a stripmine register must use a mod4 base aligned register (eg. v0, v4,
v8, ...). Normal instruction and stripmine variants may be mixed together.
+Currently, neither the assembler nor kelvin_sim checks for invalid stripmine
+registers. Code using invalid registers (like v1) will compile and sim, but
+will cause FPGA to hang.
+
When stripmining is used in conjunction with instructions which use a register
index as a base to several registers, the offset of +4 (instead of +1) shall be
used. e.g., {vm0,vm1} becomes {{v0,v1,v2,v3},{v4,v5,v6,v7}}.
@@ -753,7 +757,7 @@
### ACONV
-Convolution ALU operation.
+Performs matmul vs1*vs3, accumulating into the accumulator.
**Encodings**
@@ -787,29 +791,32 @@
(signed(SData2,Data2{7:0}) + signed(Bias2{8:0})){9:0}){18:0}
```
-Length (stop - start + 1) is in 32bit accumulator lane count, as all inputs will
-horizontally reduce to this size.
+vs1 goes to the *narrow* port of the matmul. 8 vectors are always used.
-The Start and Stop definition allows for a partial window of input values to be
-transpose broadcast into the convolution unit.
+vs3 goes to the *wide* port of the matmul, up to 8 vectors are used.
+
+vx2 specifies control params used in the operation and has the following
+format:
Mode | Mode | Usage
:----: | :--: | :-----------------------------------------------:
Common | | Mode[1:0] Start[6:2] Stop[11:7]
s8 | 0 | SData2[31] SBias2[30:22] SData1[21] SBias1[20:12]
-```
-# SIMD256
-acc.out = {v48..55}
-narrow0 = {v0..7}
-narrow1 = {v16..23}
-narrow2 = {v32..39}
-narrow3 = {v48..55}
-wide0 = {v8..15}
-wide1 = {v24..31}
-wide2 = {v40..47}
-wide3 = {v56..63}
-```
+Start and Stop controls the window of input values to participate in the
+matmul:
+- On vs1 this is in 4-byte words on all 8 vectors at the same time.
+- On vs3 this is the register number to use (vs3+0 to vs3+7).
+- The operation takes (stop - start + 1) ticks to complete.
+
+When using SIMD256, the folling operands are valid:
+- vd: v48
+- vs1: v0, v16, v32, v48
+- vs3: v8, v24, v40, v56
+
+Notes:
+- v48 is used as vd but never written to.
+- v48-v55 will always be overwritten upon VCGET.
### VCGET
@@ -830,6 +837,8 @@
```
+v48 is the only valid vd in this instruction.
+
### ACSET
Copy general registers into convolution accumulators.
@@ -847,6 +856,8 @@
Accum{Y} = vd{Y}
```
+Note that v48 is used as vd but never written to.
+
--------------------------------------------------------------------------------
### ACTR
@@ -860,13 +871,15 @@
**Operation**
```
-assert(vd in {v48})
+assert(vd == 48)
assert(vs1 in {v0, v16, v32, v48}
for I in i32.typelen
for J in i32.typelen
ACCUM[J][I] = vs1[I][J]
```
+Note that v48 is used as vd but never written to.
+
--------------------------------------------------------------------------------
### VCLB
@@ -1813,7 +1826,7 @@
vslidep.[b,h,w].[1,2,3,4].vv vd, vs1, vs2 \
vslidevp.[b,h,w].[1,2,3,4].vv.m vd, vs1, vs2 \
-vslidevp.[b,h,w].[1,2,3,4].vv.m vd, vs1, xs2
+vslidevp.[b,h,w].[1,2,3,4].vx.m vd, vs1, xs2
**Operation**
diff --git a/tflm/opt/BUILD b/tflm/opt/BUILD
index b9c0d76..9957e7a 100644
--- a/tflm/opt/BUILD
+++ b/tflm/opt/BUILD
@@ -22,17 +22,20 @@
"conv_s8.cc",
"conv_s8_1x1.cc",
"conv_s8_3x1_d48.cc",
- "conv_s8_d4.cc",
- "conv_s8_d32.cc",
"conv_s8_d1.cc",
+ "conv_s8_d32.cc",
+ "conv_s8_d4.cc",
"depthwise_conv_s16.cc",
"depthwise_conv_s8.cc",
"elementwise_add_s16.cc",
"elementwise_add_s32.cc",
"elementwise_add_s8.cc",
+ "elementwise_mul_s16.cc",
+ "elementwise_mul_s8.cc",
"leaky_relu_s16.cc",
"leaky_relu_s8.cc",
"logistic_s8.cc",
+ "max_pool_s16.cc",
"max_pool_s8.cc",
"memcpy.cc",
"resize_nearest_neighbor_s8.cc",
diff --git a/tflm/opt/elementwise_add_s16.cc b/tflm/opt/elementwise_add_s16.cc
index 106742b..bb445c3 100644
--- a/tflm/opt/elementwise_add_s16.cc
+++ b/tflm/opt/elementwise_add_s16.cc
@@ -20,16 +20,28 @@
namespace kelvin::opt {
-void ElementwiseAddS16(const int16_t* input1, const int16_t* input2,
- const int32_t input1_offset, const int32_t input1_mult,
- const int32_t input1_shift, const int32_t input2_offset,
- const int32_t input2_mult, const int32_t input2_shift,
- const int32_t left_shift, int16_t* output,
- const int32_t output_offset, const int32_t output_mult,
- const int32_t output_shift,
- const int32_t output_activation_min,
- const int32_t output_activation_max,
- const int32_t block_size) {
+void ElementwiseAddS16(const tflite::ArithmeticParams& params,
+ const tflite::RuntimeShape& input1_shape,
+ const int16_t* input1,
+ const tflite::RuntimeShape& input2_shape,
+ const int16_t* input2,
+ const tflite::RuntimeShape& output_shape,
+ int16_t* output) {
+ const int32_t input1_offset = params.input1_offset;
+ const int32_t input1_mult = params.input1_multiplier;
+ const int32_t input1_shift = params.input1_shift;
+ const int32_t input2_offset = params.input2_offset;
+ const int32_t input2_mult = params.input2_multiplier;
+ const int32_t input2_shift = params.input2_shift;
+ const int32_t left_shift = params.left_shift;
+ const int32_t output_offset = params.output_offset;
+ const int32_t output_mult = params.output_multiplier;
+ const int32_t output_shift = params.output_shift;
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+ const int block_size =
+ MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
int blocks = block_size;
int vl;
getmaxvl_h(vl);
diff --git a/tflm/opt/elementwise_add_s32.cc b/tflm/opt/elementwise_add_s32.cc
index ab2b3d1..e83d1eb 100644
--- a/tflm/opt/elementwise_add_s32.cc
+++ b/tflm/opt/elementwise_add_s32.cc
@@ -18,10 +18,18 @@
#include "tflm/opt/opt.h"
namespace kelvin::opt {
-void ElementwiseAddS32(const int32_t* input1, const int32_t* input2,
- int32_t* output, const int32_t output_activation_min,
- const int32_t output_activation_max,
- const int32_t block_size) {
+void ElementwiseAddS32(const tflite::ArithmeticParams& params,
+ const tflite::RuntimeShape& input1_shape,
+ const int32_t* input1,
+ const tflite::RuntimeShape& input2_shape,
+ const int32_t* input2,
+ const tflite::RuntimeShape& output_shape,
+ int32_t* output) {
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+ const int block_size =
+ MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
int blocks = block_size;
int vl;
getmaxvl_w_m(vl);
diff --git a/tflm/opt/elementwise_add_s8.cc b/tflm/opt/elementwise_add_s8.cc
index e664769..9d24449 100644
--- a/tflm/opt/elementwise_add_s8.cc
+++ b/tflm/opt/elementwise_add_s8.cc
@@ -20,16 +20,28 @@
namespace kelvin::opt {
-void ElementwiseAddS8(const int8_t* input1, const int8_t* input2,
- const int32_t input1_offset, const int32_t input1_mult,
- const int32_t input1_shift, const int32_t input2_offset,
- const int32_t input2_mult, const int32_t input2_shift,
- const int32_t left_shift, int8_t* output,
- const int32_t output_offset, const int32_t output_mult,
- const int32_t output_shift,
- const int32_t output_activation_min,
- const int32_t output_activation_max,
- const int32_t block_size) {
+void ElementwiseAddS8(const tflite::ArithmeticParams& params,
+ const tflite::RuntimeShape& input1_shape,
+ const int8_t* input1,
+ const tflite::RuntimeShape& input2_shape,
+ const int8_t* input2,
+ const tflite::RuntimeShape& output_shape,
+ int8_t* output) {
+ const int32_t input1_offset = params.input1_offset;
+ const int32_t input1_mult = params.input1_multiplier;
+ const int32_t input1_shift = params.input1_shift;
+ const int32_t input2_offset = params.input2_offset;
+ const int32_t input2_mult = params.input2_multiplier;
+ const int32_t input2_shift = params.input2_shift;
+ const int32_t left_shift = params.left_shift;
+ const int32_t output_offset = params.output_offset;
+ const int32_t output_mult = params.output_multiplier;
+ const int32_t output_shift = params.output_shift;
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+ const int block_size =
+ MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
int blocks = block_size;
const int32_t input1_shift_mul = 1 << LEFT_SHIFT(input1_shift);
diff --git a/tflm/opt/elementwise_mul_s16.cc b/tflm/opt/elementwise_mul_s16.cc
new file mode 100644
index 0000000..478201c
--- /dev/null
+++ b/tflm/opt/elementwise_mul_s16.cc
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "crt/kelvin.h"
+#include "tflm/opt/opt.h"
+#include "tflm/opt/util.h"
+
+namespace kelvin::opt {
+void MulS16(const tflite::ArithmeticParams& params,
+ const tflite::RuntimeShape& input1_shape,
+ const int16_t* input1_data,
+ const tflite::RuntimeShape& input2_shape,
+ const int16_t* input2_data,
+ const tflite::RuntimeShape& output_shape, int16_t* output_data) {
+ const int32_t input1_offset = params.input1_offset;
+ const int32_t input2_offset = params.input2_offset;
+ const int32_t output_offset = params.output_offset;
+ const int32_t output_mult = params.output_multiplier;
+ const int32_t output_shift = params.output_shift;
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+
+ const int block_size =
+ MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+ int blocks = block_size;
+ int vl;
+ getmaxvl_h(vl);
+ while (blocks) {
+ int count = std::min(blocks, vl);
+
+ // Widen input1 to 32-bit wide values (in vm0, vm1).
+ vld_h_lp_xx_m(vm0, input1_data, count);
+ vaddw_w_vx_m(vm0, vm0, input1_offset);
+
+ // Widen input2 to 32-bit wide values (in vm2, vm3).
+ vld_h_lp_xx_m(vm2, input2_data, count);
+ vaddw_w_vx_m(vm2, vm2, input2_offset);
+
+ // Multiply the rescaled inputs.
+ vmul_w_vv_m(vm0, vm0, vm2);
+ vmul_w_vv_m(vm1, vm1, vm3);
+
+ // Rescale the summed output.
+ rescale_m(vm0, vm0, output_mult, output_shift, output_offset);
+ rescale_m(vm1, vm1, output_mult, output_shift, output_offset);
+
+ // Clamp to the provided range.
+ vmin_w_vx_m(vm0, vm0, output_activation_max);
+ vmin_w_vx_m(vm1, vm1, output_activation_max);
+ vmax_w_vx_m(vm0, vm0, output_activation_min);
+ vmax_w_vx_m(vm1, vm1, output_activation_min);
+
+ // Swizzle and narrow back to bytes.
+ vand_w_vx_m(vm0, vm0, 0xFFFF);
+ vand_w_vx_m(vm1, vm1, 0xFFFF);
+ vsll_w_vx_m(vm1, vm1, 16);
+ vor_vv_m(vm0, vm0, vm1);
+
+ // Store to memory.
+ vst_h_lp_xx_m(vm0, output_data, count);
+
+ blocks -= count;
+ }
+}
+
+} // namespace kelvin::opt
diff --git a/tflm/opt/elementwise_mul_s8.cc b/tflm/opt/elementwise_mul_s8.cc
new file mode 100644
index 0000000..6e4ef77
--- /dev/null
+++ b/tflm/opt/elementwise_mul_s8.cc
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "crt/kelvin.h"
+#include "tflm/opt/opt.h"
+#include "tflm/opt/util.h"
+
+namespace kelvin::opt {
+void MulS8(const tflite::ArithmeticParams& params,
+ const tflite::RuntimeShape& input1_shape, const int8_t* input1_data,
+ const tflite::RuntimeShape& input2_shape, const int8_t* input2_data,
+ const tflite::RuntimeShape& output_shape, int8_t* output_data) {
+ const int32_t input1_offset = params.input1_offset;
+ const int32_t input2_offset = params.input2_offset;
+ const int32_t output_offset = params.output_offset;
+ const int32_t output_mult = params.output_multiplier;
+ const int32_t output_shift = params.output_shift;
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+
+ const int block_size =
+ MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+ int blocks = block_size;
+
+ while (blocks >= 96) {
+ vld_b_lp_xx(v0, input1_data, 32);
+ vld_b_lp_xx(v8, input2_data, 32);
+
+ vaddw_h_vx(v2, v0, 0);
+ vaddw_h_vx(v10, v8, 0);
+ vaddw_w_vx(v4, v2, input1_offset);
+ vaddw_w_vx(v6, v3, input1_offset);
+ vaddw_w_vx(v12, v10, input2_offset);
+ vaddw_w_vx(v14, v11, input2_offset);
+
+ vld_b_lp_xx(v16, input1_data, 32);
+ vld_b_lp_xx(v24, input2_data, 32);
+
+ vaddw_h_vx(v18, v16, 0);
+ vaddw_h_vx(v26, v24, 0);
+ vaddw_w_vx(v20, v18, input1_offset);
+ vaddw_w_vx(v22, v19, input1_offset);
+ vaddw_w_vx(v28, v26, input2_offset);
+ vaddw_w_vx(v30, v27, input2_offset);
+
+ vld_b_lp_xx(v32, input1_data, 32);
+ vld_b_lp_xx(v40, input2_data, 32);
+
+ vaddw_h_vx(v34, v32, 0);
+ vaddw_h_vx(v42, v40, 0);
+ vaddw_w_vx(v36, v34, input1_offset);
+ vaddw_w_vx(v38, v35, input1_offset);
+ vaddw_w_vx(v44, v42, input2_offset);
+ vaddw_w_vx(v46, v43, input2_offset);
+
+ vmul_w_vv_m(v12, v4, v12);
+ vmul_w_vv_m(v28, v20, v28);
+ vmul_w_vv_m(v44, v36, v44);
+
+ vdmulh_w_r_vx_m(v12, v12, output_mult);
+ vdmulh_w_r_vx_m(v28, v28, output_mult);
+ vdmulh_w_r_vx_m(v44, v44, output_mult);
+ vsha_w_r_vx_m(v12, v12, -output_shift);
+ vsha_w_r_vx_m(v28, v28, -output_shift);
+ vsha_w_r_vx_m(v44, v44, -output_shift);
+ vadd_w_vx_m(v12, v12, output_offset);
+ vadd_w_vx_m(v28, v28, output_offset);
+ vadd_w_vx_m(v44, v44, output_offset);
+
+ vmin_w_vx_m(v12, v12, output_activation_max);
+ vmin_w_vx_m(v28, v28, output_activation_max);
+ vmin_w_vx_m(v44, v44, output_activation_max);
+ vmax_w_vx_m(v12, v12, output_activation_min);
+ vmax_w_vx_m(v28, v28, output_activation_min);
+ vmax_w_vx_m(v44, v44, output_activation_min);
+
+ vsraqs_b_vx(v12, v12, 0);
+ vst_b_lp_xx(v12, output_data, 32);
+ vsraqs_b_vx(v28, v28, 0);
+ vst_b_lp_xx(v28, output_data, 32);
+ vsraqs_b_vx(v44, v44, 0);
+ vst_b_lp_xx(v44, output_data, 32);
+
+ blocks -= 96;
+ }
+
+ while (blocks) {
+ int count = std::min(blocks, 32);
+ vld_b_lp_xx(v0, input1_data, count);
+ vld_b_lp_xx(v8, input2_data, count);
+
+ vaddw_h_vx(v2, v0, 0);
+ vaddw_h_vx(v10, v8, 0);
+ vaddw_w_vx(v4, v2, input1_offset);
+ vaddw_w_vx(v6, v3, input1_offset);
+ vaddw_w_vx(v12, v10, input2_offset);
+ vaddw_w_vx(v14, v11, input2_offset);
+
+ vmul_w_vv_m(v16, v4, v12);
+
+ rescale_m(v16, v16, output_mult, output_shift, output_offset);
+
+ vmin_w_vx_m(v16, v16, output_activation_max);
+ vmax_w_vx_m(v16, v16, output_activation_min);
+
+ vsraqs_b_vx(v16, v16, 0);
+ vst_b_lp_xx(v16, output_data, count);
+
+ blocks -= count;
+ }
+}
+
+} // namespace kelvin::opt
diff --git a/tflm/opt/leaky_relu_s16.cc b/tflm/opt/leaky_relu_s16.cc
index 7427a6c..c750f84 100644
--- a/tflm/opt/leaky_relu_s16.cc
+++ b/tflm/opt/leaky_relu_s16.cc
@@ -21,13 +21,17 @@
#include "tflm/opt/util.h"
namespace kelvin::opt {
-void LeakyReluS16(const int16_t* input, int16_t* output,
- const int32_t block_size, const int32_t input_zero_point,
- const int32_t output_zero_point,
- const int32_t output_multiplier_alpha,
- const int32_t output_shift_alpha,
- const int32_t output_multiplier_identity,
- const int32_t output_shift_identity) {
+void LeakyReluS16(const tflite::LeakyReluParams ¶ms,
+ const tflite::RuntimeShape &input_shape, const int16_t *input,
+ const tflite::RuntimeShape &output_shape, int16_t *output) {
+ const int32_t input_zero_point = params.input_offset;
+ const int32_t output_zero_point = params.output_offset;
+ const int32_t output_multiplier_alpha = params.output_multiplier_alpha;
+ const int32_t output_shift_alpha = params.output_shift_alpha;
+ const int32_t output_multiplier_identity = params.output_multiplier_identity;
+ const int32_t output_shift_identity = params.output_shift_identity;
+ const int block_size = MatchingFlatSize(input_shape, output_shape);
+
constexpr int32_t quantized_output_min = std::numeric_limits<int16_t>::min();
constexpr int32_t quantized_output_max = std::numeric_limits<int16_t>::max();
int32_t right_shift_identity = std::min(output_shift_identity, 0L);
diff --git a/tflm/opt/leaky_relu_s8.cc b/tflm/opt/leaky_relu_s8.cc
index 8b30d19..8e43100 100644
--- a/tflm/opt/leaky_relu_s8.cc
+++ b/tflm/opt/leaky_relu_s8.cc
@@ -22,19 +22,24 @@
namespace kelvin::opt {
-void LeakyReluS8(const int8_t* input, int8_t* output, const int32_t block_size,
- const int32_t input_zero_point,
- const int32_t output_zero_point,
- const int32_t output_multiplier_alpha,
- const int32_t output_shift_alpha,
- const int32_t output_multiplier_identity,
- const int32_t output_shift_identity) {
+void LeakyReluS8(const tflite::LeakyReluParams& params,
+ const tflite::RuntimeShape& input_shape, const int8_t* input,
+ const tflite::RuntimeShape& output_shape, int8_t* output) {
+ const int32_t input_zero_point = params.input_offset;
+ const int32_t output_zero_point = params.output_offset;
+ const int32_t output_multiplier_alpha = params.output_multiplier_alpha;
+ const int32_t output_shift_alpha = params.output_shift_alpha;
+ const int32_t output_multiplier_identity = params.output_multiplier_identity;
+ const int32_t output_shift_identity = params.output_shift_identity;
+ const int block_size = MatchingFlatSize(input_shape, output_shape);
+
constexpr int32_t quantized_output_min = std::numeric_limits<int16_t>::min();
constexpr int32_t quantized_output_max = std::numeric_limits<int16_t>::max();
int32_t right_shift_identity = std::min(output_shift_identity, 0L);
int32_t left_shift_identity = std::max(output_shift_identity, 0L);
int32_t right_shift_alpha = std::min(output_shift_alpha, 0L);
int32_t left_shift_alpha = std::max(output_shift_alpha, 0L);
+
int blocks = block_size;
int vl;
getmaxvl_b(vl);
diff --git a/tflm/opt/max_pool_s16.cc b/tflm/opt/max_pool_s16.cc
new file mode 100644
index 0000000..4e3aa46
--- /dev/null
+++ b/tflm/opt/max_pool_s16.cc
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "crt/kelvin.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace kelvin::opt {
+void MaxPoolS16(const tflite::PoolParams ¶ms,
+ const tflite::RuntimeShape &input_shape,
+ const int16_t *input_data,
+ const tflite::RuntimeShape &output_shape,
+ int16_t *output_data) {
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ const int stride_height = params.stride_height;
+ const int stride_width = params.stride_width;
+ for (int batch = 0; batch < batches; ++batch) {
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ for (int out_x = 0; out_x < output_width; ++out_x) {
+ const int in_x_origin =
+ (out_x * stride_width) - params.padding_values.width;
+ const int in_y_origin =
+ (out_y * stride_height) - params.padding_values.height;
+
+ // Compute the boundaries of the filter region clamped so as to
+ // ensure that the filter window fits in the input array.
+ const int filter_x_start = std::max(0, -in_x_origin);
+ const int filter_x_end =
+ std::min(params.filter_width, input_width - in_x_origin);
+ const int filter_y_start = std::max(0, -in_y_origin);
+ const int filter_y_end =
+ std::min(params.filter_height, input_height - in_y_origin);
+
+ int channel = 0;
+ for (; channel + 16 <= depth; channel += 16) {
+ vdup_h_x(v0, params.quantized_activation_min);
+ for (int filter_y = filter_y_start; filter_y < filter_y_end;
+ ++filter_y) {
+ for (int filter_x = filter_x_start; filter_x < filter_x_end;
+ ++filter_x) {
+ const int in_x = in_x_origin + filter_x;
+ const int in_y = in_y_origin + filter_y;
+ const int16_t *local_input =
+ input_data + Offset(input_shape, batch, in_y, in_x, channel);
+ vld_h_x(v1, local_input);
+ vmax_h_vv(v0, v0, v1);
+ }
+ }
+ vmin_h_vx(v0, v0, params.quantized_activation_max);
+ int16_t *local_output =
+ output_data + Offset(output_shape, batch, out_y, out_x, channel);
+ vst_h_x(v0, local_output);
+ }
+
+ if (channel == depth) {
+ continue;
+ }
+ int remaining_channels = depth - channel;
+ vdup_h_x(v0, params.quantized_activation_min);
+ for (int filter_y = filter_y_start; filter_y < filter_y_end;
+ ++filter_y) {
+ for (int filter_x = filter_x_start; filter_x < filter_x_end;
+ ++filter_x) {
+ const int in_x = in_x_origin + filter_x;
+ const int in_y = in_y_origin + filter_y;
+ const int16_t *local_input =
+ input_data + Offset(input_shape, batch, in_y, in_x, depth - 1);
+ vld_h_l_xx(v1, local_input, remaining_channels);
+ vmax_h_vv(v0, v0, v1);
+ }
+ }
+ vmin_h_vx(v0, v0, params.quantized_activation_max);
+ int16_t *local_output =
+ output_data + Offset(output_shape, batch, out_y, out_x, depth - 1);
+ vst_h_l_xx(v0, local_output, remaining_channels);
+ }
+ }
+ }
+}
+
+} // namespace kelvin::opt
diff --git a/tflm/opt/opt.h b/tflm/opt/opt.h
index e5169b2..053fbd6 100644
--- a/tflm/opt/opt.h
+++ b/tflm/opt/opt.h
@@ -25,44 +25,36 @@
namespace kelvin::opt {
void* Memcpy(void* dst, const void* src, size_t n);
-void ElementwiseAddS8(const int8_t* input1, const int8_t* input2,
- const int32_t input1_offset, const int32_t input1_mult,
- const int32_t input1_shift, const int32_t input2_offset,
- const int32_t input2_mult, const int32_t input2_shift,
- const int32_t left_shift, int8_t* output,
- const int32_t output_offset, const int32_t output_mult,
- const int32_t output_shift,
- const int32_t output_activation_min,
- const int32_t output_activation_max,
- const int32_t block_size);
-void ElementwiseAddS16(const int16_t* input1, const int16_t* input2,
- const int32_t input1_offset, const int32_t input1_mult,
- const int32_t input1_shift, const int32_t input2_offset,
- const int32_t input2_mult, const int32_t input2_shift,
- const int32_t left_shift, int16_t* output,
- const int32_t output_offset, const int32_t output_mult,
- const int32_t output_shift,
- const int32_t output_activation_min,
- const int32_t output_activation_max,
- const int32_t block_size);
-void ElementwiseAddS32(const int32_t* input1, const int32_t* input2,
- int32_t* output, const int32_t output_activation_min,
- const int32_t output_activation_max,
- const int32_t block_size);
-void LeakyReluS8(const int8_t* input, int8_t* output, const int32_t block_size,
- const int32_t input_zero_point,
- const int32_t output_zero_point,
- const int32_t output_multiplier_alpha,
- const int32_t output_shift_alpha,
- const int32_t output_multiplier_identity,
- const int32_t output_shift_identity);
-void LeakyReluS16(const int16_t* input, int16_t* output,
- const int32_t block_size, const int32_t input_zero_point,
- const int32_t output_zero_point,
- const int32_t output_multiplier_alpha,
- const int32_t output_shift_alpha,
- const int32_t output_multiplier_identity,
- const int32_t output_shift_identity);
+void ElementwiseAddS8(const tflite::ArithmeticParams& params,
+ const tflite::RuntimeShape& input1_shape,
+ const int8_t* input1_data,
+ const tflite::RuntimeShape& input2_shape,
+ const int8_t* input2_data,
+ const tflite::RuntimeShape& output_shape,
+ int8_t* output_data);
+void ElementwiseAddS16(const tflite::ArithmeticParams& params,
+ const tflite::RuntimeShape& input1_shape,
+ const int16_t* input1_data,
+ const tflite::RuntimeShape& input2_shape,
+ const int16_t* input2_data,
+ const tflite::RuntimeShape& output_shape,
+ int16_t* output_data);
+void ElementwiseAddS32(const tflite::ArithmeticParams& params,
+ const tflite::RuntimeShape& input1_shape,
+ const int32_t* input1_data,
+ const tflite::RuntimeShape& input2_shape,
+ const int32_t* input2_data,
+ const tflite::RuntimeShape& output_shape,
+ int32_t* output_data);
+void LeakyReluS8(const tflite::LeakyReluParams& params,
+ const tflite::RuntimeShape& input_shape,
+ const int8_t* input_data,
+ const tflite::RuntimeShape& output_shape, int8_t* output_data);
+void LeakyReluS16(const tflite::LeakyReluParams& params,
+ const tflite::RuntimeShape& input_shape,
+ const int16_t* input_data,
+ const tflite::RuntimeShape& output_shape,
+ int16_t* output_data);
void ConvS16B32(const tflite::ConvParams& params,
const int32_t* output_multiplier, const int32_t* output_shift,
const tflite::RuntimeShape& input_shape,
@@ -105,30 +97,30 @@
const tflite::RuntimeShape& input_shape,
const int8_t* input_data,
const tflite::RuntimeShape& output_shape, int8_t* output_data);
+void MaxPoolS16(const tflite::PoolParams& params,
+ const tflite::RuntimeShape& input_shape,
+ const int16_t* input_data,
+ const tflite::RuntimeShape& output_shape, int16_t* output_data);
+void MulS8(const tflite::ArithmeticParams& params,
+ const tflite::RuntimeShape& input1_shape, const int8_t* input1_data,
+ const tflite::RuntimeShape& input2_shape, const int8_t* input2_data,
+ const tflite::RuntimeShape& output_shape, int8_t* output_data);
+void MulS16(const tflite::ArithmeticParams& params,
+ const tflite::RuntimeShape& input1_shape,
+ const int16_t* input1_data,
+ const tflite::RuntimeShape& input2_shape,
+ const int16_t* input2_data,
+ const tflite::RuntimeShape& output_shape, int16_t* output_data);
void LogisticS8(int32_t input_zero_point, int32_t input_range_radius,
int32_t input_multiplier, int32_t input_left_shift,
int32_t input_size, const int8_t* input_data,
int8_t* output_data);
-void KelvinResizeNearestNeighbor(
+void ResizeNearestNeighborS8(
const tflite::ResizeNearestNeighborParams& op_params,
const tflite::RuntimeShape& unextended_input_shape,
const int8_t* input_data, const tflite::RuntimeShape& output_size_shape,
const int32_t* output_size_data,
const tflite::RuntimeShape& unextended_output_shape, int8_t* output_data);
-void KelvinResizeNN2x(const tflite::ResizeNearestNeighborParams& op_params,
- const tflite::RuntimeShape& input_shape,
- const tflite::RuntimeShape& output_shape,
- const int32_t input_height, const int32_t input_width,
- const int32_t output_height, const int32_t output_width,
- const int8_t* input_data, int8_t* output_data);
-void KelvinResizeNNGeneric(const tflite::ResizeNearestNeighborParams& op_params,
- const tflite::RuntimeShape& input_shape,
- const tflite::RuntimeShape& output_shape,
- const int32_t input_height,
- const int32_t input_width,
- const int32_t output_height,
- const int32_t output_width, const int8_t* input_data,
- int8_t* output_data);
} // namespace kelvin::opt
diff --git a/tflm/opt/resize_nearest_neighbor_s8.cc b/tflm/opt/resize_nearest_neighbor_s8.cc
index 1f5cafb..8da7ee9 100644
--- a/tflm/opt/resize_nearest_neighbor_s8.cc
+++ b/tflm/opt/resize_nearest_neighbor_s8.cc
@@ -21,12 +21,12 @@
#include "tensorflow/lite/kernels/internal/types.h"
#include "tflm/opt/opt.h"
-inline int32_t KelvinGetNearestNeighbor(const int input_value,
- const int32_t input_size,
- const int32_t output_size,
- const bool align_corners,
- const bool half_pixel_centers,
- const float scale, const float offset) {
+namespace kelvin::opt {
+namespace {
+int32_t GetNearestNeighbor(const int input_value, const int32_t input_size,
+ const int32_t output_size, const bool align_corners,
+ const bool half_pixel_centers, const float scale,
+ const float offset) {
int32_t output_value = std::min(
align_corners
? static_cast<int32_t>(
@@ -39,14 +39,12 @@
return output_value;
}
-namespace kelvin::opt {
-
-void KelvinResizeNN2x(const tflite::ResizeNearestNeighborParams& op_params,
- const tflite::RuntimeShape& input_shape,
- const tflite::RuntimeShape& output_shape,
- const int32_t input_height, const int32_t input_width,
- const int32_t output_height, const int32_t output_width,
- const int8_t* input_data, int8_t* output_data) {
+void ResizeNN2x(const tflite::ResizeNearestNeighborParams& op_params,
+ const tflite::RuntimeShape& input_shape,
+ const tflite::RuntimeShape& output_shape,
+ const int32_t input_height, const int32_t input_width,
+ const int32_t output_height, const int32_t output_width,
+ const int8_t* input_data, int8_t* output_data) {
int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
const int col_offset = input_shape.Dims(3);
@@ -54,7 +52,6 @@
const int batch_offset = input_shape.Dims(1) * row_offset;
const int8_t* input_ptr = input_data;
- const int8_t* input_tmp_ptr = input_data;
int8_t* output_ptr = output_data;
for (int b = 0; b < batches; ++b) {
@@ -83,14 +80,12 @@
}
}
-void KelvinResizeNNGeneric(const tflite::ResizeNearestNeighborParams& op_params,
- const tflite::RuntimeShape& input_shape,
- const tflite::RuntimeShape& output_shape,
- const int32_t input_height,
- const int32_t input_width,
- const int32_t output_height,
- const int32_t output_width, const int8_t* input_data,
- int8_t* output_data) {
+void ResizeNNGeneric(const tflite::ResizeNearestNeighborParams& op_params,
+ const tflite::RuntimeShape& input_shape,
+ const tflite::RuntimeShape& output_shape,
+ const int32_t input_height, const int32_t input_width,
+ const int32_t output_height, const int32_t output_width,
+ const int8_t* input_data, int8_t* output_data) {
int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
const int col_offset = input_shape.Dims(3);
@@ -113,12 +108,12 @@
for (int b = 0; b < batches; ++b) {
for (int y = 0; y < output_height; ++y) {
- int32_t in_y = KelvinGetNearestNeighbor(
+ int32_t in_y = GetNearestNeighbor(
y, input_height, output_height, op_params.align_corners,
op_params.half_pixel_centers, y_scale, offset);
const int8_t* y_input_ptr = input_ptr + in_y * row_offset;
for (int x = 0; x < output_width; ++x) {
- int32_t in_x = KelvinGetNearestNeighbor(
+ int32_t in_x = GetNearestNeighbor(
x, input_width, output_width, op_params.align_corners,
op_params.half_pixel_centers, x_scale, offset);
const int8_t* x_input_ptr = y_input_ptr + in_x * col_offset;
@@ -130,8 +125,9 @@
input_ptr += batch_offset;
}
}
+} // namespace
-void KelvinResizeNearestNeighbor(
+void ResizeNearestNeighborS8(
const tflite::ResizeNearestNeighborParams& op_params,
const tflite::RuntimeShape& unextended_input_shape,
const int8_t* input_data, const tflite::RuntimeShape& output_size_shape,
@@ -153,14 +149,13 @@
int32_t output_width = output_size_data[1];
if (output_height == 2 * input_height && output_width == 2 * input_width) {
- KelvinResizeNN2x(op_params, input_shape, output_shape, input_height,
- input_width, output_height, output_width, input_data,
- output_data);
+ ResizeNN2x(op_params, input_shape, output_shape, input_height, input_width,
+ output_height, output_width, input_data, output_data);
} else {
- KelvinResizeNNGeneric(op_params, input_shape, output_shape, input_height,
- input_width, output_height, output_width, input_data,
- output_data);
+ ResizeNNGeneric(op_params, input_shape, output_shape, input_height,
+ input_width, output_height, output_width, input_data,
+ output_data);
}
}