Merge "sw/kelvin: add test target for newly added mul op for Kelvin"
diff --git a/benchmarks/benchmark.h b/benchmarks/benchmark.h
index 59383fb..16c1c85 100644
--- a/benchmarks/benchmark.h
+++ b/benchmarks/benchmark.h
@@ -17,11 +17,15 @@
#ifndef BENCHMARKS_BENCHMARK_H_
#define BENCHMARKS_BENCHMARK_H_
+#define ML_RUN_INDICATOR_IO 16
+#define ML_TOGGLE_PER_INF_IO 17
+
typedef struct {
uint32_t return_code;
uint32_t iterations;
uint64_t cycles;
uint32_t mismatch_count;
+ uint32_t gpio_toggle_per_inference;
} BenchmarkOutputHeader;
-#endif // #ifndef BENCHMARKS_BENCHMARK_H_
+#endif // BENCHMARKS_BENCHMARK_H_
diff --git a/benchmarks/benchmark_kelvin.cc b/benchmarks/benchmark_kelvin.cc
index 079f567..7e72835 100644
--- a/benchmarks/benchmark_kelvin.cc
+++ b/benchmarks/benchmark_kelvin.cc
@@ -68,6 +68,7 @@
.iterations = 0,
.cycles = 0,
.mismatch_count = 0,
+ .gpio_toggle_per_inference = 0,
};
// This includes all ops currently used in the Kelvin model suite. More can be
@@ -167,6 +168,8 @@
// TODO(michaelbrooks): Possibly set/verify test data?
for (int i = 0; i < iterations; ++i) {
+ output_header.gpio_toggle_per_inference =
+ !output_header.gpio_toggle_per_inference;
#if (TEST_DATA_INPUT == 1)
memcpy(tflite::GetTensorData<uint8_t>(input), g_benchmark_input,
input->bytes);
diff --git a/benchmarks/benchmark_sec.c b/benchmarks/benchmark_sec.c
index 251bd08..af5c8dd 100644
--- a/benchmarks/benchmark_sec.c
+++ b/benchmarks/benchmark_sec.c
@@ -20,6 +20,7 @@
#include "sw/device/lib/dif/dif_pinmux.h"
#include "sw/device/lib/dif/dif_rv_plic.h"
#include "sw/device/lib/dif/dif_smc_ctrl.h"
+#include "sw/device/lib/dif/dif_tlul_mailbox.h"
#include "sw/device/lib/dif/dif_uart.h"
#include "sw/device/lib/runtime/hart.h"
#include "sw/device/lib/runtime/irq.h"
@@ -27,6 +28,9 @@
#include "sw/device/lib/testing/test_framework/check.h"
#include "sw/device/lib/testing/test_framework/ottf_test_config.h"
#include "sw/device/lib/testing/test_framework/test_util.h"
+/* clang-format off */
+#include "benchmarks/benchmark.h"
+/* clang-format on */
#define STRINGIZE(x) #x
#define STR(x) STRINGIZE(x)
@@ -39,12 +43,43 @@
#define SMC_BINARY STR(SMC_BINARY_DIRECTORY/BENCHMARK_NAME-SMC_BINARY_TYPE)
#include SMC_BINARY
+static dif_gpio_t gpio;
+static dif_rv_plic_t plic_sec;
+static dif_tlul_mailbox_t tlul_mailbox;
static dif_pinmux_t pinmux;
static dif_smc_ctrl_t smc_ctrl;
static dif_uart_t uart;
OTTF_DEFINE_TEST_CONFIG();
+void ottf_external_isr(void) {
+ uint32_t rx;
+ dif_rv_plic_irq_id_t plic_irq_id;
+
+ CHECK_DIF_OK(dif_rv_plic_irq_claim(&plic_sec, kTopMatchaPlicTargetIbex0,
+ &plic_irq_id));
+ top_matcha_plic_peripheral_t peripheral_id =
+ top_matcha_plic_interrupt_for_peripheral[plic_irq_id];
+
+ switch (peripheral_id) {
+ case kTopMatchaPlicPeripheralTlulMailboxSec: {
+ CHECK_DIF_OK(dif_tlul_mailbox_irq_acknowledge(&tlul_mailbox,
+ kDifTlulMailboxIrqRtirq));
+ CHECK_DIF_OK(dif_tlul_mailbox_read_message(&tlul_mailbox, &rx));
+ uint32_t pin = rx >> 16;
+ uint32_t value = rx & 0xFFFF;
+ CHECK_DIF_OK(dif_gpio_write(&gpio, pin, value));
+ break;
+ }
+ default:
+ LOG_FATAL("Unhandled interrupt");
+ break;
+ }
+
+ CHECK_DIF_OK(dif_rv_plic_irq_complete(&plic_sec, kTopMatchaPlicTargetIbex0,
+ plic_irq_id));
+}
+
void _ottf_main(void) {
// Initialize the UART to enable logging for non-DV simulation platforms.
if (kDeviceType != kDeviceSimDV) {
@@ -56,6 +91,28 @@
CHECK_DIF_OK(dif_smc_ctrl_init(
mmio_region_from_addr(TOP_MATCHA_SMC_CTRL_BASE_ADDR), &smc_ctrl));
+// PinMux: Total inference GPIO J52.5/CS Sparrow (IOR7) :: PMOD3.7 on Nexus
+// (IOD4)
+// Per inference GPIO J52.7/SCK Sparrow (IOR7) :: PMOD3.8 on Nexus
+// (IOD5)
+#if defined(MATCHA_SPARROW)
+ CHECK_DIF_OK(dif_pinmux_output_select(&pinmux, kTopMatchaPinmuxMioOutIor7,
+ kTopMatchaPinmuxOutselGpioGpio16));
+ CHECK_DIF_OK(dif_pinmux_output_select(&pinmux, kTopMatchaPinmuxMioOutIoa1,
+ kTopMatchaPinmuxOutselGpioGpio17));
+#else
+ CHECK_DIF_OK(dif_pinmux_output_select(&pinmux, kTopMatchaPinmuxMioOutIod4,
+ kTopMatchaPinmuxOutselGpioGpio16));
+ CHECK_DIF_OK(dif_pinmux_output_select(&pinmux, kTopMatchaPinmuxMioOutIod5,
+ kTopMatchaPinmuxOutselGpioGpio17));
+#endif
+ CHECK_DIF_OK(
+ dif_gpio_init(mmio_region_from_addr(TOP_MATCHA_GPIO_BASE_ADDR), &gpio));
+ CHECK_DIF_OK(dif_gpio_output_set_enabled(&gpio, ML_RUN_INDICATOR_IO,
+ kDifToggleEnabled));
+ CHECK_DIF_OK(dif_gpio_output_set_enabled(&gpio, ML_TOGGLE_PER_INF_IO,
+ kDifToggleEnabled));
+
LOG_INFO("Loading Kelvin binary");
spi_flash_init();
CHECK_DIF_OK(load_file_from_tar(
@@ -66,6 +123,24 @@
LOG_INFO("Loading SMC binary");
memcpy((void*)TOP_MATCHA_RAM_SMC_BASE_ADDR, smc_bin, smc_bin_len);
}
+
+ // Enable Mailbox Interrupt
+ CHECK_DIF_OK(dif_tlul_mailbox_init(
+ mmio_region_from_addr(TOP_MATCHA_TLUL_MAILBOX_SEC_BASE_ADDR),
+ &tlul_mailbox));
+ CHECK_DIF_OK(dif_tlul_mailbox_irq_set_enabled(
+ &tlul_mailbox, kDifTlulMailboxIrqRtirq, kDifToggleEnabled));
+
+ CHECK_DIF_OK(dif_rv_plic_init(
+ mmio_region_from_addr(TOP_MATCHA_RV_PLIC_BASE_ADDR), &plic_sec));
+ CHECK_DIF_OK(dif_rv_plic_irq_set_enabled(
+ &plic_sec, kTopMatchaPlicIrqIdTlulMailboxSecRtirq,
+ kTopMatchaPlicTargetIbex0, kDifToggleEnabled));
+ CHECK_DIF_OK(dif_rv_plic_irq_set_priority(
+ &plic_sec, kTopMatchaPlicIrqIdTlulMailboxSecRtirq, 1));
+ irq_global_ctrl(true);
+ irq_external_ctrl(true);
+
CHECK_DIF_OK(dif_smc_ctrl_set_en(&smc_ctrl));
irq_global_ctrl(true);
irq_external_ctrl(true);
diff --git a/benchmarks/benchmark_smc.c b/benchmarks/benchmark_smc.c
index ca0c1f6..f290350 100644
--- a/benchmarks/benchmark_smc.c
+++ b/benchmarks/benchmark_smc.c
@@ -23,6 +23,7 @@
#include "sw/device/lib/dif/dif_ml_top.h"
#include "sw/device/lib/dif/dif_rv_plic.h"
#include "sw/device/lib/dif/dif_rv_timer.h"
+#include "sw/device/lib/dif/dif_tlul_mailbox.h"
#include "sw/device/lib/runtime/hart.h"
#include "sw/device/lib/runtime/irq.h"
#include "sw/device/lib/runtime/log.h"
@@ -41,6 +42,7 @@
static dif_rv_timer_t rv_timer;
static dif_uart_t smc_uart;
static dif_ml_top_t ml_top;
+static dif_tlul_mailbox_t tlul_mailbox;
volatile bool ml_top_finish_done = false;
@@ -113,26 +115,45 @@
irq_global_ctrl(true);
irq_external_ctrl(true);
- LOG_INFO("========== Begin Benchmark (%s) ==========", STR(BENCHMARK_NAME));
+ // Configure Mailbox.
+ CHECK_DIF_OK(dif_tlul_mailbox_init(
+ mmio_region_from_addr(TOP_MATCHA_TLUL_MAILBOX_SMC_BASE_ADDR),
+ &tlul_mailbox));
- // start kelvin
+ BenchmarkOutputHeader* output_header_ptr =
+ (BenchmarkOutputHeader*)((TOP_MATCHA_ML_TOP_DMEM_BASE_ADDR +
+ TOP_MATCHA_RAM_ML_DMEM_SIZE_BYTES) -
+ 0x40);
+
+ LOG_INFO("========== Begin Benchmark (%s) ==========", STR(BENCHMARK_NAME));
+ {
+ uint32_t value = 1;
+ uint32_t tx = ML_RUN_INDICATOR_IO << 16 | value;
+ CHECK_DIF_OK(dif_tlul_mailbox_send_message(&tlul_mailbox, &tx));
+ }
+
+ // start kelvin and pulse GPIO for data logger (Kibble)
ml_top_finish_done = false;
uint64_t timer_start;
CHECK_DIF_OK(dif_rv_timer_counter_read(&rv_timer, 0, &timer_start));
CHECK_DIF_OK(dif_ml_top_release_ctrl_en(&ml_top));
// wfi
+ uint32_t gpio_toggle_per_inference_prev = 0;
+ uint32_t tx;
while (!ml_top_finish_done) {
- wait_for_interrupt();
+ uint32_t gpio_toggle_per_inference =
+ output_header_ptr->gpio_toggle_per_inference;
+ if (gpio_toggle_per_inference != gpio_toggle_per_inference_prev) {
+ tx = ML_TOGGLE_PER_INF_IO << 16 | gpio_toggle_per_inference;
+ gpio_toggle_per_inference_prev = gpio_toggle_per_inference;
+ CHECK_DIF_OK(dif_tlul_mailbox_send_message(&tlul_mailbox, &tx));
+ }
+ // wait_for_interrupt();
}
uint64_t timer_finish;
CHECK_DIF_OK(dif_rv_timer_counter_read(&rv_timer, 0, &timer_finish));
- BenchmarkOutputHeader* output_header_ptr =
- (BenchmarkOutputHeader*)((TOP_MATCHA_ML_TOP_DMEM_BASE_ADDR +
- TOP_MATCHA_RAM_ML_DMEM_SIZE_BYTES) -
- 0x40);
-
if (output_header_ptr->return_code) {
LOG_FATAL("Kelvin returned an error: %d", output_header_ptr->return_code);
test_status_set(kTestStatusFailed);
@@ -142,6 +163,14 @@
uint64_t average_cycles = udiv64_slow(cycles, iterations, NULL);
uint64_t wall_time_us = timer_finish - timer_start;
uint64_t average_wall_time_us = udiv64_slow(wall_time_us, iterations, NULL);
+
+ // End Test Pulse
+ {
+ uint32_t value = 0;
+ uint32_t tx = ML_RUN_INDICATOR_IO << 16 | value;
+ CHECK_DIF_OK(dif_tlul_mailbox_send_message(&tlul_mailbox, &tx));
+ }
+
LOG_INFO("Iterations: %d", iterations);
_print64("Total Cycles", cycles);
_print64("Average Cycles per Iteration", average_cycles);
diff --git a/docs/kelvin_isa.md b/docs/kelvin_isa.md
index 4b9b06c..5aaac0c 100644
--- a/docs/kelvin_isa.md
+++ b/docs/kelvin_isa.md
@@ -85,6 +85,10 @@
where a stripmine register must use a mod4 base aligned register (eg. v0, v4,
v8, ...). Normal instruction and stripmine variants may be mixed together.
+Currently, neither the assembler nor kelvin_sim checks for invalid stripmine
+registers. Code using invalid registers (like v1) will compile and sim, but
+will cause FPGA to hang.
+
When stripmining is used in conjunction with instructions which use a register
index as a base to several registers, the offset of +4 (instead of +1) shall be
used. e.g., {vm0,vm1} becomes {{v0,v1,v2,v3},{v4,v5,v6,v7}}.
@@ -753,7 +757,7 @@
### ACONV
-Convolution ALU operation.
+Performs matmul vs1*vs3, accumulating into the accumulator.
**Encodings**
@@ -787,29 +791,32 @@
(signed(SData2,Data2{7:0}) + signed(Bias2{8:0})){9:0}){18:0}
```
-Length (stop - start + 1) is in 32bit accumulator lane count, as all inputs will
-horizontally reduce to this size.
+vs1 goes to the *narrow* port of the matmul. 8 vectors are always used.
-The Start and Stop definition allows for a partial window of input values to be
-transpose broadcast into the convolution unit.
+vs3 goes to the *wide* port of the matmul, up to 8 vectors are used.
+
+vx2 specifies control params used in the operation and has the following
+format:
Mode | Mode | Usage
:----: | :--: | :-----------------------------------------------:
Common | | Mode[1:0] Start[6:2] Stop[11:7]
s8 | 0 | SData2[31] SBias2[30:22] SData1[21] SBias1[20:12]
-```
-# SIMD256
-acc.out = {v48..55}
-narrow0 = {v0..7}
-narrow1 = {v16..23}
-narrow2 = {v32..39}
-narrow3 = {v48..55}
-wide0 = {v8..15}
-wide1 = {v24..31}
-wide2 = {v40..47}
-wide3 = {v56..63}
-```
+Start and Stop controls the window of input values to participate in the
+matmul:
+- On vs1 this is in 4-byte words on all 8 vectors at the same time.
+- On vs3 this is the register number to use (vs3+0 to vs3+7).
+- The operation takes (stop - start + 1) ticks to complete.
+
+When using SIMD256, the folling operands are valid:
+- vd: v48
+- vs1: v0, v16, v32, v48
+- vs3: v8, v24, v40, v56
+
+Notes:
+- v48 is used as vd but never written to.
+- v48-v55 will always be overwritten upon VCGET.
### VCGET
@@ -830,6 +837,8 @@
```
+v48 is the only valid vd in this instruction.
+
### ACSET
Copy general registers into convolution accumulators.
@@ -847,6 +856,8 @@
Accum{Y} = vd{Y}
```
+Note that v48 is used as vd but never written to.
+
--------------------------------------------------------------------------------
### ACTR
@@ -860,13 +871,15 @@
**Operation**
```
-assert(vd in {v48})
+assert(vd == 48)
assert(vs1 in {v0, v16, v32, v48}
for I in i32.typelen
for J in i32.typelen
ACCUM[J][I] = vs1[I][J]
```
+Note that v48 is used as vd but never written to.
+
--------------------------------------------------------------------------------
### VCLB
@@ -1813,7 +1826,7 @@
vslidep.[b,h,w].[1,2,3,4].vv vd, vs1, vs2 \
vslidevp.[b,h,w].[1,2,3,4].vv.m vd, vs1, vs2 \
-vslidevp.[b,h,w].[1,2,3,4].vv.m vd, vs1, xs2
+vslidevp.[b,h,w].[1,2,3,4].vx.m vd, vs1, xs2
**Operation**
diff --git a/tflm/opt/depthwise_conv_s8.cc b/tflm/opt/depthwise_conv_s8.cc
index a11c3d2..a58d960 100644
--- a/tflm/opt/depthwise_conv_s8.cc
+++ b/tflm/opt/depthwise_conv_s8.cc
@@ -25,6 +25,7 @@
// Reorders a vector to match the pattern after double-widening.
// N must be a multiple of 4.
+// Working only for mutliples of 32
void VectorSwizzle(const int32_t* input, int32_t* output, int N) {
assert(N >= 4 && N % 4 == 0);
const int32_t(&in)[N] = *(int32_t(*)[N])input;
@@ -2613,11 +2614,14 @@
for (int in_channel = 0; in_channel + 32 <= input_depth; in_channel += 32) {
const int output_channel = in_channel;
- VectorSwizzle(bias_data + output_channel, swizzled_bias_data, 32);
+
+ if (bias_data) {
+ VectorSwizzle(bias_data + output_channel, swizzled_bias_data, 32);
+ }
+
VectorSwizzle(output_multiplier + output_channel, swizzled_output_multi, 32);
VectorSwizzle(output_shift + output_channel, swizzled_shift_multi, 32);
- vld_w_x_m(v20, swizzled_bias_data);
vld_w_x_m(v24, swizzled_output_multi);
vld_w_x_m(v28, swizzled_shift_multi);
vrsub_w_vx_m(v28, v28, 0);
@@ -2628,7 +2632,12 @@
const int in_x_origin = (out_x * stride_width) - pad_width;
const int in_y_origin = (out_y * stride_height) - pad_height;
- vdup_w_x_m(v48, 0);
+ if (bias_data) {
+ vld_w_x_m(v48, swizzled_bias_data);
+ } else {
+ vdup_w_x_m(v48, 0);
+ }
+
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
const int in_y = in_y_origin + filter_y;
if ((in_y < 0) || (in_y >= input_height)) {
@@ -2639,12 +2648,18 @@
if ((in_x < 0) || (in_x >= input_width)) {
continue;
}
+ const int8_t* in_p =
+ input_data +
+ (batch * input_height * input_width * input_depth) +
+ (in_y * input_width * input_depth) + (in_x * input_depth) +
+ in_channel;
- vld_b_x(v0, &input_data[tflite::Offset(input_shape, batch, in_y,
- in_x, in_channel)]); // xp
- vld_b_x(v4, &filter_data[tflite::Offset(filter_shape, 0, filter_y,
- filter_x, in_channel)]);
+ const int8_t* fl_p = filter_data +
+ (filter_y * filter_width * input_depth) +
+ (filter_x * input_depth) + in_channel;
+ vld_b_x(v0, in_p);
+ vld_b_x(v4, fl_p);
vaddw_h_vx(v0, v0, 0);
vadd_h_vx(v0, v0, static_cast<int16_t>(input_offset));
vadd_h_vx(v1, v1,
@@ -2658,12 +2673,10 @@
}
}
- vadd_w_vv_m(v48, v48, v20); // add bias
- vdmulh_w_rn_vv_m(v48, v48, v24);
- vsha_w_r_vv_m(v48, v48, v28);
- vadd_w_vx_m(v48, v48, output_offset);
- vmax_w_vx_m(v48, v48, output_activation_min);
- vmin_w_vx_m(v48, v48, output_activation_max);
+ INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
+ v48, v24, v28, output_activation_min, output_activation_max,
+ output_offset);
+
vsraqs_b_vx(v48, v48, 0);
vst_b_x(v48, &output_data[tflite::Offset(output_shape, batch, out_y,
out_x, output_channel)]);
@@ -2673,6 +2686,118 @@
}
}
+void DepthwiseConvS8D16(
+ const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
+ const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+ const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
+ const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+ const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
+ int8_t* output_data
+
+) {
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const int32_t input_offset = params.input_offset;
+ const int32_t output_offset = params.output_offset;
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int input_depth = input_shape.Dims(3);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ const int output_depth = output_shape.Dims(3);
+ for (int in_channel = 0; in_channel + 16 <= input_depth; in_channel += 16) {
+ const int output_channel = in_channel;
+
+ vld_w_x(v24, output_multiplier);
+ vld_w_x(v25, output_multiplier + 8);
+ vld_w_x(v28, output_shift);
+ vld_w_x(v29, output_shift + 8);
+ vrsub_w_vx(v28, v28, 0);
+ vrsub_w_vx(v29, v29, 0);
+
+ for (int batch = 0; batch < batches; ++batch) {
+ const int8_t* p_output =
+ output_data + (batch * output_width * output_height * output_depth) +
+ output_channel;
+ for (int out_y = 0; out_y < output_height; ++out_y) {
+ for (int out_x = 0; out_x < output_width; ++out_x) {
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ const int y_offset = (output_depth * output_width * out_y);
+
+ if (bias_data) {
+ vld_w_x(v48, bias_data);
+ vld_w_x(v49, bias_data + 8);
+ } else {
+ vdup_w_x(v48, 0);
+ vdup_w_x(v49, 0);
+ }
+
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+ const int in_y = in_y_origin + filter_y;
+ if ((in_y < 0) || (in_y >= input_height)) {
+ continue;
+ }
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+ const int in_x = in_x_origin + filter_x;
+ if ((in_x < 0) || (in_x >= input_width)) {
+ continue;
+ }
+
+ const int8_t* in_p =
+ input_data +
+ (batch * input_height * input_width * input_depth) +
+ (in_y * input_width * input_depth) + (in_x * input_depth) +
+ in_channel;
+
+ const int8_t* fl_p = filter_data +
+ (filter_y * filter_width * input_depth) +
+ (filter_x * input_depth) + in_channel;
+
+ vld_b_l_xx(v0, in_p, 16);
+ vld_b_l_xx(v4, fl_p, 16);
+
+ vaddw_h_vx(v0, v0, 0);
+ vadd_h_vx(v0, v0, static_cast<int16_t>(input_offset));
+ vadd_h_vx(v1, v1, static_cast<int16_t>(input_offset));
+ vzip_h_vv(v0, v0, v1);
+
+ vaddw_h_vx(v4, v4, static_cast<int16_t>(0));
+ vzip_h_vv(v4, v4, v5);
+ vmulw_w_vv(v8, v0, v4);
+
+ vadd_w_vv(v48, v48, v8);
+ vadd_w_vv(v49, v49, v9);
+ }
+ }
+
+ vdmulh_w_rn_vv(v48, v48, v24);
+ vdmulh_w_rn_vv(v49, v49, v25);
+ vsha_w_r_vv(v48, v48, v28);
+ vsha_w_r_vv(v49, v49, v29);
+
+ vadd_w_vx(v48, v48, output_offset);
+ vadd_w_vx(v49, v49, output_offset);
+ vmax_w_vx(v48, v48, output_activation_min);
+ vmax_w_vx(v49, v49, output_activation_min);
+ vmin_w_vx(v48, v48, output_activation_max);
+ vmin_w_vx(v49, v49, output_activation_max);
+ vsraqs_b_vx_m(v48, v48, 0);
+ vsraqs_b_vx(v49, v49, 0);
+ vst_b_l_xx(v48, p_output + (out_x * output_depth) + y_offset, 16);
+ }
+ }
+ }
+ }
+}
+
// generic implementation based on Kelvin ops
void DepthwiseConvS8Generic(
const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
@@ -2746,8 +2871,9 @@
RUN_KERNEL(DepthwiseConvS83x3D32);
}
RUN_KERNEL(DepthwiseConvS8D32);
+ } else if (output_depth % 16 == 0) {
+ RUN_KERNEL(DepthwiseConvS8D16);
}
-
RUN_KERNEL(DepthwiseConvS8Generic);
}