Merge "sw/kelvin: add test target for newly added mul op for Kelvin"
diff --git a/benchmarks/benchmark.h b/benchmarks/benchmark.h
index 59383fb..16c1c85 100644
--- a/benchmarks/benchmark.h
+++ b/benchmarks/benchmark.h
@@ -17,11 +17,15 @@
 #ifndef BENCHMARKS_BENCHMARK_H_
 #define BENCHMARKS_BENCHMARK_H_
 
+#define ML_RUN_INDICATOR_IO 16
+#define ML_TOGGLE_PER_INF_IO 17
+
 typedef struct {
   uint32_t return_code;
   uint32_t iterations;
   uint64_t cycles;
   uint32_t mismatch_count;
+  uint32_t gpio_toggle_per_inference;
 } BenchmarkOutputHeader;
 
-#endif // #ifndef BENCHMARKS_BENCHMARK_H_
+#endif  // BENCHMARKS_BENCHMARK_H_
diff --git a/benchmarks/benchmark_kelvin.cc b/benchmarks/benchmark_kelvin.cc
index 079f567..7e72835 100644
--- a/benchmarks/benchmark_kelvin.cc
+++ b/benchmarks/benchmark_kelvin.cc
@@ -68,6 +68,7 @@
     .iterations = 0,
     .cycles = 0,
     .mismatch_count = 0,
+    .gpio_toggle_per_inference = 0,
 };
 
 // This includes all ops currently used in the Kelvin model suite. More can be
@@ -167,6 +168,8 @@
 
   // TODO(michaelbrooks): Possibly set/verify test data?
   for (int i = 0; i < iterations; ++i) {
+    output_header.gpio_toggle_per_inference =
+        !output_header.gpio_toggle_per_inference;
 #if (TEST_DATA_INPUT == 1)
     memcpy(tflite::GetTensorData<uint8_t>(input), g_benchmark_input,
            input->bytes);
diff --git a/benchmarks/benchmark_sec.c b/benchmarks/benchmark_sec.c
index 251bd08..af5c8dd 100644
--- a/benchmarks/benchmark_sec.c
+++ b/benchmarks/benchmark_sec.c
@@ -20,6 +20,7 @@
 #include "sw/device/lib/dif/dif_pinmux.h"
 #include "sw/device/lib/dif/dif_rv_plic.h"
 #include "sw/device/lib/dif/dif_smc_ctrl.h"
+#include "sw/device/lib/dif/dif_tlul_mailbox.h"
 #include "sw/device/lib/dif/dif_uart.h"
 #include "sw/device/lib/runtime/hart.h"
 #include "sw/device/lib/runtime/irq.h"
@@ -27,6 +28,9 @@
 #include "sw/device/lib/testing/test_framework/check.h"
 #include "sw/device/lib/testing/test_framework/ottf_test_config.h"
 #include "sw/device/lib/testing/test_framework/test_util.h"
+/* clang-format off */
+#include "benchmarks/benchmark.h"
+/* clang-format on */
 
 #define STRINGIZE(x) #x
 #define STR(x) STRINGIZE(x)
@@ -39,12 +43,43 @@
 #define SMC_BINARY STR(SMC_BINARY_DIRECTORY/BENCHMARK_NAME-SMC_BINARY_TYPE)
 #include SMC_BINARY
 
+static dif_gpio_t gpio;
+static dif_rv_plic_t plic_sec;
+static dif_tlul_mailbox_t tlul_mailbox;
 static dif_pinmux_t pinmux;
 static dif_smc_ctrl_t smc_ctrl;
 static dif_uart_t uart;
 
 OTTF_DEFINE_TEST_CONFIG();
 
+void ottf_external_isr(void) {
+  uint32_t rx;
+  dif_rv_plic_irq_id_t plic_irq_id;
+
+  CHECK_DIF_OK(dif_rv_plic_irq_claim(&plic_sec, kTopMatchaPlicTargetIbex0,
+                                     &plic_irq_id));
+  top_matcha_plic_peripheral_t peripheral_id =
+      top_matcha_plic_interrupt_for_peripheral[plic_irq_id];
+
+  switch (peripheral_id) {
+    case kTopMatchaPlicPeripheralTlulMailboxSec: {
+      CHECK_DIF_OK(dif_tlul_mailbox_irq_acknowledge(&tlul_mailbox,
+                                                    kDifTlulMailboxIrqRtirq));
+      CHECK_DIF_OK(dif_tlul_mailbox_read_message(&tlul_mailbox, &rx));
+      uint32_t pin = rx >> 16;
+      uint32_t value = rx & 0xFFFF;
+      CHECK_DIF_OK(dif_gpio_write(&gpio, pin, value));
+      break;
+    }
+    default:
+      LOG_FATAL("Unhandled interrupt");
+      break;
+  }
+
+  CHECK_DIF_OK(dif_rv_plic_irq_complete(&plic_sec, kTopMatchaPlicTargetIbex0,
+                                        plic_irq_id));
+}
+
 void _ottf_main(void) {
   // Initialize the UART to enable logging for non-DV simulation platforms.
   if (kDeviceType != kDeviceSimDV) {
@@ -56,6 +91,28 @@
   CHECK_DIF_OK(dif_smc_ctrl_init(
       mmio_region_from_addr(TOP_MATCHA_SMC_CTRL_BASE_ADDR), &smc_ctrl));
 
+// PinMux: Total inference GPIO J52.5/CS  Sparrow (IOR7) :: PMOD3.7 on Nexus
+// (IOD4)
+//           Per inference GPIO J52.7/SCK Sparrow (IOR7) :: PMOD3.8 on Nexus
+//           (IOD5)
+#if defined(MATCHA_SPARROW)
+  CHECK_DIF_OK(dif_pinmux_output_select(&pinmux, kTopMatchaPinmuxMioOutIor7,
+                                        kTopMatchaPinmuxOutselGpioGpio16));
+  CHECK_DIF_OK(dif_pinmux_output_select(&pinmux, kTopMatchaPinmuxMioOutIoa1,
+                                        kTopMatchaPinmuxOutselGpioGpio17));
+#else
+  CHECK_DIF_OK(dif_pinmux_output_select(&pinmux, kTopMatchaPinmuxMioOutIod4,
+                                        kTopMatchaPinmuxOutselGpioGpio16));
+  CHECK_DIF_OK(dif_pinmux_output_select(&pinmux, kTopMatchaPinmuxMioOutIod5,
+                                        kTopMatchaPinmuxOutselGpioGpio17));
+#endif
+  CHECK_DIF_OK(
+      dif_gpio_init(mmio_region_from_addr(TOP_MATCHA_GPIO_BASE_ADDR), &gpio));
+  CHECK_DIF_OK(dif_gpio_output_set_enabled(&gpio, ML_RUN_INDICATOR_IO,
+                                           kDifToggleEnabled));
+  CHECK_DIF_OK(dif_gpio_output_set_enabled(&gpio, ML_TOGGLE_PER_INF_IO,
+                                           kDifToggleEnabled));
+
   LOG_INFO("Loading Kelvin binary");
   spi_flash_init();
   CHECK_DIF_OK(load_file_from_tar(
@@ -66,6 +123,24 @@
     LOG_INFO("Loading SMC binary");
     memcpy((void*)TOP_MATCHA_RAM_SMC_BASE_ADDR, smc_bin, smc_bin_len);
   }
+
+  // Enable Mailbox Interrupt
+  CHECK_DIF_OK(dif_tlul_mailbox_init(
+      mmio_region_from_addr(TOP_MATCHA_TLUL_MAILBOX_SEC_BASE_ADDR),
+      &tlul_mailbox));
+  CHECK_DIF_OK(dif_tlul_mailbox_irq_set_enabled(
+      &tlul_mailbox, kDifTlulMailboxIrqRtirq, kDifToggleEnabled));
+
+  CHECK_DIF_OK(dif_rv_plic_init(
+      mmio_region_from_addr(TOP_MATCHA_RV_PLIC_BASE_ADDR), &plic_sec));
+  CHECK_DIF_OK(dif_rv_plic_irq_set_enabled(
+      &plic_sec, kTopMatchaPlicIrqIdTlulMailboxSecRtirq,
+      kTopMatchaPlicTargetIbex0, kDifToggleEnabled));
+  CHECK_DIF_OK(dif_rv_plic_irq_set_priority(
+      &plic_sec, kTopMatchaPlicIrqIdTlulMailboxSecRtirq, 1));
+  irq_global_ctrl(true);
+  irq_external_ctrl(true);
+
   CHECK_DIF_OK(dif_smc_ctrl_set_en(&smc_ctrl));
   irq_global_ctrl(true);
   irq_external_ctrl(true);
diff --git a/benchmarks/benchmark_smc.c b/benchmarks/benchmark_smc.c
index ca0c1f6..f290350 100644
--- a/benchmarks/benchmark_smc.c
+++ b/benchmarks/benchmark_smc.c
@@ -23,6 +23,7 @@
 #include "sw/device/lib/dif/dif_ml_top.h"
 #include "sw/device/lib/dif/dif_rv_plic.h"
 #include "sw/device/lib/dif/dif_rv_timer.h"
+#include "sw/device/lib/dif/dif_tlul_mailbox.h"
 #include "sw/device/lib/runtime/hart.h"
 #include "sw/device/lib/runtime/irq.h"
 #include "sw/device/lib/runtime/log.h"
@@ -41,6 +42,7 @@
 static dif_rv_timer_t rv_timer;
 static dif_uart_t smc_uart;
 static dif_ml_top_t ml_top;
+static dif_tlul_mailbox_t tlul_mailbox;
 
 volatile bool ml_top_finish_done = false;
 
@@ -113,26 +115,45 @@
   irq_global_ctrl(true);
   irq_external_ctrl(true);
 
-  LOG_INFO("========== Begin Benchmark (%s) ==========", STR(BENCHMARK_NAME));
+  // Configure Mailbox.
+  CHECK_DIF_OK(dif_tlul_mailbox_init(
+      mmio_region_from_addr(TOP_MATCHA_TLUL_MAILBOX_SMC_BASE_ADDR),
+      &tlul_mailbox));
 
-  // start kelvin
+  BenchmarkOutputHeader* output_header_ptr =
+      (BenchmarkOutputHeader*)((TOP_MATCHA_ML_TOP_DMEM_BASE_ADDR +
+                                TOP_MATCHA_RAM_ML_DMEM_SIZE_BYTES) -
+                               0x40);
+
+  LOG_INFO("========== Begin Benchmark (%s) ==========", STR(BENCHMARK_NAME));
+  {
+    uint32_t value = 1;
+    uint32_t tx = ML_RUN_INDICATOR_IO << 16 | value;
+    CHECK_DIF_OK(dif_tlul_mailbox_send_message(&tlul_mailbox, &tx));
+  }
+
+  // start kelvin and pulse GPIO for data logger (Kibble)
   ml_top_finish_done = false;
   uint64_t timer_start;
   CHECK_DIF_OK(dif_rv_timer_counter_read(&rv_timer, 0, &timer_start));
   CHECK_DIF_OK(dif_ml_top_release_ctrl_en(&ml_top));
 
   // wfi
+  uint32_t gpio_toggle_per_inference_prev = 0;
+  uint32_t tx;
   while (!ml_top_finish_done) {
-    wait_for_interrupt();
+    uint32_t gpio_toggle_per_inference =
+        output_header_ptr->gpio_toggle_per_inference;
+    if (gpio_toggle_per_inference != gpio_toggle_per_inference_prev) {
+      tx = ML_TOGGLE_PER_INF_IO << 16 | gpio_toggle_per_inference;
+      gpio_toggle_per_inference_prev = gpio_toggle_per_inference;
+      CHECK_DIF_OK(dif_tlul_mailbox_send_message(&tlul_mailbox, &tx));
+    }
+    // wait_for_interrupt();
   }
   uint64_t timer_finish;
   CHECK_DIF_OK(dif_rv_timer_counter_read(&rv_timer, 0, &timer_finish));
 
-  BenchmarkOutputHeader* output_header_ptr =
-      (BenchmarkOutputHeader*)((TOP_MATCHA_ML_TOP_DMEM_BASE_ADDR +
-                                TOP_MATCHA_RAM_ML_DMEM_SIZE_BYTES) -
-                              0x40);
-
   if (output_header_ptr->return_code) {
     LOG_FATAL("Kelvin returned an error: %d", output_header_ptr->return_code);
     test_status_set(kTestStatusFailed);
@@ -142,6 +163,14 @@
   uint64_t average_cycles = udiv64_slow(cycles, iterations, NULL);
   uint64_t wall_time_us = timer_finish - timer_start;
   uint64_t average_wall_time_us = udiv64_slow(wall_time_us, iterations, NULL);
+
+  // End Test Pulse
+  {
+    uint32_t value = 0;
+    uint32_t tx = ML_RUN_INDICATOR_IO << 16 | value;
+    CHECK_DIF_OK(dif_tlul_mailbox_send_message(&tlul_mailbox, &tx));
+  }
+
   LOG_INFO("Iterations: %d", iterations);
   _print64("Total Cycles", cycles);
   _print64("Average Cycles per Iteration", average_cycles);
diff --git a/docs/kelvin_isa.md b/docs/kelvin_isa.md
index 4b9b06c..5aaac0c 100644
--- a/docs/kelvin_isa.md
+++ b/docs/kelvin_isa.md
@@ -85,6 +85,10 @@
 where a stripmine register must use a mod4 base aligned register (eg. v0, v4,
 v8, ...). Normal instruction and stripmine variants may be mixed together.
 
+Currently, neither the assembler nor kelvin_sim checks for invalid stripmine
+registers. Code using invalid registers (like v1) will compile and sim, but
+will cause FPGA to hang.
+
 When stripmining is used in conjunction with instructions which use a register
 index as a base to several registers, the offset of +4 (instead of +1) shall be
 used. e.g., {vm0,vm1} becomes {{v0,v1,v2,v3},{v4,v5,v6,v7}}.
@@ -753,7 +757,7 @@
 
 ### ACONV
 
-Convolution ALU operation.
+Performs matmul vs1*vs3, accumulating into the accumulator.
 
 **Encodings**
 
@@ -787,29 +791,32 @@
          (signed(SData2,Data2{7:0}) + signed(Bias2{8:0})){9:0}){18:0}
 ```
 
-Length (stop - start + 1) is in 32bit accumulator lane count, as all inputs will
-horizontally reduce to this size.
+vs1 goes to the *narrow* port of the matmul. 8 vectors are always used.
 
-The Start and Stop definition allows for a partial window of input values to be
-transpose broadcast into the convolution unit.
+vs3 goes to the *wide* port of the matmul, up to 8 vectors are used.
+
+vx2 specifies control params used in the operation and has the following
+format:
 
 Mode   | Mode | Usage
 :----: | :--: | :-----------------------------------------------:
 Common |      | Mode[1:0] Start[6:2] Stop[11:7]
 s8     | 0    | SData2[31] SBias2[30:22] SData1[21] SBias1[20:12]
 
-```
-# SIMD256
-acc.out = {v48..55}
-narrow0 = {v0..7}
-narrow1 = {v16..23}
-narrow2 = {v32..39}
-narrow3 = {v48..55}
-wide0   = {v8..15}
-wide1   = {v24..31}
-wide2   = {v40..47}
-wide3   = {v56..63}
-```
+Start and Stop controls the window of input values to participate in the
+matmul:
+- On vs1 this is in 4-byte words on all 8 vectors at the same time.
+- On vs3 this is the register number to use (vs3+0 to vs3+7).
+- The operation takes (stop - start + 1) ticks to complete.
+
+When using SIMD256, the folling operands are valid:
+- vd: v48
+- vs1: v0, v16, v32, v48
+- vs3: v8, v24, v40, v56
+
+Notes:
+- v48 is used as vd but never written to.
+- v48-v55 will always be overwritten upon VCGET.
 
 ### VCGET
 
@@ -830,6 +837,8 @@
 
 ```
 
+v48 is the only valid vd in this instruction.
+
 ### ACSET
 
 Copy general registers into convolution accumulators.
@@ -847,6 +856,8 @@
   Accum{Y} = vd{Y}
 ```
 
+Note that v48 is used as vd but never written to.
+
 --------------------------------------------------------------------------------
 
 ### ACTR
@@ -860,13 +871,15 @@
 **Operation**
 
 ```
-assert(vd in {v48})
+assert(vd == 48)
 assert(vs1 in {v0, v16, v32, v48}
 for I in i32.typelen
   for J in i32.typelen
     ACCUM[J][I] = vs1[I][J]
 ```
 
+Note that v48 is used as vd but never written to.
+
 --------------------------------------------------------------------------------
 
 ### VCLB
@@ -1813,7 +1826,7 @@
 
 vslidep.[b,h,w].[1,2,3,4].vv vd, vs1, vs2 \
 vslidevp.[b,h,w].[1,2,3,4].vv.m vd, vs1, vs2 \
-vslidevp.[b,h,w].[1,2,3,4].vv.m vd, vs1, xs2
+vslidevp.[b,h,w].[1,2,3,4].vx.m vd, vs1, xs2
 
 **Operation**
 
diff --git a/tflm/opt/depthwise_conv_s8.cc b/tflm/opt/depthwise_conv_s8.cc
index a11c3d2..a58d960 100644
--- a/tflm/opt/depthwise_conv_s8.cc
+++ b/tflm/opt/depthwise_conv_s8.cc
@@ -25,6 +25,7 @@
 
 // Reorders a vector to match the pattern after double-widening.
 // N must be a multiple of 4.
+// Working only for mutliples of 32
 void VectorSwizzle(const int32_t* input, int32_t* output, int N) {
   assert(N >= 4 && N % 4 == 0);
   const int32_t(&in)[N] = *(int32_t(*)[N])input;
@@ -2613,11 +2614,14 @@
 
   for (int in_channel = 0; in_channel + 32 <= input_depth; in_channel += 32) {
     const int output_channel = in_channel;
-    VectorSwizzle(bias_data + output_channel, swizzled_bias_data, 32);
+
+    if (bias_data) {
+      VectorSwizzle(bias_data + output_channel, swizzled_bias_data, 32);
+    }
+
     VectorSwizzle(output_multiplier + output_channel, swizzled_output_multi, 32);
     VectorSwizzle(output_shift + output_channel, swizzled_shift_multi, 32);
 
-    vld_w_x_m(v20, swizzled_bias_data);
     vld_w_x_m(v24, swizzled_output_multi);
     vld_w_x_m(v28, swizzled_shift_multi);
     vrsub_w_vx_m(v28, v28, 0);
@@ -2628,7 +2632,12 @@
           const int in_x_origin = (out_x * stride_width) - pad_width;
           const int in_y_origin = (out_y * stride_height) - pad_height;
 
-          vdup_w_x_m(v48, 0);
+          if (bias_data) {
+            vld_w_x_m(v48, swizzled_bias_data);
+          } else {
+            vdup_w_x_m(v48, 0);
+          }
+
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
             const int in_y = in_y_origin + filter_y;
             if ((in_y < 0) || (in_y >= input_height)) {
@@ -2639,12 +2648,18 @@
               if ((in_x < 0) || (in_x >= input_width)) {
                 continue;
               }
+              const int8_t* in_p =
+                  input_data +
+                  (batch * input_height * input_width * input_depth) +
+                  (in_y * input_width * input_depth) + (in_x * input_depth) +
+                  in_channel;
 
-              vld_b_x(v0, &input_data[tflite::Offset(input_shape, batch, in_y,
-                                                     in_x, in_channel)]);  // xp
-              vld_b_x(v4, &filter_data[tflite::Offset(filter_shape, 0, filter_y,
-                                                      filter_x, in_channel)]);
+              const int8_t* fl_p = filter_data +
+                                   (filter_y * filter_width * input_depth) +
+                                   (filter_x * input_depth) + in_channel;
 
+              vld_b_x(v0, in_p);
+              vld_b_x(v4, fl_p);
               vaddw_h_vx(v0, v0, 0);
               vadd_h_vx(v0, v0, static_cast<int16_t>(input_offset));
               vadd_h_vx(v1, v1,
@@ -2658,12 +2673,10 @@
             }
           }
 
-          vadd_w_vv_m(v48, v48, v20);  // add bias
-          vdmulh_w_rn_vv_m(v48, v48, v24);
-          vsha_w_r_vv_m(v48, v48, v28);
-          vadd_w_vx_m(v48, v48, output_offset);
-          vmax_w_vx_m(v48, v48, output_activation_min);
-          vmin_w_vx_m(v48, v48, output_activation_max);
+          INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
+              v48, v24, v28, output_activation_min, output_activation_max,
+              output_offset);
+
           vsraqs_b_vx(v48, v48, 0);
           vst_b_x(v48, &output_data[tflite::Offset(output_shape, batch, out_y,
                                                    out_x, output_channel)]);
@@ -2673,6 +2686,118 @@
   }
 }
 
+void DepthwiseConvS8D16(
+    const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const tflite::RuntimeShape& input_shape,
+    const int8_t* input_data, const tflite::RuntimeShape& filter_shape,
+    const int8_t* filter_data, const tflite::RuntimeShape& bias_shape,
+    const int32_t* bias_data, const tflite::RuntimeShape& output_shape,
+    int8_t* output_data
+
+) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int output_depth = output_shape.Dims(3);
+  for (int in_channel = 0; in_channel + 16 <= input_depth; in_channel += 16) {
+    const int output_channel = in_channel;
+
+    vld_w_x(v24, output_multiplier);
+    vld_w_x(v25, output_multiplier + 8);
+    vld_w_x(v28, output_shift);
+    vld_w_x(v29, output_shift + 8);
+    vrsub_w_vx(v28, v28, 0);
+    vrsub_w_vx(v29, v29, 0);
+
+    for (int batch = 0; batch < batches; ++batch) {
+      const int8_t* p_output =
+          output_data + (batch * output_width * output_height * output_depth) +
+          output_channel;
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          const int y_offset = (output_depth * output_width * out_y);
+
+          if (bias_data) {
+            vld_w_x(v48, bias_data);
+            vld_w_x(v49, bias_data + 8);
+          } else {
+            vdup_w_x(v48, 0);
+            vdup_w_x(v49, 0);
+          }
+
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + filter_y;
+            if ((in_y < 0) || (in_y >= input_height)) {
+              continue;
+            }
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              if ((in_x < 0) || (in_x >= input_width)) {
+                continue;
+              }
+
+              const int8_t* in_p =
+                  input_data +
+                  (batch * input_height * input_width * input_depth) +
+                  (in_y * input_width * input_depth) + (in_x * input_depth) +
+                  in_channel;
+
+              const int8_t* fl_p = filter_data +
+                                   (filter_y * filter_width * input_depth) +
+                                   (filter_x * input_depth) + in_channel;
+
+              vld_b_l_xx(v0, in_p, 16);
+              vld_b_l_xx(v4, fl_p, 16);
+
+              vaddw_h_vx(v0, v0, 0);
+              vadd_h_vx(v0, v0, static_cast<int16_t>(input_offset));
+              vadd_h_vx(v1, v1, static_cast<int16_t>(input_offset));
+              vzip_h_vv(v0, v0, v1);
+
+              vaddw_h_vx(v4, v4, static_cast<int16_t>(0));
+              vzip_h_vv(v4, v4, v5);
+              vmulw_w_vv(v8, v0, v4);
+
+              vadd_w_vv(v48, v48, v8);
+              vadd_w_vv(v49, v49, v9);
+            }
+          }
+
+          vdmulh_w_rn_vv(v48, v48, v24);
+          vdmulh_w_rn_vv(v49, v49, v25);
+          vsha_w_r_vv(v48, v48, v28);
+          vsha_w_r_vv(v49, v49, v29);
+
+          vadd_w_vx(v48, v48, output_offset);
+          vadd_w_vx(v49, v49, output_offset);
+          vmax_w_vx(v48, v48, output_activation_min);
+          vmax_w_vx(v49, v49, output_activation_min);
+          vmin_w_vx(v48, v48, output_activation_max);
+          vmin_w_vx(v49, v49, output_activation_max);
+          vsraqs_b_vx_m(v48, v48, 0);
+          vsraqs_b_vx(v49, v49, 0);
+          vst_b_l_xx(v48, p_output + (out_x * output_depth) + y_offset, 16);
+        }
+      }
+    }
+  }
+}
+
 // generic implementation based on Kelvin ops
 void DepthwiseConvS8Generic(
     const tflite::DepthwiseParams& params, const int32_t* output_multiplier,
@@ -2746,8 +2871,9 @@
         RUN_KERNEL(DepthwiseConvS83x3D32);
       }
       RUN_KERNEL(DepthwiseConvS8D32);
+    } else if (output_depth % 16 == 0) {
+      RUN_KERNEL(DepthwiseConvS8D16);
     }
-
     RUN_KERNEL(DepthwiseConvS8Generic);
   }