Merge "Create templated version of Swizzle."
diff --git a/tflm/opt/conv_s8.cc b/tflm/opt/conv_s8.cc
index a92433f..0dc432b 100644
--- a/tflm/opt/conv_s8.cc
+++ b/tflm/opt/conv_s8.cc
@@ -220,6 +220,7 @@
   }
 
   if (input_depth == 1 && filter_width == 5 && filter_height == 5 &&
+      stride_width == 2 &&
       output_depth == 24) {
     RUN_KERNEL(kelvin::opt::ConvPerChannelD1OD24_5x5);
   }
diff --git a/tflm/opt/conv_s8_d1.cc b/tflm/opt/conv_s8_d1.cc
index 09f4110..c03a7f5 100644
--- a/tflm/opt/conv_s8_d1.cc
+++ b/tflm/opt/conv_s8_d1.cc
@@ -67,6 +67,67 @@
 
 }  // namespace
 
+#define FLT_0_0 v0
+#define FLT_0_1 v3
+#define FLT_0_2 v6
+#define FLT_0_3 v9
+#define FLT_0_4 v12
+
+#define FLT_1_0 v1
+#define FLT_1_1 v4
+#define FLT_1_2 v7
+#define FLT_1_3 v10
+#define FLT_1_4 v13
+
+#define FLT_2_0 v2
+#define FLT_2_1 v5
+#define FLT_2_2 v8
+#define FLT_2_3 v11
+#define FLT_2_4 v14
+
+#define FLT_3_0 v15
+#define FLT_3_1 v16
+#define FLT_3_2 v17
+#define FLT_3_3 v18
+#define FLT_3_4 v19
+
+#define FLT_HOLE v20
+#define FLT_4_0 v21
+#define FLT_4_1 v22
+#define FLT_4_2 v23
+#define FLT_4_3 v24
+#define FLT_4_4 v25
+
+#define INPUT_0_0 v26
+#define INPUT_0_1 v29
+#define INPUT_0_2 v32
+#define INPUT_0_3 v35
+#define INPUT_0_4 v38
+
+#define INPUT_1_0 v27
+#define INPUT_1_1 v30
+#define INPUT_1_2 v33
+#define INPUT_1_3 v36
+#define INPUT_1_4 v39
+
+#define INPUT_2_0 v28
+#define INPUT_2_1 v31
+#define INPUT_2_2 v34
+#define INPUT_2_3 v37
+#define INPUT_2_4 v40
+
+#define INPUT_3_0 v41
+#define INPUT_3_1 v42
+#define INPUT_3_2 v43
+#define INPUT_3_3 v44
+#define INPUT_3_4 v45
+
+#define INPUT_4_0 v46
+#define INPUT_4_1 v47
+#define INPUT_4_2 v48
+#define INPUT_4_3 v49
+#define INPUT_4_4 v50
+
 #define CALCULATE_IN_X(in_x_origin)                        \
   {                                                        \
     _Pragma("GCC unroll 5") for (int i = 0; i < 5; ++i) {  \
@@ -83,93 +144,93 @@
 
 #define PAD_ROW_0(input_offset)   \
   {                               \
-    vdup_b_x(v27, -input_offset); \
-    vdup_b_x(v28, -input_offset); \
-    vdup_b_x(v29, -input_offset); \
-    vdup_b_x(v30, -input_offset); \
-    vdup_b_x(v31, -input_offset); \
+    vdup_b_x(INPUT_0_0, -input_offset); \
+    vdup_b_x(INPUT_0_1, -input_offset); \
+    vdup_b_x(INPUT_0_2, -input_offset); \
+    vdup_b_x(INPUT_0_3, -input_offset); \
+    vdup_b_x(INPUT_0_4, -input_offset); \
   }
 #define PAD_ROW_1(input_offset)   \
   {                               \
-    vdup_b_x(v32, -input_offset); \
-    vdup_b_x(v33, -input_offset); \
-    vdup_b_x(v34, -input_offset); \
-    vdup_b_x(v35, -input_offset); \
-    vdup_b_x(v36, -input_offset); \
+    vdup_b_x(INPUT_1_0, -input_offset); \
+    vdup_b_x(INPUT_1_1, -input_offset); \
+    vdup_b_x(INPUT_1_2, -input_offset); \
+    vdup_b_x(INPUT_1_3, -input_offset); \
+    vdup_b_x(INPUT_1_4, -input_offset); \
   }
 #define PAD_ROW_2(input_offset)   \
-  {                               \
-    vdup_b_x(v37, -input_offset); \
-    vdup_b_x(v38, -input_offset); \
-    vdup_b_x(v39, -input_offset); \
-    vdup_b_x(v40, -input_offset); \
-    vdup_b_x(v41, -input_offset); \
+  {                            \
+    vdup_b_x(INPUT_2_0, -input_offset); \
+    vdup_b_x(INPUT_2_1, -input_offset); \
+    vdup_b_x(INPUT_2_2, -input_offset); \
+    vdup_b_x(INPUT_2_3, -input_offset); \
+    vdup_b_x(INPUT_2_4, -input_offset); \
   }
 #define PAD_ROW_3(input_offset)   \
   {                               \
-    vdup_b_x(v42, -input_offset); \
-    vdup_b_x(v43, -input_offset); \
-    vdup_b_x(v44, -input_offset); \
-    vdup_b_x(v45, -input_offset); \
-    vdup_b_x(v46, -input_offset); \
+    vdup_b_x(INPUT_3_0, -input_offset); \
+    vdup_b_x(INPUT_3_1, -input_offset); \
+    vdup_b_x(INPUT_3_2, -input_offset); \
+    vdup_b_x(INPUT_3_3, -input_offset); \
+    vdup_b_x(INPUT_3_4, -input_offset); \
   }
 #define PAD_ROW_4(input_offset)   \
   {                               \
-    vdup_b_x(v47, -input_offset); \
-    vdup_b_x(v48, -input_offset); \
-    vdup_b_x(v49, -input_offset); \
-    vdup_b_x(v50, -input_offset); \
-    vdup_b_x(v51, -input_offset); \
+    vdup_b_x(INPUT_4_0, -input_offset); \
+    vdup_b_x(INPUT_4_1, -input_offset); \
+    vdup_b_x(INPUT_4_2, -input_offset); \
+    vdup_b_x(INPUT_4_3, -input_offset); \
+    vdup_b_x(INPUT_4_4, -input_offset); \
   }
 
 #define LOAD_ROW_0(p_input, input_width, in_y, in_x)         \
   {                                                          \
     const int8_t* p_row = p_input + (in_y[0] * input_width); \
-    vdup_b_x(v27, *(p_row + in_x[0]));                       \
-    vdup_b_x(v28, *(p_row + in_x[1]));                       \
-    vdup_b_x(v29, *(p_row + in_x[2]));                       \
-    vdup_b_x(v30, *(p_row + in_x[3]));                       \
-    vdup_b_x(v31, *(p_row + in_x[4]));                       \
+    vdup_b_x(INPUT_0_0, *(p_row + in_x[0]));                       \
+    vdup_b_x(INPUT_0_1, *(p_row + in_x[1]));                       \
+    vdup_b_x(INPUT_0_2, *(p_row + in_x[2]));                       \
+    vdup_b_x(INPUT_0_3, *(p_row + in_x[3]));                       \
+    vdup_b_x(INPUT_0_4, *(p_row + in_x[4]));                       \
   }
 
 #define LOAD_ROW_1(p_input, input_width, in_y, in_x)         \
   {                                                          \
     const int8_t* p_row = p_input + (in_y[1] * input_width); \
-    vdup_b_x(v32, *(p_row + in_x[0]));                       \
-    vdup_b_x(v33, *(p_row + in_x[1]));                       \
-    vdup_b_x(v34, *(p_row + in_x[2]));                       \
-    vdup_b_x(v35, *(p_row + in_x[3]));                       \
-    vdup_b_x(v36, *(p_row + in_x[4]));                       \
+    vdup_b_x(INPUT_1_0, *(p_row + in_x[0]));                       \
+    vdup_b_x(INPUT_1_1, *(p_row + in_x[1]));                       \
+    vdup_b_x(INPUT_1_2, *(p_row + in_x[2]));                       \
+    vdup_b_x(INPUT_1_3, *(p_row + in_x[3]));                       \
+    vdup_b_x(INPUT_1_4, *(p_row + in_x[4]));                       \
   }
 
 #define LOAD_ROW_2(p_input, input_width, in_y, in_x)         \
   {                                                          \
     const int8_t* p_row = p_input + (in_y[2] * input_width); \
-    vdup_b_x(v37, *(p_row + in_x[0]));                       \
-    vdup_b_x(v38, *(p_row + in_x[1]));                       \
-    vdup_b_x(v39, *(p_row + in_x[2]));                       \
-    vdup_b_x(v40, *(p_row + in_x[3]));                       \
-    vdup_b_x(v41, *(p_row + in_x[4]));                       \
+    vdup_b_x(INPUT_2_0, *(p_row + in_x[0]));                       \
+    vdup_b_x(INPUT_2_1, *(p_row + in_x[1]));                       \
+    vdup_b_x(INPUT_2_2, *(p_row + in_x[2]));                       \
+    vdup_b_x(INPUT_2_3, *(p_row + in_x[3]));                       \
+    vdup_b_x(INPUT_2_4, *(p_row + in_x[4]));                       \
   }
 
 #define LOAD_ROW_3(p_input, input_width, in_y, in_x)         \
   {                                                          \
     const int8_t* p_row = p_input + (in_y[3] * input_width); \
-    vdup_b_x(v42, *(p_row + in_x[0]));                       \
-    vdup_b_x(v43, *(p_row + in_x[1]));                       \
-    vdup_b_x(v44, *(p_row + in_x[2]));                       \
-    vdup_b_x(v45, *(p_row + in_x[3]));                       \
-    vdup_b_x(v46, *(p_row + in_x[4]));                       \
+    vdup_b_x(INPUT_3_0, *(p_row + in_x[0]));                       \
+    vdup_b_x(INPUT_3_1, *(p_row + in_x[1]));                       \
+    vdup_b_x(INPUT_3_2, *(p_row + in_x[2]));                       \
+    vdup_b_x(INPUT_3_3, *(p_row + in_x[3]));                       \
+    vdup_b_x(INPUT_3_4, *(p_row + in_x[4]));                       \
   }
 
 #define LOAD_ROW_4(p_input, input_width, in_y, in_x)         \
   {                                                          \
     const int8_t* p_row = p_input + (in_y[4] * input_width); \
-    vdup_b_x(v47, *(p_row + in_x[0]));                       \
-    vdup_b_x(v48, *(p_row + in_x[1]));                       \
-    vdup_b_x(v49, *(p_row + in_x[2]));                       \
-    vdup_b_x(v50, *(p_row + in_x[3]));                       \
-    vdup_b_x(v51, *(p_row + in_x[4]));                       \
+    vdup_b_x(INPUT_4_0, *(p_row + in_x[0]));                       \
+    vdup_b_x(INPUT_4_1, *(p_row + in_x[1]));                       \
+    vdup_b_x(INPUT_4_2, *(p_row + in_x[2]));                       \
+    vdup_b_x(INPUT_4_3, *(p_row + in_x[3]));                       \
+    vdup_b_x(INPUT_4_4, *(p_row + in_x[4]));                       \
   }
 
 #define H_PAD_OR_LOAD_ROW_0(p_input, input_width, input_offset, in_y, in_x) \
@@ -178,29 +239,29 @@
   } else {                                                                  \
     const int8_t* p_row = p_input + (in_y[0] * input_width);                \
     if (in_x[0] < 0 || in_x[0] >= input_width) {                            \
-      vdup_b_x(v27, -input_offset);                                         \
+      vdup_b_x(INPUT_0_0, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v27, *(p_row + in_x[0]));                                    \
+      vdup_b_x(INPUT_0_0, *(p_row + in_x[0]));                                    \
     }                                                                       \
     if (in_x[1] < 0 || in_x[1] >= input_width) {                            \
-      vdup_b_x(v28, -input_offset);                                         \
+      vdup_b_x(INPUT_0_1, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v28, *(p_row + in_x[1]));                                    \
+      vdup_b_x(INPUT_0_1, *(p_row + in_x[1]));                                    \
     }                                                                       \
     if (in_x[2] < 0 || in_x[2] >= input_width) {                            \
-      vdup_b_x(v29, -input_offset);                                         \
+      vdup_b_x(INPUT_0_2, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v29, *(p_row + in_x[2]));                                    \
+      vdup_b_x(INPUT_0_2, *(p_row + in_x[2]));                                    \
     }                                                                       \
     if (in_x[3] < 0 || in_x[3] >= input_width) {                            \
-      vdup_b_x(v30, -input_offset);                                         \
+      vdup_b_x(INPUT_0_3, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v30, *(p_row + in_x[3]));                                    \
+      vdup_b_x(INPUT_0_3, *(p_row + in_x[3]));                                    \
     }                                                                       \
     if (in_x[4] < 0 || in_x[4] >= input_width) {                            \
-      vdup_b_x(v31, -input_offset);                                         \
+      vdup_b_x(INPUT_0_4, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v31, *(p_row + in_x[4]));                                    \
+      vdup_b_x(INPUT_0_4, *(p_row + in_x[4]));                                    \
     }                                                                       \
   }
 
@@ -210,29 +271,29 @@
   } else {                                                                  \
     const int8_t* p_row = p_input + (in_y[1] * input_width);                \
     if (in_x[0] < 0 || in_x[0] >= input_width) {                            \
-      vdup_b_x(v32, -input_offset);                                         \
+      vdup_b_x(INPUT_1_0, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v32, *(p_row + in_x[0]));                                    \
+      vdup_b_x(INPUT_1_0, *(p_row + in_x[0]));                                    \
     }                                                                       \
     if (in_x[1] < 0 || in_x[1] >= input_width) {                            \
-      vdup_b_x(v33, -input_offset);                                         \
+      vdup_b_x(INPUT_1_1, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v33, *(p_row + in_x[1]));                                    \
+      vdup_b_x(INPUT_1_1, *(p_row + in_x[1]));                                    \
     }                                                                       \
     if (in_x[2] < 0 || in_x[2] >= input_width) {                            \
-      vdup_b_x(v34, -input_offset);                                         \
+      vdup_b_x(INPUT_1_2, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v34, *(p_row + in_x[2]));                                    \
+      vdup_b_x(INPUT_1_2, *(p_row + in_x[2]));                                    \
     }                                                                       \
     if (in_x[3] < 0 || in_x[3] >= input_width) {                            \
-      vdup_b_x(v35, -input_offset);                                         \
+      vdup_b_x(INPUT_1_3, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v35, *(p_row + in_x[3]));                                    \
+      vdup_b_x(INPUT_1_3, *(p_row + in_x[3]));                                    \
     }                                                                       \
     if (in_x[4] < 0 || in_x[4] >= input_width) {                            \
-      vdup_b_x(v36, -input_offset);                                         \
+      vdup_b_x(INPUT_1_4, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v36, *(p_row + in_x[4]));                                    \
+      vdup_b_x(INPUT_1_4, *(p_row + in_x[4]));                                    \
     }                                                                       \
   }
 
@@ -242,29 +303,29 @@
   } else {                                                                  \
     const int8_t* p_row = p_input + (in_y[2] * input_width);                \
     if (in_x[0] < 0 || in_x[0] >= input_width) {                            \
-      vdup_b_x(v37, -input_offset);                                         \
+      vdup_b_x(INPUT_2_0, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v37, *(p_row + in_x[0]));                                    \
+      vdup_b_x(INPUT_2_0, *(p_row + in_x[0]));                                    \
     }                                                                       \
     if (in_x[1] < 0 || in_x[1] >= input_width) {                            \
-      vdup_b_x(v38, -input_offset);                                         \
+      vdup_b_x(INPUT_2_1, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v38, *(p_row + in_x[1]));                                    \
+      vdup_b_x(INPUT_2_1, *(p_row + in_x[1]));                                    \
     }                                                                       \
     if (in_x[2] < 0 || in_x[2] >= input_width) {                            \
-      vdup_b_x(v39, -input_offset);                                         \
+      vdup_b_x(INPUT_2_2, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v39, *(p_row + in_x[2]));                                    \
+      vdup_b_x(INPUT_2_2, *(p_row + in_x[2]));                                    \
     }                                                                       \
     if (in_x[3] < 0 || in_x[3] >= input_width) {                            \
-      vdup_b_x(v40, -input_offset);                                         \
+      vdup_b_x(INPUT_2_3, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v40, *(p_row + in_x[3]));                                    \
+      vdup_b_x(INPUT_2_3, *(p_row + in_x[3]));                                    \
     }                                                                       \
     if (in_x[4] < 0 || in_x[4] >= input_width) {                            \
-      vdup_b_x(v41, -input_offset);                                         \
+      vdup_b_x(INPUT_2_4, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v41, *(p_row + in_x[4]));                                    \
+      vdup_b_x(INPUT_2_4, *(p_row + in_x[4]));                                    \
     }                                                                       \
   }
 
@@ -274,29 +335,29 @@
   } else {                                                                  \
     const int8_t* p_row = p_input + (in_y[3] * input_width);                \
     if (in_x[0] < 0 || in_x[0] >= input_width) {                            \
-      vdup_b_x(v42, -input_offset);                                         \
+      vdup_b_x(INPUT_3_0, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v42, *(p_row + in_x[0]));                                    \
+      vdup_b_x(INPUT_3_0, *(p_row + in_x[0]));                                    \
     }                                                                       \
     if (in_x[1] < 0 || in_x[1] >= input_width) {                            \
-      vdup_b_x(v43, -input_offset);                                         \
+      vdup_b_x(INPUT_3_1, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v43, *(p_row + in_x[1]));                                    \
+      vdup_b_x(INPUT_3_1, *(p_row + in_x[1]));                                    \
     }                                                                       \
     if (in_x[2] < 0 || in_x[2] >= input_width) {                            \
-      vdup_b_x(v44, -input_offset);                                         \
+      vdup_b_x(INPUT_3_2, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v44, *(p_row + in_x[2]));                                    \
+      vdup_b_x(INPUT_3_2, *(p_row + in_x[2]));                                    \
     }                                                                       \
     if (in_x[3] < 0 || in_x[3] >= input_width) {                            \
-      vdup_b_x(v45, -input_offset);                                         \
+      vdup_b_x(INPUT_3_3, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v45, *(p_row + in_x[3]));                                    \
+      vdup_b_x(INPUT_3_3, *(p_row + in_x[3]));                                    \
     }                                                                       \
     if (in_x[4] < 0 || in_x[4] >= input_width) {                            \
-      vdup_b_x(v46, -input_offset);                                         \
+      vdup_b_x(INPUT_3_4, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v46, *(p_row + in_x[4]));                                    \
+      vdup_b_x(INPUT_3_4, *(p_row + in_x[4]));                                    \
     }                                                                       \
   }
 
@@ -306,29 +367,29 @@
   } else {                                                                  \
     const int8_t* p_row = p_input + (in_y[4] * input_width);                \
     if (in_x[0] < 0 || in_x[0] >= input_width) {                            \
-      vdup_b_x(v47, -input_offset);                                         \
+      vdup_b_x(INPUT_4_0, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v47, *(p_row + in_x[0]));                                    \
+      vdup_b_x(INPUT_4_0, *(p_row + in_x[0]));                                    \
     }                                                                       \
     if (in_x[1] < 0 || in_x[1] >= input_width) {                            \
-      vdup_b_x(v48, -input_offset);                                         \
+      vdup_b_x(INPUT_4_1, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v48, *(p_row + in_x[1]));                                    \
+      vdup_b_x(INPUT_4_1, *(p_row + in_x[1]));                                    \
     }                                                                       \
     if (in_x[2] < 0 || in_x[2] >= input_width) {                            \
-      vdup_b_x(v49, -input_offset);                                         \
+      vdup_b_x(INPUT_4_2, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v49, *(p_row + in_x[2]));                                    \
+      vdup_b_x(INPUT_4_2, *(p_row + in_x[2]));                                    \
     }                                                                       \
     if (in_x[3] < 0 || in_x[3] >= input_width) {                            \
-      vdup_b_x(v50, -input_offset);                                         \
+      vdup_b_x(INPUT_4_3, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v50, *(p_row + in_x[3]));                                    \
+      vdup_b_x(INPUT_4_3, *(p_row + in_x[3]));                                    \
     }                                                                       \
     if (in_x[4] < 0 || in_x[4] >= input_width) {                            \
-      vdup_b_x(v51, -input_offset);                                         \
+      vdup_b_x(INPUT_4_4, -input_offset);                                         \
     } else {                                                                \
-      vdup_b_x(v51, *(p_row + in_x[4]));                                    \
+      vdup_b_x(INPUT_4_4, *(p_row + in_x[4]));                                    \
     }                                                                       \
   }
 
@@ -371,15 +432,15 @@
   {                                       \
     vld_w_x_m(v60, swizzled_bias_data);   \
     adwinit_v(v60, v60);                  \
-    adwconv_vxv(v60, v27, cmds, v0);      \
-    adwconv_vxv(v60, v30, cmds, v3);      \
-    adwconv_vxv(v60, v33, cmds, v6);      \
-    adwconv_vxv(v60, v36, cmds, v9);      \
-    adwconv_vxv(v60, v39, cmds, v12);     \
-    adwconv_vxv(v60, v42, cmds, v15);     \
-    adwconv_vxv(v60, v45, cmds, v18);     \
-    adwconv_vxv(v60, v48, cmds, v21);     \
-    vdwconv_vxv(v60, v51, cmds, v24);     \
+    adwconv_vxv(v60, INPUT_0_0, cmds, FLT_0_0);      \
+    adwconv_vxv(v60, INPUT_0_1, cmds, FLT_0_1);      \
+    adwconv_vxv(v60, INPUT_0_2, cmds, FLT_0_2);      \
+    adwconv_vxv(v60, INPUT_0_3, cmds, FLT_0_3);      \
+    adwconv_vxv(v60, INPUT_0_4, cmds, FLT_0_4);     \
+    adwconv_vxv(v60, INPUT_3_0, cmds, FLT_3_0);     \
+    adwconv_vxv(v60, INPUT_3_3, cmds, FLT_3_3);     \
+    adwconv_vxv(v60, INPUT_3_4, cmds, FLT_HOLE);     \
+    vdwconv_vxv(v60, INPUT_4_2, cmds, FLT_4_2);     \
   }
 
 #define OUTPUT(output_activation_min, output_activation_max, output_offset, \
@@ -459,42 +520,42 @@
       }
     }
   }
+
   const int8_t* p_flt_0 = swizzled_filter_data.get() + (0 * filter_width * 24);
   const int8_t* p_flt_1 = swizzled_filter_data.get() + (1 * filter_width * 24);
   const int8_t* p_flt_2 = swizzled_filter_data.get() + (2 * filter_width * 24);
   const int8_t* p_flt_3 = swizzled_filter_data.get() + (3 * filter_width * 24);
   const int8_t* p_flt_4 = swizzled_filter_data.get() + (4 * filter_width * 24);
-  vld_b_l_xx(v0, p_flt_0 + (0 * 24), 24);
-  vld_b_l_xx(v1, p_flt_0 + (1 * 24), 24);
-  vld_b_l_xx(v2, p_flt_0 + (2 * 24), 24);
-  vld_b_l_xx(v3, p_flt_0 + (3 * 24), 24);
-  vld_b_l_xx(v4, p_flt_0 + (4 * 24), 24);
+  vld_b_lp_xx(FLT_0_0, p_flt_0, 24);
+  vld_b_lp_xx(FLT_0_1, p_flt_0, 24);
+  vld_b_lp_xx(FLT_0_2, p_flt_0, 24);
+  vld_b_lp_xx(FLT_0_3, p_flt_0, 24);
+  vld_b_lp_xx(FLT_0_4, p_flt_0, 24);
 
-  vld_b_l_xx(v5, p_flt_1 + (0 * 24), 24);
-  vld_b_l_xx(v6, p_flt_1 + (1 * 24), 24);
-  vld_b_l_xx(v7, p_flt_1 + (2 * 24), 24);
-  vld_b_l_xx(v8, p_flt_1 + (3 * 24), 24);
-  vld_b_l_xx(v9, p_flt_1 + (4 * 24), 24);
+  vld_b_lp_xx(FLT_1_0, p_flt_1, 24);
+  vld_b_lp_xx(FLT_1_1, p_flt_1, 24);
+  vld_b_lp_xx(FLT_1_2, p_flt_1, 24);
+  vld_b_lp_xx(FLT_1_3, p_flt_1, 24);
+  vld_b_lp_xx(FLT_1_4, p_flt_1, 24);
 
-  vld_b_l_xx(v10, p_flt_2 + (0 * 24), 24);
-  vld_b_l_xx(v11, p_flt_2 + (1 * 24), 24);
-  vld_b_l_xx(v12, p_flt_2 + (2 * 24), 24);
-  vld_b_l_xx(v13, p_flt_2 + (3 * 24), 24);
-  vld_b_l_xx(v14, p_flt_2 + (4 * 24), 24);
+  vld_b_lp_xx(FLT_2_0, p_flt_2, 24);
+  vld_b_lp_xx(FLT_2_1, p_flt_2, 24);
+  vld_b_lp_xx(FLT_2_2, p_flt_2, 24);
+  vld_b_lp_xx(FLT_2_3, p_flt_2, 24);
+  vld_b_lp_xx(FLT_2_4, p_flt_2, 24);
 
-  vld_b_l_xx(v15, p_flt_3 + (0 * 24), 24);
-  vld_b_l_xx(v16, p_flt_3 + (1 * 24), 24);
-  vld_b_l_xx(v17, p_flt_3 + (2 * 24), 24);
-  vld_b_l_xx(v18, p_flt_3 + (3 * 24), 24);
-  vld_b_l_xx(v19, p_flt_3 + (4 * 24), 24);
+  vld_b_lp_xx(FLT_3_0, p_flt_3, 24);
+  vld_b_lp_xx(FLT_3_1, p_flt_3, 24);
+  vld_b_lp_xx(FLT_3_2, p_flt_3, 24);
+  vld_b_lp_xx(FLT_3_3, p_flt_3, 24);
+  vld_b_lp_xx(FLT_3_4, p_flt_3, 24);
 
-  vld_b_l_xx(v20, p_flt_4 + (0 * 24), 24);
-  vld_b_l_xx(v21, p_flt_4 + (1 * 24), 24);
-  vld_b_l_xx(v22, p_flt_4 + (2 * 24), 24);
-  vld_b_l_xx(v23, p_flt_4 + (3 * 24), 24);
-  vld_b_l_xx(v24, p_flt_4 + (4 * 24), 24);
-  vdup_b_x(v25, 0);
-  vdup_b_x(v26, 0);
+  vdup_b_x(FLT_HOLE, 0);
+  vld_b_lp_xx(FLT_4_0, p_flt_4, 24);
+  vld_b_lp_xx(FLT_4_1, p_flt_4, 24);
+  vld_b_lp_xx(FLT_4_2, p_flt_4, 24);
+  vld_b_lp_xx(FLT_4_3, p_flt_4, 24);
+  vld_b_lp_xx(FLT_4_4, p_flt_4, 24);
 
   union {
     vdwconv_u8_t dwconv;
@@ -524,7 +585,7 @@
 
   int8_t* local_output_data = output_data + out_channel;
   int in_y[5];
-  int in_x[5];
+  int in_x[7];
   int out_y = 0;
   const int8_t* p_input = input_data;
   // Handle top row padding
@@ -615,6 +676,109 @@
              local_output_data, n_channels);
       local_output_data += output_depth;
     }
+    for (; out_x + 2 <= (output_width - pad_width); out_x += 2) {
+      const int in_x_origin = (out_x * stride_width) - pad_width;
+
+      #pragma GCC unroll 7
+      for (int i = 0; i < 7; ++i) {
+        in_x[i] = in_x_origin + (dilation_width_factor * i);
+      }
+      const int8_t* p_rows[5];
+      #pragma GCC unroll 5
+      for (int i = 0; i < 5; ++i) {
+        p_rows[i] = p_input + (in_y[i] * input_width);
+      }
+
+      vdup_b_x(INPUT_0_0, *(p_rows[0] + in_x[0]));
+      vdup_b_x(INPUT_0_1, *(p_rows[0] + in_x[1]));
+      vdup_b_x(INPUT_0_2, *(p_rows[0] + in_x[2]));
+      vdup_b_x(INPUT_0_3, *(p_rows[0] + in_x[3]));
+      vdup_b_x(INPUT_0_4, *(p_rows[0] + in_x[4]));
+
+      vdup_b_x(INPUT_1_0, *(p_rows[1] + in_x[0]));
+      vdup_b_x(INPUT_1_1, *(p_rows[1] + in_x[1]));
+      vdup_b_x(INPUT_1_2, *(p_rows[1] + in_x[2]));
+      vdup_b_x(INPUT_1_3, *(p_rows[1] + in_x[3]));
+      vdup_b_x(INPUT_1_4, *(p_rows[1] + in_x[4]));
+
+      vdup_b_x(INPUT_2_0, *(p_rows[2] + in_x[0]));
+      vdup_b_x(INPUT_2_1, *(p_rows[2] + in_x[1]));
+      vdup_b_x(INPUT_2_2, *(p_rows[2] + in_x[2]));
+      vdup_b_x(INPUT_2_3, *(p_rows[2] + in_x[3]));
+      vdup_b_x(INPUT_2_4, *(p_rows[2] + in_x[4]));
+
+      vdup_b_x(INPUT_3_0, *(p_rows[3] + in_x[0]));
+      vdup_b_x(INPUT_3_1, *(p_rows[3] + in_x[1]));
+      vdup_b_x(INPUT_3_2, *(p_rows[3] + in_x[2]));
+      vdup_b_x(INPUT_3_3, *(p_rows[3] + in_x[3]));
+      vdup_b_x(INPUT_3_4, *(p_rows[3] + in_x[4]));
+
+      vdup_b_x(INPUT_4_0, *(p_rows[4] + in_x[0]));
+      vdup_b_x(INPUT_4_1, *(p_rows[4] + in_x[1]));
+      vdup_b_x(INPUT_4_2, *(p_rows[4] + in_x[2]));
+      vdup_b_x(INPUT_4_3, *(p_rows[4] + in_x[3]));
+      vdup_b_x(INPUT_4_4, *(p_rows[4] + in_x[4]));
+
+      vld_w_x_m(v60, swizzled_bias_data);
+      adwinit_v(v60, v60);
+      adwconv_vxv(v60, INPUT_0_0, cmds, FLT_0_0);
+      adwconv_vxv(v60, INPUT_0_1, cmds, FLT_0_1);
+      adwconv_vxv(v60, INPUT_0_2, cmds, FLT_0_2);
+      adwconv_vxv(v60, INPUT_0_3, cmds, FLT_0_3);
+      adwconv_vxv(v60, INPUT_0_4, cmds, FLT_0_4);
+      adwconv_vxv(v60, INPUT_3_0, cmds, FLT_3_0);
+      adwconv_vxv(v60, INPUT_3_3, cmds, FLT_3_3);
+      adwconv_vxv(v60, INPUT_3_4, cmds, FLT_HOLE);
+      vdwconv_vxv(v60, INPUT_4_2, cmds, FLT_4_2);
+      vmv_v(INPUT_0_0, v60);
+      vmv_v(INPUT_1_0, v61);
+      vmv_v(INPUT_2_0, v62);
+      vmv_v(INPUT_0_1, v63);
+
+      vdup_b_x(INPUT_3_0, *(p_rows[3] + in_x[5]));
+      vdup_b_x(INPUT_3_1, *(p_rows[3] + in_x[6]));
+
+      vmv_v(INPUT_4_0, INPUT_4_2);
+      vmv_v(INPUT_4_1, INPUT_4_3);
+      vmv_v(INPUT_4_2, INPUT_4_4);
+      vdup_b_x(INPUT_4_3, *(p_rows[4] + in_x[5]));
+      vdup_b_x(INPUT_4_4, *(p_rows[4] + in_x[6]));
+
+      vld_w_x_m(v60, swizzled_bias_data);
+      adwinit_v(v60, v60);
+      adwconv_vxv(v60, INPUT_0_2, cmds, FLT_0_0);
+      adwconv_vxv(v60, INPUT_0_3, cmds, FLT_0_1);
+
+      vmv_v(INPUT_0_2, INPUT_0_0);
+      vmv_v(INPUT_1_2, INPUT_1_0);
+      vmv_v(INPUT_2_2, INPUT_2_0);
+      vmv_v(INPUT_0_3, INPUT_0_1);
+
+      vdup_b_x(INPUT_0_0, *(p_rows[0] + in_x[5]));
+      vdup_b_x(INPUT_0_1, *(p_rows[0] + in_x[6]));
+      vdup_b_x(INPUT_1_0, *(p_rows[1] + in_x[5]));
+      vdup_b_x(INPUT_1_1, *(p_rows[1] + in_x[6]));
+      vdup_b_x(INPUT_2_0, *(p_rows[2] + in_x[5]));
+      vdup_b_x(INPUT_2_1, *(p_rows[2] + in_x[6]));
+
+      adwconv_vxv(v60, INPUT_0_4, cmds, FLT_0_2);
+      adwconv_vxv(v60, INPUT_0_0, cmds, FLT_0_3);
+      adwconv_vxv(v60, INPUT_0_1, cmds, FLT_0_4);
+      adwconv_vxv(v60, INPUT_3_2, cmds, FLT_3_0);
+      adwconv_vxv(v60, INPUT_3_0, cmds, FLT_3_3);
+      adwconv_vxv(v60, INPUT_3_4, cmds, FLT_HOLE);
+      vdwconv_vxv(v60, INPUT_4_2, cmds, FLT_4_2);
+      INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE2(
+        v60, INPUT_0_2, v52, v56, output_activation_min,
+        output_activation_max, output_offset
+      );
+      vsraqs_b_vx(INPUT_0_2, INPUT_0_2, 0);
+      vst_b_l_xx(INPUT_0_2, local_output_data, n_channels);
+      local_output_data += output_depth;
+      vsraqs_b_vx(v60, v60, 0);
+      vst_b_l_xx(v60, local_output_data, n_channels);
+      local_output_data += output_depth;
+    }
     for (; out_x < (output_width - pad_width); ++out_x) {
       const int in_x_origin = (out_x * stride_width) - pad_width;
 
@@ -736,6 +900,62 @@
 #undef PAD_ROW_4
 #undef CALCULATE_IN_X
 #undef CALCULATE_IN_Y
+#undef INPUT_0_0
+#undef INPUT_0_1
+#undef INPUT_0_2
+#undef INPUT_0_3
+#undef INPUT_0_4
+#undef INPUT_1_0
+#undef INPUT_1_1
+#undef INPUT_1_2
+#undef INPUT_1_3
+#undef INPUT_1_4
+#undef INPUT_2_0
+#undef INPUT_2_1
+#undef INPUT_2_2
+#undef INPUT_2_3
+#undef INPUT_2_4
+#undef INPUT_3_0
+#undef INPUT_3_1
+#undef INPUT_3_2
+#undef INPUT_3_3
+#undef INPUT_3_4
+#undef INPUT_4_0
+#undef INPUT_4_1
+#undef INPUT_4_2
+#undef INPUT_4_3
+#undef INPUT_4_4
+#undef INPUT_0_5
+#undef INPUT_1_5
+#undef INPUT_2_5
+#undef INPUT_3_5
+#undef INPUT_4_5
+#undef FLT_0_0
+#undef FLT_0_1
+#undef FLT_0_2
+#undef FLT_0_3
+#undef FLT_0_4
+#undef FLT_1_0
+#undef FLT_1_1
+#undef FLT_1_2
+#undef FLT_1_3
+#undef FLT_1_4
+#undef FLT_2_0
+#undef FLT_2_1
+#undef FLT_2_2
+#undef FLT_2_3
+#undef FLT_2_4
+#undef FLT_3_0
+#undef FLT_3_1
+#undef FLT_3_2
+#undef FLT_3_3
+#undef FLT_3_4
+#undef FLT_HOLE
+#undef FLT_4_0
+#undef FLT_4_1
+#undef FLT_4_2
+#undef FLT_4_3
+#undef FLT_4_4
 
 void ConvPerChannelD1(
     const tflite::ConvParams& params, const int32_t* output_multiplier,
diff --git a/tflm/opt/depthwise_conv_s8.cc b/tflm/opt/depthwise_conv_s8.cc
index 4f9440c..a11c3d2 100644
--- a/tflm/opt/depthwise_conv_s8.cc
+++ b/tflm/opt/depthwise_conv_s8.cc
@@ -1835,8 +1835,6 @@
   const int input_height = input_shape.Dims(1);
   const int input_width = input_shape.Dims(2);
   const int input_depth = input_shape.Dims(3);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
   const int output_height = output_shape.Dims(1);
   const int output_width = output_shape.Dims(2);
   const int output_depth = output_shape.Dims(3);
@@ -1844,134 +1842,742 @@
   int32_t swizzled_shift_multi[32];
   int32_t swizzled_output_multi[32];
 
+#define FLT_0_0 v0
+#define FLT_0_1 v3
+#define FLT_0_2 v6
+#define FLT_0_3 v9
+#define FLT_0_4 v12
+#define FLT_1_0 v1
+#define FLT_1_1 v4
+#define FLT_1_2 v7
+#define FLT_1_3 v10
+#define FLT_1_4 v13
+#define FLT_2_0 v2
+#define FLT_2_1 v5
+#define FLT_2_2 v8
+#define FLT_2_3 v11
+#define FLT_2_4 v14
+#define FLT_3_0 v15
+#define FLT_3_1 v16
+#define FLT_3_2 v17
+#define FLT_3_3 v18
+#define FLT_3_4 v19
+#define FLT_HOLE v20
+#define FLT_4_0 v21
+#define FLT_4_1 v22
+#define FLT_4_2 v23
+#define FLT_4_3 v24
+#define FLT_4_4 v25
+
+#define INPUT_0_0 v26
+#define INPUT_0_1 v29
+#define INPUT_0_2 v32
+#define INPUT_0_3 v35
+#define INPUT_0_4 v38
+#define INPUT_1_0 v27
+#define INPUT_1_1 v30
+#define INPUT_1_2 v33
+#define INPUT_1_3 v36
+#define INPUT_1_4 v39
+#define INPUT_2_0 v28
+#define INPUT_2_1 v31
+#define INPUT_2_2 v34
+#define INPUT_2_3 v37
+#define INPUT_2_4 v40
+#define INPUT_3_0 v41
+#define INPUT_3_1 v42
+#define INPUT_3_2 v43
+#define INPUT_3_3 v44
+#define INPUT_3_4 v45
+#define INPUT_4_0 v46
+#define INPUT_4_1 v47
+#define INPUT_4_2 v48
+#define INPUT_4_3 v49
+#define INPUT_4_4 v50
+
   for (int in_channel = 0; in_channel + 32 <= input_depth; in_channel += 32) {
     const int output_channel = in_channel;
     VectorSwizzle(bias_data + output_channel, swizzled_bias_data, 32);
     VectorSwizzle(output_multiplier + output_channel, swizzled_output_multi, 32);
     VectorSwizzle(output_shift + output_channel, swizzled_shift_multi, 32);
 
-    vld_w_x_m(v52, swizzled_bias_data);
+    union {
+      vdwconv_u8_t dwconv;
+      uint32_t raw;
+    } cmds;
+    cmds.raw = 0;
+    cmds.dwconv.sdata1 = true;
+    cmds.dwconv.sbias1 = input_offset;
+    cmds.dwconv.sdata2 = true;
+    cmds.dwconv.sbias2 = 0;
+    cmds.dwconv.mode = 0;
+    cmds.dwconv.sparsity = 0;
+    cmds.dwconv.regbase = 0;
+
     vld_w_x_m(v56, swizzled_output_multi);
     vld_w_x_m(v60, swizzled_shift_multi);
     vrsub_w_vx_m(v60, v60, 0);
 
-    // Don't reorder me!
     const int8_t* p_flt = filter_data + in_channel;
-    vld_b_sp_xx(v6, p_flt, input_depth);
-    vld_b_sp_xx(v7, p_flt, input_depth);
-    vld_b_sp_xx_m(v8, p_flt, input_depth);
-    vld_b_sp_xx_m(v12, p_flt, input_depth);
-    vld_b_sp_xx_m(v16, p_flt, input_depth);
-    vld_b_sp_xx_m(v20, p_flt, input_depth);
-    vld_b_sp_xx_m(v24, p_flt, input_depth);
-    vld_b_sp_xx(v28, p_flt, input_depth);
-    vld_b_sp_xx(v29, p_flt, input_depth);
-    vld_b_sp_xx(v30, p_flt, input_depth);
+    vld_b_sp_xx(FLT_0_0, p_flt, input_depth);
+    vld_b_sp_xx(FLT_0_1, p_flt, input_depth);
+    vld_b_sp_xx(FLT_0_2, p_flt, input_depth);
+    vld_b_sp_xx(FLT_0_3, p_flt, input_depth);
+    vld_b_sp_xx(FLT_0_4, p_flt, input_depth);
 
+    vld_b_sp_xx(FLT_1_0, p_flt, input_depth);
+    vld_b_sp_xx(FLT_1_1, p_flt, input_depth);
+    vld_b_sp_xx(FLT_1_2, p_flt, input_depth);
+    vld_b_sp_xx(FLT_1_3, p_flt, input_depth);
+    vld_b_sp_xx(FLT_1_4, p_flt, input_depth);
+
+    vld_b_sp_xx(FLT_2_0, p_flt, input_depth);
+    vld_b_sp_xx(FLT_2_1, p_flt, input_depth);
+    vld_b_sp_xx(FLT_2_2, p_flt, input_depth);
+    vld_b_sp_xx(FLT_2_3, p_flt, input_depth);
+    vld_b_sp_xx(FLT_2_4, p_flt, input_depth);
+
+    vld_b_sp_xx(FLT_3_0, p_flt, input_depth);
+    vld_b_sp_xx(FLT_3_1, p_flt, input_depth);
+    vld_b_sp_xx(FLT_3_2, p_flt, input_depth);
+    vld_b_sp_xx(FLT_3_3, p_flt, input_depth);
+    vld_b_sp_xx(FLT_3_4, p_flt, input_depth);
+
+    vld_b_sp_xx(FLT_4_0, p_flt, input_depth);
+    vld_b_sp_xx(FLT_4_1, p_flt, input_depth);
+    vld_b_sp_xx(FLT_4_2, p_flt, input_depth);
+    vld_b_sp_xx(FLT_4_3, p_flt, input_depth);
+    vld_b_sp_xx(FLT_4_4, p_flt, input_depth);
+    vdup_b_x(FLT_HOLE, 0);
+
+#define COMPUTE()                              \
+  vld_w_x_m(v52, swizzled_bias_data);          \
+  adwinit_v(v52, v52);                         \
+  adwconv_vxv(v52, INPUT_0_0, cmds, FLT_0_0);  \
+  adwconv_vxv(v52, INPUT_0_1, cmds, FLT_0_1);  \
+  adwconv_vxv(v52, INPUT_0_2, cmds, FLT_0_2);  \
+  adwconv_vxv(v52, INPUT_0_3, cmds, FLT_0_3);  \
+  adwconv_vxv(v52, INPUT_0_4, cmds, FLT_0_4);  \
+  adwconv_vxv(v52, INPUT_3_0, cmds, FLT_3_0);  \
+  adwconv_vxv(v52, INPUT_3_3, cmds, FLT_3_3);  \
+  adwconv_vxv(v52, INPUT_3_4, cmds, FLT_HOLE); \
+  vdwconv_vxv(v52, INPUT_4_2, cmds, FLT_4_2);
+
+#define INPUT_PTRS()                                                    \
+  const int8_t* p_input_0 =                                             \
+      input_data + (batch * input_height * input_width * input_depth) + \
+      (in_y_origin * input_width * input_depth) +                       \
+      (in_x_origin * input_depth) + in_channel;                         \
+  const int8_t* p_input_1 = p_input_0 + (input_width * input_depth);    \
+  const int8_t* p_input_2 = p_input_1 + (input_width * input_depth);    \
+  const int8_t* p_input_3 = p_input_2 + (input_width * input_depth);    \
+  const int8_t* p_input_4 = p_input_3 + (input_width * input_depth);    \
+  (void)p_input_4;
 
     for (int batch = 0; batch < batches; ++batch) {
-      const int8_t* p_input = input_data + (batch * input_width * input_height * input_depth) + in_channel;
-      const int8_t* p_output = output_data + (batch * output_width * output_height * output_depth) + output_channel;
-      for (int out_y = 0; out_y < output_height; ++out_y) {
-        const int out_y_offset = (out_y * output_width * output_depth);
-        for (int out_x = 0; out_x < output_width; ++out_x) {
+      int out_y = 0;
+      int8_t* p_output = output_data +
+                         (batch * output_height * output_width * output_depth) +
+                         output_channel;
+      do {
+        const int in_y_origin = (out_y * stride_height) - pad_height;
+        if (in_y_origin >= 0) {
+          break;
+        }
+        int out_x = 0;
+        do {
           const int in_x_origin = (out_x * stride_width) - pad_width;
-          const int in_y_origin = (out_y * stride_height) - pad_height;
+          if (in_x_origin >= 0) {
+            break;
+          }
+          INPUT_PTRS();
+#define LOAD_INPUT(y, x)                                       \
+  if (in_y_origin + y < 0) {                                   \
+    vdup_b_x(INPUT_##y##_##x, -input_offset);                  \
+  } else if (in_x_origin + x < 0) {                            \
+    vdup_b_x(INPUT_##y##_##x, -input_offset);                  \
+  } else {                                                     \
+    vld_b_x(INPUT_##y##_##x, p_input_##y + (x * input_depth)); \
+  }
 
-          // Initialize accumulators w/ bias_data
-          vmv_v_m(v48, v52);
+          LOAD_INPUT(0, 0);
+          LOAD_INPUT(0, 1);
+          LOAD_INPUT(0, 2);
+          LOAD_INPUT(0, 3);
+          LOAD_INPUT(0, 4);
+          LOAD_INPUT(1, 0);
+          LOAD_INPUT(1, 1);
+          LOAD_INPUT(1, 2);
+          LOAD_INPUT(1, 3);
+          LOAD_INPUT(1, 4);
+          LOAD_INPUT(2, 0);
+          LOAD_INPUT(2, 1);
+          LOAD_INPUT(2, 2);
+          LOAD_INPUT(2, 3);
+          LOAD_INPUT(2, 4);
+          LOAD_INPUT(3, 0);
+          LOAD_INPUT(3, 1);
+          LOAD_INPUT(3, 2);
+          LOAD_INPUT(3, 3);
+          LOAD_INPUT(3, 4);
+          LOAD_INPUT(4, 0);
+          LOAD_INPUT(4, 1);
+          LOAD_INPUT(4, 2);
+          LOAD_INPUT(4, 3);
+          LOAD_INPUT(4, 4);
+#undef LOAD_INPUT
+          COMPUTE();
+          INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
+              v52, v56, v60, output_activation_min, output_activation_max,
+              output_offset);
+          vsraqs_b_vx(v52, v52, 0);
+          vst_b_x(v52, p_output);
+          p_output += output_depth;
+          ++out_x;
+        } while (out_x < output_width);
+        do {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          if (in_x_origin + 4 >= input_width) {
+            break;
+          }
+          INPUT_PTRS();
+          vdup_b_x(INPUT_0_0, -input_offset);
+          vdup_b_x(INPUT_0_1, -input_offset);
+          vdup_b_x(INPUT_0_2, -input_offset);
+          vdup_b_x(INPUT_0_3, -input_offset);
+          vdup_b_x(INPUT_0_4, -input_offset);
+          if (in_y_origin + 1 < 0) {
+            vdup_b_x(INPUT_1_0, -input_offset);
+            vdup_b_x(INPUT_1_1, -input_offset);
+            vdup_b_x(INPUT_1_2, -input_offset);
+            vdup_b_x(INPUT_1_3, -input_offset);
+            vdup_b_x(INPUT_1_4, -input_offset);
+          } else {
+            vld_b_sp_xx(INPUT_1_0, p_input_1, input_depth);
+            vld_b_sp_xx(INPUT_1_1, p_input_1, input_depth);
+            vld_b_sp_xx(INPUT_1_2, p_input_1, input_depth);
+            vld_b_sp_xx(INPUT_1_3, p_input_1, input_depth);
+            vld_b_sp_xx(INPUT_1_4, p_input_1, input_depth);
+          }
+          if (in_y_origin + 2 < 0) {
+            vdup_b_x(INPUT_2_0, -input_offset);
+            vdup_b_x(INPUT_2_1, -input_offset);
+            vdup_b_x(INPUT_2_2, -input_offset);
+            vdup_b_x(INPUT_2_3, -input_offset);
+            vdup_b_x(INPUT_2_4, -input_offset);
+          } else {
+            vld_b_sp_xx(INPUT_2_0, p_input_2, input_depth);
+            vld_b_sp_xx(INPUT_2_1, p_input_2, input_depth);
+            vld_b_sp_xx(INPUT_2_2, p_input_2, input_depth);
+            vld_b_sp_xx(INPUT_2_3, p_input_2, input_depth);
+            vld_b_sp_xx(INPUT_2_4, p_input_2, input_depth);
+          }
+          if (in_y_origin + 3 < 0) {
+            vdup_b_x(INPUT_3_0, -input_offset);
+            vdup_b_x(INPUT_3_1, -input_offset);
+            vdup_b_x(INPUT_3_2, -input_offset);
+            vdup_b_x(INPUT_3_3, -input_offset);
+            vdup_b_x(INPUT_3_4, -input_offset);
+          } else {
+            vld_b_sp_xx(INPUT_3_0, p_input_3, input_depth);
+            vld_b_sp_xx(INPUT_3_1, p_input_3, input_depth);
+            vld_b_sp_xx(INPUT_3_2, p_input_3, input_depth);
+            vld_b_sp_xx(INPUT_3_3, p_input_3, input_depth);
+            vld_b_sp_xx(INPUT_3_4, p_input_3, input_depth);
+          }
+          if (in_y_origin + 4 < 0) {
+            vdup_b_x(INPUT_4_0, -input_offset);
+            vdup_b_x(INPUT_4_1, -input_offset);
+            vdup_b_x(INPUT_4_2, -input_offset);
+            vdup_b_x(INPUT_4_3, -input_offset);
+            vdup_b_x(INPUT_4_4, -input_offset);
+          } else {
+            vld_b_sp_xx(INPUT_4_0, p_input_4, input_depth);
+            vld_b_sp_xx(INPUT_4_1, p_input_4, input_depth);
+            vld_b_sp_xx(INPUT_4_2, p_input_4, input_depth);
+            vld_b_sp_xx(INPUT_4_3, p_input_4, input_depth);
+            vld_b_sp_xx(INPUT_4_4, p_input_4, input_depth);
+          }
+          COMPUTE();
+          INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
+              v52, v56, v60, output_activation_min, output_activation_max,
+              output_offset);
+          vsraqs_b_vx(v52, v52, 0);
+          vst_b_x(v52, p_output);
+          p_output += output_depth;
+          ++out_x;
+        } while (out_x < output_width);
+        do {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          INPUT_PTRS();
+#define LOAD_INPUT(y, x)                                       \
+  if (in_y_origin + y < 0) {                                   \
+    vdup_b_x(INPUT_##y##_##x, -input_offset);                  \
+  } else if (in_x_origin + x >= input_width) {                 \
+    vdup_b_x(INPUT_##y##_##x, -input_offset);                  \
+  } else {                                                     \
+    vld_b_x(INPUT_##y##_##x, p_input_##y + (x * input_depth)); \
+  }
 
-          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-            const int in_y = in_y_origin + filter_y;
-            if ((in_y < 0) || (in_y >= input_height)) {
-              continue;
-            }
-            switch (filter_y) {
-              case 0:
-                vaddw_h_vx(v31, v6, 0);
-                vaddw_h_vx(v33, v7, 0);
-                vaddw_h_vx(v35, v8, 0);
-                vaddw_h_vx(v37, v9, 0);
-                vaddw_h_vx(v39, v10, 0);
-                break;
-              case 1:
-                vaddw_h_vx(v31, v11, 0);
-                vaddw_h_vx(v33, v12, 0);
-                vaddw_h_vx(v35, v13, 0);
-                vaddw_h_vx(v37, v14, 0);
-                vaddw_h_vx(v39, v15, 0);
-                break;
-              case 2:
-                vaddw_h_vx(v31, v16, 0);
-                vaddw_h_vx(v33, v17, 0);
-                vaddw_h_vx(v35, v18, 0);
-                vaddw_h_vx(v37, v19, 0);
-                vaddw_h_vx(v39, v20, 0);
-                break;
-              case 3:
-                vaddw_h_vx(v31, v21, 0);
-                vaddw_h_vx(v33, v22, 0);
-                vaddw_h_vx(v35, v23, 0);
-                vaddw_h_vx(v37, v24, 0);
-                vaddw_h_vx(v39, v25, 0);
-                break;
-              case 4:
-                vaddw_h_vx(v31, v26, 0);
-                vaddw_h_vx(v33, v27, 0);
-                vaddw_h_vx(v35, v28, 0);
-                vaddw_h_vx(v37, v29, 0);
-                vaddw_h_vx(v39, v30, 0);
-                break;
-            }
-            const int in_y_offset = in_y  * input_width * input_depth;
-            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-              const int in_x = in_x_origin + filter_x;
-              if ((in_x < 0) || (in_x >= input_width)) {
-                continue;
-              }
-
-              vld_b_x(v0, p_input + (in_x * input_depth) + in_y_offset);
-
-              vaddw_h_vx(v0, v0, 0);
-              vadd_h_vx(v0, v0, static_cast<int16_t>(input_offset));
-              vadd_h_vx(v1, v1,
-                        static_cast<int16_t>(input_offset));  // v0 v1 input
-              switch (filter_x) {
-                case 0:
-                  vmulw_w_vv(v2, v1, v32);
-                  vmulw_w_vv(v0, v0, v31);
-                  break;
-                case 1:
-                  vmulw_w_vv(v2, v1, v34);
-                  vmulw_w_vv(v0, v0, v33);
-                  break;
-                case 2:
-                  vmulw_w_vv(v2, v1, v36);
-                  vmulw_w_vv(v0, v0, v35);
-                  break;
-                case 3:
-                  vmulw_w_vv(v2, v1, v38);
-                  vmulw_w_vv(v0, v0, v37);
-                  break;
-                case 4:
-                  vmulw_w_vv(v2, v1, v40);
-                  vmulw_w_vv(v0, v0, v39);
-                  break;
-              }
-              vadd_w_vv_m(v48, v48, v0);
-            }
+          LOAD_INPUT(0, 0);
+          LOAD_INPUT(0, 1);
+          LOAD_INPUT(0, 2);
+          LOAD_INPUT(0, 3);
+          LOAD_INPUT(0, 4);
+          LOAD_INPUT(1, 0);
+          LOAD_INPUT(1, 1);
+          LOAD_INPUT(1, 2);
+          LOAD_INPUT(1, 3);
+          LOAD_INPUT(1, 4);
+          LOAD_INPUT(2, 0);
+          LOAD_INPUT(2, 1);
+          LOAD_INPUT(2, 2);
+          LOAD_INPUT(2, 3);
+          LOAD_INPUT(2, 4);
+          LOAD_INPUT(3, 0);
+          LOAD_INPUT(3, 1);
+          LOAD_INPUT(3, 2);
+          LOAD_INPUT(3, 3);
+          LOAD_INPUT(3, 4);
+          LOAD_INPUT(4, 0);
+          LOAD_INPUT(4, 1);
+          LOAD_INPUT(4, 2);
+          LOAD_INPUT(4, 3);
+          LOAD_INPUT(4, 4);
+#undef LOAD_INPUT
+          COMPUTE();
+          INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
+              v52, v56, v60, output_activation_min, output_activation_max,
+              output_offset);
+          vsraqs_b_vx(v52, v52, 0);
+          vst_b_x(v52, p_output);
+          p_output += output_depth;
+          ++out_x;
+        } while (out_x < output_width);
+        ++out_y;
+      } while (out_y < output_height);
+      do {
+        const int in_y_origin = (out_y * stride_height) - pad_height;
+        if (in_y_origin + 4 >= input_height) {
+          break;
+        }
+        int out_x = 0;
+        do {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          if (in_x_origin >= 0) {
+            break;
+          }
+          INPUT_PTRS();
+          vdup_b_x(INPUT_0_0, -input_offset);
+          vdup_b_x(INPUT_1_0, -input_offset);
+          vdup_b_x(INPUT_2_0, -input_offset);
+          vdup_b_x(INPUT_3_0, -input_offset);
+          vdup_b_x(INPUT_4_0, -input_offset);
+          if (in_x_origin + 1 < 0) {
+            vdup_b_x(INPUT_0_1, -input_offset);
+            vdup_b_x(INPUT_1_1, -input_offset);
+            vdup_b_x(INPUT_2_1, -input_offset);
+            vdup_b_x(INPUT_3_1, -input_offset);
+            vdup_b_x(INPUT_4_1, -input_offset);
+          } else {
+            vld_b_x(INPUT_0_1, p_input_0 + (1 * input_depth));
+            vld_b_x(INPUT_1_1, p_input_1 + (1 * input_depth));
+            vld_b_x(INPUT_2_1, p_input_2 + (1 * input_depth));
+            vld_b_x(INPUT_3_1, p_input_3 + (1 * input_depth));
+            vld_b_x(INPUT_4_1, p_input_4 + (1 * input_depth));
+          }
+          if (in_x_origin + 2 < 0) {
+            vdup_b_x(INPUT_0_2, -input_offset);
+            vdup_b_x(INPUT_1_2, -input_offset);
+            vdup_b_x(INPUT_2_2, -input_offset);
+            vdup_b_x(INPUT_3_2, -input_offset);
+            vdup_b_x(INPUT_4_2, -input_offset);
+          } else {
+            vld_b_x(INPUT_0_2, p_input_0 + (2 * input_depth));
+            vld_b_x(INPUT_1_2, p_input_1 + (2 * input_depth));
+            vld_b_x(INPUT_2_2, p_input_2 + (2 * input_depth));
+            vld_b_x(INPUT_3_2, p_input_3 + (2 * input_depth));
+            vld_b_x(INPUT_4_2, p_input_4 + (2 * input_depth));
+          }
+          if (in_x_origin + 3 < 0) {
+            vdup_b_x(INPUT_0_3, -input_offset);
+            vdup_b_x(INPUT_1_3, -input_offset);
+            vdup_b_x(INPUT_2_3, -input_offset);
+            vdup_b_x(INPUT_3_3, -input_offset);
+            vdup_b_x(INPUT_4_3, -input_offset);
+          } else {
+            vld_b_x(INPUT_0_3, p_input_0 + (3 * input_depth));
+            vld_b_x(INPUT_1_3, p_input_1 + (3 * input_depth));
+            vld_b_x(INPUT_2_3, p_input_2 + (3 * input_depth));
+            vld_b_x(INPUT_3_3, p_input_3 + (3 * input_depth));
+            vld_b_x(INPUT_4_3, p_input_4 + (3 * input_depth));
+          }
+          if (in_x_origin + 4 < 0) {
+            vdup_b_x(INPUT_0_4, -input_offset);
+            vdup_b_x(INPUT_1_4, -input_offset);
+            vdup_b_x(INPUT_2_4, -input_offset);
+            vdup_b_x(INPUT_3_4, -input_offset);
+            vdup_b_x(INPUT_4_4, -input_offset);
+          } else {
+            vld_b_x(INPUT_0_4, p_input_0 + (4 * input_depth));
+            vld_b_x(INPUT_1_4, p_input_1 + (4 * input_depth));
+            vld_b_x(INPUT_2_4, p_input_2 + (4 * input_depth));
+            vld_b_x(INPUT_3_4, p_input_3 + (4 * input_depth));
+            vld_b_x(INPUT_4_4, p_input_4 + (4 * input_depth));
           }
 
-          vdmulh_w_rn_vv_m(v48, v48, v56);
-          vsha_w_r_vv_m(v48, v48, v60);
-          vadd_w_vx_m(v48, v48, output_offset);
-          vmax_w_vx_m(v48, v48, output_activation_min);
-          vmin_w_vx_m(v48, v48, output_activation_max);
-          vsraqs_b_vx(v48, v48, 0);
-          vst_b_x(v48, p_output + out_y_offset + (out_x * output_depth));
-        }
-      }
+          COMPUTE();
+          INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
+              v52, v56, v60, output_activation_min, output_activation_max,
+              output_offset);
+          vsraqs_b_vx(v52, v52, 0);
+          vst_b_x(v52, p_output);
+          p_output += output_depth;
+          ++out_x;
+        } while (out_x < output_width);
+        do {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          if (in_x_origin + 4 >= input_width) {
+            break;
+          }
+          INPUT_PTRS();
+          vld_b_sp_xx(INPUT_0_0, p_input_0, input_depth);
+          vld_b_sp_xx(INPUT_0_1, p_input_0, input_depth);
+          vld_b_sp_xx(INPUT_0_2, p_input_0, input_depth);
+          vld_b_sp_xx(INPUT_0_3, p_input_0, input_depth);
+          vld_b_sp_xx(INPUT_0_4, p_input_0, input_depth);
+          vld_b_sp_xx(INPUT_1_0, p_input_1, input_depth);
+          vld_b_sp_xx(INPUT_1_1, p_input_1, input_depth);
+          vld_b_sp_xx(INPUT_1_2, p_input_1, input_depth);
+          vld_b_sp_xx(INPUT_1_3, p_input_1, input_depth);
+          vld_b_sp_xx(INPUT_1_4, p_input_1, input_depth);
+          vld_b_sp_xx(INPUT_2_0, p_input_2, input_depth);
+          vld_b_sp_xx(INPUT_2_1, p_input_2, input_depth);
+          vld_b_sp_xx(INPUT_2_2, p_input_2, input_depth);
+          vld_b_sp_xx(INPUT_2_3, p_input_2, input_depth);
+          vld_b_sp_xx(INPUT_2_4, p_input_2, input_depth);
+          vld_b_sp_xx(INPUT_3_0, p_input_3, input_depth);
+          vld_b_sp_xx(INPUT_3_1, p_input_3, input_depth);
+          vld_b_sp_xx(INPUT_3_2, p_input_3, input_depth);
+          vld_b_sp_xx(INPUT_3_3, p_input_3, input_depth);
+          vld_b_sp_xx(INPUT_3_4, p_input_3, input_depth);
+          vld_b_sp_xx(INPUT_4_0, p_input_4, input_depth);
+          vld_b_sp_xx(INPUT_4_1, p_input_4, input_depth);
+          vld_b_sp_xx(INPUT_4_2, p_input_4, input_depth);
+          vld_b_sp_xx(INPUT_4_3, p_input_4, input_depth);
+          vld_b_sp_xx(INPUT_4_4, p_input_4, input_depth);
+
+          COMPUTE();
+          INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
+              v52, v56, v60, output_activation_min, output_activation_max,
+              output_offset);
+          vsraqs_b_vx(v52, v52, 0);
+          vst_b_x(v52, p_output);
+          p_output += output_depth;
+          ++out_x;
+        } while (out_x < output_width);
+        do {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          INPUT_PTRS();
+          if (in_x_origin >= input_width) {
+            vdup_b_x(INPUT_0_0, -input_offset);
+            vdup_b_x(INPUT_1_0, -input_offset);
+            vdup_b_x(INPUT_2_0, -input_offset);
+            vdup_b_x(INPUT_3_0, -input_offset);
+            vdup_b_x(INPUT_4_0, -input_offset);
+          } else {
+            vld_b_x(INPUT_0_0, p_input_0);
+            vld_b_x(INPUT_1_0, p_input_1);
+            vld_b_x(INPUT_2_0, p_input_2);
+            vld_b_x(INPUT_3_0, p_input_3);
+            vld_b_x(INPUT_4_0, p_input_4);
+          }
+          if (in_x_origin + 1 >= input_width) {
+            vdup_b_x(INPUT_0_1, -input_offset);
+            vdup_b_x(INPUT_1_1, -input_offset);
+            vdup_b_x(INPUT_2_1, -input_offset);
+            vdup_b_x(INPUT_3_1, -input_offset);
+            vdup_b_x(INPUT_4_1, -input_offset);
+          } else {
+            vld_b_x(INPUT_0_1, p_input_0 + (1 * input_depth));
+            vld_b_x(INPUT_1_1, p_input_1 + (1 * input_depth));
+            vld_b_x(INPUT_2_1, p_input_2 + (1 * input_depth));
+            vld_b_x(INPUT_3_1, p_input_3 + (1 * input_depth));
+            vld_b_x(INPUT_4_1, p_input_4 + (1 * input_depth));
+          }
+          if (in_x_origin + 2 >= input_width) {
+            vdup_b_x(INPUT_0_2, -input_offset);
+            vdup_b_x(INPUT_1_2, -input_offset);
+            vdup_b_x(INPUT_2_2, -input_offset);
+            vdup_b_x(INPUT_3_2, -input_offset);
+            vdup_b_x(INPUT_4_2, -input_offset);
+          } else {
+            vld_b_x(INPUT_0_2, p_input_0 + (2 * input_depth));
+            vld_b_x(INPUT_1_2, p_input_1 + (2 * input_depth));
+            vld_b_x(INPUT_2_2, p_input_2 + (2 * input_depth));
+            vld_b_x(INPUT_3_2, p_input_3 + (2 * input_depth));
+            vld_b_x(INPUT_4_2, p_input_4 + (2 * input_depth));
+          }
+          if (in_x_origin + 3 >= input_width) {
+            vdup_b_x(INPUT_0_3, -input_offset);
+            vdup_b_x(INPUT_1_3, -input_offset);
+            vdup_b_x(INPUT_2_3, -input_offset);
+            vdup_b_x(INPUT_3_3, -input_offset);
+            vdup_b_x(INPUT_4_3, -input_offset);
+          } else {
+            vld_b_x(INPUT_0_3, p_input_0 + (3 * input_depth));
+            vld_b_x(INPUT_1_3, p_input_1 + (3 * input_depth));
+            vld_b_x(INPUT_2_3, p_input_2 + (3 * input_depth));
+            vld_b_x(INPUT_3_3, p_input_3 + (3 * input_depth));
+            vld_b_x(INPUT_4_3, p_input_4 + (3 * input_depth));
+          }
+          vdup_b_x(INPUT_0_4, -input_offset);
+          vdup_b_x(INPUT_1_4, -input_offset);
+          vdup_b_x(INPUT_2_4, -input_offset);
+          vdup_b_x(INPUT_3_4, -input_offset);
+          vdup_b_x(INPUT_4_4, -input_offset);
+
+          COMPUTE();
+          INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
+              v52, v56, v60, output_activation_min, output_activation_max,
+              output_offset);
+          vsraqs_b_vx(v52, v52, 0);
+          vst_b_x(v52, p_output);
+          p_output += output_depth;
+          ++out_x;
+        } while (out_x < output_width);
+        ++out_y;
+      } while (out_y < output_height);
+      do {
+        const int in_y_origin = (out_y * stride_height) - pad_height;
+        int out_x = 0;
+        do {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          if (in_x_origin >= 0) {
+            break;
+          }
+          INPUT_PTRS();
+#define LOAD_INPUT(y, x)                                       \
+  if (in_y_origin + y >= input_height) {                       \
+    vdup_b_x(INPUT_##y##_##x, -input_offset);                  \
+  } else if (in_x_origin + x < 0) {                            \
+    vdup_b_x(INPUT_##y##_##x, -input_offset);                  \
+  } else {                                                     \
+    vld_b_x(INPUT_##y##_##x, p_input_##y + (x * input_depth)); \
+  }
+
+          LOAD_INPUT(0, 0);
+          LOAD_INPUT(0, 1);
+          LOAD_INPUT(0, 2);
+          LOAD_INPUT(0, 3);
+          LOAD_INPUT(0, 4);
+          LOAD_INPUT(1, 0);
+          LOAD_INPUT(1, 1);
+          LOAD_INPUT(1, 2);
+          LOAD_INPUT(1, 3);
+          LOAD_INPUT(1, 4);
+          LOAD_INPUT(2, 0);
+          LOAD_INPUT(2, 1);
+          LOAD_INPUT(2, 2);
+          LOAD_INPUT(2, 3);
+          LOAD_INPUT(2, 4);
+          LOAD_INPUT(3, 0);
+          LOAD_INPUT(3, 1);
+          LOAD_INPUT(3, 2);
+          LOAD_INPUT(3, 3);
+          LOAD_INPUT(3, 4);
+          LOAD_INPUT(4, 0);
+          LOAD_INPUT(4, 1);
+          LOAD_INPUT(4, 2);
+          LOAD_INPUT(4, 3);
+          LOAD_INPUT(4, 4);
+#undef LOAD_INPUT
+          COMPUTE();
+          INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
+              v52, v56, v60, output_activation_min, output_activation_max,
+              output_offset);
+          vsraqs_b_vx(v52, v52, 0);
+          vst_b_x(v52, p_output);
+          p_output += output_depth;
+          ++out_x;
+        } while (out_x < output_width);
+        do {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          if (in_x_origin + 4 >= input_width) {
+            break;
+          }
+          INPUT_PTRS();
+          if (in_y_origin >= input_height) {
+            vdup_b_x(INPUT_0_0, -input_offset);
+            vdup_b_x(INPUT_0_1, -input_offset);
+            vdup_b_x(INPUT_0_2, -input_offset);
+            vdup_b_x(INPUT_0_3, -input_offset);
+            vdup_b_x(INPUT_0_4, -input_offset);
+          } else {
+            vld_b_sp_xx(INPUT_0_0, p_input_0, input_depth);
+            vld_b_sp_xx(INPUT_0_1, p_input_0, input_depth);
+            vld_b_sp_xx(INPUT_0_2, p_input_0, input_depth);
+            vld_b_sp_xx(INPUT_0_3, p_input_0, input_depth);
+            vld_b_sp_xx(INPUT_0_4, p_input_0, input_depth);
+          }
+          if (in_y_origin + 1 >= input_height) {
+            vdup_b_x(INPUT_1_0, -input_offset);
+            vdup_b_x(INPUT_1_1, -input_offset);
+            vdup_b_x(INPUT_1_2, -input_offset);
+            vdup_b_x(INPUT_1_3, -input_offset);
+            vdup_b_x(INPUT_1_4, -input_offset);
+          } else {
+            vld_b_sp_xx(INPUT_1_0, p_input_1, input_depth);
+            vld_b_sp_xx(INPUT_1_1, p_input_1, input_depth);
+            vld_b_sp_xx(INPUT_1_2, p_input_1, input_depth);
+            vld_b_sp_xx(INPUT_1_3, p_input_1, input_depth);
+            vld_b_sp_xx(INPUT_1_4, p_input_1, input_depth);
+          }
+          if (in_y_origin + 2 >= input_height) {
+            vdup_b_x(INPUT_2_0, -input_offset);
+            vdup_b_x(INPUT_2_1, -input_offset);
+            vdup_b_x(INPUT_2_2, -input_offset);
+            vdup_b_x(INPUT_2_3, -input_offset);
+            vdup_b_x(INPUT_2_4, -input_offset);
+          } else {
+            vld_b_sp_xx(INPUT_2_0, p_input_2, input_depth);
+            vld_b_sp_xx(INPUT_2_1, p_input_2, input_depth);
+            vld_b_sp_xx(INPUT_2_2, p_input_2, input_depth);
+            vld_b_sp_xx(INPUT_2_3, p_input_2, input_depth);
+            vld_b_sp_xx(INPUT_2_4, p_input_2, input_depth);
+          }
+          if (in_y_origin + 3 >= input_height) {
+            vdup_b_x(INPUT_3_0, -input_offset);
+            vdup_b_x(INPUT_3_1, -input_offset);
+            vdup_b_x(INPUT_3_2, -input_offset);
+            vdup_b_x(INPUT_3_3, -input_offset);
+            vdup_b_x(INPUT_3_4, -input_offset);
+          } else {
+            vld_b_sp_xx(INPUT_3_0, p_input_3, input_depth);
+            vld_b_sp_xx(INPUT_3_1, p_input_3, input_depth);
+            vld_b_sp_xx(INPUT_3_2, p_input_3, input_depth);
+            vld_b_sp_xx(INPUT_3_3, p_input_3, input_depth);
+            vld_b_sp_xx(INPUT_3_4, p_input_3, input_depth);
+          }
+          vdup_b_x(INPUT_4_0, -input_offset);
+          vdup_b_x(INPUT_4_1, -input_offset);
+          vdup_b_x(INPUT_4_2, -input_offset);
+          vdup_b_x(INPUT_4_3, -input_offset);
+          vdup_b_x(INPUT_4_4, -input_offset);
+
+          COMPUTE();
+          INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
+              v52, v56, v60, output_activation_min, output_activation_max,
+              output_offset);
+          vsraqs_b_vx(v52, v52, 0);
+          vst_b_x(v52, p_output);
+          p_output += output_depth;
+          ++out_x;
+        } while (out_x < output_width);
+        do {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          INPUT_PTRS();
+#define LOAD_INPUT(y, x)                                       \
+  if (in_y_origin + y >= input_height) {                       \
+    vdup_b_x(INPUT_##y##_##x, -input_offset);                  \
+  } else if (in_x_origin + x >= input_width) {                 \
+    vdup_b_x(INPUT_##y##_##x, -input_offset);                  \
+  } else {                                                     \
+    vld_b_x(INPUT_##y##_##x, p_input_##y + (x * input_depth)); \
+  }
+
+          LOAD_INPUT(0, 0);
+          LOAD_INPUT(0, 1);
+          LOAD_INPUT(0, 2);
+          LOAD_INPUT(0, 3);
+          LOAD_INPUT(0, 4);
+          LOAD_INPUT(1, 0);
+          LOAD_INPUT(1, 1);
+          LOAD_INPUT(1, 2);
+          LOAD_INPUT(1, 3);
+          LOAD_INPUT(1, 4);
+          LOAD_INPUT(2, 0);
+          LOAD_INPUT(2, 1);
+          LOAD_INPUT(2, 2);
+          LOAD_INPUT(2, 3);
+          LOAD_INPUT(2, 4);
+          LOAD_INPUT(3, 0);
+          LOAD_INPUT(3, 1);
+          LOAD_INPUT(3, 2);
+          LOAD_INPUT(3, 3);
+          LOAD_INPUT(3, 4);
+          LOAD_INPUT(4, 0);
+          LOAD_INPUT(4, 1);
+          LOAD_INPUT(4, 2);
+          LOAD_INPUT(4, 3);
+          LOAD_INPUT(4, 4);
+#undef LOAD_INPUT
+          COMPUTE();
+          INT32_TO_INT8_OUTPUT_PIPELINE_INPLACE(
+              v52, v56, v60, output_activation_min, output_activation_max,
+              output_offset);
+          vsraqs_b_vx(v52, v52, 0);
+          vst_b_x(v52, p_output);
+          p_output += output_depth;
+          ++out_x;
+        } while (out_x < output_width);
+        ++out_y;
+      } while (out_y < output_height);
     }
+
+#undef COMPUTE
+#undef INPUT_PTRS
+#undef FLT_0_0
+#undef FLT_0_1
+#undef FLT_0_2
+#undef FLT_0_3
+#undef FLT_0_4
+#undef FLT_1_0
+#undef FLT_1_1
+#undef FLT_1_2
+#undef FLT_1_3
+#undef FLT_1_4
+#undef FLT_2_0
+#undef FLT_2_1
+#undef FLT_2_2
+#undef FLT_2_3
+#undef FLT_2_4
+#undef FLT_3_0
+#undef FLT_3_1
+#undef FLT_3_2
+#undef FLT_3_3
+#undef FLT_3_4
+#undef FLT_HOLE
+#undef FLT_4_0
+#undef FLT_4_1
+#undef FLT_4_2
+#undef FLT_4_3
+#undef FLT_4_4
+#undef INPUT_0_0
+#undef INPUT_0_1
+#undef INPUT_0_2
+#undef INPUT_0_3
+#undef INPUT_0_4
+#undef INPUT_1_0
+#undef INPUT_1_1
+#undef INPUT_1_2
+#undef INPUT_1_3
+#undef INPUT_1_4
+#undef INPUT_2_0
+#undef INPUT_2_1
+#undef INPUT_2_2
+#undef INPUT_2_3
+#undef INPUT_2_4
+#undef INPUT_3_0
+#undef INPUT_3_1
+#undef INPUT_3_2
+#undef INPUT_3_3
+#undef INPUT_3_4
+#undef INPUT_4_0
+#undef INPUT_4_1
+#undef INPUT_4_2
+#undef INPUT_4_3
+#undef INPUT_4_4
   }
 }