pack ukernel: optimized ARM64 code, benchmark,+refactorings (#11134)

Brings a decently optimized ARM64 implementation of the `iree_uk_pack`
microkernel.

It's not super optimized --- not even any assembly code. Just some C
with intrinsics, aiming at a trade-off with simplicity,
generalizability, code size. We even have a naive for loop
`iree_uk_memcpy`, which thanks to `restrict` the compiler is able to
lift to a true `memcpy` while preserving compile-time memcpy size, and
in some cases we just use that.

Another thought tilting us towards C is instrumentability: pack is the
kind of place that's potentially relevant to sanitize (asan, tsan...) so
it's kind of interesting to keep it in C.

The PR is made bigger because this was another instance where
generalizing from having 1 microkernel (mmt4d) to more than 1, required
some groundwork.

[Performance
charts](https://docs.google.com/spreadsheets/d/1hK39a9snA_P_e0nXKFyUw-hTtOmp3t2qdEUcK_kFZ64/edit?usp=sharing&resourcekey=0-hQznpmTatcIsl80mONdfmQ)
. Some cases are fairly close to memcpy, other cases are 1 order of
magnitude slower. Still probably good enough to not dominate profiles
(by contrast, naive for loops before optimization could be 2 to 3 orders
of magnitude slower than memcpy, and that was more clearly a problem).

Code size on ARM64, release: total 3104 bytes.

```
0000000000000924 t iree_uk_pack
0000000000000004 t iree_uk_pack_select_tile_func_arch
0000000000000220 t iree_uk_pack_select_tile_func_arm_64
0000000000000032 t iree_uk_pack_select_tile_func_generic
0000000000000260 t iree_uk_pack_tile_8x1_x32_arm_64_direct
0000000000000036 t iree_uk_pack_tile_8x1_x32_arm_64_transpose
0000000000000372 t iree_uk_pack_tile_8x1_x8_arm_64_direct
0000000000000136 t iree_uk_pack_tile_8x1_x8_arm_64_transpose
0000000000000248 t iree_uk_pack_tile_8x4_x8_arm_64_direct
0000000000000080 t iree_uk_pack_tile_8x4_x8_arm_64_transpose
0000000000000104 t iree_uk_pack_tile_8x8_x8_arm_64_direct
0000000000000196 t iree_uk_pack_tile_8x8_x8_arm_64_transpose
0000000000000212 t iree_uk_pack_tile_generic_direct
0000000000000280 t iree_uk_pack_tile_generic_transpose
```
diff --git a/runtime/src/iree/builtins/ukernel/BUILD b/runtime/src/iree/builtins/ukernel/BUILD
index 6ceb403..42dbb01 100644
--- a/runtime/src/iree/builtins/ukernel/BUILD
+++ b/runtime/src/iree/builtins/ukernel/BUILD
@@ -45,10 +45,12 @@
 iree_runtime_cc_library(
     name = "generic",
     srcs = [
-        "mmt4d_select_tile_generic.c",
+        "mmt4d_generic.c",
+        "pack_generic.c",
     ],
     hdrs = [
-        "mmt4d_select_tile_generic.h",
+        "mmt4d_generic.h",
+        "pack_generic.h",
     ],
     deps = [
         ":common",
diff --git a/runtime/src/iree/builtins/ukernel/CMakeLists.txt b/runtime/src/iree/builtins/ukernel/CMakeLists.txt
index c28c6f5..bbb4b96 100644
--- a/runtime/src/iree/builtins/ukernel/CMakeLists.txt
+++ b/runtime/src/iree/builtins/ukernel/CMakeLists.txt
@@ -51,9 +51,11 @@
   NAME
     generic
   HDRS
-    "mmt4d_select_tile_generic.h"
+    "mmt4d_generic.h"
+    "pack_generic.h"
   SRCS
-    "mmt4d_select_tile_generic.c"
+    "mmt4d_generic.c"
+    "pack_generic.c"
   DEPS
     ::common
   PUBLIC
diff --git a/runtime/src/iree/builtins/ukernel/arch/BUILD b/runtime/src/iree/builtins/ukernel/arch/BUILD
index b5814ce..d6aa4d8 100644
--- a/runtime/src/iree/builtins/ukernel/arch/BUILD
+++ b/runtime/src/iree/builtins/ukernel/arch/BUILD
@@ -29,10 +29,12 @@
 iree_runtime_cc_library(
     name = "ukernel_arch",
     srcs = [
-        "mmt4d_select_tile_arch.c",
+        "mmt4d_arch.c",
+        "pack_arch.c",
     ],
     hdrs = [
-        "mmt4d_select_tile_arch.h",
+        "mmt4d_arch.h",
+        "pack_arch.h",
     ],
     deps = [
         "//runtime/src/iree/builtins/ukernel:common",
diff --git a/runtime/src/iree/builtins/ukernel/arch/CMakeLists.txt b/runtime/src/iree/builtins/ukernel/arch/CMakeLists.txt
index 7e042f6..d39b480 100644
--- a/runtime/src/iree/builtins/ukernel/arch/CMakeLists.txt
+++ b/runtime/src/iree/builtins/ukernel/arch/CMakeLists.txt
@@ -27,12 +27,17 @@
   if((CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64) OR (CMAKE_SYSTEM_PROCESSOR STREQUAL arm64))
     set(IREE_UK_ARCH_ARM_64 TRUE)
     add_subdirectory(arm_64)
-    list(APPEND IREE_UK_ARCH_DEPS "iree::builtins::ukernel::arch::arm_64::mmt4d_select_tile_arm_64")
+    list(APPEND IREE_UK_ARCH_DEPS
+      "iree::builtins::ukernel::arch::arm_64::mmt4d_arm_64"
+      "iree::builtins::ukernel::arch::arm_64::pack_arm_64"
+    )
   endif()
 endif()  # IREE_UK_ENABLE_ARCH_SPECIFIC_CODE
 
 set(IREE_UK_POINTER_SIZE "${CMAKE_SIZEOF_VOID_P}")
 
+configure_file(config.h.in config.h)
+
 iree_cc_library(
   NAME
     config
@@ -44,13 +49,13 @@
   NAME
     ukernel_arch
   HDRS
-    "mmt4d_select_tile_arch.h"
+    "mmt4d_arch.h"
+    "pack_arch.h"
   SRCS
-    "mmt4d_select_tile_arch.c"
+    "mmt4d_arch.c"
+    "pack_arch.c"
   DEPS
     iree::builtins::ukernel::common
     ${IREE_UK_ARCH_DEPS}
   PUBLIC
 )
-
-configure_file(config.h.in config.h)
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/BUILD b/runtime/src/iree/builtins/ukernel/arch/arm_64/BUILD
index cf55b8d..142f414 100644
--- a/runtime/src/iree/builtins/ukernel/arch/arm_64/BUILD
+++ b/runtime/src/iree/builtins/ukernel/arch/arm_64/BUILD
@@ -13,8 +13,15 @@
 )
 
 iree_runtime_cc_library(
-    name = "mmt4d_select_tile_arm_64",
+    name = "mmt4d_arm_64",
     hdrs = [
-        "mmt4d_select_tile_arm_64.h",
+        "mmt4d_arm_64.h",
+    ],
+)
+
+iree_runtime_cc_library(
+    name = "pack_arm_64",
+    hdrs = [
+        "pack_arm_64.h",
     ],
 )
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/CMakeLists.txt b/runtime/src/iree/builtins/ukernel/arch/arm_64/CMakeLists.txt
index f10e4e8..7bb6ad9 100644
--- a/runtime/src/iree/builtins/ukernel/arch/arm_64/CMakeLists.txt
+++ b/runtime/src/iree/builtins/ukernel/arch/arm_64/CMakeLists.txt
@@ -1,3 +1,17 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+###############################################################################
+# configuration
+###############################################################################
+
+check_cxx_compiler_flag("-march=armv8.2-a+dotprod" IREE_UK_BUILD_ARM_64_DOTPROD)
+check_cxx_compiler_flag("-march=armv8.2-a+i8mm" IREE_UK_BUILD_ARM_64_I8MM)
+configure_file(config.h.in config.h)
+
 iree_cc_library(
   NAME
     assembly
@@ -5,9 +19,15 @@
     "assembly.h"
 )
 
+###############################################################################
+# mmt4d tile funcs
+###############################################################################
+
 iree_cc_library(
   NAME
     mmt4d_tile_arm_64
+  HDRS
+    "mmt4d_tile_arm_64.h"
   SRCS
     "mmt4d_tile_arm_64.S"
   DEPS
@@ -16,11 +36,12 @@
 )
 list(APPEND IREE_UK_MMT4D_TILE_ARM_64_DEPS "iree::builtins::ukernel::arch::arm_64::mmt4d_tile_arm_64")
 
-check_cxx_compiler_flag("-march=armv8.2-a+dotprod" IREE_UK_BUILD_ARM_64_DOTPROD)
 if(IREE_UK_BUILD_ARM_64_DOTPROD)
   iree_cc_library(
     NAME
       mmt4d_tile_arm_64_dotprod
+    HDRS
+      "mmt4d_tile_arm_64.h"
     SRCS
       "mmt4d_tile_arm_64_dotprod.S"
     COPTS
@@ -33,11 +54,12 @@
   list(APPEND IREE_UK_MMT4D_TILE_ARM_64_DEPS "iree::builtins::ukernel::arch::arm_64::mmt4d_tile_arm_64_dotprod")
 endif()
 
-check_cxx_compiler_flag("-march=armv8.2-a+i8mm" IREE_UK_BUILD_ARM_64_I8MM)
 if(IREE_UK_BUILD_ARM_64_I8MM)
   iree_cc_library(
     NAME
       mmt4d_tile_arm_64_i8mm
+    HDRS
+      "mmt4d_tile_arm_64.h"
     SRCS
       "mmt4d_tile_arm_64_i8mm.S"
     COPTS
@@ -49,15 +71,17 @@
   list(APPEND IREE_UK_MMT4D_TILE_ARM_64_DEPS "iree::builtins::ukernel::arch::arm_64::mmt4d_tile_arm_64_i8mm")
 endif()
 
-configure_file(config.h.in config.h)
+###############################################################################
+# mmt4d entry point
+###############################################################################
 
 iree_cc_library(
   NAME
-    mmt4d_select_tile_arm_64
+    mmt4d_arm_64
   HDRS
-    "mmt4d_select_tile_arm_64.h"
+    "mmt4d_arm_64.h"
   SRCS
-    "mmt4d_select_tile_arm_64.c"
+    "mmt4d_arm_64.c"
   DEPS
     iree::base::core_headers
     iree::schemas::cpu_data
@@ -65,3 +89,37 @@
     ${IREE_UK_MMT4D_TILE_ARM_64_DEPS}
   PUBLIC
 )
+
+###############################################################################
+# pack tile funcs
+###############################################################################
+
+iree_cc_library(
+  NAME
+    pack_tile_arm_64
+  HDRS
+    "pack_tile_arm_64.h"
+  SRCS
+    "pack_tile_arm_64.c"
+  DEPS
+    iree::builtins::ukernel::exported_flag_bits
+)
+
+###############################################################################
+# pack entry point
+###############################################################################
+
+iree_cc_library(
+  NAME
+    pack_arm_64
+  HDRS
+    "pack_arm_64.h"
+  SRCS
+    "pack_arm_64.c"
+  DEPS
+    iree::base::core_headers
+    iree::schemas::cpu_data
+    iree::builtins::ukernel::common
+    ::pack_tile_arm_64
+  PUBLIC
+)
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_select_tile_arm_64.c b/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64.c
similarity index 87%
rename from runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_select_tile_arm_64.c
rename to runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64.c
index f861f19..d3a6777 100644
--- a/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_select_tile_arm_64.c
+++ b/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64.c
@@ -4,16 +4,12 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree/builtins/ukernel/arch/arm_64/mmt4d_select_tile_arm_64.h"
+#include "iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64.h"
 
 #include "iree/builtins/ukernel/arch/arm_64/config.h"
+#include "iree/builtins/ukernel/arch/arm_64/mmt4d_tile_arm_64.h"
 #include "iree/schemas/cpu_data.h"
 
-IREE_UK_MMT4D_TILE_FUNC_DECL(iree_uk_mmt4d_tile_f32f32f32_8x8x1_arm_64)
-IREE_UK_MMT4D_TILE_FUNC_DECL(iree_uk_mmt4d_tile_i8i8i32_8x8x1_arm_64)
-IREE_UK_MMT4D_TILE_FUNC_DECL(iree_uk_mmt4d_tile_i8i8i32_8x8x4_arm_64_dotprod)
-IREE_UK_MMT4D_TILE_FUNC_DECL(iree_uk_mmt4d_tile_i8i8i32_8x8x8_arm_64_i8mm)
-
 static iree_uk_mmt4d_tile_func_t
 iree_uk_mmt4d_select_tile_func_arm_64_f32f32f32_8x8x1(
     const iree_uk_mmt4d_params_t* params) {
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_select_tile_arm_64.h b/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64.h
similarity index 75%
rename from runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_select_tile_arm_64.h
rename to runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64.h
index c2e1abb..39e331c 100644
--- a/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_select_tile_arm_64.h
+++ b/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64.h
@@ -4,8 +4,8 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#ifndef IREE_BUILTINS_UKERNEL_ARM_64_MMT4D_SELECT_TILE_ARM_64_H_
-#define IREE_BUILTINS_UKERNEL_ARM_64_MMT4D_SELECT_TILE_ARM_64_H_
+#ifndef IREE_BUILTINS_UKERNEL_ARCH_ARM_64_MMT4D_ARM_64_H_
+#define IREE_BUILTINS_UKERNEL_ARCH_ARM_64_MMT4D_ARM_64_H_
 
 #include "iree/builtins/ukernel/mmt4d_types.h"
 
@@ -15,4 +15,4 @@
 iree_uk_mmt4d_tile_func_t iree_uk_mmt4d_select_tile_func_arm_64(
     const iree_uk_mmt4d_params_t* params);
 
-#endif  // IREE_BUILTINS_UKERNEL_ARM_64_MMT4D_SELECT_TILE_ARM_64_H_
+#endif  // IREE_BUILTINS_UKERNEL_ARCH_ARM_64_MMT4D_ARM_64_H_
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_tile_arm_64.h b/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_tile_arm_64.h
new file mode 100644
index 0000000..5da3cb5
--- /dev/null
+++ b/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_tile_arm_64.h
@@ -0,0 +1,17 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BUILTINS_UKERNEL_ARCH_ARM_64_MMT4D_TILE_ARM_64_H_
+#define IREE_BUILTINS_UKERNEL_ARCH_ARM_64_MMT4D_TILE_ARM_64_H_
+
+#include "iree/builtins/ukernel/mmt4d_types.h"
+
+IREE_UK_MMT4D_TILE_FUNC_DECL(iree_uk_mmt4d_tile_f32f32f32_8x8x1_arm_64)
+IREE_UK_MMT4D_TILE_FUNC_DECL(iree_uk_mmt4d_tile_i8i8i32_8x8x1_arm_64)
+IREE_UK_MMT4D_TILE_FUNC_DECL(iree_uk_mmt4d_tile_i8i8i32_8x8x4_arm_64_dotprod)
+IREE_UK_MMT4D_TILE_FUNC_DECL(iree_uk_mmt4d_tile_i8i8i32_8x8x8_arm_64_i8mm)
+
+#endif  // IREE_BUILTINS_UKERNEL_ARCH_ARM_64_MMT4D_TILE_ARM_64_H_
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_arm_64.c b/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_arm_64.c
new file mode 100644
index 0000000..9c3e3f4
--- /dev/null
+++ b/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_arm_64.c
@@ -0,0 +1,32 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/builtins/ukernel/arch/arm_64/pack_arm_64.h"
+
+#include "iree/builtins/ukernel/arch/arm_64/pack_tile_arm_64.h"
+
+iree_uk_pack_tile_func_t iree_uk_pack_select_tile_func_arm_64(
+    const iree_uk_pack_params_t* params) {
+  // At the moment, as sum-reductions are not yet part of pack ops,
+  // no arithmetic whatsoever is being done here, so only the element type
+  // size matters, not the type itself.
+  int esize = iree_uk_type_size(iree_uk_pack_out_type(params->type));
+  bool transpose = params->flags & IREE_UK_FLAG_PACK_TRANSPOSE_INNER;
+  if (esize == 4 && params->out_size2 == 8 && params->out_size3 == 1) {
+    return transpose ? iree_uk_pack_tile_8x1_x32_arm_64_transpose
+                     : iree_uk_pack_tile_8x1_x32_arm_64_direct;
+  } else if (esize == 1 && params->out_size2 == 8 && params->out_size3 == 1) {
+    return transpose ? iree_uk_pack_tile_8x1_x8_arm_64_transpose
+                     : iree_uk_pack_tile_8x1_x8_arm_64_direct;
+  } else if (esize == 1 && params->out_size2 == 8 && params->out_size3 == 4) {
+    return transpose ? iree_uk_pack_tile_8x4_x8_arm_64_transpose
+                     : iree_uk_pack_tile_8x4_x8_arm_64_direct;
+  } else if (esize == 1 && params->out_size2 == 8 && params->out_size3 == 8) {
+    return transpose ? iree_uk_pack_tile_8x8_x8_arm_64_transpose
+                     : iree_uk_pack_tile_8x8_x8_arm_64_direct;
+  }
+  return 0;
+}
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_arm_64.h b/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_arm_64.h
new file mode 100644
index 0000000..d61cc99
--- /dev/null
+++ b/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_arm_64.h
@@ -0,0 +1,18 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BUILTINS_UKERNEL_ARCH_ARM_64_PACK_ARM_64_H_
+#define IREE_BUILTINS_UKERNEL_ARCH_ARM_64_PACK_ARM_64_H_
+
+#include "iree/builtins/ukernel/pack_types.h"
+
+// Returns the arm64 tile function to use for the pack op with given params, or
+// NULL if no suitable arm64 tile function exists for these params, in which
+// case the caller may fall back to a generic tile function.
+iree_uk_pack_tile_func_t iree_uk_pack_select_tile_func_arm_64(
+    const iree_uk_pack_params_t* params);
+
+#endif  // IREE_BUILTINS_UKERNEL_ARCH_ARM_64_PACK_ARM_64_H_
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_tile_arm_64.c b/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_tile_arm_64.c
new file mode 100644
index 0000000..f3db0b0
--- /dev/null
+++ b/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_tile_arm_64.c
@@ -0,0 +1,324 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/builtins/ukernel/arch/arm_64/pack_tile_arm_64.h"
+
+#include <arm_neon.h>
+
+void* iree_uk_pack_tile_8x1_x32_arm_64_direct(
+    void* restrict out_tile_ptr, const void* restrict in_tile_ptr,
+    iree_uk_ssize_t outer_size1, iree_uk_ssize_t out_stride_l1,
+    iree_uk_ssize_t in_stride0, iree_uk_ssize_t elem_size_unused,
+    iree_uk_ssize_t tile_size0_unused, iree_uk_ssize_t tile_size1_unused) {
+  return iree_uk_pack_tile_8x4_x8_arm_64_direct(out_tile_ptr, in_tile_ptr,
+                                                outer_size1, out_stride_l1 * 4,
+                                                in_stride0 * 4, 1, 8, 4);
+}
+
+static inline int8x8_t iree_uk_neon_load_8xi8_strided(const iree_uk_int8_t* src,
+                                                      iree_uk_ssize_t stride) {
+  int8x8_t v = vdup_n_s8(0);
+  v = vld1_lane_s8(src + 0 * stride, v, 0);
+  v = vld1_lane_s8(src + 1 * stride, v, 1);
+  v = vld1_lane_s8(src + 2 * stride, v, 2);
+  v = vld1_lane_s8(src + 3 * stride, v, 3);
+  v = vld1_lane_s8(src + 4 * stride, v, 4);
+  v = vld1_lane_s8(src + 5 * stride, v, 5);
+  v = vld1_lane_s8(src + 6 * stride, v, 6);
+  v = vld1_lane_s8(src + 7 * stride, v, 7);
+  return v;
+}
+
+static inline int8x16x2_t iree_uk_neon_load_8x4xi8_rowmajor_strided(
+    const iree_uk_int8_t* src, iree_uk_ssize_t stride) {
+  int32x4_t v0_i32 = vdupq_n_s32(0);
+  int32x4_t v1_i32 = vdupq_n_s32(0);
+  v0_i32 =
+      vld1q_lane_s32((const iree_uk_int32_t*)(src + 0 * stride), v0_i32, 0);
+  v0_i32 =
+      vld1q_lane_s32((const iree_uk_int32_t*)(src + 1 * stride), v0_i32, 1);
+  v0_i32 =
+      vld1q_lane_s32((const iree_uk_int32_t*)(src + 2 * stride), v0_i32, 2);
+  v0_i32 =
+      vld1q_lane_s32((const iree_uk_int32_t*)(src + 3 * stride), v0_i32, 3);
+  v1_i32 =
+      vld1q_lane_s32((const iree_uk_int32_t*)(src + 4 * stride), v1_i32, 0);
+  v1_i32 =
+      vld1q_lane_s32((const iree_uk_int32_t*)(src + 5 * stride), v1_i32, 1);
+  v1_i32 =
+      vld1q_lane_s32((const iree_uk_int32_t*)(src + 6 * stride), v1_i32, 2);
+  v1_i32 =
+      vld1q_lane_s32((const iree_uk_int32_t*)(src + 7 * stride), v1_i32, 3);
+  int8x16x2_t v;
+  v.val[0] = vreinterpretq_s8_s32(v0_i32);
+  v.val[1] = vreinterpretq_s8_s32(v1_i32);
+  return v;
+}
+
+static inline int8x16x4_t
+iree_uk_neon_load_8x8xi8_rowmajor_strided_permute_rows(
+    const iree_uk_int8_t* src, iree_uk_ssize_t stride, int p0, int p1, int p2,
+    int p3, int p4, int p5, int p6, int p7) {
+  int8x8_t row0 = vld1_s8(src + p0 * stride);
+  int8x8_t row1 = vld1_s8(src + p1 * stride);
+  int8x8_t row2 = vld1_s8(src + p2 * stride);
+  int8x8_t row3 = vld1_s8(src + p3 * stride);
+  int8x8_t row4 = vld1_s8(src + p4 * stride);
+  int8x8_t row5 = vld1_s8(src + p5 * stride);
+  int8x8_t row6 = vld1_s8(src + p6 * stride);
+  int8x8_t row7 = vld1_s8(src + p7 * stride);
+  int8x16x4_t v;
+  v.val[0] = vcombine_s8(row0, row1);
+  v.val[1] = vcombine_s8(row2, row3);
+  v.val[2] = vcombine_s8(row4, row5);
+  v.val[3] = vcombine_s8(row6, row7);
+  return v;
+}
+
+static inline int8x16x4_t iree_uk_neon_load_8x8xi8_rowmajor_strided(
+    const iree_uk_int8_t* src, iree_uk_ssize_t stride) {
+  return iree_uk_neon_load_8x8xi8_rowmajor_strided_permute_rows(
+      src, stride, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+static inline void iree_uk_neon_copy_8x1xi8_strided_to_unstrided(
+    iree_uk_int8_t* restrict out_ptr, const iree_uk_int8_t* restrict in_ptr,
+    iree_uk_ssize_t in_stride) {
+  int8x8_t in = iree_uk_neon_load_8xi8_strided(in_ptr, in_stride);
+  vst1_s8(out_ptr, in);
+}
+
+static inline void iree_uk_neon_copy_8x4xi8_rowmajor_strided_to_unstrided(
+    iree_uk_int8_t* restrict out_ptr, const iree_uk_int8_t* restrict in_ptr,
+    iree_uk_ssize_t in_stride) {
+  int8x16x2_t in = iree_uk_neon_load_8x4xi8_rowmajor_strided(in_ptr, in_stride);
+  vst1q_s8(out_ptr + 0, in.val[0]);
+  vst1q_s8(out_ptr + 16, in.val[1]);
+}
+
+static inline void iree_uk_neon_copy_8x8xi8_rowmajor_strided_to_unstrided(
+    iree_uk_int8_t* restrict out_ptr, const iree_uk_int8_t* restrict in_ptr,
+    iree_uk_ssize_t in_stride) {
+  int8x16x4_t in = iree_uk_neon_load_8x8xi8_rowmajor_strided(in_ptr, in_stride);
+  vst1q_s8(out_ptr + 0, in.val[0]);
+  vst1q_s8(out_ptr + 16, in.val[1]);
+  vst1q_s8(out_ptr + 32, in.val[2]);
+  vst1q_s8(out_ptr + 48, in.val[3]);
+}
+
+static inline int16x8x2_t iree_uk_neon_zip_16xi8_as_8xi16(int8x16_t a,
+                                                          int8x16_t b) {
+  int8x16x2_t z = vzipq_s8(a, b);
+  int16x8x2_t r;
+  r.val[0] = vreinterpretq_s16_s8(z.val[0]);
+  r.val[1] = vreinterpretq_s16_s8(z.val[1]);
+  return r;
+}
+
+static inline int32x4x2_t iree_uk_neon_zip_8xi16_as_4xi32(int16x8_t a,
+                                                          int16x8_t b) {
+  int16x8x2_t z = vzipq_s16(a, b);
+  int32x4x2_t r;
+  r.val[0] = vreinterpretq_s32_s16(z.val[0]);
+  r.val[1] = vreinterpretq_s32_s16(z.val[1]);
+  return r;
+}
+
+static inline int64x2x2_t iree_uk_neon_zip_4xi32_as_2xi64(int32x4_t a,
+                                                          int32x4_t b) {
+  int32x4x2_t z = vzipq_s32(a, b);
+  int64x2x2_t r;
+  r.val[0] = vreinterpretq_s64_s32(z.val[0]);
+  r.val[1] = vreinterpretq_s64_s32(z.val[1]);
+  return r;
+}
+
+static inline void iree_uk_neon_copy_8x8xi8_rowmajor_to_colmajor(
+    iree_uk_int8_t* restrict out_ptr, const iree_uk_int8_t* restrict in_ptr,
+    iree_uk_ssize_t out_stride, iree_uk_ssize_t in_stride) {
+  int8x16x4_t in = iree_uk_neon_load_8x8xi8_rowmajor_strided_permute_rows(
+      in_ptr, in_stride, 0, 4, 1, 5, 2, 6, 3, 7);
+  int16x8x2_t zip_i16_0 = iree_uk_neon_zip_16xi8_as_8xi16(in.val[0], in.val[1]);
+  int16x8x2_t zip_i16_1 = iree_uk_neon_zip_16xi8_as_8xi16(in.val[2], in.val[3]);
+  int32x4x2_t zip_i32_0 =
+      iree_uk_neon_zip_8xi16_as_4xi32(zip_i16_0.val[0], zip_i16_1.val[0]);
+  int32x4x2_t zip_i32_1 =
+      iree_uk_neon_zip_8xi16_as_4xi32(zip_i16_0.val[1], zip_i16_1.val[1]);
+  int64x2x2_t zip_i64_0 =
+      iree_uk_neon_zip_4xi32_as_2xi64(zip_i32_0.val[0], zip_i32_1.val[0]);
+  int64x2x2_t zip_i64_1 =
+      iree_uk_neon_zip_4xi32_as_2xi64(zip_i32_0.val[1], zip_i32_1.val[1]);
+  int8x16x4_t out;
+  out.val[0] = vreinterpretq_s8_s64(zip_i64_0.val[0]);
+  out.val[1] = vreinterpretq_s8_s64(zip_i64_0.val[1]);
+  out.val[2] = vreinterpretq_s8_s64(zip_i64_1.val[0]);
+  out.val[3] = vreinterpretq_s8_s64(zip_i64_1.val[1]);
+  vst1_s8(out_ptr + 0 * out_stride, vget_low_s8(out.val[0]));
+  vst1_s8(out_ptr + 1 * out_stride, vget_high_s8(out.val[0]));
+  vst1_s8(out_ptr + 2 * out_stride, vget_low_s8(out.val[1]));
+  vst1_s8(out_ptr + 3 * out_stride, vget_high_s8(out.val[1]));
+  vst1_s8(out_ptr + 4 * out_stride, vget_low_s8(out.val[2]));
+  vst1_s8(out_ptr + 5 * out_stride, vget_high_s8(out.val[2]));
+  vst1_s8(out_ptr + 6 * out_stride, vget_low_s8(out.val[3]));
+  vst1_s8(out_ptr + 7 * out_stride, vget_high_s8(out.val[3]));
+}
+
+static inline void iree_uk_neon_copy_8x8xi8_rowmajor_to_colmajor_tiled_1x4(
+    iree_uk_int8_t* restrict out_ptr, const iree_uk_int8_t* restrict in_ptr,
+    iree_uk_ssize_t out_stride, iree_uk_ssize_t in_stride) {
+  int8x16x4_t in = iree_uk_neon_load_8x8xi8_rowmajor_strided_permute_rows(
+      in_ptr, in_stride, 0, 2, 1, 3, 4, 6, 5, 7);
+  int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s8(in.val[0]),
+                             vreinterpretq_s32_s8(in.val[1]));
+  int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s8(in.val[2]),
+                             vreinterpretq_s32_s8(in.val[3]));
+  vst1q_s8(out_ptr + 0 + 0 * out_stride, vreinterpretq_s8_s32(c0.val[0]));
+  vst1q_s8(out_ptr + 16 + 0 * out_stride, vreinterpretq_s8_s32(c1.val[0]));
+  vst1q_s8(out_ptr + 0 + 1 * out_stride, vreinterpretq_s8_s32(c0.val[1]));
+  vst1q_s8(out_ptr + 16 + 1 * out_stride, vreinterpretq_s8_s32(c1.val[1]));
+}
+
+void* iree_uk_pack_tile_8x1_x8_arm_64_direct(
+    void* restrict out_tile_ptr, const void* restrict in_tile_ptr,
+    iree_uk_ssize_t outer_size1, iree_uk_ssize_t out_stride_l1,
+    iree_uk_ssize_t in_stride0, iree_uk_ssize_t elem_size_unused,
+    iree_uk_ssize_t tile_size0_unused, iree_uk_ssize_t tile_size1_unused) {
+  iree_uk_ssize_t outer_i1 = 0;
+  iree_uk_int8_t* restrict out_ptr = out_tile_ptr;
+  const iree_uk_int8_t* restrict in_ptr = in_tile_ptr;
+  // A further 2x unrolling (outer_i1+=16) yields another 1.2x speedup on A710
+  // thanks to using 16-byte loads. Is it worth the code size? This 8x1 tile is
+  // used on baseline aarch64 where the matmul kernel is slow anyway.
+  for (; outer_i1 <= outer_size1 - 8; outer_i1 += 8) {
+    iree_uk_neon_copy_8x8xi8_rowmajor_to_colmajor(out_ptr, in_ptr,
+                                                  out_stride_l1, in_stride0);
+    out_ptr += 8 * out_stride_l1;
+    in_ptr += 8;
+  }
+  for (; outer_i1 < outer_size1; ++outer_i1) {
+    iree_uk_neon_copy_8x1xi8_strided_to_unstrided(out_ptr, in_ptr, in_stride0);
+    out_ptr += out_stride_l1;
+    in_ptr += 1;
+  }
+  return out_ptr;
+}
+
+void* iree_uk_pack_tile_8x4_x8_arm_64_direct(
+    void* restrict out_tile_ptr, const void* restrict in_tile_ptr,
+    iree_uk_ssize_t outer_size1, iree_uk_ssize_t out_stride_l1,
+    iree_uk_ssize_t in_stride0, iree_uk_ssize_t elem_size_unused,
+    iree_uk_ssize_t tile_size0_unused, iree_uk_ssize_t tile_size1_unused) {
+  iree_uk_ssize_t outer_i1 = 0;
+  iree_uk_int8_t* restrict out_ptr = out_tile_ptr;
+  const iree_uk_int8_t* restrict in_ptr = in_tile_ptr;
+  for (; outer_i1 <= outer_size1 - 2; outer_i1 += 2) {
+    iree_uk_neon_copy_8x8xi8_rowmajor_to_colmajor_tiled_1x4(
+        out_ptr, in_ptr, out_stride_l1, in_stride0);
+    out_ptr += 2 * out_stride_l1;
+    in_ptr += 8;
+  }
+  for (; outer_i1 < outer_size1; outer_i1++) {
+    iree_uk_neon_copy_8x4xi8_rowmajor_strided_to_unstrided(out_ptr, in_ptr,
+                                                           in_stride0);
+    out_ptr += out_stride_l1;
+    in_ptr += 4;
+  }
+  return out_ptr;
+}
+void* iree_uk_pack_tile_8x8_x8_arm_64_direct(
+    void* restrict out_tile_ptr, const void* restrict in_tile_ptr,
+    iree_uk_ssize_t outer_size1, iree_uk_ssize_t out_stride_l1,
+    iree_uk_ssize_t in_stride0, iree_uk_ssize_t elem_size_unused,
+    iree_uk_ssize_t tile_size0_unused, iree_uk_ssize_t tile_size1_unused) {
+  iree_uk_int8_t* restrict out_ptr = out_tile_ptr;
+  const iree_uk_int8_t* restrict in_ptr = in_tile_ptr;
+  for (iree_uk_ssize_t outer_i1 = 0; outer_i1 < outer_size1; ++outer_i1) {
+    iree_uk_neon_copy_8x8xi8_rowmajor_strided_to_unstrided(out_ptr, in_ptr,
+                                                           in_stride0);
+    out_ptr += out_stride_l1;
+    in_ptr += 8;
+  }
+  return out_ptr;
+}
+
+void* iree_uk_pack_tile_8x1_x32_arm_64_transpose(
+    void* restrict out_tile_ptr, const void* restrict in_tile_ptr,
+    iree_uk_ssize_t outer_size1, iree_uk_ssize_t out_stride_l1,
+    iree_uk_ssize_t in_stride0, iree_uk_ssize_t elem_size_unused,
+    iree_uk_ssize_t tile_size0_unused, iree_uk_ssize_t tile_size1_unused) {
+  const iree_uk_int32_t* restrict in_tile_ptr_i32 = in_tile_ptr;
+  iree_uk_int32_t* restrict out_tile_i32_ptr = out_tile_ptr;
+  for (iree_uk_ssize_t outer_i1 = 0; outer_i1 < outer_size1; ++outer_i1) {
+    iree_uk_memcpy(out_tile_i32_ptr, in_tile_ptr_i32, 32);
+    out_tile_i32_ptr += out_stride_l1;
+    in_tile_ptr_i32 += 8;
+  }
+  return out_tile_i32_ptr;
+}
+
+void* iree_uk_pack_tile_8x1_x8_arm_64_transpose(
+    void* restrict out_tile_ptr, const void* restrict in_tile_ptr,
+    iree_uk_ssize_t outer_size1, iree_uk_ssize_t out_stride_l1,
+    iree_uk_ssize_t in_stride0, iree_uk_ssize_t elem_size_unused,
+    iree_uk_ssize_t tile_size0_unused, iree_uk_ssize_t tile_size1_unused) {
+  const iree_uk_int8_t* restrict in_ptr = in_tile_ptr;
+  iree_uk_int8_t* restrict out_ptr = out_tile_ptr;
+  iree_uk_ssize_t outer_i1 = 0;
+  for (; outer_i1 <= outer_size1 - 4; outer_i1 += 4) {
+    iree_uk_memcpy(out_ptr + 0 * out_stride_l1, in_ptr + 0, 8);
+    iree_uk_memcpy(out_ptr + 1 * out_stride_l1, in_ptr + 8, 8);
+    iree_uk_memcpy(out_ptr + 2 * out_stride_l1, in_ptr + 16, 8);
+    iree_uk_memcpy(out_ptr + 3 * out_stride_l1, in_ptr + 24, 8);
+    out_ptr += 4 * out_stride_l1;
+    in_ptr += 32;
+  }
+  for (; outer_i1 < outer_size1; ++outer_i1) {
+    iree_uk_memcpy(out_ptr, in_ptr, 8);
+    out_ptr += out_stride_l1;
+    in_ptr += 8;
+  }
+  return out_ptr;
+}
+
+void* iree_uk_pack_tile_8x4_x8_arm_64_transpose(
+    void* restrict out_tile_ptr, const void* restrict in_tile_ptr,
+    iree_uk_ssize_t outer_size1, iree_uk_ssize_t out_stride_l1,
+    iree_uk_ssize_t in_stride0, iree_uk_ssize_t elem_size_unused,
+    iree_uk_ssize_t tile_size0_unused, iree_uk_ssize_t tile_size1_unused) {
+  const iree_uk_int8_t* restrict in_ptr = in_tile_ptr;
+  iree_uk_int8_t* restrict out_ptr = out_tile_ptr;
+  for (iree_uk_ssize_t outer_i1 = 0; outer_i1 < outer_size1; ++outer_i1) {
+    int8x16x2_t in;
+    in.val[0] = vcombine_s8(vld1_s8(in_ptr + 0 * in_stride0),
+                            vld1_s8(in_ptr + 2 * in_stride0));
+    in.val[1] = vcombine_s8(vld1_s8(in_ptr + 1 * in_stride0),
+                            vld1_s8(in_ptr + 3 * in_stride0));
+    int16x8x2_t zip_i16 = iree_uk_neon_zip_16xi8_as_8xi16(in.val[0], in.val[1]);
+    int32x4x2_t zip_i32 =
+        iree_uk_neon_zip_8xi16_as_4xi32(zip_i16.val[0], zip_i16.val[1]);
+    vst1q_s8(out_ptr, vreinterpretq_s8_s32(zip_i32.val[0]));
+    vst1q_s8(out_ptr + 16, vreinterpretq_s8_s32(zip_i32.val[1]));
+    out_ptr += out_stride_l1;
+    in_ptr += 8;
+  }
+  return out_ptr;
+}
+
+void* iree_uk_pack_tile_8x8_x8_arm_64_transpose(
+    void* restrict out_tile_ptr, const void* restrict in_tile_ptr,
+    iree_uk_ssize_t outer_size1, iree_uk_ssize_t out_stride_l1,
+    iree_uk_ssize_t in_stride0, iree_uk_ssize_t elem_size_unused,
+    iree_uk_ssize_t tile_size0_unused, iree_uk_ssize_t tile_size1_unused) {
+  const iree_uk_int8_t* restrict in_ptr = in_tile_ptr;
+  iree_uk_int8_t* restrict out_ptr = out_tile_ptr;
+  for (iree_uk_ssize_t outer_i1 = 0; outer_i1 < outer_size1; ++outer_i1) {
+    iree_uk_neon_copy_8x8xi8_rowmajor_to_colmajor(out_ptr, in_ptr, 8,
+                                                  in_stride0);
+    out_ptr += out_stride_l1;
+    in_ptr += 8;
+  }
+  return out_ptr;
+}
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_tile_arm_64.h b/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_tile_arm_64.h
new file mode 100644
index 0000000..ac6e385
--- /dev/null
+++ b/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_tile_arm_64.h
@@ -0,0 +1,21 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BUILTINS_UKERNEL_ARCH_ARM_64_PACK_TILE_ARM_64_H_
+#define IREE_BUILTINS_UKERNEL_ARCH_ARM_64_PACK_TILE_ARM_64_H_
+
+#include "iree/builtins/ukernel/pack_types.h"
+
+IREE_UK_PACK_TILE_FUNC_DECL(iree_uk_pack_tile_8x1_x32_arm_64_direct)
+IREE_UK_PACK_TILE_FUNC_DECL(iree_uk_pack_tile_8x1_x32_arm_64_transpose)
+IREE_UK_PACK_TILE_FUNC_DECL(iree_uk_pack_tile_8x1_x8_arm_64_direct)
+IREE_UK_PACK_TILE_FUNC_DECL(iree_uk_pack_tile_8x1_x8_arm_64_transpose)
+IREE_UK_PACK_TILE_FUNC_DECL(iree_uk_pack_tile_8x4_x8_arm_64_direct)
+IREE_UK_PACK_TILE_FUNC_DECL(iree_uk_pack_tile_8x4_x8_arm_64_transpose)
+IREE_UK_PACK_TILE_FUNC_DECL(iree_uk_pack_tile_8x8_x8_arm_64_direct)
+IREE_UK_PACK_TILE_FUNC_DECL(iree_uk_pack_tile_8x8_x8_arm_64_transpose)
+
+#endif  // IREE_BUILTINS_UKERNEL_ARCH_ARM_64_PACK_TILE_ARM_64_H_
diff --git a/runtime/src/iree/builtins/ukernel/arch/mmt4d_select_tile_arch.c b/runtime/src/iree/builtins/ukernel/arch/mmt4d_arch.c
similarity index 77%
rename from runtime/src/iree/builtins/ukernel/arch/mmt4d_select_tile_arch.c
rename to runtime/src/iree/builtins/ukernel/arch/mmt4d_arch.c
index b1757c7..fee4cec 100644
--- a/runtime/src/iree/builtins/ukernel/arch/mmt4d_select_tile_arch.c
+++ b/runtime/src/iree/builtins/ukernel/arch/mmt4d_arch.c
@@ -4,10 +4,10 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree/builtins/ukernel/arch/mmt4d_select_tile_arch.h"
+#include "iree/builtins/ukernel/arch/mmt4d_arch.h"
 
 #if defined(IREE_UK_ARCH_ARM_64)
-#include "iree/builtins/ukernel/arch/arm_64/mmt4d_select_tile_arm_64.h"
+#include "iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64.h"
 #endif
 
 iree_uk_mmt4d_tile_func_t iree_uk_mmt4d_select_tile_func_arch(
diff --git a/runtime/src/iree/builtins/ukernel/arch/mmt4d_select_tile_arch.h b/runtime/src/iree/builtins/ukernel/arch/mmt4d_arch.h
similarity index 77%
rename from runtime/src/iree/builtins/ukernel/arch/mmt4d_select_tile_arch.h
rename to runtime/src/iree/builtins/ukernel/arch/mmt4d_arch.h
index 5502fcb..996b4d4 100644
--- a/runtime/src/iree/builtins/ukernel/arch/mmt4d_select_tile_arch.h
+++ b/runtime/src/iree/builtins/ukernel/arch/mmt4d_arch.h
@@ -4,8 +4,8 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#ifndef IREE_BUILTINS_UKERNEL_ARCH_MMT4D_SELECT_TILE_ARCH_H_
-#define IREE_BUILTINS_UKERNEL_ARCH_MMT4D_SELECT_TILE_ARCH_H_
+#ifndef IREE_BUILTINS_UKERNEL_ARCH_MMT4D_ARCH_H_
+#define IREE_BUILTINS_UKERNEL_ARCH_MMT4D_ARCH_H_
 
 #include "iree/builtins/ukernel/mmt4d_types.h"
 
@@ -16,4 +16,4 @@
 iree_uk_mmt4d_tile_func_t iree_uk_mmt4d_select_tile_func_arch(
     const iree_uk_mmt4d_params_t* params);
 
-#endif  // IREE_BUILTINS_UKERNEL_ARCH_MMT4D_SELECT_TILE_ARCH_H_
+#endif  // IREE_BUILTINS_UKERNEL_ARCH_MMT4D_ARCH_H_
diff --git a/runtime/src/iree/builtins/ukernel/arch/pack_arch.c b/runtime/src/iree/builtins/ukernel/arch/pack_arch.c
new file mode 100644
index 0000000..265e9ad
--- /dev/null
+++ b/runtime/src/iree/builtins/ukernel/arch/pack_arch.c
@@ -0,0 +1,19 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/builtins/ukernel/arch/pack_arch.h"
+
+#if defined(IREE_UK_ARCH_ARM_64)
+#include "iree/builtins/ukernel/arch/arm_64/pack_arm_64.h"
+#endif
+
+iree_uk_pack_tile_func_t iree_uk_pack_select_tile_func_arch(
+    const iree_uk_pack_params_t* params) {
+#if defined(IREE_UK_ARCH_ARM_64)
+  return iree_uk_pack_select_tile_func_arm_64(params);
+#endif
+  return 0;
+}
diff --git a/runtime/src/iree/builtins/ukernel/arch/pack_arch.h b/runtime/src/iree/builtins/ukernel/arch/pack_arch.h
new file mode 100644
index 0000000..16119b0
--- /dev/null
+++ b/runtime/src/iree/builtins/ukernel/arch/pack_arch.h
@@ -0,0 +1,19 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BUILTINS_UKERNEL_ARCH_PACK_ARCH_H_
+#define IREE_BUILTINS_UKERNEL_ARCH_PACK_ARCH_H_
+
+#include "iree/builtins/ukernel/pack_types.h"
+
+// Returns the architecture-specific tile function to use for the pack op with
+// given params, or NULL if no suitable architecture-specific tile function
+// exists for these params, in which case the caller may fall back to a generic
+// tile function.
+iree_uk_pack_tile_func_t iree_uk_pack_select_tile_func_arch(
+    const iree_uk_pack_params_t* params);
+
+#endif  // IREE_BUILTINS_UKERNEL_ARCH_PACK_ARCH_H_
diff --git a/runtime/src/iree/builtins/ukernel/common.c b/runtime/src/iree/builtins/ukernel/common.c
index 307b91e..940f363 100644
--- a/runtime/src/iree/builtins/ukernel/common.c
+++ b/runtime/src/iree/builtins/ukernel/common.c
@@ -16,6 +16,8 @@
       return "unsupported huge or negative size in mmt4d";
     case iree_uk_status_unsupported_generic_tile_size:
       return "tile size too large for the generic tile implementation";
+    case iree_uk_status_shapes_mismatch:
+      return "shapes mismatch";
     default:
       return "unknown";
   }
diff --git a/runtime/src/iree/builtins/ukernel/common.h b/runtime/src/iree/builtins/ukernel/common.h
index 90a437f..5a04e3b 100644
--- a/runtime/src/iree/builtins/ukernel/common.h
+++ b/runtime/src/iree/builtins/ukernel/common.h
@@ -192,6 +192,12 @@
 #error Unexpected pointer size
 #endif
 
+static inline void iree_uk_ssize_swap(iree_uk_ssize_t* a, iree_uk_ssize_t* b) {
+  iree_uk_ssize_t t = *a;
+  *a = *b;
+  *b = t;
+}
+
 //===----------------------------------------------------------------------===//
 // Local replacement for stdbool.h
 //===----------------------------------------------------------------------===//
@@ -216,6 +222,7 @@
   iree_uk_status_bad_flags,
   iree_uk_status_unsupported_huge_or_negative_dimension,
   iree_uk_status_unsupported_generic_tile_size,
+  iree_uk_status_shapes_mismatch,
 } iree_uk_status_t;
 
 // Convert a status code to a human-readable string.
@@ -442,6 +449,16 @@
 #define IREE_UK_ATTRIBUTE_NOINLINE
 #endif  // IREE_UK_HAVE_ATTRIBUTE(noinline)
 
+// The `restrict` here have the effect of enabling the compiler to rewrite this
+// as a memcpy call, shrinking code size of the (slow anyway) generic code paths
+// that would use this.
+static inline void iree_uk_memcpy(void* IREE_UK_RESTRICT dst,
+                                  const void* IREE_UK_RESTRICT src,
+                                  iree_uk_ssize_t size) {
+  for (iree_uk_ssize_t i = 0; i < size; ++i)
+    ((char*)dst)[i] = ((const char*)src)[i];
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/runtime/src/iree/builtins/ukernel/mmt4d.c b/runtime/src/iree/builtins/ukernel/mmt4d.c
index 08e75f2..1e5cb9d 100644
--- a/runtime/src/iree/builtins/ukernel/mmt4d.c
+++ b/runtime/src/iree/builtins/ukernel/mmt4d.c
@@ -6,10 +6,8 @@
 
 #include "iree/builtins/ukernel/mmt4d.h"
 
-#include <stdbool.h>
-
-#include "iree/builtins/ukernel/arch/mmt4d_select_tile_arch.h"
-#include "iree/builtins/ukernel/mmt4d_select_tile_generic.h"
+#include "iree/builtins/ukernel/arch/mmt4d_arch.h"
+#include "iree/builtins/ukernel/mmt4d_generic.h"
 
 #define OUTSIDE_UINT_RANGE(value, bits) (((value) < 0) || ((value) >> (bits)))
 
diff --git a/runtime/src/iree/builtins/ukernel/mmt4d_select_tile_generic.c b/runtime/src/iree/builtins/ukernel/mmt4d_generic.c
similarity index 97%
rename from runtime/src/iree/builtins/ukernel/mmt4d_select_tile_generic.c
rename to runtime/src/iree/builtins/ukernel/mmt4d_generic.c
index 41f13b6..3c4c38e 100644
--- a/runtime/src/iree/builtins/ukernel/mmt4d_select_tile_generic.c
+++ b/runtime/src/iree/builtins/ukernel/mmt4d_generic.c
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree/builtins/ukernel/mmt4d_select_tile_generic.h"
+#include "iree/builtins/ukernel/mmt4d_generic.h"
 
 // Generic implementation of matmul tile, i8*i8->i32 case.
 static void iree_uk_mmt4d_tile_i8i8i32_generic(
diff --git a/runtime/src/iree/builtins/ukernel/mmt4d_select_tile_generic.h b/runtime/src/iree/builtins/ukernel/mmt4d_generic.h
similarity index 100%
rename from runtime/src/iree/builtins/ukernel/mmt4d_select_tile_generic.h
rename to runtime/src/iree/builtins/ukernel/mmt4d_generic.h
diff --git a/runtime/src/iree/builtins/ukernel/mmt4d_types.h b/runtime/src/iree/builtins/ukernel/mmt4d_types.h
index 821dece..2c2397b 100644
--- a/runtime/src/iree/builtins/ukernel/mmt4d_types.h
+++ b/runtime/src/iree/builtins/ukernel/mmt4d_types.h
@@ -32,9 +32,6 @@
 typedef struct iree_uk_mmt4d_params_t {
   iree_uk_mmt4d_type_t type;
   iree_uk_uint32_t flags;
-  const void* lhs_buffer;
-  const void* rhs_buffer;
-  void* out_buffer;
   iree_uk_ssize_t lhs_stride;
   iree_uk_ssize_t rhs_stride;
   iree_uk_ssize_t out_stride;
@@ -44,6 +41,9 @@
   iree_uk_int32_t M0;
   iree_uk_int32_t N0;
   iree_uk_int32_t K0;
+  const void* lhs_buffer;
+  const void* rhs_buffer;
+  void* out_buffer;
   const iree_uk_uint64_t* cpu_data;
 } iree_uk_mmt4d_params_t;
 
diff --git a/runtime/src/iree/builtins/ukernel/pack.c b/runtime/src/iree/builtins/ukernel/pack.c
index 96c623a..101020a 100644
--- a/runtime/src/iree/builtins/ukernel/pack.c
+++ b/runtime/src/iree/builtins/ukernel/pack.c
@@ -6,10 +6,21 @@
 
 #include "iree/builtins/ukernel/pack.h"
 
+#include "iree/builtins/ukernel/arch/pack_arch.h"
+#include "iree/builtins/ukernel/pack_generic.h"
+
 static iree_uk_status_t iree_uk_pack_validate(
     const iree_uk_pack_params_t* params) {
   const iree_uk_uint32_t allflags =
       IREE_UK_FLAG_PACK_TRANSPOSE_INNER | IREE_UK_FLAG_PACK_TRANSPOSE_OUTER;
+#ifdef NDEBUG
+  // Avoid validation code overhead (code size and latency) in release builds.
+  // This actually enables more thorough validation as it removes optimization
+  // concerns from the validation code.
+  // Microkernels take raw pointers/sizes/strides anyway, so if params are
+  // incorrect, UB will happen no matter how much we try to validate.
+  return iree_uk_status_ok;
+#endif
   if (params->flags & ~allflags) {
     return iree_uk_status_bad_flags;
   }
@@ -26,54 +37,90 @@
       params->out_size1 < 0 || params->out_size2 < 0 || params->out_size3 < 0) {
     return iree_uk_status_unsupported_huge_or_negative_dimension;
   }
+  // Check that the input and output shapes match, give or take padding that
+  // must not exceed the inner tile size.s
+  iree_uk_ssize_t outer_size0 = params->out_size0;
+  iree_uk_ssize_t outer_size1 = params->out_size1;
+  iree_uk_ssize_t tile_size0 = params->out_size2;
+  iree_uk_ssize_t tile_size1 = params->out_size3;
+  if (params->flags & IREE_UK_FLAG_PACK_TRANSPOSE_OUTER) {
+    iree_uk_ssize_swap(&outer_size0, &outer_size1);
+  }
+  if (params->flags & IREE_UK_FLAG_PACK_TRANSPOSE_INNER) {
+    iree_uk_ssize_swap(&tile_size0, &tile_size1);
+  }
+  if (outer_size0 * tile_size0 < params->in_size0 ||
+      outer_size1 * tile_size1 < params->in_size1 ||
+      (outer_size0 - 1) * tile_size0 >= params->in_size0 ||
+      (outer_size1 - 1) * tile_size1 >= params->in_size1) {
+    return iree_uk_status_shapes_mismatch;
+  }
   return iree_uk_status_ok;
 }
 
-static inline void iree_uk_ssize_swap(iree_uk_ssize_t* a, iree_uk_ssize_t* b) {
-  iree_uk_ssize_t t = *a;
-  *a = *b;
-  *b = t;
+static bool iree_uk_pack_early(const iree_uk_pack_params_t* params) {
+  return (params->out_size0 == 0 || params->out_size1 == 0 ||
+          params->out_size2 == 0 || params->out_size3 == 0);
 }
 
-static inline void iree_uk_memcpy(char* dst, const char* src,
-                                  iree_uk_ssize_t size) {
-  for (iree_uk_ssize_t i = 0; i < size; ++i) dst[i] = src[i];
-}
-
-iree_uk_status_t iree_uk_pack(const iree_uk_pack_params_t* params) {
-  IREE_UK_RETURN_IF_ERROR(iree_uk_pack_validate(params));
-  if (params->out_size0 == 0 || params->out_size1 == 0 ||
-      params->out_size2 == 0 || params->out_size3 == 0) {
-    return iree_uk_status_ok;
+static iree_uk_pack_tile_func_t iree_uk_pack_select_tile_func(
+    const iree_uk_pack_params_t* params) {
+  iree_uk_pack_tile_func_t arch_tile_func =
+      iree_uk_pack_select_tile_func_arch(params);
+  if (arch_tile_func) {
+    return arch_tile_func;
   }
+  return iree_uk_pack_select_tile_func_generic(params);
+}
+
+static void iree_uk_pack_using_tile_func(const iree_uk_pack_params_t* params,
+                                         iree_uk_pack_tile_func_t tile_func) {
   // For now, the input and output element types are always the same.
   iree_uk_type_t elem_type = iree_uk_pack_in_type(params->type);
   iree_uk_ssize_t elem_size = iree_uk_type_size(elem_type);
-  iree_uk_ssize_t lsize0 = params->out_size0;
-  iree_uk_ssize_t lsize1 = params->out_size1;
-  iree_uk_ssize_t lsize2 = params->out_size2;
-  iree_uk_ssize_t lsize3 = params->out_size3;
+  iree_uk_ssize_t outer_size0 = params->out_size0;
+  iree_uk_ssize_t outer_size1 = params->out_size1;
+  iree_uk_ssize_t tile_size0 = params->out_size2;
+  iree_uk_ssize_t tile_size1 = params->out_size3;
   iree_uk_ssize_t out_stride_l0 = params->out_stride0;
   iree_uk_ssize_t out_stride_l1 = params->out_size3 * params->out_size2;
   iree_uk_ssize_t out_stride_l2 = params->out_size3;
   iree_uk_ssize_t out_stride_l3 = 1;
   if (params->flags & IREE_UK_FLAG_PACK_TRANSPOSE_OUTER) {
-    iree_uk_ssize_swap(&lsize0, &lsize1);
+    iree_uk_ssize_swap(&outer_size0, &outer_size1);
     iree_uk_ssize_swap(&out_stride_l0, &out_stride_l1);
   }
   if (params->flags & IREE_UK_FLAG_PACK_TRANSPOSE_INNER) {
-    iree_uk_ssize_swap(&lsize2, &lsize3);
+    iree_uk_ssize_swap(&tile_size0, &tile_size1);
     iree_uk_ssize_swap(&out_stride_l2, &out_stride_l3);
   }
-  for (iree_uk_ssize_t l0 = 0; l0 < lsize0; ++l0) {
-    for (iree_uk_ssize_t l2 = 0; l2 < lsize2; ++l2) {
-      for (iree_uk_ssize_t l1 = 0; l1 < lsize1; ++l1) {
-        for (iree_uk_ssize_t l3 = 0; l3 < lsize3; ++l3) {
-          iree_uk_ssize_t out_offset = l0 * out_stride_l0 + l2 * out_stride_l2 +
-                                       l1 * out_stride_l1 + l3 * out_stride_l3;
-          iree_uk_ssize_t i0 = l0 * lsize2 + l2;
-          iree_uk_ssize_t i1 = l1 * lsize3 + l3;
-          char* out_ptr = ((char*)params->out_buffer) + out_offset * elem_size;
+  const char* in_row_ptr = params->in_buffer;
+  char* out_row_ptr = params->out_buffer;
+  bool l0_has_padding = outer_size0 * tile_size0 != params->in_size0;
+  bool l1_has_padding = outer_size1 * tile_size1 != params->in_size1;
+  iree_uk_ssize_t l0_full_tile_end = outer_size0 - (l0_has_padding ? 1 : 0);
+  iree_uk_ssize_t l1_full_tile_end = outer_size1 - (l1_has_padding ? 1 : 0);
+  for (iree_uk_ssize_t outer_i0 = 0; outer_i0 < outer_size0; ++outer_i0) {
+    // If we're on the final iteration of outer loop 0 and there is padding,
+    // set l1_full_tile_end to 0, so henceforth it is sufficient to check
+    // against l1_full_tile_end to tell if we are padding.
+    if (outer_i0 == l0_full_tile_end) {
+      l1_full_tile_end = 0;
+    }
+    // Handle full tiles, using the (fast) tile_func.
+    char* out_tile_ptr =
+        tile_func(out_row_ptr, in_row_ptr, l1_full_tile_end, out_stride_l1,
+                  params->in_stride0, elem_size, tile_size0, tile_size1);
+    // Handle incomplete tiles, with padding, using slow code here.
+    for (iree_uk_ssize_t outer_i1 = l1_full_tile_end; outer_i1 < outer_size1;
+         ++outer_i1) {
+      for (iree_uk_ssize_t tile_i0 = 0; tile_i0 < tile_size0; ++tile_i0) {
+        for (iree_uk_ssize_t tile_i1 = 0; tile_i1 < tile_size1; ++tile_i1) {
+          iree_uk_ssize_t i0 = outer_i0 * tile_size0 + tile_i0;
+          iree_uk_ssize_t i1 = outer_i1 * tile_size1 + tile_i1;
+          char* out_ptr =
+              out_tile_ptr +
+              (tile_i0 * out_stride_l2 + tile_i1 * out_stride_l3) * elem_size;
           if (i0 >= params->in_size0 || i1 >= params->in_size1) {
             iree_uk_memcpy(out_ptr, params->padding_value, elem_size);
           } else {
@@ -84,7 +131,21 @@
           }
         }
       }
+      out_tile_ptr += out_stride_l1 * elem_size;
     }
+    out_row_ptr += out_stride_l0 * elem_size;
+    in_row_ptr += tile_size0 * params->in_stride0 * elem_size;
   }
+}
+
+iree_uk_status_t iree_uk_pack(const iree_uk_pack_params_t* params) {
+  IREE_UK_RETURN_IF_ERROR(iree_uk_pack_validate(params));
+
+  if (iree_uk_pack_early(params)) return iree_uk_status_ok;
+
+  // Select a target-specific tile_func (inner loop on K, computing one M0xN0
+  // tile) and use that with generic outer loops.
+  iree_uk_pack_tile_func_t row_func = iree_uk_pack_select_tile_func(params);
+  iree_uk_pack_using_tile_func(params, row_func);
   return iree_uk_status_ok;
 }
diff --git a/runtime/src/iree/builtins/ukernel/pack_generic.c b/runtime/src/iree/builtins/ukernel/pack_generic.c
new file mode 100644
index 0000000..5e40ae7
--- /dev/null
+++ b/runtime/src/iree/builtins/ukernel/pack_generic.c
@@ -0,0 +1,66 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/builtins/ukernel/pack_generic.h"
+
+static void* iree_uk_pack_tile_generic_direct(
+    void* IREE_UK_RESTRICT out_tile_ptr,
+    const void* IREE_UK_RESTRICT in_tile_ptr, iree_uk_ssize_t outer_size1,
+    iree_uk_ssize_t out_stride_l1, iree_uk_ssize_t in_stride0,
+    iree_uk_ssize_t elem_size, iree_uk_ssize_t tile_size0,
+    iree_uk_ssize_t tile_size1) {
+  const char* IREE_UK_RESTRICT in_ptr_l1 = in_tile_ptr;
+  char* IREE_UK_RESTRICT out_ptr_l1 = out_tile_ptr;
+  for (iree_uk_ssize_t outer_i1 = 0; outer_i1 < outer_size1; ++outer_i1) {
+    const char* IREE_UK_RESTRICT in_ptr = in_ptr_l1;
+    char* IREE_UK_RESTRICT out_ptr = out_ptr_l1;
+    for (iree_uk_ssize_t tile_i0 = 0; tile_i0 < tile_size0; ++tile_i0) {
+      iree_uk_memcpy(out_ptr, in_ptr, tile_size1 * elem_size);
+      out_ptr += tile_size1 * elem_size;
+      in_ptr += in_stride0 * elem_size;
+    }
+    out_ptr_l1 += out_stride_l1 * elem_size;
+    in_ptr_l1 += tile_size1 * elem_size;
+  }
+  return out_ptr_l1;
+}
+
+static void* iree_uk_pack_tile_generic_transpose(
+    void* IREE_UK_RESTRICT out_tile_ptr,
+    const void* IREE_UK_RESTRICT in_tile_ptr, iree_uk_ssize_t outer_size1,
+    iree_uk_ssize_t out_stride_l1, iree_uk_ssize_t in_stride0,
+    iree_uk_ssize_t elem_size, iree_uk_ssize_t tile_size0,
+    iree_uk_ssize_t tile_size1) {
+  const char* IREE_UK_RESTRICT in_ptr_l1 = in_tile_ptr;
+  char* IREE_UK_RESTRICT out_ptr_l1 = out_tile_ptr;
+  for (iree_uk_ssize_t outer_i1 = 0; outer_i1 < outer_size1; ++outer_i1) {
+    const char* IREE_UK_RESTRICT in_ptr_l2 = in_ptr_l1;
+    char* IREE_UK_RESTRICT out_ptr_l2 = out_ptr_l1;
+    for (iree_uk_ssize_t tile_i0 = 0; tile_i0 < tile_size0; ++tile_i0) {
+      const char* IREE_UK_RESTRICT in_ptr = in_ptr_l2;
+      char* IREE_UK_RESTRICT out_ptr = out_ptr_l2;
+      for (iree_uk_ssize_t tile_i1 = 0; tile_i1 < tile_size1; ++tile_i1) {
+        iree_uk_memcpy(out_ptr, in_ptr, elem_size);
+        out_ptr += tile_size0 * elem_size;
+        in_ptr += elem_size;
+      }
+      out_ptr_l2 += elem_size;
+      in_ptr_l2 += in_stride0 * elem_size;
+    }
+    out_ptr_l1 += out_stride_l1 * elem_size;
+    in_ptr_l1 += tile_size1 * elem_size;
+  }
+  return out_ptr_l1;
+}
+
+iree_uk_pack_tile_func_t iree_uk_pack_select_tile_func_generic(
+    const iree_uk_pack_params_t* params) {
+  if (params->flags & IREE_UK_FLAG_PACK_TRANSPOSE_INNER) {
+    return iree_uk_pack_tile_generic_transpose;
+  } else {
+    return iree_uk_pack_tile_generic_direct;
+  }
+}
diff --git a/runtime/src/iree/builtins/ukernel/pack_generic.h b/runtime/src/iree/builtins/ukernel/pack_generic.h
new file mode 100644
index 0000000..50bee3e
--- /dev/null
+++ b/runtime/src/iree/builtins/ukernel/pack_generic.h
@@ -0,0 +1,18 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BUILTINS_UKERNEL_PACK_GENERIC_H_
+#define IREE_BUILTINS_UKERNEL_PACK_GENERIC_H_
+
+#include "iree/builtins/ukernel/pack_types.h"
+
+// Returns the generic tile function to use to perform the mmt4d with the given
+// *params. The caller may want to first try to get an optimized
+// architecture-specific tile function before falling back on this.
+iree_uk_pack_tile_func_t iree_uk_pack_select_tile_func_generic(
+    const iree_uk_pack_params_t* params);
+
+#endif  // IREE_BUILTINS_UKERNEL_PACK_GENERIC_H_
diff --git a/runtime/src/iree/builtins/ukernel/pack_types.h b/runtime/src/iree/builtins/ukernel/pack_types.h
index dac5353..fdc7790 100644
--- a/runtime/src/iree/builtins/ukernel/pack_types.h
+++ b/runtime/src/iree/builtins/ukernel/pack_types.h
@@ -28,8 +28,7 @@
 // Parameters for a pack operation.
 typedef struct iree_uk_pack_params_t {
   iree_uk_pack_type_t type;
-  const void* in_buffer;
-  void* out_buffer;
+  iree_uk_uint32_t flags;
   iree_uk_ssize_t in_stride0;
   iree_uk_ssize_t out_stride0;
   iree_uk_ssize_t in_size0;
@@ -38,8 +37,25 @@
   iree_uk_ssize_t out_size1;
   iree_uk_ssize_t out_size2;
   iree_uk_ssize_t out_size3;
+  const void* in_buffer;
+  void* out_buffer;
   const void* padding_value;
-  iree_uk_uint32_t flags;
+  const iree_uk_uint64_t* cpu_data;
 } iree_uk_pack_params_t;
 
+typedef void* (*iree_uk_pack_tile_func_t)(
+    void* IREE_UK_RESTRICT /*out_tile_ptr*/,
+    const void* IREE_UK_RESTRICT /*in_tile_ptr*/,
+    iree_uk_ssize_t /*outer_size1*/, iree_uk_ssize_t /*out_stride_l1*/,
+    iree_uk_ssize_t /*in_stride0*/, iree_uk_ssize_t /*elem_size*/,
+    iree_uk_ssize_t /*tile_size0*/, iree_uk_ssize_t /*tile_size1*/);
+
+// Tile kernel declarations. Prototype matches iree_uk_pack_tile_func_t.
+#define IREE_UK_PACK_TILE_FUNC_DECL(NAME)                                \
+  void* NAME(void* IREE_UK_RESTRICT out_tile_ptr,                        \
+             const void* IREE_UK_RESTRICT in_tile_ptr,                   \
+             iree_uk_ssize_t outer_size1, iree_uk_ssize_t out_stride_l1, \
+             iree_uk_ssize_t in_stride0, iree_uk_ssize_t elem_size,      \
+             iree_uk_ssize_t tile_size0, iree_uk_ssize_t tile_size1);
+
 #endif  // IREE_BUILTINS_UKERNEL_PACK_TYPES_H_
diff --git a/runtime/src/iree/builtins/ukernel/tools/BUILD b/runtime/src/iree/builtins/ukernel/tools/BUILD
index 4de5fb0..1f7da3e 100644
--- a/runtime/src/iree/builtins/ukernel/tools/BUILD
+++ b/runtime/src/iree/builtins/ukernel/tools/BUILD
@@ -50,6 +50,19 @@
     ],
 )
 
+cc_binary_benchmark(
+    name = "pack_benchmark",
+    srcs = ["pack_benchmark.c"],
+    deps = [
+        ":ukernel_test_utils",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base/internal:cpu",
+        "//runtime/src/iree/base/internal:flags",
+        "//runtime/src/iree/builtins/ukernel",
+        "//runtime/src/iree/testing:benchmark",
+    ],
+)
+
 iree_runtime_cc_test(
     name = "pack_test",
     srcs = ["pack_test.cc"],
@@ -60,6 +73,5 @@
         "//runtime/src/iree/base/internal:flags",
         "//runtime/src/iree/builtins/ukernel",
         "//runtime/src/iree/testing:gtest",
-        "//runtime/src/iree/testing:gtest_main",
     ],
 )
diff --git a/runtime/src/iree/builtins/ukernel/tools/CMakeLists.txt b/runtime/src/iree/builtins/ukernel/tools/CMakeLists.txt
index 5273087..fe16eab 100644
--- a/runtime/src/iree/builtins/ukernel/tools/CMakeLists.txt
+++ b/runtime/src/iree/builtins/ukernel/tools/CMakeLists.txt
@@ -53,6 +53,21 @@
     iree::testing::gtest
 )
 
+iree_cc_binary_benchmark(
+  NAME
+    pack_benchmark
+  SRCS
+    "pack_benchmark.c"
+  DEPS
+    ::ukernel_test_utils
+    iree::base
+    iree::base::internal::cpu
+    iree::base::internal::flags
+    iree::builtins::ukernel
+    iree::testing::benchmark
+  TESTONLY
+)
+
 iree_cc_test(
   NAME
     pack_test
@@ -65,7 +80,6 @@
     iree::base::internal::flags
     iree::builtins::ukernel
     iree::testing::gtest
-    iree::testing::gtest_main
 )
 
 ### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/builtins/ukernel/tools/mmt4d_benchmark.c b/runtime/src/iree/builtins/ukernel/tools/mmt4d_benchmark.c
index 7c8aea4..208d527 100644
--- a/runtime/src/iree/builtins/ukernel/tools/mmt4d_benchmark.c
+++ b/runtime/src/iree/builtins/ukernel/tools/mmt4d_benchmark.c
@@ -148,7 +148,7 @@
   } while (0)
 
 #define MMT4D_BENCHMARK_REGISTER_GENERIC(_type, _m0, _n0, _k0) \
-  MMT4D_BENCHMARK_REGISTER(_type, _m0, _n0, _k0, 0, GENERIC)
+  MMT4D_BENCHMARK_REGISTER(_type, _m0, _n0, _k0, 0, generic)
 
 #define MMT4D_BENCHMARK_REGISTER_ARM_64(_type, _m0, _n0, _k0) \
   MMT4D_BENCHMARK_REGISTER(_type, _m0, _n0, _k0, 0, arm_64)
@@ -160,10 +160,9 @@
                            arm_64_##_cpu_feature)
 
 int main(int argc, char** argv) {
-  iree_flags_set_usage(
-      "mmt4d_benchmark",
-      "Benchmarks the libmmt4d implementation of the target machine.\n"
-      "\n");
+  iree_flags_set_usage("mmt4d_benchmark",
+                       "Benchmarks the mmt4d microkernel.\n"
+                       "\n");
 
   iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_UNDEFINED_OK, &argc, &argv);
   iree_benchmark_initialize(&argc, argv);
diff --git a/runtime/src/iree/builtins/ukernel/tools/mmt4d_test.cc b/runtime/src/iree/builtins/ukernel/tools/mmt4d_test.cc
index 569c995..4193340 100644
--- a/runtime/src/iree/builtins/ukernel/tools/mmt4d_test.cc
+++ b/runtime/src/iree/builtins/ukernel/tools/mmt4d_test.cc
@@ -186,11 +186,11 @@
   // Populate strides first - we need them below to compute buffer lengths.
   // Randomly make strides either tight or not to exercise all cases.
   params.lhs_stride = params.K * params.M0 * params.K0 +
-                      iree_uk_test_random_engine_get_0_or_1(engine);
+                      iree_uk_test_random_engine_get_0_1(engine);
   params.rhs_stride = params.K * params.N0 * params.K0 +
-                      iree_uk_test_random_engine_get_0_or_1(engine);
+                      iree_uk_test_random_engine_get_0_1(engine);
   params.out_stride = params.N * params.M0 * params.N0 +
-                      iree_uk_test_random_engine_get_0_or_1(engine);
+                      iree_uk_test_random_engine_get_0_1(engine);
   iree_uk_type_t lhs_type = iree_uk_mmt4d_lhs_type(params.type);
   iree_uk_type_t rhs_type = iree_uk_mmt4d_rhs_type(params.type);
   iree_uk_ssize_t lhs_buffer_size =
diff --git a/runtime/src/iree/builtins/ukernel/tools/pack_benchmark.c b/runtime/src/iree/builtins/ukernel/tools/pack_benchmark.c
new file mode 100644
index 0000000..e22eccb
--- /dev/null
+++ b/runtime/src/iree/builtins/ukernel/tools/pack_benchmark.c
@@ -0,0 +1,260 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/cpu.h"
+#include "iree/base/internal/flags.h"
+#include "iree/builtins/ukernel/pack.h"
+#include "iree/builtins/ukernel/tools/ukernel_test_utils.h"
+#include "iree/testing/benchmark.h"
+
+IREE_FLAG(int64_t, batch_min_traversal_size, 1000000000,
+          "Minimum number of bytes to be traversed in each batch.");
+
+IREE_FLAG(
+    int64_t, working_set_size, 1000000,
+    "Number of bytes to be traversed by the benchmark workload (input and "
+    "output buffers together). Matrix shapes are computed accordingly.");
+IREE_FLAG(
+    int32_t, padding_size, 0,
+    "Padding size (same value used for both dimensions, 0 means no padding)");
+
+typedef struct iree_pack_benchmark_user_data_t {
+  iree_uk_pack_type_t type;
+  int size2;
+  int size3;
+  iree_uk_uint32_t flags;
+  const iree_uk_uint64_t* cpu_data;
+} iree_pack_benchmark_user_data_t;
+
+IREE_UK_ATTRIBUTE_NOINLINE static void iree_memcpy_noinline(
+    void* restrict dst, const void* restrict src, size_t size) {
+  memcpy(dst, src, size);
+}
+
+static iree_status_t iree_memcpy_benchmark(
+    const iree_benchmark_def_t* benchmark_def,
+    iree_benchmark_state_t* benchmark_state) {
+  iree_uk_int64_t total_iterations = 0;
+  iree_uk_int64_t batch_count =
+      (FLAG_batch_min_traversal_size + FLAG_working_set_size - 1) /
+      FLAG_working_set_size;
+  iree_uk_ssize_t buffer_size = FLAG_working_set_size / 2;
+  uint8_t* in_buffer = malloc(buffer_size);
+  uint8_t* out_buffer = malloc(buffer_size);
+  for (iree_uk_ssize_t i = 0; i < buffer_size; ++i) in_buffer[i] = (i & 0xFF);
+  while (iree_benchmark_keep_running(benchmark_state,
+                                     /*batch_count=*/batch_count)) {
+    for (int i = 0; i < batch_count; ++i) {
+      iree_memcpy_noinline(out_buffer, in_buffer, buffer_size);
+    }
+    total_iterations += batch_count;
+  }
+  // Report bytes per second, so that can be easily compared to known memory
+  // system performance metrics (e.g. RAM bandwidth, to tell whether this is
+  // memory-bound).
+  iree_benchmark_set_items_processed(benchmark_state,
+                                     total_iterations * buffer_size);
+  assert(!memcmp(in_buffer, out_buffer, buffer_size));
+  free(in_buffer);
+  free(out_buffer);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_pack_benchmark(
+    const iree_benchmark_def_t* benchmark_def,
+    iree_benchmark_state_t* benchmark_state) {
+  const iree_pack_benchmark_user_data_t* user_data = benchmark_def->user_data;
+  iree_uk_type_t in_type = iree_uk_pack_in_type(user_data->type);
+  iree_uk_type_t out_type = iree_uk_pack_out_type(user_data->type);
+  iree_uk_ssize_t in_type_size = iree_uk_type_size(in_type);
+  iree_uk_ssize_t out_type_size = iree_uk_type_size(out_type);
+
+  // The inner dims 2, 3 are given to us as part of the benchmark user_data.
+  // The outer dims 0, 1 are to be determined based on FLAG_working_set_size.
+  iree_uk_ssize_t out_size0 = 1;
+  iree_uk_ssize_t out_size1 = 1;
+  iree_uk_ssize_t out_size2 = user_data->size2;
+  iree_uk_ssize_t out_size3 = user_data->size3;
+  int target_matrix_size_in_elems =
+      FLAG_working_set_size / (in_type_size + out_type_size);
+  int target_product_of_outer_sizes_0_1 =
+      target_matrix_size_in_elems / (out_size2 * out_size3);
+  while (target_product_of_outer_sizes_0_1 >= 4) {
+    target_product_of_outer_sizes_0_1 /= 4;
+    out_size0 *= 2;
+    out_size1 *= 2;
+  }
+  out_size1 *= target_product_of_outer_sizes_0_1;
+
+  iree_uk_pack_params_t params;
+  memset(&params, 0, sizeof params);
+  params.type = user_data->type;
+  params.flags = user_data->flags;
+  params.out_size0 = out_size0;
+  params.out_size1 = out_size1;
+  params.out_size2 = out_size2;
+  params.out_size3 = out_size3;
+  if (params.flags & IREE_UK_FLAG_PACK_TRANSPOSE_OUTER) {
+    iree_uk_ssize_swap(&out_size0, &out_size1);
+  }
+  if (params.flags & IREE_UK_FLAG_PACK_TRANSPOSE_INNER) {
+    iree_uk_ssize_swap(&out_size2, &out_size3);
+  }
+  params.in_size0 = iree_max(0, out_size0 * out_size2 - FLAG_padding_size);
+  params.in_size1 = iree_max(0, out_size1 * out_size3 - FLAG_padding_size);
+  params.in_stride0 = params.in_size1;
+  params.out_stride0 = params.out_size1 * params.out_size2 * params.out_size3;
+  iree_uk_ssize_t in_buffer_size = iree_uk_test_2d_buffer_length(
+      in_type, params.in_size0, params.in_stride0);
+  iree_uk_ssize_t out_buffer_size = iree_uk_test_2d_buffer_length(
+      out_type, params.out_size0, params.out_stride0);
+  void* in_buffer = malloc(in_buffer_size);
+  void* out_buffer = malloc(out_buffer_size);
+  void* padding_value_buffer = malloc(out_type_size);
+  iree_uk_test_random_engine_t* engine = iree_uk_test_random_engine_create();
+  // It's just about plausible that on some platform, for some number type,
+  // performance might be different on zero buffers vs random buffers. But it
+  // shouldn't matter that we recreate the random engine every time, getting
+  // the same random values again.
+  iree_uk_test_write_random_buffer(in_buffer, in_buffer_size, in_type, engine);
+  iree_uk_test_write_random_buffer(out_buffer, out_buffer_size, out_type,
+                                   engine);
+  iree_uk_test_write_random_buffer(padding_value_buffer, out_type_size,
+                                   out_type, engine);
+  iree_uk_test_random_engine_destroy(engine);
+  params.in_buffer = in_buffer;
+  params.out_buffer = out_buffer;
+  params.padding_value = padding_value_buffer;
+  iree_uk_int64_t total_iterations = 0;
+  iree_uk_int64_t batch_count =
+      (FLAG_batch_min_traversal_size + FLAG_working_set_size - 1) /
+      FLAG_working_set_size;
+  while (iree_benchmark_keep_running(benchmark_state,
+                                     /*batch_count=*/batch_count)) {
+    for (int i = 0; i < batch_count; ++i) {
+      iree_uk_status_t status = iree_uk_pack(&params);
+      if (status != iree_uk_status_ok) {
+        fprintf(stderr, "FATAL: iree_uk_pack failed: %s\n",
+                iree_uk_status_message(status));
+        iree_abort();
+      }
+    }
+    total_iterations += batch_count;
+  }
+  // Report bytes per second, so that can be easily compared to known memory
+  // system performance metrics (e.g. RAM bandwidth, to tell whether this is
+  // memory-bound).
+  iree_benchmark_set_items_processed(benchmark_state,
+                                     total_iterations * out_buffer_size);
+  free(in_buffer);
+  free(out_buffer);
+  free(padding_value_buffer);
+  return iree_ok_status();
+}
+
+static void iree_pack_benchmark_register(
+    const iree_pack_benchmark_user_data_t* user_data, const char* name) {
+  // Does this benchmark require an optional CPU feature?
+  if (user_data->cpu_data[0]) {
+    if ((iree_cpu_data_field(0) & user_data->cpu_data[0]) !=
+        user_data->cpu_data[0]) {
+      // The CPU does not meet this benchmark's requirements. The builtin
+      // would crash.
+      return;
+    }
+  }
+
+  // benchmark_def does not need to be static, it will be cloned.
+  const iree_benchmark_def_t benchmark_def = {
+      .flags = IREE_BENCHMARK_FLAG_USE_REAL_TIME,
+      .time_unit = IREE_BENCHMARK_UNIT_MICROSECOND,
+      .minimum_duration_ns = 0,
+      .iteration_count = 0,
+      .run = iree_pack_benchmark,
+      .user_data = user_data,
+  };
+  iree_benchmark_register(IREE_SV(name), &benchmark_def);
+}
+
+#define PACK_BENCHMARK_REGISTER_WITH_FLAGS(                                   \
+    _flags, _flags_suffix, _type, _size2, _size3, _cpu_data_field_0, _label)  \
+  do {                                                                        \
+    static const iree_uk_uint64_t local_cpu_data[IREE_CPU_DATA_FIELD_COUNT] = \
+        {_cpu_data_field_0};                                                  \
+    static const iree_pack_benchmark_user_data_t user_data = {                \
+        .type = iree_uk_pack_type_##_type,                                    \
+        .size2 = _size2,                                                      \
+        .size3 = _size3,                                                      \
+        .flags = _flags,                                                      \
+        .cpu_data = local_cpu_data,                                           \
+    };                                                                        \
+    iree_pack_benchmark_register(&user_data,                                  \
+                                 "iree_uk_pack_" #_type "_" #_size2           \
+                                 "x" #_size3 "_" _flags_suffix "_" #_label);  \
+  } while (0)
+
+#define PACK_BENCHMARK_REGISTER(...)                                         \
+  PACK_BENCHMARK_REGISTER_WITH_FLAGS(0, "TRANSPOSE_NONE", __VA_ARGS__);      \
+  PACK_BENCHMARK_REGISTER_WITH_FLAGS(IREE_UK_FLAG_PACK_TRANSPOSE_INNER,      \
+                                     "TRANSPOSE_INNER", __VA_ARGS__);        \
+  PACK_BENCHMARK_REGISTER_WITH_FLAGS(IREE_UK_FLAG_PACK_TRANSPOSE_OUTER,      \
+                                     "TRANSPOSE_OUTER", __VA_ARGS__);        \
+  PACK_BENCHMARK_REGISTER_WITH_FLAGS(                                        \
+      IREE_UK_FLAG_PACK_TRANSPOSE_INNER | IREE_UK_FLAG_PACK_TRANSPOSE_OUTER, \
+      "TRANSPOSE_BOTH", __VA_ARGS__);
+
+#define PACK_BENCHMARK_REGISTER_GENERIC(_type, _size2, _size3) \
+  PACK_BENCHMARK_REGISTER(_type, _size2, _size3, 0, generic)
+
+#define PACK_BENCHMARK_REGISTER_ARM_64(_type, _size2, _size3) \
+  PACK_BENCHMARK_REGISTER(_type, _size2, _size3, 0, arm_64)
+
+#define PACK_BENCHMARK_REGISTER_ARM_64_WITH_CPU_FEATURE(_type, _size2, _size3, \
+                                                        _cpu_feature)          \
+  PACK_BENCHMARK_REGISTER(_type, _size2, _size3,                               \
+                          IREE_CPU_DATA_FIELD_0_AARCH64_HAVE_##_cpu_feature,   \
+                          arm_64_##_cpu_feature)
+
+int main(int argc, char** argv) {
+  iree_flags_set_usage("pack_benchmark",
+                       "Benchmarks the pack microkernel.\n"
+                       "\n");
+
+  iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_UNDEFINED_OK, &argc, &argv);
+  iree_benchmark_initialize(&argc, argv);
+  iree_cpu_initialize(iree_allocator_system());
+
+  const iree_benchmark_def_t memcpy_benchmark_def = {
+      .flags = IREE_BENCHMARK_FLAG_USE_REAL_TIME,
+      .time_unit = IREE_BENCHMARK_UNIT_MICROSECOND,
+      .minimum_duration_ns = 0,
+      .iteration_count = 0,
+      .run = iree_memcpy_benchmark,
+      .user_data = 0,
+  };
+  iree_benchmark_register(IREE_SV("memcpy"), &memcpy_benchmark_def);
+
+  // Generic code paths, not actually used, but interesting to get a sense
+  // of how slow generic code goes vs decent SIMD kernels.
+  PACK_BENCHMARK_REGISTER_GENERIC(f32f32, 4, 4);
+
+// ARM_64 benchmarks.
+#if defined(IREE_UK_ARCH_ARM_64)
+
+  PACK_BENCHMARK_REGISTER_ARM_64(f32f32, 8, 1);
+  PACK_BENCHMARK_REGISTER_ARM_64(i8i8, 8, 1);
+  PACK_BENCHMARK_REGISTER_ARM_64(i8i8, 8, 4);
+  PACK_BENCHMARK_REGISTER_ARM_64(i8i8, 8, 8);
+
+#endif  // defined(IREE_UK_ARCH_ARM_64)
+
+  iree_benchmark_run_specified();
+  return 0;
+}
diff --git a/runtime/src/iree/builtins/ukernel/tools/pack_test.cc b/runtime/src/iree/builtins/ukernel/tools/pack_test.cc
index fc3da46..7ccc944 100644
--- a/runtime/src/iree/builtins/ukernel/tools/pack_test.cc
+++ b/runtime/src/iree/builtins/ukernel/tools/pack_test.cc
@@ -6,11 +6,12 @@
 
 #include "iree/builtins/ukernel/pack.h"
 
+#include <algorithm>
 #include <cstring>
-#include <utility>
 #include <vector>
 
 #include "iree/base/api.h"
+#include "iree/base/internal/cpu.h"
 #include "iree/builtins/ukernel/tools/ukernel_test_utils.h"
 #include "iree/testing/gtest.h"
 #include "iree/testing/status_matchers.h"
@@ -19,32 +20,35 @@
   // For now, the input and output element types are always the same.
   iree_uk_type_t elem_type = iree_uk_pack_in_type(params.type);
   iree_uk_ssize_t elem_size = iree_uk_type_size(elem_type);
-  iree_uk_ssize_t lsize0 = params.out_size0;
-  iree_uk_ssize_t lsize1 = params.out_size1;
-  iree_uk_ssize_t lsize2 = params.out_size2;
-  iree_uk_ssize_t lsize3 = params.out_size3;
+  iree_uk_ssize_t outer_size0 = params.out_size0;
+  iree_uk_ssize_t outer_size1 = params.out_size1;
+  iree_uk_ssize_t tile_size0 = params.out_size2;
+  iree_uk_ssize_t tile_size1 = params.out_size3;
   iree_uk_ssize_t out_stride_l0 = params.out_stride0;
   iree_uk_ssize_t out_stride_l1 = params.out_size3 * params.out_size2;
   iree_uk_ssize_t out_stride_l2 = params.out_size3;
   iree_uk_ssize_t out_stride_l3 = 1;
   if (params.flags & IREE_UK_FLAG_PACK_TRANSPOSE_OUTER) {
-    std::swap(lsize0, lsize1);
+    std::swap(outer_size0, outer_size1);
     std::swap(out_stride_l0, out_stride_l1);
   }
   if (params.flags & IREE_UK_FLAG_PACK_TRANSPOSE_INNER) {
-    std::swap(lsize2, lsize3);
+    std::swap(tile_size0, tile_size1);
     std::swap(out_stride_l2, out_stride_l3);
   }
-  assert(lsize0 * lsize2 == params.in_size0);
-  assert(lsize1 * lsize3 == params.in_size1);
-  for (iree_uk_ssize_t l0 = 0; l0 < lsize0; ++l0) {
-    for (iree_uk_ssize_t l2 = 0; l2 < lsize2; ++l2) {
-      for (iree_uk_ssize_t l1 = 0; l1 < lsize1; ++l1) {
-        for (iree_uk_ssize_t l3 = 0; l3 < lsize3; ++l3) {
-          iree_uk_ssize_t out_offset = l0 * out_stride_l0 + l2 * out_stride_l2 +
-                                       l1 * out_stride_l1 + l3 * out_stride_l3;
-          iree_uk_ssize_t i0 = l0 * lsize2 + l2;
-          iree_uk_ssize_t i1 = l1 * lsize3 + l3;
+  assert(outer_size0 * tile_size0 >= params.in_size0);
+  assert(outer_size1 * tile_size1 >= params.in_size1);
+  assert((outer_size0 - 1) * tile_size0 < params.in_size0);
+  assert((outer_size1 - 1) * tile_size1 < params.in_size1);
+  for (iree_uk_ssize_t outer_i0 = 0; outer_i0 < outer_size0; ++outer_i0) {
+    for (iree_uk_ssize_t outer_i1 = 0; outer_i1 < outer_size1; ++outer_i1) {
+      for (iree_uk_ssize_t tile_i0 = 0; tile_i0 < tile_size0; ++tile_i0) {
+        for (iree_uk_ssize_t tile_i1 = 0; tile_i1 < tile_size1; ++tile_i1) {
+          iree_uk_ssize_t out_offset =
+              outer_i0 * out_stride_l0 + tile_i0 * out_stride_l2 +
+              outer_i1 * out_stride_l1 + tile_i1 * out_stride_l3;
+          iree_uk_ssize_t i0 = outer_i0 * tile_size0 + tile_i0;
+          iree_uk_ssize_t i1 = outer_i1 * tile_size1 + tile_i1;
           char* out_ptr = ((char*)params.out_buffer) + out_offset * elem_size;
           if (i0 >= params.in_size0 || i1 >= params.in_size1) {
             memcpy(out_ptr, params.padding_value, elem_size);
@@ -139,9 +143,8 @@
   // Populate strides first - we need them below to compute buffer lengths.
   // Randomly make strides either tight or not to exercise all cases.
   params.in_stride0 =
-      params.in_size1 + iree_uk_test_random_engine_get_0_or_1(engine);
+      params.in_size1 + iree_uk_test_random_engine_get_0_1(engine);
   params.out_stride0 = params.out_size1 * params.out_size2 * params.out_size3;
-  iree_uk_test_random_engine_get_0_or_1(engine);
   iree_uk_type_t in_type = iree_uk_pack_in_type(params.type);
   iree_uk_ssize_t in_buffer_size = iree_uk_test_2d_buffer_length(
       in_type, params.in_size0, params.in_stride0);
@@ -152,53 +155,133 @@
   free(in_buffer);
 }
 
-static void pack_test(const iree_uk_pack_type_t& type) {
-  iree_uk_test_random_engine_t* engine = iree_uk_test_random_engine_create();
-  struct untransposed_out_shape_t {
-    int size0, size1, size2, size3;
+static void pack_test_for_various_tile_shapes_and_flags(
+    iree_uk_pack_type_t type, int tile_size0, int tile_size1,
+    const iree_uk_uint64_t* cpu_data, iree_uk_test_random_engine_t* engine) {
+  struct outer_shape_t {
+    int size0, size1;
   };
-  std::vector<untransposed_out_shape_t> untransposed_out_shapes{
+  std::vector<outer_shape_t> outer_shapes{
       // Degenerate cases. Vacuous.
-      {0, 1, 1, 1},
-      {1, 0, 1, 1},
+      {0, 1},
+      {1, 0},
       // Non-degenerate cases.
-      {1, 1, 1, 1},
-      {2, 2, 2, 2},
-      {3, 3, 2, 2},
-      {2, 2, 3, 3},
-      {2, 3, 2, 3},
-      {11, 13, 7, 5},
-      {4, 8, 16, 32},
+      {1, 1},
+      {2, 2},
+      {3, 2},
+      {8, 8},
+      {11, 13},
+      {123, 45},
   };
-  for (const auto& shape : untransposed_out_shapes) {
+  for (const auto& outer_shape : outer_shapes) {
     for (bool transpose_inner : {false, true}) {
       for (bool transpose_outer : {false, true}) {
         iree_uk_pack_params_t params = {};
         params.type = type;
-        params.in_size0 = shape.size0 * shape.size2;
-        params.in_size1 = shape.size1 * shape.size3;
-        params.out_size0 = shape.size0;
-        params.out_size1 = shape.size1;
-        params.out_size2 = shape.size2;
-        params.out_size3 = shape.size3;
+        params.cpu_data = cpu_data;
+        iree_uk_ssize_t out_size0 = outer_shape.size0;
+        iree_uk_ssize_t out_size1 = outer_shape.size1;
+        iree_uk_ssize_t out_size2 = tile_size0;
+        iree_uk_ssize_t out_size3 = tile_size1;
+        params.out_size0 = out_size0;
+        params.out_size1 = out_size1;
+        params.out_size2 = out_size2;
+        params.out_size3 = out_size3;
         params.flags = 0;
         if (transpose_outer) {
           params.flags |= IREE_UK_FLAG_PACK_TRANSPOSE_OUTER;
-          std::swap(params.out_size0, params.out_size1);
+          std::swap(out_size0, out_size1);
         }
         if (transpose_inner) {
           params.flags |= IREE_UK_FLAG_PACK_TRANSPOSE_INNER;
-          std::swap(params.out_size2, params.out_size3);
+          std::swap(out_size2, out_size3);
         }
+        iree_uk_ssize_t pad_size0 =
+            iree_uk_test_random_engine_get_0_65535(engine) % out_size2;
+        iree_uk_ssize_t pad_size1 =
+            iree_uk_test_random_engine_get_0_65535(engine) % out_size3;
+        params.in_size0 =
+            std::max<iree_uk_ssize_t>(0, out_size0 * out_size2 - pad_size0);
+        params.in_size1 =
+            std::max<iree_uk_ssize_t>(0, out_size1 * out_size3 - pad_size1);
+        iree_uk_type_t out_type = iree_uk_pack_out_type(type);
+        int out_elem_size = iree_uk_type_size(out_type);
+        void* padding_value_buffer = malloc(out_elem_size);
+        iree_uk_test_write_random_buffer(padding_value_buffer, out_elem_size,
+                                         out_type, engine);
+        params.padding_value = padding_value_buffer;
         test_one_pack_creating_input_for_given_shape(params, engine);
+        free(padding_value_buffer);
       }
     }
   }
+}
+
+static void pack_test(iree_uk_pack_type_t type, int tile_size0, int tile_size1,
+                      iree_uk_uint64_t cpu_data_field_0_bit) {
+  const iree_uk_uint64_t local_cpu_data_default[IREE_CPU_DATA_FIELD_COUNT] = {
+      0};
+  iree_uk_test_random_engine_t* engine = iree_uk_test_random_engine_create();
+  // First try without any optional CPU feature. This matters even when the
+  // feature is supported by the CPU because we want to test the fallback to
+  // architecture-default or generic code.
+  pack_test_for_various_tile_shapes_and_flags(type, tile_size0, tile_size1,
+                                              local_cpu_data_default, engine);
+  // If this is nonzero, we are asked to test again with this CPU feature.
+  if (cpu_data_field_0_bit) {
+    const iree_uk_uint64_t local_cpu_data_with_bit[IREE_CPU_DATA_FIELD_COUNT] =
+        {cpu_data_field_0_bit};
+    // Check if the CPU supports the feature (otherwise, we crash).
+    bool supported = iree_cpu_data_field(0) & cpu_data_field_0_bit;
+    char cpu_feat_str[32];
+    iree_uk_test_cpu_features_str(cpu_feat_str, sizeof cpu_feat_str,
+                                  local_cpu_data_with_bit, 1);
+    if (supported) {
+      // Run with the optional CPU feature.
+      printf("Device supports CPU feature: %s\n", cpu_feat_str);
+      pack_test_for_various_tile_shapes_and_flags(
+          type, tile_size0, tile_size1, local_cpu_data_with_bit, engine);
+    } else {
+      printf("Skipped: device does not support CPU feature: %s\n",
+             cpu_feat_str);
+    }
+  }
+
   iree_uk_test_random_engine_destroy(engine);
 }
 
-TEST(PackTest, f32f32) { pack_test(iree_uk_pack_type_f32f32); }
+#define PACK_TEST(type, tile_size0, tile_size1, test_suffix, feature_bit)     \
+  TEST(PackTest, type##_tile_##tile_size0##x##tile_size1##_##test_suffix) {   \
+    pack_test(iree_uk_pack_type_##type, tile_size0, tile_size1, feature_bit); \
+  }
 
-TEST(PackTest, i8i8) { pack_test(iree_uk_pack_type_i8i8); }
+// Generic tests, not matching any particular CPU feature. This is the place to
+// test weird tile shapes to ensure e.g. that we haven't unwittingly baked in a
+// power-of-two assumption
+PACK_TEST(f32f32, 3, 5, generic, 0)
+PACK_TEST(i8i8, 4, 2, generic, 0)
+PACK_TEST(i32i32, 3, 4, generic, 0)
 
-TEST(PackTest, i32i32) { pack_test(iree_uk_pack_type_i32i32); }
+// ARM_64 tests.
+#if defined(IREE_UK_ARCH_ARM_64)
+
+#define PACK_ARM_64_TEST(type, tile_size0, tile_size1) \
+  PACK_TEST(type, tile_size0, tile_size1, arm_64, 0)
+
+#define PACK_ARM_64_TEST_WITH_CPU_FEATURE(type, tile_size0, tile_size1, \
+                                          FEATURE)                      \
+  PACK_TEST(type, tile_size0, tile_size1, arm_64_##FEATURE,             \
+            IREE_CPU_DATA_FIELD_0_AARCH64_HAVE_##FEATURE)
+
+PACK_ARM_64_TEST(f32f32, 8, 1)
+PACK_ARM_64_TEST(i8i8, 8, 1)
+PACK_ARM_64_TEST_WITH_CPU_FEATURE(i8i8, 8, 4, DOTPROD)
+PACK_ARM_64_TEST_WITH_CPU_FEATURE(i8i8, 8, 8, I8MM)
+
+#endif  // defined(IREE_UK_ARCH_ARM_64)
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  iree_cpu_initialize(iree_allocator_system());
+  return RUN_ALL_TESTS();
+}
diff --git a/runtime/src/iree/builtins/ukernel/tools/ukernel_test_utils.cc b/runtime/src/iree/builtins/ukernel/tools/ukernel_test_utils.cc
index a6c3333..e7e77d4 100644
--- a/runtime/src/iree/builtins/ukernel/tools/ukernel_test_utils.cc
+++ b/runtime/src/iree/builtins/ukernel/tools/ukernel_test_utils.cc
@@ -31,22 +31,21 @@
   delete e;
 }
 
-static int iree_uk_test_random_engine_get_in_uint16_range(
-    iree_uk_test_random_engine_t* e) {
+int iree_uk_test_random_engine_get_0_65535(iree_uk_test_random_engine_t* e) {
   iree_uk_uint32_t v = e->cpp_random_engine();
   // Return the middle two out of the 4 bytes of state. It avoids
   // some mild issues with the least-significant and most-significant bytes.
   return (v >> 8) & 0xffff;
 }
 
-int iree_uk_test_random_engine_get_0_or_1(iree_uk_test_random_engine_t* e) {
-  int v = iree_uk_test_random_engine_get_in_uint16_range(e);
+int iree_uk_test_random_engine_get_0_1(iree_uk_test_random_engine_t* e) {
+  int v = iree_uk_test_random_engine_get_0_65535(e);
   return v & 1;
 }
 
-int iree_uk_test_random_engine_get_between_minus16_and_plus15(
+int iree_uk_test_random_engine_get_minus16_plus15(
     iree_uk_test_random_engine_t* e) {
-  int v = iree_uk_test_random_engine_get_in_uint16_range(e);
+  int v = iree_uk_test_random_engine_get_0_65535(e);
   return (v % 32) - 16;
 }
 
@@ -60,8 +59,7 @@
     // Small integers, should work for now for all the types we currently have
     // and enable exact float arithmetic, allowing to keep tests simpler for
     // now. Watch out for when we'll do float16!
-    T random_val =
-        iree_uk_test_random_engine_get_between_minus16_and_plus15(engine);
+    T random_val = iree_uk_test_random_engine_get_minus16_plus15(engine);
     buffer[i] = random_val;
   }
 }
@@ -104,7 +102,7 @@
       return "bf";
     default:
       assert(false && "unknown type category");
-      return "(unknown type category)";
+      return "(?)";
   }
 }
 
diff --git a/runtime/src/iree/builtins/ukernel/tools/ukernel_test_utils.h b/runtime/src/iree/builtins/ukernel/tools/ukernel_test_utils.h
index ff49415..429910e 100644
--- a/runtime/src/iree/builtins/ukernel/tools/ukernel_test_utils.h
+++ b/runtime/src/iree/builtins/ukernel/tools/ukernel_test_utils.h
@@ -24,8 +24,9 @@
 typedef struct iree_uk_test_random_engine_t iree_uk_test_random_engine_t;
 iree_uk_test_random_engine_t* iree_uk_test_random_engine_create();
 void iree_uk_test_random_engine_destroy(iree_uk_test_random_engine_t* e);
-int iree_uk_test_random_engine_get_0_or_1(iree_uk_test_random_engine_t* e);
-int iree_uk_test_random_engine_get_between_minus16_and_plus15(
+int iree_uk_test_random_engine_get_0_65535(iree_uk_test_random_engine_t* e);
+int iree_uk_test_random_engine_get_0_1(iree_uk_test_random_engine_t* e);
+int iree_uk_test_random_engine_get_minus16_plus15(
     iree_uk_test_random_engine_t* e);
 void iree_uk_test_write_random_buffer(void* buffer,
                                       iree_uk_ssize_t size_in_bytes,