Refactor microkernel headers (#11920)

- Expose a unified `api.h` header.
- Fold separate `_types.h` headers.
- Fold some internal build targets.
- Caught some stray `#include <assert.h>`.
diff --git a/runtime/src/iree/builtins/ukernel/BUILD b/runtime/src/iree/builtins/ukernel/BUILD
index e251719..fbe09ec 100644
--- a/runtime/src/iree/builtins/ukernel/BUILD
+++ b/runtime/src/iree/builtins/ukernel/BUILD
@@ -23,17 +23,20 @@
     deps = [":static_assert"],
 )
 
+ukernel_headers = [
+    "common.h",
+    "elementwise.h",
+    "mmt4d.h",
+    "pack.h",
+    "query_tile_sizes.h",
+    "unpack.h",
+]
+
 # :types is the type declarations used by both the entry points and the
 # internal implementation functions.
 iree_runtime_cc_library(
-    name = "common",
-    hdrs = [
-        "common.h",
-        "mmt4d_types.h",
-        "pack_types.h",
-        "query_tile_sizes_types.h",
-        "unpack_types.h",
-    ],
+    name = "headers",
+    hdrs = ukernel_headers,
     deps = [
         ":exported_bits",
         ":static_assert",
@@ -42,39 +45,6 @@
     ],
 )
 
-# :generic contains non-architecture-specific implementations.
-iree_runtime_cc_library(
-    name = "generic",
-    srcs = [
-        "mmt4d_generic.c",
-        "pack_generic.c",
-    ],
-    hdrs = [
-        "mmt4d_generic.h",
-        "pack_generic.h",
-    ],
-    deps = [
-        ":common",
-    ],
-)
-
-# elementwise code is structured differently from other kernels. In fact it's
-# profoundly different: it carries its own custom shims. For now, we keep it
-# separate from the rest.
-iree_runtime_cc_library(
-    name = "elementwise",
-    srcs = [
-        "elementwise_generic.c",
-        "elementwise_impl.c.inc",
-    ],
-    hdrs = [
-        "elementwise.h",
-    ],
-    deps = [
-        ":common",
-    ],
-)
-
 # Entry points.
 iree_runtime_cc_library(
     name = "ukernel",
@@ -83,18 +53,20 @@
         "pack.c",
         "query_tile_sizes.c",
         "unpack.c",
-    ],
-    hdrs = [
-        "elementwise.h",
-        "mmt4d.h",
-        "pack.h",
-        "query_tile_sizes.h",
-        "unpack.h",
-    ],
+        "elementwise_generic.c",
+        "elementwise_impl.c.inc",
+        "mmt4d_generic.c",
+        "pack_generic.c",
+        "mmt4d_generic.h",
+        "pack_generic.h",
+    ] + ukernel_headers,
+    hdrs = ["api.h"],
     deps = [
-        ":common",
-        ":elementwise",
-        ":generic",
+        ":exported_bits",
+        ":headers",
+        ":static_assert",
+        "//runtime/src/iree/base:core_headers",
+        "//runtime/src/iree/builtins/ukernel/arch:config",
         "//runtime/src/iree/builtins/ukernel/arch:ukernel_arch",
     ],
 )
diff --git a/runtime/src/iree/builtins/ukernel/CMakeLists.txt b/runtime/src/iree/builtins/ukernel/CMakeLists.txt
index 59556d7..f6ff27e 100644
--- a/runtime/src/iree/builtins/ukernel/CMakeLists.txt
+++ b/runtime/src/iree/builtins/ukernel/CMakeLists.txt
@@ -32,13 +32,14 @@
 
 iree_cc_library(
   NAME
-    common
+    headers
   HDRS
     "common.h"
-    "mmt4d_types.h"
-    "pack_types.h"
-    "query_tile_sizes_types.h"
-    "unpack_types.h"
+    "elementwise.h"
+    "mmt4d.h"
+    "pack.h"
+    "query_tile_sizes.h"
+    "unpack.h"
   DEPS
     ::exported_bits
     ::static_assert
@@ -49,49 +50,32 @@
 
 iree_cc_library(
   NAME
-    generic
-  HDRS
-    "mmt4d_generic.h"
-    "pack_generic.h"
-  SRCS
-    "mmt4d_generic.c"
-    "pack_generic.c"
-  DEPS
-    ::common
-  PUBLIC
-)
-
-iree_cc_library(
-  NAME
-    elementwise
-  HDRS
-    "elementwise.h"
-  SRCS
-    "elementwise_generic.c"
-    "elementwise_impl.c.inc"
-  DEPS
-    ::common
-  PUBLIC
-)
-
-iree_cc_library(
-  NAME
     ukernel
   HDRS
-    "elementwise.h"
-    "mmt4d.h"
-    "pack.h"
-    "query_tile_sizes.h"
-    "unpack.h"
+    "api.h"
   SRCS
+    "common.h"
+    "elementwise.h"
+    "elementwise_generic.c"
+    "elementwise_impl.c.inc"
     "mmt4d.c"
+    "mmt4d.h"
+    "mmt4d_generic.c"
+    "mmt4d_generic.h"
     "pack.c"
+    "pack.h"
+    "pack_generic.c"
+    "pack_generic.h"
     "query_tile_sizes.c"
+    "query_tile_sizes.h"
     "unpack.c"
+    "unpack.h"
   DEPS
-    ::common
-    ::elementwise
-    ::generic
+    ::exported_bits
+    ::headers
+    ::static_assert
+    iree::base::core_headers
+    iree::builtins::ukernel::arch::config
     iree::builtins::ukernel::arch::ukernel_arch
   PUBLIC
 )
diff --git a/runtime/src/iree/builtins/ukernel/api.h b/runtime/src/iree/builtins/ukernel/api.h
new file mode 100644
index 0000000..85af566
--- /dev/null
+++ b/runtime/src/iree/builtins/ukernel/api.h
@@ -0,0 +1,16 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BUILTINS_UKERNEL_API_H_
+#define IREE_BUILTINS_UKERNEL_API_H_
+
+#include "iree/builtins/ukernel/elementwise.h"
+#include "iree/builtins/ukernel/mmt4d.h"
+#include "iree/builtins/ukernel/pack.h"
+#include "iree/builtins/ukernel/query_tile_sizes.h"
+#include "iree/builtins/ukernel/unpack.h"
+
+#endif  // IREE_BUILTINS_UKERNEL_API_H_
diff --git a/runtime/src/iree/builtins/ukernel/arch/BUILD b/runtime/src/iree/builtins/ukernel/arch/BUILD
index f66dab2..d9041e4 100644
--- a/runtime/src/iree/builtins/ukernel/arch/BUILD
+++ b/runtime/src/iree/builtins/ukernel/arch/BUILD
@@ -39,6 +39,6 @@
         "query_tile_sizes_arch.h",
     ],
     deps = [
-        "//runtime/src/iree/builtins/ukernel:common",
+        "//runtime/src/iree/builtins/ukernel:headers",
     ],
 )
diff --git a/runtime/src/iree/builtins/ukernel/arch/CMakeLists.txt b/runtime/src/iree/builtins/ukernel/arch/CMakeLists.txt
index ff71558..4a99996 100644
--- a/runtime/src/iree/builtins/ukernel/arch/CMakeLists.txt
+++ b/runtime/src/iree/builtins/ukernel/arch/CMakeLists.txt
@@ -58,7 +58,7 @@
     "pack_arch.c"
     "query_tile_sizes_arch.c"
   DEPS
-    iree::builtins::ukernel::common
+    iree::builtins::ukernel::headers
     ${IREE_UK_ARCH_DEPS}
   PUBLIC
 )
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/CMakeLists.txt b/runtime/src/iree/builtins/ukernel/arch/arm_64/CMakeLists.txt
index 11f7c7d..ec31de1 100644
--- a/runtime/src/iree/builtins/ukernel/arch/arm_64/CMakeLists.txt
+++ b/runtime/src/iree/builtins/ukernel/arch/arm_64/CMakeLists.txt
@@ -85,7 +85,7 @@
   DEPS
     iree::base::core_headers
     iree::schemas::cpu_data
-    iree::builtins::ukernel::common
+    iree::builtins::ukernel::headers
     ${IREE_UK_MMT4D_TILE_ARM_64_DEPS}
   PUBLIC
 )
@@ -119,7 +119,7 @@
   DEPS
     iree::base::core_headers
     iree::schemas::cpu_data
-    iree::builtins::ukernel::common
+    iree::builtins::ukernel::headers
     ::pack_tile_arm_64
   PUBLIC
 )
@@ -138,6 +138,6 @@
   DEPS
     iree::base::core_headers
     iree::schemas::cpu_data
-    iree::builtins::ukernel::common
+    iree::builtins::ukernel::headers
   PUBLIC
 )
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64.h b/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64.h
index 39e331c..5c162f3 100644
--- a/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64.h
+++ b/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64.h
@@ -7,7 +7,7 @@
 #ifndef IREE_BUILTINS_UKERNEL_ARCH_ARM_64_MMT4D_ARM_64_H_
 #define IREE_BUILTINS_UKERNEL_ARCH_ARM_64_MMT4D_ARM_64_H_
 
-#include "iree/builtins/ukernel/mmt4d_types.h"
+#include "iree/builtins/ukernel/mmt4d.h"
 
 // Returns the arm64 tile function to use for the mmt4d with given params, or
 // NULL if no suitable arm64 tile function exists for these params, in which
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_tile_arm_64.h b/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_tile_arm_64.h
index 5da3cb5..87e89e8 100644
--- a/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_tile_arm_64.h
+++ b/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_tile_arm_64.h
@@ -7,7 +7,7 @@
 #ifndef IREE_BUILTINS_UKERNEL_ARCH_ARM_64_MMT4D_TILE_ARM_64_H_
 #define IREE_BUILTINS_UKERNEL_ARCH_ARM_64_MMT4D_TILE_ARM_64_H_
 
-#include "iree/builtins/ukernel/mmt4d_types.h"
+#include "iree/builtins/ukernel/mmt4d.h"
 
 IREE_UK_MMT4D_TILE_FUNC_DECL(iree_uk_mmt4d_tile_f32f32f32_8x8x1_arm_64)
 IREE_UK_MMT4D_TILE_FUNC_DECL(iree_uk_mmt4d_tile_i8i8i32_8x8x1_arm_64)
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_arm_64.h b/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_arm_64.h
index d61cc99..52719b1 100644
--- a/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_arm_64.h
+++ b/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_arm_64.h
@@ -7,7 +7,7 @@
 #ifndef IREE_BUILTINS_UKERNEL_ARCH_ARM_64_PACK_ARM_64_H_
 #define IREE_BUILTINS_UKERNEL_ARCH_ARM_64_PACK_ARM_64_H_
 
-#include "iree/builtins/ukernel/pack_types.h"
+#include "iree/builtins/ukernel/pack.h"
 
 // Returns the arm64 tile function to use for the pack op with given params, or
 // NULL if no suitable arm64 tile function exists for these params, in which
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_tile_arm_64.h b/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_tile_arm_64.h
index ac6e385..c468e1d 100644
--- a/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_tile_arm_64.h
+++ b/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_tile_arm_64.h
@@ -7,7 +7,7 @@
 #ifndef IREE_BUILTINS_UKERNEL_ARCH_ARM_64_PACK_TILE_ARM_64_H_
 #define IREE_BUILTINS_UKERNEL_ARCH_ARM_64_PACK_TILE_ARM_64_H_
 
-#include "iree/builtins/ukernel/pack_types.h"
+#include "iree/builtins/ukernel/pack.h"
 
 IREE_UK_PACK_TILE_FUNC_DECL(iree_uk_pack_tile_8x1_x32_arm_64_direct)
 IREE_UK_PACK_TILE_FUNC_DECL(iree_uk_pack_tile_8x1_x32_arm_64_transpose)
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/query_tile_sizes_arm_64.h b/runtime/src/iree/builtins/ukernel/arch/arm_64/query_tile_sizes_arm_64.h
index 3242384..530e4d7 100644
--- a/runtime/src/iree/builtins/ukernel/arch/arm_64/query_tile_sizes_arm_64.h
+++ b/runtime/src/iree/builtins/ukernel/arch/arm_64/query_tile_sizes_arm_64.h
@@ -7,7 +7,7 @@
 #ifndef IREE_BUILTINS_UKERNEL_ARCH_ARM_64_QUERY_TILE_SIZES_ARM_64_H_
 #define IREE_BUILTINS_UKERNEL_ARCH_ARM_64_QUERY_TILE_SIZES_ARM_64_H_
 
-#include "iree/builtins/ukernel/query_tile_sizes_types.h"
+#include "iree/builtins/ukernel/query_tile_sizes.h"
 
 bool iree_uk_query_matmul_tile_sizes_arm_64(
     const iree_uk_query_tile_sizes_2d_params_t* params,
diff --git a/runtime/src/iree/builtins/ukernel/arch/mmt4d_arch.h b/runtime/src/iree/builtins/ukernel/arch/mmt4d_arch.h
index 996b4d4..dd2638f 100644
--- a/runtime/src/iree/builtins/ukernel/arch/mmt4d_arch.h
+++ b/runtime/src/iree/builtins/ukernel/arch/mmt4d_arch.h
@@ -7,7 +7,7 @@
 #ifndef IREE_BUILTINS_UKERNEL_ARCH_MMT4D_ARCH_H_
 #define IREE_BUILTINS_UKERNEL_ARCH_MMT4D_ARCH_H_
 
-#include "iree/builtins/ukernel/mmt4d_types.h"
+#include "iree/builtins/ukernel/mmt4d.h"
 
 // Returns the architecture-specific tile function to use for the mmt4d with
 // given params, or NULL if no suitable architecture-specific tile function
diff --git a/runtime/src/iree/builtins/ukernel/arch/pack_arch.h b/runtime/src/iree/builtins/ukernel/arch/pack_arch.h
index 16119b0..442c803 100644
--- a/runtime/src/iree/builtins/ukernel/arch/pack_arch.h
+++ b/runtime/src/iree/builtins/ukernel/arch/pack_arch.h
@@ -7,7 +7,7 @@
 #ifndef IREE_BUILTINS_UKERNEL_ARCH_PACK_ARCH_H_
 #define IREE_BUILTINS_UKERNEL_ARCH_PACK_ARCH_H_
 
-#include "iree/builtins/ukernel/pack_types.h"
+#include "iree/builtins/ukernel/pack.h"
 
 // Returns the architecture-specific tile function to use for the pack op with
 // given params, or NULL if no suitable architecture-specific tile function
diff --git a/runtime/src/iree/builtins/ukernel/arch/query_tile_sizes_arch.h b/runtime/src/iree/builtins/ukernel/arch/query_tile_sizes_arch.h
index 7eebdb7..a970534 100644
--- a/runtime/src/iree/builtins/ukernel/arch/query_tile_sizes_arch.h
+++ b/runtime/src/iree/builtins/ukernel/arch/query_tile_sizes_arch.h
@@ -7,7 +7,7 @@
 #ifndef IREE_BUILTINS_UKERNEL_ARCH_QUERY_TILE_SIZES_ARCH_H_
 #define IREE_BUILTINS_UKERNEL_ARCH_QUERY_TILE_SIZES_ARCH_H_
 
-#include "iree/builtins/ukernel/query_tile_sizes_types.h"
+#include "iree/builtins/ukernel/query_tile_sizes.h"
 
 bool iree_uk_query_matmul_tile_sizes_arch(
     const iree_uk_query_tile_sizes_2d_params_t* params,
diff --git a/runtime/src/iree/builtins/ukernel/mmt4d.h b/runtime/src/iree/builtins/ukernel/mmt4d.h
index 5db0336..6e9f140 100644
--- a/runtime/src/iree/builtins/ukernel/mmt4d.h
+++ b/runtime/src/iree/builtins/ukernel/mmt4d.h
@@ -7,12 +7,91 @@
 #ifndef IREE_BUILTINS_UKERNEL_MMT4D_H_
 #define IREE_BUILTINS_UKERNEL_MMT4D_H_
 
-#include "iree/builtins/ukernel/mmt4d_types.h"
+#include "iree/builtins/ukernel/common.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
 
+typedef enum iree_uk_mmt4d_type_t {
+  iree_uk_mmt4d_type_f32f32f32 =
+      IREE_UK_TIE_3_TYPES_LITERAL(FLOAT_32, FLOAT_32, FLOAT_32),
+  iree_uk_mmt4d_type_i8i8i32 =
+      IREE_UK_TIE_3_TYPES_LITERAL(INT_8, INT_8, INT_32),
+} iree_uk_mmt4d_type_t;
+
+static inline iree_uk_type_t iree_uk_mmt4d_lhs_type(iree_uk_mmt4d_type_t type) {
+  return iree_uk_untie_type(0, type);
+}
+
+static inline iree_uk_type_t iree_uk_mmt4d_rhs_type(iree_uk_mmt4d_type_t type) {
+  return iree_uk_untie_type(1, type);
+}
+
+static inline iree_uk_type_t iree_uk_mmt4d_out_type(iree_uk_mmt4d_type_t type) {
+  return iree_uk_untie_type(2, type);
+}
+
+// Parameters for a mmt4d operation.
+typedef struct iree_uk_mmt4d_params_t {
+  iree_uk_mmt4d_type_t type;
+  iree_uk_uint32_t flags;
+  iree_uk_ssize_t lhs_stride;
+  iree_uk_ssize_t rhs_stride;
+  iree_uk_ssize_t out_stride;
+  iree_uk_ssize_t M;
+  iree_uk_ssize_t N;
+  iree_uk_ssize_t K;
+  iree_uk_int32_t M0;
+  iree_uk_int32_t N0;
+  iree_uk_int32_t K0;
+  const void* lhs_buffer;
+  const void* rhs_buffer;
+  void* out_buffer;
+  const iree_uk_uint64_t* cpu_data;
+} iree_uk_mmt4d_params_t;
+
+// Function pointer type for tile functions, i.e. typically architecture
+// specific functions computing one M0xN0 tile of the output matrix, i.e.
+// the inner-most loop of the matmul, i.e. the thing that we should actually
+// be calling "micro kernel" except that the name is already taken by the
+// higher-level builtin name.
+//
+// The 'params' argument is only used by generic kernels. Actual optimized
+// kernels are already specialized for a given tile shape (M0xN0xK0), so the
+// five first arguments here are the only information that they need. Not having
+// to address 'params' struct fields in the middle of assembly kernels is
+// good, because it's hard to get the struct field offsets right in assembly
+// and keep that in sync with future struct changes.
+typedef void (*iree_uk_mmt4d_tile_func_t)(
+    void* /*out_tile*/, const void* /*lhs_panel*/, const void* /*rhs_panel*/,
+    iree_uk_int32_t /*K*/, iree_uk_uint32_t /*flags*/,
+    const iree_uk_mmt4d_params_t* /*params*/);
+
+// Tile kernel declarations. Prototype matches iree_uk_mmt4d_tile_func_t.
+#define IREE_UK_MMT4D_TILE_FUNC_DECL(NAME)                                \
+  void NAME(void* out_tile, const void* lhs_panel, const void* rhs_panel, \
+            iree_uk_int32_t K, iree_uk_uint32_t flags,                    \
+            const iree_uk_mmt4d_params_t* params);
+
+// In order to be helpful as a reference for future architecture-specific
+// kernels, the generic kernels are structured like an actual optimized kernel,
+// using an "accumulator tile" that in this case is a stack array (which would
+// become a group of SIMD registers in an actual optimized kernel). The downside
+// of this approach is that we have to set a fixed max size for the accumulator
+// tile, but for now all known cases are comfortably far below where trouble
+// would happen. For reference:
+// - On ARM NEON, the entire register space is 512 bytes, so the accumulator
+//   tile is less than that, typically 256 to 384 bytes.
+// - On ARM SME, we will be working with an accumulator tile as large as 4096
+//   bytes (IIUC).
+// - The smallest stack frame size limit that we know we may have to deal with
+//   on certain targets is 16 kilobytes.
+// The size or architecture-specific tiles is relevant here because this
+// generic code is what will be run as a fallback if the device is found not to
+// support the CPU feature that the tile sizes were picked to target.
+enum { iree_uk_mmt4d_tile_generic_max_bytes = 4096 };
+
 // Main entry point.
 IREE_UK_EXPORT void iree_uk_mmt4d(const iree_uk_mmt4d_params_t* params);
 
diff --git a/runtime/src/iree/builtins/ukernel/mmt4d_generic.h b/runtime/src/iree/builtins/ukernel/mmt4d_generic.h
index 848edf7..0c20bab 100644
--- a/runtime/src/iree/builtins/ukernel/mmt4d_generic.h
+++ b/runtime/src/iree/builtins/ukernel/mmt4d_generic.h
@@ -7,7 +7,7 @@
 #ifndef IREE_BUILTINS_UKERNEL_MMT4D_GENERIC_H_
 #define IREE_BUILTINS_UKERNEL_MMT4D_GENERIC_H_
 
-#include "iree/builtins/ukernel/mmt4d_types.h"
+#include "iree/builtins/ukernel/mmt4d.h"
 
 // Returns the generic tile function to use to perform the mmt4d with the given
 // *params. The caller may want to first try to get an optimized
diff --git a/runtime/src/iree/builtins/ukernel/mmt4d_types.h b/runtime/src/iree/builtins/ukernel/mmt4d_types.h
deleted file mode 100644
index 723723f..0000000
--- a/runtime/src/iree/builtins/ukernel/mmt4d_types.h
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright 2022 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_BUILTINS_UKERNEL_MMT4D_TYPES_H_
-#define IREE_BUILTINS_UKERNEL_MMT4D_TYPES_H_
-
-#include "iree/builtins/ukernel/common.h"
-
-typedef enum iree_uk_mmt4d_type_t {
-  iree_uk_mmt4d_type_f32f32f32 =
-      IREE_UK_TIE_3_TYPES_LITERAL(FLOAT_32, FLOAT_32, FLOAT_32),
-  iree_uk_mmt4d_type_i8i8i32 =
-      IREE_UK_TIE_3_TYPES_LITERAL(INT_8, INT_8, INT_32),
-} iree_uk_mmt4d_type_t;
-
-static inline iree_uk_type_t iree_uk_mmt4d_lhs_type(iree_uk_mmt4d_type_t type) {
-  return iree_uk_untie_type(0, type);
-}
-
-static inline iree_uk_type_t iree_uk_mmt4d_rhs_type(iree_uk_mmt4d_type_t type) {
-  return iree_uk_untie_type(1, type);
-}
-
-static inline iree_uk_type_t iree_uk_mmt4d_out_type(iree_uk_mmt4d_type_t type) {
-  return iree_uk_untie_type(2, type);
-}
-
-// Parameters for a mmt4d operation.
-typedef struct iree_uk_mmt4d_params_t {
-  iree_uk_mmt4d_type_t type;
-  iree_uk_uint32_t flags;
-  iree_uk_ssize_t lhs_stride;
-  iree_uk_ssize_t rhs_stride;
-  iree_uk_ssize_t out_stride;
-  iree_uk_ssize_t M;
-  iree_uk_ssize_t N;
-  iree_uk_ssize_t K;
-  iree_uk_int32_t M0;
-  iree_uk_int32_t N0;
-  iree_uk_int32_t K0;
-  const void* lhs_buffer;
-  const void* rhs_buffer;
-  void* out_buffer;
-  const iree_uk_uint64_t* cpu_data;
-} iree_uk_mmt4d_params_t;
-
-// Function pointer type for tile functions, i.e. typically architecture
-// specific functions computing one M0xN0 tile of the output matrix, i.e.
-// the inner-most loop of the matmul, i.e. the thing that we should actually
-// be calling "micro kernel" except that the name is already taken by the
-// higher-level builtin name.
-//
-// The 'params' argument is only used by generic kernels. Actual optimized
-// kernels are already specialized for a given tile shape (M0xN0xK0), so the
-// five first arguments here are the only information that they need. Not having
-// to address 'params' struct fields in the middle of assembly kernels is
-// good, because it's hard to get the struct field offsets right in assembly
-// and keep that in sync with future struct changes.
-typedef void (*iree_uk_mmt4d_tile_func_t)(
-    void* /*out_tile*/, const void* /*lhs_panel*/, const void* /*rhs_panel*/,
-    iree_uk_int32_t /*K*/, iree_uk_uint32_t /*flags*/,
-    const iree_uk_mmt4d_params_t* /*params*/);
-
-// Tile kernel declarations. Prototype matches iree_uk_mmt4d_tile_func_t.
-#define IREE_UK_MMT4D_TILE_FUNC_DECL(NAME)                                \
-  void NAME(void* out_tile, const void* lhs_panel, const void* rhs_panel, \
-            iree_uk_int32_t K, iree_uk_uint32_t flags,                    \
-            const iree_uk_mmt4d_params_t* params);
-
-// In order to be helpful as a reference for future architecture-specific
-// kernels, the generic kernels are structured like an actual optimized kernel,
-// using an "accumulator tile" that in this case is a stack array (which would
-// become a group of SIMD registers in an actual optimized kernel). The downside
-// of this approach is that we have to set a fixed max size for the accumulator
-// tile, but for now all known cases are comfortably far below where trouble
-// would happen. For reference:
-// - On ARM NEON, the entire register space is 512 bytes, so the accumulator
-//   tile is less than that, typically 256 to 384 bytes.
-// - On ARM SME, we will be working with an accumulator tile as large as 4096
-//   bytes (IIUC).
-// - The smallest stack frame size limit that we know we may have to deal with
-//   on certain targets is 16 kilobytes.
-// The size or architecture-specific tiles is relevant here because this
-// generic code is what will be run as a fallback if the device is found not to
-// support the CPU feature that the tile sizes were picked to target.
-enum { iree_uk_mmt4d_tile_generic_max_bytes = 4096 };
-
-#endif  // IREE_BUILTINS_UKERNEL_MMT4D_TYPES_H_
diff --git a/runtime/src/iree/builtins/ukernel/pack.h b/runtime/src/iree/builtins/ukernel/pack.h
index 9d14249..218706d 100644
--- a/runtime/src/iree/builtins/ukernel/pack.h
+++ b/runtime/src/iree/builtins/ukernel/pack.h
@@ -7,12 +7,59 @@
 #ifndef IREE_BUILTINS_UKERNEL_PACK_H_
 #define IREE_BUILTINS_UKERNEL_PACK_H_
 
-#include "iree/builtins/ukernel/pack_types.h"
+#include "iree/builtins/ukernel/common.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
 
+typedef enum iree_uk_pack_type_t {
+  iree_uk_pack_type_f32f32 = IREE_UK_TIE_2_TYPES_LITERAL(FLOAT_32, FLOAT_32),
+  iree_uk_pack_type_i8i8 = IREE_UK_TIE_2_TYPES_LITERAL(INT_8, INT_8),
+  iree_uk_pack_type_i32i32 = IREE_UK_TIE_2_TYPES_LITERAL(INT_32, INT_32),
+} iree_uk_pack_type_t;
+
+static inline iree_uk_type_t iree_uk_pack_in_type(iree_uk_pack_type_t type) {
+  return iree_uk_untie_type(0, type);
+}
+
+static inline iree_uk_type_t iree_uk_pack_out_type(iree_uk_pack_type_t type) {
+  return iree_uk_untie_type(1, type);
+}
+
+// Parameters for a pack operation.
+typedef struct iree_uk_pack_params_t {
+  iree_uk_pack_type_t type;
+  iree_uk_uint32_t flags;
+  iree_uk_ssize_t in_stride0;
+  iree_uk_ssize_t out_stride0;
+  iree_uk_ssize_t in_size0;
+  iree_uk_ssize_t in_size1;
+  iree_uk_ssize_t out_size0;
+  iree_uk_ssize_t out_size1;
+  iree_uk_ssize_t out_size2;
+  iree_uk_ssize_t out_size3;
+  const void* in_buffer;
+  void* out_buffer;
+  const void* padding_value;
+  const iree_uk_uint64_t* cpu_data;
+} iree_uk_pack_params_t;
+
+typedef void* (*iree_uk_pack_tile_func_t)(
+    void* IREE_UK_RESTRICT /*out_tile_ptr*/,
+    const void* IREE_UK_RESTRICT /*in_tile_ptr*/,
+    iree_uk_ssize_t /*outer_size1*/, iree_uk_ssize_t /*out_stride_l1*/,
+    iree_uk_ssize_t /*in_stride0*/, iree_uk_ssize_t /*elem_size*/,
+    iree_uk_ssize_t /*tile_size0*/, iree_uk_ssize_t /*tile_size1*/);
+
+// Tile kernel declarations. Prototype matches iree_uk_pack_tile_func_t.
+#define IREE_UK_PACK_TILE_FUNC_DECL(NAME)                                \
+  void* NAME(void* IREE_UK_RESTRICT out_tile_ptr,                        \
+             const void* IREE_UK_RESTRICT in_tile_ptr,                   \
+             iree_uk_ssize_t outer_size1, iree_uk_ssize_t out_stride_l1, \
+             iree_uk_ssize_t in_stride0, iree_uk_ssize_t elem_size,      \
+             iree_uk_ssize_t tile_size0, iree_uk_ssize_t tile_size1);
+
 // Main entry point.
 IREE_UK_EXPORT void iree_uk_pack(const iree_uk_pack_params_t* params);
 
diff --git a/runtime/src/iree/builtins/ukernel/pack_generic.h b/runtime/src/iree/builtins/ukernel/pack_generic.h
index 50bee3e..3e9f863 100644
--- a/runtime/src/iree/builtins/ukernel/pack_generic.h
+++ b/runtime/src/iree/builtins/ukernel/pack_generic.h
@@ -7,7 +7,7 @@
 #ifndef IREE_BUILTINS_UKERNEL_PACK_GENERIC_H_
 #define IREE_BUILTINS_UKERNEL_PACK_GENERIC_H_
 
-#include "iree/builtins/ukernel/pack_types.h"
+#include "iree/builtins/ukernel/pack.h"
 
 // Returns the generic tile function to use to perform the mmt4d with the given
 // *params. The caller may want to first try to get an optimized
diff --git a/runtime/src/iree/builtins/ukernel/pack_types.h b/runtime/src/iree/builtins/ukernel/pack_types.h
deleted file mode 100644
index f0c4bc5..0000000
--- a/runtime/src/iree/builtins/ukernel/pack_types.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2022 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_BUILTINS_UKERNEL_PACK_TYPES_H_
-#define IREE_BUILTINS_UKERNEL_PACK_TYPES_H_
-
-#include <assert.h>
-
-#include "iree/builtins/ukernel/common.h"
-
-typedef enum iree_uk_pack_type_t {
-  iree_uk_pack_type_f32f32 = IREE_UK_TIE_2_TYPES_LITERAL(FLOAT_32, FLOAT_32),
-  iree_uk_pack_type_i8i8 = IREE_UK_TIE_2_TYPES_LITERAL(INT_8, INT_8),
-  iree_uk_pack_type_i32i32 = IREE_UK_TIE_2_TYPES_LITERAL(INT_32, INT_32),
-} iree_uk_pack_type_t;
-
-static inline iree_uk_type_t iree_uk_pack_in_type(iree_uk_pack_type_t type) {
-  return iree_uk_untie_type(0, type);
-}
-
-static inline iree_uk_type_t iree_uk_pack_out_type(iree_uk_pack_type_t type) {
-  return iree_uk_untie_type(1, type);
-}
-
-// Parameters for a pack operation.
-typedef struct iree_uk_pack_params_t {
-  iree_uk_pack_type_t type;
-  iree_uk_uint32_t flags;
-  iree_uk_ssize_t in_stride0;
-  iree_uk_ssize_t out_stride0;
-  iree_uk_ssize_t in_size0;
-  iree_uk_ssize_t in_size1;
-  iree_uk_ssize_t out_size0;
-  iree_uk_ssize_t out_size1;
-  iree_uk_ssize_t out_size2;
-  iree_uk_ssize_t out_size3;
-  const void* in_buffer;
-  void* out_buffer;
-  const void* padding_value;
-  const iree_uk_uint64_t* cpu_data;
-} iree_uk_pack_params_t;
-
-typedef void* (*iree_uk_pack_tile_func_t)(
-    void* IREE_UK_RESTRICT /*out_tile_ptr*/,
-    const void* IREE_UK_RESTRICT /*in_tile_ptr*/,
-    iree_uk_ssize_t /*outer_size1*/, iree_uk_ssize_t /*out_stride_l1*/,
-    iree_uk_ssize_t /*in_stride0*/, iree_uk_ssize_t /*elem_size*/,
-    iree_uk_ssize_t /*tile_size0*/, iree_uk_ssize_t /*tile_size1*/);
-
-// Tile kernel declarations. Prototype matches iree_uk_pack_tile_func_t.
-#define IREE_UK_PACK_TILE_FUNC_DECL(NAME)                                \
-  void* NAME(void* IREE_UK_RESTRICT out_tile_ptr,                        \
-             const void* IREE_UK_RESTRICT in_tile_ptr,                   \
-             iree_uk_ssize_t outer_size1, iree_uk_ssize_t out_stride_l1, \
-             iree_uk_ssize_t in_stride0, iree_uk_ssize_t elem_size,      \
-             iree_uk_ssize_t tile_size0, iree_uk_ssize_t tile_size1);
-
-#endif  // IREE_BUILTINS_UKERNEL_PACK_TYPES_H_
diff --git a/runtime/src/iree/builtins/ukernel/query_tile_sizes.h b/runtime/src/iree/builtins/ukernel/query_tile_sizes.h
index 128c079..54dc84b 100644
--- a/runtime/src/iree/builtins/ukernel/query_tile_sizes.h
+++ b/runtime/src/iree/builtins/ukernel/query_tile_sizes.h
@@ -7,12 +7,41 @@
 #ifndef IREE_BUILTINS_UKERNEL_QUERY_TILE_SIZES_H_
 #define IREE_BUILTINS_UKERNEL_QUERY_TILE_SIZES_H_
 
-#include "iree/builtins/ukernel/query_tile_sizes_types.h"
+#include "iree/builtins/ukernel/common.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
 
+// Parameters for a query_tile_sizes operation.
+typedef struct iree_uk_query_tile_sizes_2d_params_t {
+  iree_uk_uint32_t flags;
+  iree_uk_ssize_t size0;
+  iree_uk_ssize_t size1;
+  const iree_uk_uint64_t* cpu_data;
+} iree_uk_query_tile_sizes_2d_params_t;
+
+typedef struct iree_uk_query_tile_sizes_2d_out_params_t {
+  iree_uk_ssize_t tile_size0;
+  iree_uk_ssize_t tile_size1;
+} iree_uk_query_tile_sizes_2d_out_params_t;
+
+static inline iree_uk_uint32_t iree_uk_query_tile_sizes_operand_role(
+    iree_uk_uint32_t flags) {
+  return flags & IREE_UK_FLAG_QUERY_TILE_SIZES_OPERAND_ROLE_MASK_INTERNAL;
+}
+
+static inline iree_uk_uint32_t iree_uk_query_tile_sizes_operation(
+    iree_uk_uint32_t flags) {
+  return flags & IREE_UK_FLAG_QUERY_TILE_SIZES_OPERATION_MASK_INTERNAL;
+}
+
+// Internal use only. Holds matmul tile params as returned from architecture
+// specific backend code.
+typedef struct iree_uk_matmul_tile_sizes_t {
+  int M, K, N;
+} iree_uk_matmul_tile_sizes_t;
+
 // Main entry point.
 IREE_UK_EXPORT void iree_uk_query_tile_sizes_2d(
     const iree_uk_query_tile_sizes_2d_params_t* params,
diff --git a/runtime/src/iree/builtins/ukernel/query_tile_sizes_types.h b/runtime/src/iree/builtins/ukernel/query_tile_sizes_types.h
deleted file mode 100644
index a156339..0000000
--- a/runtime/src/iree/builtins/ukernel/query_tile_sizes_types.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2022 The IREE Aupackthors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_BUILTINS_QUERY_TILE_SIZES_TYPES_H_
-#define IREE_BUILTINS_QUERY_TILE_SIZES_TYPES_H_
-
-#include <assert.h>
-
-#include "iree/builtins/ukernel/common.h"
-
-// Parameters for a query_tile_sizes operation.
-typedef struct iree_uk_query_tile_sizes_2d_params_t {
-  iree_uk_uint32_t flags;
-  iree_uk_ssize_t size0;
-  iree_uk_ssize_t size1;
-  const iree_uk_uint64_t* cpu_data;
-} iree_uk_query_tile_sizes_2d_params_t;
-
-typedef struct iree_uk_query_tile_sizes_2d_out_params_t {
-  iree_uk_ssize_t tile_size0;
-  iree_uk_ssize_t tile_size1;
-} iree_uk_query_tile_sizes_2d_out_params_t;
-
-static inline iree_uk_uint32_t iree_uk_query_tile_sizes_operand_role(
-    iree_uk_uint32_t flags) {
-  return flags & IREE_UK_FLAG_QUERY_TILE_SIZES_OPERAND_ROLE_MASK_INTERNAL;
-}
-
-static inline iree_uk_uint32_t iree_uk_query_tile_sizes_operation(
-    iree_uk_uint32_t flags) {
-  return flags & IREE_UK_FLAG_QUERY_TILE_SIZES_OPERATION_MASK_INTERNAL;
-}
-
-// Internal use only. Holds matmul tile params as returned from architecture
-// specific backend code.
-typedef struct iree_uk_matmul_tile_sizes_t {
-  int M, K, N;
-} iree_uk_matmul_tile_sizes_t;
-
-#endif  // IREE_BUILTINS_QUERY_TILE_SIZES_TYPES_H_
diff --git a/runtime/src/iree/builtins/ukernel/tools/BUILD b/runtime/src/iree/builtins/ukernel/tools/BUILD
index 1f7da3e..d29f04d 100644
--- a/runtime/src/iree/builtins/ukernel/tools/BUILD
+++ b/runtime/src/iree/builtins/ukernel/tools/BUILD
@@ -19,7 +19,7 @@
     hdrs = ["ukernel_test_utils.h"],
     deps = [
         "//runtime/src/iree/base",
-        "//runtime/src/iree/builtins/ukernel:common",
+        "//runtime/src/iree/builtins/ukernel:headers",
         "//runtime/src/iree/schemas:cpu_data",
     ],
 )
diff --git a/runtime/src/iree/builtins/ukernel/tools/CMakeLists.txt b/runtime/src/iree/builtins/ukernel/tools/CMakeLists.txt
index fe16eab..a0829dd 100644
--- a/runtime/src/iree/builtins/ukernel/tools/CMakeLists.txt
+++ b/runtime/src/iree/builtins/ukernel/tools/CMakeLists.txt
@@ -19,7 +19,7 @@
     "ukernel_test_utils.cc"
   DEPS
     iree::base
-    iree::builtins::ukernel::common
+    iree::builtins::ukernel::headers
     iree::schemas::cpu_data
   PUBLIC
 )
diff --git a/runtime/src/iree/builtins/ukernel/tools/mmt4d_benchmark.c b/runtime/src/iree/builtins/ukernel/tools/mmt4d_benchmark.c
index efe1602..e8021ac 100644
--- a/runtime/src/iree/builtins/ukernel/tools/mmt4d_benchmark.c
+++ b/runtime/src/iree/builtins/ukernel/tools/mmt4d_benchmark.c
@@ -10,7 +10,7 @@
 #include "iree/base/api.h"
 #include "iree/base/internal/cpu.h"
 #include "iree/base/internal/flags.h"
-#include "iree/builtins/ukernel/mmt4d.h"
+#include "iree/builtins/ukernel/api.h"
 #include "iree/builtins/ukernel/tools/ukernel_test_utils.h"
 #include "iree/testing/benchmark.h"
 
diff --git a/runtime/src/iree/builtins/ukernel/tools/mmt4d_test.cc b/runtime/src/iree/builtins/ukernel/tools/mmt4d_test.cc
index 1177ac5..027cad1 100644
--- a/runtime/src/iree/builtins/ukernel/tools/mmt4d_test.cc
+++ b/runtime/src/iree/builtins/ukernel/tools/mmt4d_test.cc
@@ -43,12 +43,11 @@
 // things that we would otherwise prefer to keep internal in the mmt4d builtin
 // implementation, and would make e2e/matmul tests even more expensive.
 
-#include "iree/builtins/ukernel/mmt4d.h"
-
 #include <vector>
 
 #include "iree/base/api.h"
 #include "iree/base/internal/cpu.h"
+#include "iree/builtins/ukernel/api.h"
 #include "iree/builtins/ukernel/tools/ukernel_test_utils.h"
 #include "iree/testing/gtest.h"
 #include "iree/testing/status_matchers.h"
diff --git a/runtime/src/iree/builtins/ukernel/tools/pack_benchmark.c b/runtime/src/iree/builtins/ukernel/tools/pack_benchmark.c
index bf0c0aa..8de72d1 100644
--- a/runtime/src/iree/builtins/ukernel/tools/pack_benchmark.c
+++ b/runtime/src/iree/builtins/ukernel/tools/pack_benchmark.c
@@ -10,7 +10,7 @@
 #include "iree/base/api.h"
 #include "iree/base/internal/cpu.h"
 #include "iree/base/internal/flags.h"
-#include "iree/builtins/ukernel/pack.h"
+#include "iree/builtins/ukernel/api.h"
 #include "iree/builtins/ukernel/tools/ukernel_test_utils.h"
 #include "iree/testing/benchmark.h"
 
diff --git a/runtime/src/iree/builtins/ukernel/tools/pack_test.cc b/runtime/src/iree/builtins/ukernel/tools/pack_test.cc
index 9de8c9d..0da9d5e 100644
--- a/runtime/src/iree/builtins/ukernel/tools/pack_test.cc
+++ b/runtime/src/iree/builtins/ukernel/tools/pack_test.cc
@@ -4,14 +4,13 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree/builtins/ukernel/pack.h"
-
 #include <algorithm>
 #include <cstring>
 #include <vector>
 
 #include "iree/base/api.h"
 #include "iree/base/internal/cpu.h"
+#include "iree/builtins/ukernel/api.h"
 #include "iree/builtins/ukernel/tools/ukernel_test_utils.h"
 #include "iree/testing/gtest.h"
 #include "iree/testing/status_matchers.h"
diff --git a/runtime/src/iree/builtins/ukernel/unpack.c b/runtime/src/iree/builtins/ukernel/unpack.c
index d22fa8c..5e51d3b 100644
--- a/runtime/src/iree/builtins/ukernel/unpack.c
+++ b/runtime/src/iree/builtins/ukernel/unpack.c
@@ -66,10 +66,6 @@
     iree_uk_ssize_swap(&tile_size0, &tile_size1);
     iree_uk_ssize_swap(&in_stride_l2, &in_stride_l3);
   }
-  assert(outer_size0 * tile_size0 >= params->out_size0);
-  assert(outer_size1 * tile_size1 >= params->out_size1);
-  assert((outer_size0 - 1) * tile_size0 < params->out_size0);
-  assert((outer_size1 - 1) * tile_size1 < params->out_size1);
   for (iree_uk_ssize_t outer_i0 = 0; outer_i0 < outer_size0; ++outer_i0) {
     for (iree_uk_ssize_t outer_i1 = 0; outer_i1 < outer_size1; ++outer_i1) {
       for (iree_uk_ssize_t tile_i0 = 0; tile_i0 < tile_size0; ++tile_i0) {
diff --git a/runtime/src/iree/builtins/ukernel/unpack.h b/runtime/src/iree/builtins/ukernel/unpack.h
index ceffbbb..c51047a 100644
--- a/runtime/src/iree/builtins/ukernel/unpack.h
+++ b/runtime/src/iree/builtins/ukernel/unpack.h
@@ -7,12 +7,60 @@
 #ifndef IREE_BUILTINS_UKERNEL_UNPACK_H_
 #define IREE_BUILTINS_UKERNEL_UNPACK_H_
 
-#include "iree/builtins/ukernel/unpack_types.h"
+#include "iree/builtins/ukernel/common.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
 
+typedef enum iree_uk_unpack_type_t {
+  iree_uk_unpack_type_f32f32 = IREE_UK_TIE_2_TYPES_LITERAL(FLOAT_32, FLOAT_32),
+  iree_uk_unpack_type_i8i8 = IREE_UK_TIE_2_TYPES_LITERAL(INT_8, INT_8),
+  iree_uk_unpack_type_i32i32 = IREE_UK_TIE_2_TYPES_LITERAL(INT_32, INT_32),
+} iree_uk_unpack_type_t;
+
+static inline iree_uk_type_t iree_uk_unpack_in_type(
+    iree_uk_unpack_type_t type) {
+  return iree_uk_untie_type(0, type);
+}
+
+static inline iree_uk_type_t iree_uk_unpack_out_type(
+    iree_uk_unpack_type_t type) {
+  return iree_uk_untie_type(1, type);
+}
+
+// Parameters for a unpack operation.
+typedef struct iree_uk_unpack_params_t {
+  iree_uk_unpack_type_t type;
+  iree_uk_uint32_t flags;
+  iree_uk_ssize_t in_stride0;
+  iree_uk_ssize_t out_stride0;
+  iree_uk_ssize_t in_size0;
+  iree_uk_ssize_t in_size1;
+  iree_uk_ssize_t in_size2;
+  iree_uk_ssize_t in_size3;
+  iree_uk_ssize_t out_size0;
+  iree_uk_ssize_t out_size1;
+  const void* in_buffer;
+  void* out_buffer;
+  const iree_uk_uint64_t* cpu_data;
+} iree_uk_unpack_params_t;
+
+typedef void* (*iree_uk_unpack_tile_func_t)(
+    void* IREE_UK_RESTRICT /*out_tile_ptr*/,
+    const void* IREE_UK_RESTRICT /*in_tile_ptr*/,
+    iree_uk_ssize_t /*outer_size1*/, iree_uk_ssize_t /*out_stride_l1*/,
+    iree_uk_ssize_t /*in_stride0*/, iree_uk_ssize_t /*elem_size*/,
+    iree_uk_ssize_t /*tile_size0*/, iree_uk_ssize_t /*tile_size1*/);
+
+// Tile kernel declarations. Prototype matches iree_uk_unpack_tile_func_t.
+#define IREE_UK_UNPACK_TILE_FUNC_DECL(NAME)                              \
+  void* NAME(void* IREE_UK_RESTRICT out_tile_ptr,                        \
+             const void* IREE_UK_RESTRICT in_tile_ptr,                   \
+             iree_uk_ssize_t outer_size1, iree_uk_ssize_t out_stride_l1, \
+             iree_uk_ssize_t in_stride0, iree_uk_ssize_t elem_size,      \
+             iree_uk_ssize_t tile_size0, iree_uk_ssize_t tile_size1);
+
 // Main entry point.
 IREE_UK_EXPORT void iree_uk_unpack(const iree_uk_unpack_params_t* params);
 
diff --git a/runtime/src/iree/builtins/ukernel/unpack_types.h b/runtime/src/iree/builtins/ukernel/unpack_types.h
deleted file mode 100644
index 472b820..0000000
--- a/runtime/src/iree/builtins/ukernel/unpack_types.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2022 The IREE Aupackthors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_BUILTINS_UKERNEL_UNPACK_TYPES_H_
-#define IREE_BUILTINS_UKERNEL_UNPACK_TYPES_H_
-
-#include <assert.h>
-
-#include "iree/builtins/ukernel/common.h"
-
-typedef enum iree_uk_unpack_type_t {
-  iree_uk_unpack_type_f32f32 = IREE_UK_TIE_2_TYPES_LITERAL(FLOAT_32, FLOAT_32),
-  iree_uk_unpack_type_i8i8 = IREE_UK_TIE_2_TYPES_LITERAL(INT_8, INT_8),
-  iree_uk_unpack_type_i32i32 = IREE_UK_TIE_2_TYPES_LITERAL(INT_32, INT_32),
-} iree_uk_unpack_type_t;
-
-static inline iree_uk_type_t iree_uk_unpack_in_type(
-    iree_uk_unpack_type_t type) {
-  return iree_uk_untie_type(0, type);
-}
-
-static inline iree_uk_type_t iree_uk_unpack_out_type(
-    iree_uk_unpack_type_t type) {
-  return iree_uk_untie_type(1, type);
-}
-
-// Parameters for a unpack operation.
-typedef struct iree_uk_unpack_params_t {
-  iree_uk_unpack_type_t type;
-  iree_uk_uint32_t flags;
-  iree_uk_ssize_t in_stride0;
-  iree_uk_ssize_t out_stride0;
-  iree_uk_ssize_t in_size0;
-  iree_uk_ssize_t in_size1;
-  iree_uk_ssize_t in_size2;
-  iree_uk_ssize_t in_size3;
-  iree_uk_ssize_t out_size0;
-  iree_uk_ssize_t out_size1;
-  const void* in_buffer;
-  void* out_buffer;
-  const iree_uk_uint64_t* cpu_data;
-} iree_uk_unpack_params_t;
-
-typedef void* (*iree_uk_unpack_tile_func_t)(
-    void* IREE_UK_RESTRICT /*out_tile_ptr*/,
-    const void* IREE_UK_RESTRICT /*in_tile_ptr*/,
-    iree_uk_ssize_t /*outer_size1*/, iree_uk_ssize_t /*out_stride_l1*/,
-    iree_uk_ssize_t /*in_stride0*/, iree_uk_ssize_t /*elem_size*/,
-    iree_uk_ssize_t /*tile_size0*/, iree_uk_ssize_t /*tile_size1*/);
-
-// Tile kernel declarations. Prototype matches iree_uk_unpack_tile_func_t.
-#define IREE_UK_UNPACK_TILE_FUNC_DECL(NAME)                              \
-  void* NAME(void* IREE_UK_RESTRICT out_tile_ptr,                        \
-             const void* IREE_UK_RESTRICT in_tile_ptr,                   \
-             iree_uk_ssize_t outer_size1, iree_uk_ssize_t out_stride_l1, \
-             iree_uk_ssize_t in_stride0, iree_uk_ssize_t elem_size,      \
-             iree_uk_ssize_t tile_size0, iree_uk_ssize_t tile_size1);
-
-#endif  // IREE_BUILTINS_UKERNEL_UNPACK_TYPES_H_
diff --git a/runtime/src/iree/modules/vmvx/module.c b/runtime/src/iree/modules/vmvx/module.c
index 198574b..f795185 100644
--- a/runtime/src/iree/modules/vmvx/module.c
+++ b/runtime/src/iree/modules/vmvx/module.c
@@ -19,11 +19,7 @@
 // Include the ukernel support library so that we can use its implementations
 // as fixed-function components of the runtime.
 #include "iree/base/internal/cpu.h"
-#include "iree/builtins/ukernel/elementwise.h"
-#include "iree/builtins/ukernel/mmt4d.h"
-#include "iree/builtins/ukernel/pack.h"
-#include "iree/builtins/ukernel/query_tile_sizes.h"
-#include "iree/builtins/ukernel/unpack.h"
+#include "iree/builtins/ukernel/api.h"
 
 #define IREE_VMVX_MODULE_VERSION_0_0 0x00000000u
 #define IREE_VMVX_MODULE_VERSION_LATEST IREE_VMVX_MODULE_VERSION_0_0