CPU ukernels as bitcode (x86-only for now) (#13460)

While #13433 enabled the use of micro kernels within codegen backends
using the plugin mechanism, here the ukernel code is compiled into a
bitcode library. This bitcode library is linked with the generated code
at compilation time. The lowering to LLVM inlines the exact micro kernel
needed from the micro kernel library for a particular architecture. To
enable this end-to-end flow, the following changes are needed

- Add an enum attribute to HAL : `IREE::HAL::CallingConvention` that
allows specifying what calling convention to use for the micro kernel
call. `Default` leaves the params as is, `ParameterStruct` packs all the
returns and arguments into a parameter struct to mimic ABIs like
https://github.com/openxla/iree/blob/6cf092d022810d4347353b23e5ce2688a166dd67/runtime/src/iree/builtins/ukernel/mmt4d.h#L16
- Couple of patterns are added to `ConvertToLLVM` pass, to handle the
lowering of the function definition and function call, in keeping with
the specified ABI
- Allow specification of `hal.import.fields` to specify `processor_data`
and `processor_id` on ukernel function defn. This then generates the
code to forward this information to the microkernels (similar to what is
done for external calls using the plugin mechanism)
- Propagate the target CPU features in `hal.executable.target` of the
dispatch into the micro kernel call. This allows the LLVM passes to walk
through the branching used to pick the right micro kernel function and
effectively inline that.

Co-authored-by: Benoit Jacob <benoitjacob@google.com>
diff --git a/build_tools/bazel/iree_bitcode_library.bzl b/build_tools/bazel/iree_bitcode_library.bzl
index a38fa45..b4f9b38 100644
--- a/build_tools/bazel/iree_bitcode_library.bzl
+++ b/build_tools/bazel/iree_bitcode_library.bzl
@@ -40,23 +40,25 @@
     for bitcode_src in srcs:
         bitcode_out = "%s_%s.bc" % (name, bitcode_src)
         bitcode_files.append(bitcode_out)
+        system_headers = ["immintrin.h"]
         native.genrule(
             name = "gen_%s" % (bitcode_out),
-            srcs = [bitcode_src],
+            srcs = [bitcode_src] + hdrs + [builtin_headers_dep],
             outs = [bitcode_out],
             cmd = " && ".join([
                 " ".join([
                     "$(location %s)" % (clang_tool),
-                    "-isystem $(BINDIR)/%s" % (builtin_headers_path),
+                    "-isystem $(BINDIR)/%s" % builtin_headers_path,
                     " ".join(copts),
                     " ".join(["-D%s" % (define) for define in defines]),
+                    " ".join(["-I $(BINDIR)/runtime/src"]),
+                    " ".join(["-I runtime/src"]),
                     "-o $(location %s)" % (bitcode_out),
                     "$(location %s)" % (bitcode_src),
                 ]),
             ]),
-            tools = hdrs + data + [
+            tools = data + [
                 clang_tool,
-                builtin_headers_dep,
             ],
             message = "Compiling %s to %s..." % (bitcode_src, bitcode_out),
             output_to_bindir = 1,
@@ -81,3 +83,40 @@
         output_to_bindir = 1,
         **kwargs
     )
+
+def iree_link_bitcode(
+        name,
+        bitcode_files,
+        out = None,
+        link_tool = "@llvm-project//llvm:llvm-link",
+        **kwargs):
+    """Builds an LLVM bitcode library from an input file via clang.
+
+    Args:
+        name: Name of the target.
+        bitcode_files: bitcode files to link together.
+        out: output file name (defaults to name.bc).
+        link_tool: llvm-link tool used for linking bitcode files.
+        **kwargs: any additional attributes to pass to the underlying rules.
+    """
+
+    bitcode_files_qualified = [(("//" + native.package_name() + "/" + b) if b.count(":") else b) for b in bitcode_files]
+
+    if not out:
+        out = "%s.bc" % (name)
+    native.genrule(
+        name = name,
+        srcs = bitcode_files_qualified,
+        outs = [out],
+        cmd = " && ".join([
+            " ".join([
+                "$(location %s)" % (link_tool),
+                "-o $(location %s)" % (out),
+                " ".join(["$(locations %s)" % (src) for src in bitcode_files_qualified]),
+            ]),
+        ]),
+        tools = [link_tool],
+        message = "Linking bitcode library %s to %s..." % (name, out),
+        output_to_bindir = 1,
+        **kwargs
+    )
diff --git a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
index 4cbc856..d10af72 100644
--- a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
+++ b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
@@ -515,6 +515,20 @@
                              f"{testonly_block}"
                              f"  PUBLIC\n)\n\n")
 
+  def iree_link_bitcode(self, name, bitcode_files, data=None, testonly=None):
+    name_block = self._convert_string_arg_block("NAME", name, quote=False)
+    bitcode_files_block = self._convert_srcs_block(
+        [f.replace(":", "/") for f in bitcode_files])
+    data_block = self._convert_target_list_block("DATA", data)
+    testonly_block = self._convert_option_block("TESTONLY", testonly)
+
+    self._converter.body += (f"iree_link_bitcode(\n"
+                             f"{name_block}"
+                             f"{bitcode_files_block}"
+                             f"{data_block}"
+                             f"{testonly_block}"
+                             f"  PUBLIC\n)\n\n")
+
   def iree_bytecode_module(self,
                            name,
                            src,
diff --git a/build_tools/cmake/iree_bitcode_library.cmake b/build_tools/cmake/iree_bitcode_library.cmake
index 0f17151..05ee273 100644
--- a/build_tools/cmake/iree_bitcode_library.cmake
+++ b/build_tools/cmake/iree_bitcode_library.cmake
@@ -34,10 +34,6 @@
   set(_CLANG_TOOL "$<TARGET_FILE:${IREE_CLANG_TARGET}>")
   set(_LINK_TOOL "$<TARGET_FILE:${IREE_LLVM_LINK_TARGET}>")
 
-  # These are copied as part of the clang build; we could allow the user to
-  # override this but it should be harmless.
-  set(_BUILTIN_HEADERS_PATH "${IREE_BINARY_DIR}/third_party/llvm-project/llvm/lib/clang/${CLANG_VERSION_MAJOR}/include/")
-
   if(_RULE_TESTONLY AND NOT IREE_BUILD_TESTS)
     return()
   endif()
@@ -48,7 +44,21 @@
     set(_OUT "${_RULE_NAME}.bc")
   endif()
 
-  set(_ARGS "-isystem ${_BUILTIN_HEADERS_PATH}")
+  # We need CLANG_VERSION_MAJOR to set up include directories. Unfortunately,
+  # Clang's own CMakeLists do not expose CLANG_VERSION_MAJOR to PARENT_SCOPE.
+  # Likewise with LLVM_VERSION_MAJOR. However, CLANG_EXECUTABLE_VERSION is
+  # CACHE'd, so we can access it, and it currently has the same value.
+  set(_CLANG_VERSION_MAJOR "${CLANG_EXECUTABLE_VERSION}")
+
+  # These are copied as part of the clang build; we could allow the user to
+  # override this but it should be harmless.
+  set(_BUILTIN_HEADERS_PATH "${IREE_BINARY_DIR}/llvm-project/lib/clang/${_CLANG_VERSION_MAJOR}/include/")
+
+  set(_ARGS "")
+  list(APPEND _ARGS "-isystem" "${_BUILTIN_HEADERS_PATH}")
+  list(APPEND _ARGS "-I" "${IREE_SOURCE_DIR}/runtime/src")
+  list(APPEND _ARGS "-I" "${IREE_BINARY_DIR}/runtime/src")
+  
   list(APPEND _ARGS "${_RULE_COPTS}")
   foreach(_DEFINE ${_RULE_DEFINES})
     list(APPEND _ARGS "-D${_DEFINE}")
@@ -70,6 +80,7 @@
         "${_BITCODE_FILE}"
       DEPENDS
         ${_CLANG_TOOL}
+        ${_LINK_TOOL}
         ${_BITCODE_SRC}
       COMMENT
         "Compiling ${_BITCODE_SRC} to ${_BITCODE_FILE}"
@@ -101,3 +112,66 @@
     DEPENDS "${_OUT}"
   )
 endfunction()
+
+# iree_link_bitcode()
+#
+# Builds an LLVM bitcode library from an input file via clang
+#
+# Parameters:
+# NAME: Name of target (see Note).
+# SRCS: Source files to pass to clang.
+# HDRS: Additional headers included by the source files.
+# COPTS: additional flags to pass to clang.
+# DEFINES: Preprocessor definitions to pass to clang.
+# DATA: Additional data required during compilation.
+# OUT: Output file name (defaults to NAME.bc).
+# PUBLIC: Add this so that this library will be exported under ${PACKAGE}::
+#     Also in IDE, target will appear in ${PACKAGE} folder while non PUBLIC
+#     will be in ${PACKAGE}/internal.
+# TESTONLY: When added, this target will only be built if IREE_BUILD_TESTS=ON.
+function(iree_link_bitcode)
+  cmake_parse_arguments(
+    _RULE
+    "PUBLIC;TESTONLY"
+    "NAME;OUT"
+    "SRCS;DEFINES;DATA"
+    ${ARGN}
+  )
+
+  set(_LINK_TOOL "$<TARGET_FILE:${IREE_LLVM_LINK_TARGET}>")
+
+  if(_RULE_TESTONLY AND NOT IREE_BUILD_TESTS)
+    return()
+  endif()
+
+  if(DEFINED _RULE_OUT)
+    set(_OUT "${_RULE_OUT}")
+  else()
+    set(_OUT "${_RULE_NAME}.bc")
+  endif()
+
+  set(_BITCODE_FILES "${_RULE_SRCS}")
+
+  add_custom_command(
+    OUTPUT
+      ${_OUT}
+    COMMAND
+      ${_LINK_TOOL}
+      ${_BITCODE_FILES}
+      "-o"
+      "${_OUT}"
+    DEPENDS
+      ${_LINK_TOOL}
+      ${_BITCODE_FILES}
+    COMMENT
+      "Linking bitcode to ${_OUT}"
+    VERBATIM
+  )
+
+  # Only add iree_${NAME} as custom target doesn't support aliasing to
+  # iree::${NAME}.
+  iree_package_name(_PACKAGE_NAME)
+  add_custom_target("${_PACKAGE_NAME}_${_RULE_NAME}"
+    DEPENDS "${_OUT}"
+  )
+endfunction()
diff --git a/build_tools/cmake/iree_trace_runner_test.cmake b/build_tools/cmake/iree_trace_runner_test.cmake
index 96ef411..eff4d41 100644
--- a/build_tools/cmake/iree_trace_runner_test.cmake
+++ b/build_tools/cmake/iree_trace_runner_test.cmake
@@ -47,6 +47,10 @@
     ${ARGN}
   )
 
+  if(CMAKE_CROSSCOMPILING AND "hostonly" IN_LIST _RULE_LABELS)
+    return()
+  endif()
+
   iree_package_name(_PACKAGE_NAME)
   set(_NAME "${_PACKAGE_NAME}_${_RULE_NAME}")
 
@@ -149,6 +153,10 @@
     ${ARGN}
   )
 
+  if(CMAKE_CROSSCOMPILING AND "hostonly" IN_LIST _RULE_LABELS)
+    return()
+  endif()
+
   # Omit tests for which the specified driver or target backend is not enabled.
   # This overlaps with directory exclusions and other filtering mechanisms.
   string(TOUPPER ${_RULE_DRIVER} _UPPERCASE_DRIVER)
@@ -293,6 +301,10 @@
     ${ARGN}
   )
 
+  if(CMAKE_CROSSCOMPILING AND "hostonly" IN_LIST _RULE_LABELS)
+    return()
+  endif()
+
   if(NOT DEFINED _RULE_TARGET_BACKENDS AND NOT DEFINED _RULE_DRIVERS)
     set(_RULE_TARGET_BACKENDS "vmvx" "vulkan-spirv" "llvm-cpu")
     set(_RULE_DRIVERS "local-task" "vulkan" "local-task")
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
index c4127a3..2899ee9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
@@ -72,6 +72,7 @@
         "//llvm-external-projects/iree-dialects:IREELinalgTransformDialect",
         "//llvm-external-projects/iree-dialects:IREELinalgTransformDialectPasses",
         "//runtime/src/iree/builtins/ukernel:exported_bits",
+        "//runtime/src/iree/schemas:cpu_data",
         "//runtime/src/iree/schemas/instruments",
         "@llvm-project//llvm:BinaryFormat",
         "@llvm-project//llvm:Support",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
index d7af1fc..4c2bea1 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
@@ -120,6 +120,7 @@
     iree::compiler::Dialect::Util::IR
     iree::compiler::Dialect::Util::Transforms
     iree::compiler::Utils
+    iree::schemas::cpu_data
     iree::schemas::instruments
   PUBLIC
 )
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
index e30ec4d..bc50628 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
@@ -666,6 +666,137 @@
   }
 };
 
+/// Helper method to get information about extra operands that need to be
+/// appended to a function defn/call operation.
+static SmallVector<StringRef> getExtraFields(Operation *forOp) {
+  SmallVector<StringRef> extraFields;
+  if (auto extraFieldsAttr =
+          forOp->getAttrOfType<ArrayAttr>("hal.import.fields")) {
+    extraFields = llvm::to_vector(llvm::map_range(
+        extraFieldsAttr.getValue(),
+        [](Attribute attr) { return attr.cast<StringAttr>().getValue(); }));
+  }
+  return extraFields;
+}
+
+/// Return calling convention to use for the operation.
+static IREE::HAL::CallingConvention getCallingConvention(Operation *forOp) {
+  auto cConv = IREE::HAL::CallingConvention::Default;
+  if (auto cConvAttr = forOp->getAttrOfType<IREE::HAL::CallingConventionAttr>(
+          "hal.import.cconv")) {
+    cConv = cConvAttr.getValue();
+  }
+  return cConv;
+}
+
+/// Lower func ops with specified ABI. Currently this pattern is triggered
+/// only for operations with the `hal.import.bitcode` attribute set.
+///
+/// Note: this is an LLVM::CallOp -> LLVM::CallOp rewrite that is introduced
+/// after all conversions are done. Importantly, this is not a conversion
+/// pattern.
+struct RewriteFuncOpABI : public OpRewritePattern<LLVM::LLVMFuncOp> {
+  RewriteFuncOpABI(HALDispatchABI &abi, LLVMTypeConverter &typeConverter)
+      : OpRewritePattern(&typeConverter.getContext()),
+        abi(abi),
+        typeConverter(typeConverter) {}
+
+  LogicalResult matchAndRewrite(LLVM::LLVMFuncOp funcOp,
+                                PatternRewriter &rewriter) const override {
+    if (!funcOp.isExternal()) {
+      return rewriter.notifyMatchFailure(funcOp, "skipping non-external calls");
+    }
+    if (!funcOp->hasAttr("hal.import.bitcode")) {
+      return rewriter.notifyMatchFailure(
+          funcOp, "callee is not imported using bitcode linkage; skipping");
+    }
+    IREE::HAL::CallingConvention cConv = getCallingConvention(funcOp);
+
+    SmallVector<StringRef> extraFields = getExtraFields(funcOp);
+    auto funcType = funcOp.getFunctionType();
+    FailureOr<LLVM::LLVMFunctionType> expectedType =
+        abi.getABIFunctionType(funcOp, cConv, funcType.getReturnTypes(),
+                               funcType.getParams(), extraFields);
+    if (failed(expectedType)) {
+      return rewriter.notifyMatchFailure(
+          funcOp,
+          "unable to get function type to match the calling convention");
+    }
+    if (abi.hasCompatibleFunctionSignature(
+            rewriter.getContext(), expectedType.value(),
+            funcType.getReturnTypes(), funcType.getParams())) {
+      return failure();
+    }
+    auto attrs = getPrunedAttributeList(
+        funcOp, llvm::to_vector(LLVM::LLVMFuncOp::getAttributeNames()));
+    SmallVector<DictionaryAttr> argAttrs;
+    if (auto currArgAttrs = funcOp.getArgAttrsAttr()) {
+      argAttrs =
+          llvm::to_vector(llvm::map_range(currArgAttrs, [](Attribute attr) {
+            return attr.cast<DictionaryAttr>();
+          }));
+    }
+    rewriter.create<LLVM::LLVMFuncOp>(
+        funcOp.getLoc(), funcOp.getName(), expectedType.value(),
+        funcOp.getLinkage(), funcOp.getDsoLocal(), funcOp.getCConv(), attrs,
+        argAttrs, funcOp.getFunctionEntryCount());
+    rewriter.eraseOp(funcOp);
+    return success();
+  }
+
+ private:
+  HALDispatchABI &abi;
+  LLVMTypeConverter &typeConverter;
+};
+
+/// Lower call ops with specified ABI. The ABI to use is looked up from the
+/// callee. Currently this pattern is triggered only for operations where the
+/// callee has the `hal.import.bitcode` attribute set.
+///
+/// Note: this is an LLVM::CallOp -> LLVM::CallOp rewrite that is introduced
+/// after all conversions are done. Importantly, this is not a conversion
+/// pattern.
+struct RewriteCallOpABI : public OpRewritePattern<LLVM::CallOp> {
+  RewriteCallOpABI(HALDispatchABI &abi, LLVMTypeConverter &typeConverter)
+      : OpRewritePattern(&typeConverter.getContext()),
+        abi(abi),
+        typeConverter(typeConverter) {}
+
+  LogicalResult matchAndRewrite(LLVM::CallOp callOp,
+                                PatternRewriter &rewriter) const override {
+    auto symbol = callOp.getCallableForCallee().dyn_cast<SymbolRefAttr>();
+    auto flatSymbol = symbol.dyn_cast_or_null<FlatSymbolRefAttr>();
+    if (!flatSymbol) return failure();
+
+    // Ensure the target function is extern.
+    // To support conversion inserting calls in local patterns that can't add
+    // global function symbols we assume any missing callee is extern.
+    auto calleeOp =
+        SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(callOp, symbol);
+    if (!calleeOp || !calleeOp->hasAttr("hal.import.bitcode") ||
+        !calleeOp.isExternal()) {
+      return rewriter.notifyMatchFailure(
+          callOp, "callee is not imported using bitcode linakge; skipping");
+    }
+
+    IREE::HAL::CallingConvention cConv = getCallingConvention(calleeOp);
+    SmallVector<StringRef> extraFields = getExtraFields(calleeOp);
+
+    FailureOr<SmallVector<Value>> results = abi.materializeABI(
+        callOp, calleeOp.getSymName(), cConv, callOp->getResultTypes(),
+        callOp->getOperands(), extraFields, rewriter);
+    if (failed(results)) {
+      return failure();
+    }
+    rewriter.replaceOp(callOp, *results);
+    return success();
+  }
+
+ private:
+  HALDispatchABI &abi;
+  LLVMTypeConverter &typeConverter;
+};
+
 /// Rewrites calls to extern functions to dynamic library import calls.
 /// The parent LLVMFuncOp must be compatible with HALDispatchABI.
 ///
@@ -702,12 +833,22 @@
     // let it fall through to the linker stage where it can be picked up either
     // from the runtime build (in the case of us producing static libraries) or
     // the user-specified object files (when producing dynamic libraries).
-    if (calleeOp->hasAttr("hal.import.static")) {
+    if (calleeOp->hasAttr("hal.import.static") ||
+        calleeOp->hasAttr("hal.import.bitcode")) {
       return rewriter.notifyMatchFailure(callOp,
                                          "external function is marked static "
                                          "and does not need an import wrapper");
     }
 
+    // The call may need some additional internal fields appended.
+    SmallVector<StringRef> extraFields;
+    if (auto extraFieldsAttr =
+            calleeOp->getAttrOfType<ArrayAttr>("hal.import.fields")) {
+      for (auto extraFieldAttr : extraFieldsAttr) {
+        extraFields.push_back(extraFieldAttr.cast<StringAttr>().getValue());
+      }
+    }
+
     // Allow multiple imports to alias by having their name explicitly
     // specified.
     StringRef importName = flatSymbol.getValue();
@@ -719,15 +860,6 @@
     // TODO(benvanik): way to determine if weak (maybe via linkage?).
     bool weak = false;
 
-    // The call may need some additional internal fields appended.
-    SmallVector<StringRef> extraFields;
-    if (auto extraFieldsAttr =
-            calleeOp->getAttrOfType<ArrayAttr>("hal.import.fields")) {
-      for (auto extraFieldAttr : extraFieldsAttr) {
-        extraFields.push_back(extraFieldAttr.cast<StringAttr>().getValue());
-      }
-    }
-
     // Rewrite the call to a dynamic import call.
     SmallVector<Value> results = abi.wrapAndCallImport(
         callOp, importName, weak, callOp->getResultTypes(),
@@ -955,8 +1087,8 @@
   // Rewrite any extern calls emitted to dynamic library imports.
   {
     RewritePatternSet patterns(&getContext());
-    patterns.insert<RewriteExternCallOpToDynamicImportCallOp>(abi,
-                                                              typeConverter);
+    patterns.insert<RewriteExternCallOpToDynamicImportCallOp, RewriteCallOpABI,
+                    RewriteFuncOpABI>(abi, typeConverter);
     if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
       return signalPassFailure();
   }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp
index da116a6..4b00a2d 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp
@@ -6,6 +6,7 @@
 
 #include "iree/compiler/Codegen/LLVMCPU/DispatchABI.h"
 
+#include "iree/schemas/cpu_data.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -13,6 +14,7 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 
 static llvm::cl::opt<bool> clVerboseDebugInfo(
     "iree-codegen-llvm-verbose-debug-info",
@@ -22,6 +24,39 @@
 namespace mlir {
 namespace iree_compiler {
 
+// List of all defined llvm feature-name to bit pattern used to represent it.
+// This is derived based on the schema in `runtime/src/iree/schemas/`.
+// TODO(ravishankarm): This link to the runtime schemas needs to be broken.
+// Instead we should use a reflection callback to resolve arch guarded features
+// directly in the compiler.
+
+// Struct to capture tuple of llvm feature-name to bit pattern used to represent
+// it.
+struct iree_llvm_name_and_bit_pattern_t {
+  const char *llvm_name;
+  unsigned long long bit_pattern;
+};
+
+#define IREE_CPU_FEATURE_BIT_NAME(arch, field_index, bit_name) \
+  IREE_CPU_DATA##field_index##_##arch##_##bit_name
+
+#define IREE_CPU_FEATURE_NAME_AND_BIT_PATTERN(arch, field_index, bit_name, \
+                                              llvm_name)                   \
+  {llvm_name, IREE_CPU_FEATURE_BIT_NAME(arch, field_index, bit_name)},
+
+static const struct iree_llvm_name_and_bit_pattern_t
+    iree_llvm_name_and_bit_pattern_list[] = {
+
+#define IREE_CPU_FEATURE_BIT(arch, field_index, bit_pos, bit_name, llvm_name) \
+  IREE_CPU_FEATURE_NAME_AND_BIT_PATTERN(arch, field_index, bit_name, llvm_name)
+#include "iree/schemas/cpu_feature_bits.inl"
+#undef IREE_CPU_FEATURE_BIT
+
+};
+
+#undef IREE_CPU_FEATURE_NAME_AND_BIT_PATTERN
+#undef IREE_CPU_FEATURE_BIT_NAME
+
 //------------------------------------------------------------------------------
 // ExecutableLibraryDI
 //------------------------------------------------------------------------------
@@ -823,6 +858,10 @@
   }
 }
 
+Type HALDispatchABI::getProcessorIDType() {
+  return getFieldType(WorkgroupStateField::processor_id);
+}
+
 Value HALDispatchABI::loadProcessorID(Operation *forOp, OpBuilder &builder) {
   auto resultValue =
       loadFieldValue(forOp, WorkgroupStateField::processor_id, builder);
@@ -830,6 +869,79 @@
                       di.getBasicType(resultValue.getType()), builder);
 }
 
+Value HALDispatchABI::updateProcessorDataFromTargetAttr(
+    Operation *forOp, Value processorDataPtrValue, OpBuilder &builder) {
+  // Get the target attr.
+  IREE::HAL::ExecutableTargetAttr targetAttr =
+      IREE::HAL::ExecutableTargetAttr::lookup(forOp);
+  if (!targetAttr) {
+    return processorDataPtrValue;
+  }
+
+  // Lookup CPU features.
+  std::optional<NamedAttribute> cpuFeatures =
+      targetAttr.getConfiguration().getNamed("cpu_features");
+  if (!cpuFeatures) {
+    return processorDataPtrValue;
+  }
+
+  SmallVector<uint64_t> specifiedFeatureBitPatterns;
+  {
+    llvm::StringMap<uint64_t> featureToBitPattern;
+    for (auto [llvmName, bitPattern] : iree_llvm_name_and_bit_pattern_list) {
+      featureToBitPattern[llvmName] = bitPattern;
+    }
+    SmallVector<StringRef> cpuFeatureStrings;
+    cpuFeatures->getValue().cast<StringAttr>().getValue().split(
+        cpuFeatureStrings, ',', /*MakeSplit=*/-1, /*KeepEmpty=*/false);
+    for (auto featureString : cpuFeatureStrings) {
+      if (featureToBitPattern.count(featureString.drop_front())) {
+        specifiedFeatureBitPatterns.push_back(
+            featureToBitPattern.lookup(featureString.drop_front()));
+      }
+    }
+  }
+  if (specifiedFeatureBitPatterns.empty()) {
+    return processorDataPtrValue;
+  }
+
+  // Create a new stack allocation for the bit pattern.
+  Location loc = forOp->getLoc();
+  MLIRContext *context = forOp->getContext();
+  auto ptrType = LLVM::LLVMPointerType::get(context);
+  auto i64Ty = builder.getI64Type();
+  Value arraySize = builder.create<LLVM::ConstantOp>(
+      loc, i64Ty, builder.getI64IntegerAttr(ProcessorDataCapacity));
+  Value alloca = builder.create<LLVM::AllocaOp>(loc, ptrType, i64Ty, arraySize,
+                                                /*alignment=*/sizeof(uint64_t));
+  // Load the 0-th value.
+  Value srcData0 =
+      builder.create<LLVM::LoadOp>(loc, i64Ty, processorDataPtrValue);
+  // Set the specified CPU arch data.
+  for (auto bitPattern : specifiedFeatureBitPatterns) {
+    Value bitPatternVal = builder.create<LLVM::ConstantOp>(
+        loc, i64Ty, builder.getI64IntegerAttr(bitPattern));
+    srcData0 = builder.create<LLVM::OrOp>(loc, srcData0, bitPatternVal);
+  }
+  builder.create<LLVM::StoreOp>(loc, srcData0, alloca);
+  // Copy over the rest.
+  for (int64_t i = 1, e = ProcessorDataCapacity; i < e; ++i) {
+    Value loadPtr = builder.create<LLVM::GEPOp>(
+        loc, processorDataPtrValue.getType(), i64Ty, processorDataPtrValue,
+        LLVM::GEPArg(int32_t(i)), /*inbounds =*/true);
+    Value loadVal = builder.create<LLVM::LoadOp>(loc, i64Ty, loadPtr);
+    Value storePtr = builder.create<LLVM::GEPOp>(
+        loc, alloca.getType(), i64Ty, alloca, LLVM::GEPArg(int32_t(i)),
+        /*inbounds =*/true);
+    builder.create<LLVM::StoreOp>(loc, loadVal, storePtr);
+  }
+  return alloca;
+}
+
+Type HALDispatchABI::getProcessorDataType() {
+  return LLVM::LLVMPointerType::get(processorType.getContext());
+}
+
 Value HALDispatchABI::loadProcessorData(Operation *forOp, OpBuilder &builder) {
   // To get a pointer to the processor data we need to track pointers all the
   // way from the environment argument. This is redundant with loadFieldValue
@@ -848,7 +960,9 @@
       LLVM::LLVMPointerType::get(processorType), processorPtrValue,
       LLVM::GEPArg(int32_t(ProcessorField::data)),
       /*inbounds=*/true);
-  return buildValueDI(forOp, processorDataPtrValue, "processor_data",
+  Value updatedProcessorData =
+      updateProcessorDataFromTargetAttr(forOp, processorDataPtrValue, builder);
+  return buildValueDI(forOp, updatedProcessorData, "processor_data",
                       di.getPtrOf(di.getConstOf(di.getArrayOf(
                           di.getUint64T(), ProcessorDataCapacity))),
                       builder);
@@ -993,59 +1107,207 @@
   return callOp.getResult();
 }
 
-SmallVector<Value> HALDispatchABI::wrapAndCallImport(
-    Operation *forOp, StringRef importName, bool weak, TypeRange resultTypes,
-    ValueRange args, ArrayRef<StringRef> extraFields, OpBuilder &builder) {
-  auto loc = forOp->getLoc();
-  auto context = builder.getContext();
-
+// static
+std::optional<Type> HALDispatchABI::getParameterStructType(
+    TypeRange resultTypes, ValueRange args, TypeRange extraFieldsTypes) {
   // Struct types are ordered [results..., args...].
   SmallVector<Type> types(resultTypes);
   types.reserve(resultTypes.size() + args.size());
   for (Value arg : args) {
     types.push_back(typeConverter->convertType(arg.getType()));
   }
+  types.append(extraFieldsTypes.begin(), extraFieldsTypes.end());
+
+  if (types.empty()) {
+    return std::nullopt;
+  }
+  return LLVM::LLVMStructType::getLiteral(context, types);
+}
+
+// static
+std::tuple<Type, Value> HALDispatchABI::packIntoParameterStruct(
+    Operation *forOp, TypeRange resultTypes, ValueRange args,
+    ValueRange extraFields, OpBuilder &builder) {
+  Location loc = forOp->getLoc();
+  MLIRContext *context = builder.getContext();
 
   // Query any extra fields that were requested and append them to the struct.
-  SmallVector<Value> extraFieldValues;
-  for (auto extraField : extraFields) {
-    auto extraFieldValue = getExtraField(forOp, extraField, builder);
-    extraFieldValues.push_back(extraFieldValue);
-    types.push_back(extraFieldValue.getType());
+  auto extraFieldsTypes = llvm::to_vector(
+      llvm::map_range(extraFields, [](Value v) { return v.getType(); }));
+
+  std::optional<Type> structType =
+      getParameterStructType(resultTypes, args, extraFieldsTypes);
+
+  if (!structType) {
+    Type voidPtrType = LLVM::LLVMPointerType::get(context);
+    return {voidPtrType,
+            builder.create<LLVM::UndefOp>(loc, voidPtrType).getResult()};
   }
 
-  // Pack parameter structure.
-  Type structType;
-  Value paramsPtr, voidPtr;
-  auto voidPtrTy = LLVM::LLVMPointerType::get(context);
-  if (!types.empty()) {
-    // TODO(benvanik): set specific layout to match runtime.
-    structType = LLVM::LLVMStructType::getLiteral(context, types);
-    auto ptrStructType = LLVM::LLVMPointerType::get(context);
-    Value one = builder.create<LLVM::ConstantOp>(loc, builder.getI64Type(),
-                                                 builder.getIndexAttr(1));
-    paramsPtr =
-        builder.create<LLVM::AllocaOp>(loc, ptrStructType, structType, one,
-                                       /*alignment=*/0);
-    Value structVal = builder.create<LLVM::UndefOp>(loc, structType);
-    for (int64_t i = 0, e = args.size(); i < e; ++i) {
-      structVal = builder.create<LLVM::InsertValueOp>(loc, structVal, args[i],
-                                                      i + resultTypes.size());
-    }
-    for (int64_t i = 0, e = extraFieldValues.size(); i < e; ++i) {
-      structVal = builder.create<LLVM::InsertValueOp>(
-          loc, structVal, extraFieldValues[i],
-          i + resultTypes.size() + args.size());
-    }
-    // Store into the alloca'ed descriptor.
-    builder.create<LLVM::StoreOp>(loc, structVal, paramsPtr);
-    voidPtr = builder.create<LLVM::BitcastOp>(loc, voidPtrTy, paramsPtr);
-  } else {
-    voidPtr = builder.create<LLVM::UndefOp>(loc, voidPtrTy);
+  auto ptrStructType = LLVM::LLVMPointerType::get(context);
+  Value one = builder.create<LLVM::ConstantOp>(loc, builder.getI64Type(),
+                                               builder.getIndexAttr(1));
+  Value paramsPtr =
+      builder.create<LLVM::AllocaOp>(loc, ptrStructType, *structType, one,
+                                     /*alignment=*/0);
+  Value structVal = builder.create<LLVM::UndefOp>(loc, *structType);
+  for (int64_t i = 0, e = args.size(); i < e; ++i) {
+    structVal = builder.create<LLVM::InsertValueOp>(loc, structVal, args[i],
+                                                    i + resultTypes.size());
   }
+  for (int64_t i = 0, e = extraFields.size(); i < e; ++i) {
+    structVal = builder.create<LLVM::InsertValueOp>(
+        loc, structVal, extraFields[i], i + resultTypes.size() + args.size());
+  }
+  // Store into the alloca'ed descriptor.
+  builder.create<LLVM::StoreOp>(loc, structVal, paramsPtr);
+  return {*structType, paramsPtr};
+}
+
+// static
+FailureOr<LLVM::LLVMFunctionType> HALDispatchABI::getABIFunctionType(
+    Operation *forOp, IREE::HAL::CallingConvention cConv, TypeRange resultTypes,
+    TypeRange argTypes, ArrayRef<StringRef> extraFields) {
+  MLIRContext *context = forOp->getContext();
+  SmallVector<Type> extraFieldsTypes = llvm::to_vector(llvm::map_range(
+      extraFields, [&](StringRef name) { return getExtraFieldType(name); }));
+
+  // Check for extra fields already added.
+  if (argTypes.size() >= extraFieldsTypes.size()) {
+    if (llvm::all_of(llvm::zip(argTypes.take_back(extraFieldsTypes.size()),
+                               extraFieldsTypes),
+                     [](auto it) {
+                       auto lhsType = std::get<0>(it);
+                       auto rhsType = std::get<1>(it);
+                       return (lhsType.template isa<LLVM::LLVMPointerType>() &&
+                               rhsType.template isa<LLVM::LLVMPointerType>()) ||
+                              std::get<0>(it) == std::get<1>(it);
+                     })) {
+      // Extra fields already added. Drop them.
+      extraFieldsTypes.clear();
+    }
+  }
+
+  switch (cConv) {
+    case IREE::HAL::CallingConvention::Default: {
+      if (resultTypes.size() > 1) {
+        return forOp->emitOpError(
+            "Cannot have multiple return values for function");
+      }
+      Type resultType = resultTypes.size() == 1
+                            ? resultTypes[0]
+                            : LLVM::LLVMVoidType::get(context);
+      SmallVector<Type> allArgTypes = argTypes;
+      allArgTypes.append(extraFieldsTypes.begin(), extraFieldsTypes.end());
+      return LLVM::LLVMFunctionType::get(resultType, allArgTypes);
+    }
+    case IREE::HAL::CallingConvention::ParameterStruct:
+      return LLVM::LLVMFunctionType::get(LLVM::LLVMVoidType::get(context),
+                                         LLVM::LLVMPointerType::get(context));
+  }
+}
+
+// static
+bool HALDispatchABI::hasCompatibleFunctionSignature(
+    MLIRContext *context, LLVM::LLVMFunctionType funcType,
+    TypeRange resultTypes, TypeRange paramTypes) {
+  TypeRange funcParamTypes = funcType.getParams();
+  if (funcParamTypes.size() != paramTypes.size()) {
+    return false;
+  }
+  if (!llvm::all_of(llvm::zip(funcParamTypes, paramTypes), [](auto it) {
+        auto lhsType = std::get<0>(it);
+        auto rhsType = std::get<1>(it);
+        return (lhsType.template isa<LLVM::LLVMPointerType>() &&
+                rhsType.template isa<LLVM::LLVMPointerType>()) ||
+               std::get<0>(it) == std::get<1>(it);
+      })) {
+    return false;
+  }
+  if (resultTypes.size() > 1) {
+    return false;
+  }
+  Type funcResultType = funcType.getReturnType();
+  if (resultTypes.empty() &&
+      funcResultType != LLVM::LLVMVoidType::get(context)) {
+    return false;
+  }
+  if (resultTypes.size() == 1 && resultTypes[0] != funcResultType) {
+    return false;
+  }
+  return true;
+}
+
+FailureOr<SmallVector<Value>> HALDispatchABI::materializeABI(
+    Operation *forOp, StringRef symbolName, IREE::HAL::CallingConvention cConv,
+    TypeRange resultTypes, ValueRange args, ArrayRef<StringRef> extraFields,
+    RewriterBase &rewriter) {
+  auto argTypes = llvm::to_vector(
+      llvm::map_range(args, [](Value v) { return v.getType(); }));
+  FailureOr<LLVM::LLVMFunctionType> abiFunctionType =
+      getABIFunctionType(forOp, cConv, resultTypes, argTypes, extraFields);
+  if (failed(abiFunctionType)) {
+    return forOp->emitOpError(
+        "failed to get function type for calling convention");
+  }
+  if (hasCompatibleFunctionSignature(rewriter.getContext(),
+                                     abiFunctionType.value(), resultTypes,
+                                     argTypes)) {
+    return rewriter.notifyMatchFailure(
+        forOp, "no change in function signature. skipping");
+  }
+
+  // Combined args list.
+  SmallVector<Value> allArgsList = llvm::to_vector(args);
+  SmallVector<Value> extraFieldVals =
+      llvm::to_vector(llvm::map_range(extraFields, [&](StringRef fieldName) {
+        return getExtraField(forOp, fieldName, rewriter);
+      }));
+  allArgsList.append(extraFieldVals);
+
+  Location loc = forOp->getLoc();
+  if (cConv == IREE::HAL::CallingConvention::Default) {
+    auto callOp = rewriter.create<LLVM::CallOp>(
+        loc, abiFunctionType->getReturnTypes(), allArgsList, forOp->getAttrs());
+    return llvm::to_vector(llvm::map_range(
+        callOp.getResults(), [](OpResult v) -> Value { return v; }));
+  }
+
+  if (cConv == IREE::HAL::CallingConvention::ParameterStruct) {
+    auto [structType, paramsStructPtr] = packIntoParameterStruct(
+        forOp, resultTypes, args, extraFieldVals, rewriter);
+    rewriter.create<LLVM::CallOp>(loc, TypeRange{}, paramsStructPtr,
+                                  forOp->getAttrs());
+    SmallVector<Value> results;
+    if (!resultTypes.empty()) {
+      results.reserve(resultTypes.size());
+      Value structVal =
+          rewriter.create<LLVM::LoadOp>(loc, structType, paramsStructPtr);
+      for (int64_t i = 0, e = resultTypes.size(); i < e; ++i) {
+        results.push_back(
+            rewriter.create<LLVM::ExtractValueOp>(loc, structVal, i));
+      }
+    }
+    return results;
+  }
+  return forOp->emitOpError("unhandled calling convention");
+}
+
+SmallVector<Value> HALDispatchABI::wrapAndCallImport(
+    Operation *forOp, StringRef importName, bool weak, TypeRange resultTypes,
+    ValueRange args, ArrayRef<StringRef> extraFields, OpBuilder &builder) {
+  auto loc = forOp->getLoc();
+
+  SmallVector<Value> extraFieldVals =
+      llvm::to_vector(llvm::map_range(extraFields, [&](StringRef fieldName) {
+        return getExtraField(forOp, fieldName, builder);
+      }));
+
+  auto [structType, paramsPtr] = packIntoParameterStruct(
+      forOp, resultTypes, args, extraFieldVals, builder);
 
   // Calls return 0 (success) or non-zero (failure).
-  auto callResult = callImport(forOp, importName, weak, voidPtr, builder);
+  auto callResult = callImport(forOp, importName, weak, paramsPtr, builder);
   Block *trueDest =
       builder.getInsertionBlock()->splitBlock(++builder.getInsertionPoint());
   Block *falseDest = builder.createBlock(trueDest);
@@ -1129,6 +1391,10 @@
   return builder.create<LLVM::ExtractValueOp>(loc, stateValue, position);
 }
 
+Type HALDispatchABI::getFieldType(WorkgroupStateField field) {
+  return workgroupStateType.getBody()[int64_t(field)];
+}
+
 Value HALDispatchABI::loadFieldValue(Operation *forOp,
                                      WorkgroupStateField field,
                                      OpBuilder &builder) {
@@ -1142,6 +1408,17 @@
   return builder.create<LLVM::ExtractValueOp>(loc, stateValue, position);
 }
 
+Type HALDispatchABI::getExtraFieldType(StringRef extraField) {
+  if (extraField == "processor_id") {
+    return getProcessorIDType();
+  }
+  if (extraField == "processor_data") {
+    return getProcessorDataType();
+  }
+  assert(false && "unhandled extra filed");
+  return {};
+}
+
 Value HALDispatchABI::getExtraField(Operation *forOp, StringRef extraField,
                                     OpBuilder &builder) {
   if (extraField == "processor_id") {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.h
index 07278b8..e21cb59 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.h
@@ -267,9 +267,11 @@
   // Loads the processor ID the code is (most likely) being run on.
   // Equivalent to:
   //   uint32_t processor_id = state->processor_id;
+  Type getProcessorIDType();
   Value loadProcessorID(Operation *forOp, OpBuilder &builder);
 
   // Loads a pointer to the processor information data fields.
+  Type getProcessorDataType();
   Value loadProcessorData(Operation *forOp, OpBuilder &builder);
 
   // Loads a processor information data field at the given index.
@@ -322,6 +324,39 @@
                                        ArrayRef<StringRef> extraFields,
                                        OpBuilder &builder);
 
+  //===--------------------------------------------------------------------==//
+  // External/bitcode function ABI handling methods.
+  //===--------------------------------------------------------------------==//
+  // Methods required for handling ABI for functions whose definitions are
+  // external.
+
+  /// Check if the `funcType` is equivalent to a function with return being
+  /// of type `resultTypes` and operands of type `paramTypes`.
+  static bool hasCompatibleFunctionSignature(MLIRContext *context,
+                                             LLVM::LLVMFunctionType funcType,
+                                             TypeRange resultTypes,
+                                             TypeRange paramTypes);
+
+  /// Given a calling convention `cConv`, and callee with return of
+  /// `resultTypes` and operands with type `argTypes`, along with extra fields
+  /// to append to argument list specified in `extraFields`; return the function
+  /// type of use for the function that implements the specified calling
+  /// convention.
+  FailureOr<LLVM::LLVMFunctionType> getABIFunctionType(
+      Operation *forOp, IREE::HAL::CallingConvention cConv,
+      TypeRange resultTypes, TypeRange argTypes,
+      ArrayRef<StringRef> extraFields);
+
+  /// Given a calling convention `cConv`, and callee with return of
+  /// `resultTypes` and operands with type `argTypes`, along with extra fields
+  /// to append to argument list specified in `extraFields`; modify the `callOp`
+  /// to implement the specified ABI. The calleee signature is expected to have
+  /// been/to be modified separately, i.e. it isnt done within this method.
+  FailureOr<SmallVector<Value>> materializeABI(
+      Operation *callOp, StringRef symbolName,
+      IREE::HAL::CallingConvention cConv, TypeRange resultTypes,
+      ValueRange args, ArrayRef<StringRef> extraFields, RewriterBase &builder);
+
  private:
   Value getIndexValue(Location loc, int64_t value, OpBuilder &builder);
 
@@ -334,12 +369,38 @@
                        OpBuilder &builder);
   Value loadFieldValue(Operation *forOp, DispatchStateField field,
                        OpBuilder &builder);
+  Type getFieldType(WorkgroupStateField field);
   Value loadFieldValue(Operation *forOp, WorkgroupStateField field,
                        OpBuilder &builder);
 
+  Type getExtraFieldType(StringRef extraField);
   Value getExtraField(Operation *forOp, StringRef extraField,
                       OpBuilder &builder);
 
+  // Update the processor data based on the `cpu_features` present in
+  // `executable.target` attribute.
+  Value updateProcessorDataFromTargetAttr(Operation *forOp,
+                                          Value processorDataPtrValue,
+                                          OpBuilder &builder);
+
+  // Return LLVM Struct type that represents a container for arguments
+  // and return types. The struct type are ordered [results..., args...]
+  std::optional<Type> getParameterStructType(TypeRange resultTypes,
+                                             ValueRange args,
+                                             TypeRange extraFieldsTypes);
+  // For a given call operation, generate the struct that is the container
+  // for passing the arguments.
+  //
+  // The provided |resultTypes| and |args| are packed in a struct and transit
+  // through memory so that we can expose a single void* argument. Optionally
+  // |extraFields| can be specified with an ordered list of field names to be
+  // appended to the end of the struct.
+  std::tuple<Type, Value> packIntoParameterStruct(Operation *forOp,
+                                                  TypeRange resultTypes,
+                                                  ValueRange args,
+                                                  ValueRange extraFields,
+                                                  OpBuilder &builder);
+
   mlir::MLIRContext *context;
   LLVMTypeConverter *typeConverter;
   LLVM::LLVMStructType processorType;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerToUKernels.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerToUKernels.cpp
index 1d0a210..01688b8 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerToUKernels.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerToUKernels.cpp
@@ -63,6 +63,13 @@
     result.defAttrs.emplace_back(
         rewriter.getStringAttr("hal.import.fields"),
         rewriter.getArrayAttr({rewriter.getStringAttr("processor_data")}));
+    result.defAttrs.emplace_back(rewriter.getStringAttr("hal.import.bitcode"),
+                                 rewriter.getBoolAttr(true));
+    result.defAttrs.emplace_back(
+        rewriter.getStringAttr("hal.import.cconv"),
+        IREE::HAL::CallingConventionAttr::get(
+            rewriter.getContext(),
+            IREE::HAL::CallingConvention::ParameterStruct));
   }
   return result;
 }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir
index 38017ed..7acae65 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt -iree-convert-to-llvm %s | FileCheck %s
+// RUN: iree-opt -iree-convert-to-llvm --split-input-file %s | FileCheck %s
 
 builtin.module {
   func.func private @extern_public()
@@ -13,3 +13,163 @@
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32
 //      CHECK:     llvm.return %{{.+}} : i32
 
+// -----
+
+module {
+  func.func private @default_cconv_with_extra_fields(memref<f32>, i32, f64) -> (f32) attributes {
+      hal.import.bitcode = true,
+      hal.import.cconv = 0 : i32,
+      hal.import.fields = ["processor_data", "processor_id"],
+      llvm.bareptr = true
+  }
+  func.func @bar() {
+    %c0 = arith.constant 42 : i32
+    %c1 = arith.constant 42.0 : f64
+    %0 = memref.alloca() : memref<f32>
+    %1 = call @default_cconv_with_extra_fields(%0, %c0, %c1) : (memref<f32>, i32, f64) -> (f32)
+    return
+  }
+}
+//      CHECK: llvm.func @default_cconv_with_extra_fields(!llvm.ptr, i32, f64, !llvm.ptr, i32) -> f32
+//      CHECK: llvm.func @bar
+//  CHECK-DAG:   %[[Ci32:.+]] = llvm.mlir.constant(42 : i32) : i32
+//  CHECK-DAG:   %[[Cf64:.+]] = llvm.mlir.constant(4.200000e+01 : f64) : f64
+//  CHECK-DAG:   %[[ALLOCA:.+]] = llvm.alloca
+//  CHECK-DAG:   %[[DATA:.+]] = llvm.getelementptr inbounds %arg0[4]
+//  CHECK-DAG:   %[[PROCESSOR_INFO:.+]] = llvm.load %arg2
+//      CHECK:   %[[PROCESSOR_ID:.+]] = llvm.extractvalue %[[PROCESSOR_INFO]][4]
+//      CHECK: %[[VAL:.+]] = llvm.call @default_cconv_with_extra_fields
+// CHECK-SAME:     (%[[ALLOCA]], %[[Ci32]], %[[Cf64]], %[[DATA]], %[[PROCESSOR_ID]])
+
+// -----
+
+module {
+  func.func private @paramstruct_cconv_with_extra_fields(memref<f32>, i32, f64) -> (f32) attributes {
+      hal.import.bitcode = true,
+      hal.import.cconv = 1 : i32,
+      hal.import.fields = ["processor_data", "processor_id"],
+      llvm.bareptr = true
+  }
+  func.func @bar() {
+    %c0 = arith.constant 42 : i32
+    %c1 = arith.constant 42.0 : f64
+    %0 = memref.alloca() : memref<f32>
+    %1 = call @paramstruct_cconv_with_extra_fields(%0, %c0, %c1) : (memref<f32>, i32, f64) -> (f32)
+    return
+  }
+}
+//      CHECK: llvm.func @paramstruct_cconv_with_extra_fields(!llvm.ptr)
+//      CHECK: llvm.func @bar
+//  CHECK-DAG:   %[[C1:.+]] = llvm.mlir.constant(1 : index) : i64
+//  CHECK-DAG:   %[[Ci32:.+]] = llvm.mlir.constant(42 : i32) : i32
+//  CHECK-DAG:   %[[Cf64:.+]] = llvm.mlir.constant(4.200000e+01 : f64) : f64
+//  CHECK-DAG:   %[[ALLOCA:.+]] = llvm.alloca
+//  CHECK-DAG:   %[[DATA:.+]] = llvm.getelementptr inbounds %arg0[4]
+//  CHECK-DAG:   %[[PROCESSOR_INFO:.+]] = llvm.load %arg2
+//      CHECK:   %[[PROCESSOR_ID:.+]] = llvm.extractvalue %[[PROCESSOR_INFO]][4]
+//      CHECK:   %[[PARAMSTRUCT_ALLOCA:.+]] = llvm.alloca %[[C1]] x !llvm.struct<(f32, ptr, i32, f64, ptr, i32)>
+//      CHECK:   %[[PARAMSTRUCT:.+]] = llvm.mlir.undef : !llvm.struct<(f32, ptr, i32, f64, ptr, i32)>
+//      CHECK:   %[[INSERT_ARG0:.+]] = llvm.insertvalue %[[ALLOCA]], %[[PARAMSTRUCT]][1]
+//      CHECK:   %[[INSERT_ARG1:.+]] = llvm.insertvalue %[[Ci32]], %[[INSERT_ARG0]][2]
+//      CHECK:   %[[INSERT_ARG2:.+]] = llvm.insertvalue %[[Cf64]], %[[INSERT_ARG1]][3]
+//      CHECK:   %[[INSERT_ARG3:.+]] = llvm.insertvalue %[[DATA]], %[[INSERT_ARG2]][4]
+//      CHECK:   %[[INSERT_ARG4:.+]] = llvm.insertvalue %[[PROCESSOR_ID]], %[[INSERT_ARG3]][5]
+//      CHECK:   llvm.store %[[INSERT_ARG4]], %[[PARAMSTRUCT_ALLOCA]]
+//      CHECK:   llvm.call @paramstruct_cconv_with_extra_fields(%[[PARAMSTRUCT_ALLOCA]])
+
+// -----
+
+module attributes {
+  hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
+      cpu = "haswell",
+      cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+fma,+bmi,+bmi2,+pclmul,+cx16,+cx8,+crc32,+f16c,+fsgsbase,+fxsr,+invpcid,+lzcnt,+movbe,+rdrnd,+sahf,+x87,+xsave,+xsaveopt",
+      data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+      native_vector_size = 32 : index,
+      target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = true}>} {
+  func.func private @paramstruct_cconv_with_extra_fields_and_executable_target(memref<f32>, i32, f64) -> (f32) attributes {
+      hal.import.bitcode = true,
+      hal.import.cconv = 1 : i32,
+      hal.import.fields = ["processor_data", "processor_id"],
+      llvm.bareptr = true
+  }
+  func.func @bar() {
+    %c0 = arith.constant 42 : i32
+    %c1 = arith.constant 42.0 : f64
+    %0 = memref.alloca() : memref<f32>
+    %1 = call @paramstruct_cconv_with_extra_fields_and_executable_target(%0, %c0, %c1) : (memref<f32>, i32, f64) -> (f32)
+    return
+  }
+}
+//      CHECK: llvm.func @paramstruct_cconv_with_extra_fields_and_executable_target(!llvm.ptr)
+//      CHECK: llvm.func @bar
+//  CHECK-DAG:   %[[F16C:.+]] = llvm.mlir.constant(16384 : i64) : i64
+//  CHECK-DAG:   %[[FMA:.+]] = llvm.mlir.constant(2048 : i64) : i64
+//  CHECK-DAG:   %[[AVX2:.+]] = llvm.mlir.constant(32768 : i64) : i64
+//  CHECK-DAG:   %[[AVX:.+]] = llvm.mlir.constant(1024 : i64) : i64
+//  CHECK-DAG:   %[[SSE41:.+]] = llvm.mlir.constant(4 : i64) : i64
+//  CHECK-DAG:   %[[SSSE3:.+]] = llvm.mlir.constant(2 : i64) : i64
+//  CHECK-DAG:   %[[SSE3:.+]] = llvm.mlir.constant(1 : i64) : i64
+//  CHECK-DAG:   %[[C8:.+]] = llvm.mlir.constant(8 : i64) : i64
+//  CHECK-DAG:   %[[C1:.+]] = llvm.mlir.constant(1 : index) : i64
+//  CHECK-DAG:   %[[Ci32:.+]] = llvm.mlir.constant(42 : i32) : i32
+//  CHECK-DAG:   %[[Cf64:.+]] = llvm.mlir.constant(4.200000e+01 : f64) : f64
+//  CHECK-DAG:   %[[ALLOCA:.+]] = llvm.alloca
+//  CHECK-DAG:   %[[DATA_PTR:.+]] = llvm.getelementptr inbounds %arg0[4]
+//      CHECK:   %[[PROCESSOR_DATA_ALLOCA:.+]] = llvm.alloca %[[C8]] x i64 {alignment = 8 : i64}
+//  CHECK-DAG:   %[[DATA:.+]] = llvm.load %[[DATA_PTR]]
+//  CHECK-DAG:   %[[OR0:.+]] = llvm.or %[[DATA]], %[[SSE3]]
+//  CHECK-DAG:   %[[OR1:.+]] = llvm.or %[[OR0]], %[[SSSE3]]
+//  CHECK-DAG:   %[[OR2:.+]] = llvm.or %[[OR1]], %[[SSE41]]
+//  CHECK-DAG:   %[[OR3:.+]] = llvm.or %[[OR2]], %[[C8]]
+//  CHECK-DAG:   %[[OR4:.+]] = llvm.or %[[OR3]], %[[AVX]]
+//  CHECK-DAG:   %[[OR5:.+]] = llvm.or %[[OR4]], %[[AVX2]]
+//  CHECK-DAG:   %[[OR6:.+]] = llvm.or %[[OR5]], %[[FMA]]
+//  CHECK-DAG:   %[[OR7:.+]] = llvm.or %[[OR6]], %[[F16C]]
+//      CHECK:   llvm.store %[[OR7]], %[[PROCESSOR_DATA_ALLOCA]]
+
+//      CHECK:   %[[PROCESSOR_DATA_PTR_1:.+]] = llvm.getelementptr inbounds %[[DATA_PTR]][1]
+//      CHECK:   %[[PROCESSOR_DATA_1:.+]] = llvm.load %[[PROCESSOR_DATA_PTR_1]]
+//      CHECK:   %[[PROCESSOR_DATA_ALLOCA_PTR_1:.+]] = llvm.getelementptr inbounds %[[PROCESSOR_DATA_ALLOCA]][1]
+//      CHECK:   llvm.store %[[PROCESSOR_DATA_1]], %[[PROCESSOR_DATA_ALLOCA_PTR_1]]
+
+//      CHECK:   %[[PROCESSOR_DATA_PTR_2:.+]] = llvm.getelementptr inbounds %[[DATA_PTR]][2]
+//      CHECK:   %[[PROCESSOR_DATA_2:.+]] = llvm.load %[[PROCESSOR_DATA_PTR_2]]
+//      CHECK:   %[[PROCESSOR_DATA_ALLOCA_PTR_2:.+]] = llvm.getelementptr inbounds %[[PROCESSOR_DATA_ALLOCA]][2]
+//      CHECK:   llvm.store %[[PROCESSOR_DATA_2]], %[[PROCESSOR_DATA_ALLOCA_PTR_2]]
+
+//      CHECK:   %[[PROCESSOR_DATA_PTR_3:.+]] = llvm.getelementptr inbounds %[[DATA_PTR]][3]
+//      CHECK:   %[[PROCESSOR_DATA_3:.+]] = llvm.load %[[PROCESSOR_DATA_PTR_3]]
+//      CHECK:   %[[PROCESSOR_DATA_ALLOCA_PTR_3:.+]] = llvm.getelementptr inbounds %[[PROCESSOR_DATA_ALLOCA]][3]
+//      CHECK:   llvm.store %[[PROCESSOR_DATA_3]], %[[PROCESSOR_DATA_ALLOCA_PTR_3]]
+
+//      CHECK:   %[[PROCESSOR_DATA_PTR_4:.+]] = llvm.getelementptr inbounds %[[DATA_PTR]][4]
+//      CHECK:   %[[PROCESSOR_DATA_4:.+]] = llvm.load %[[PROCESSOR_DATA_PTR_4]]
+//      CHECK:   %[[PROCESSOR_DATA_ALLOCA_PTR_4:.+]] = llvm.getelementptr inbounds %[[PROCESSOR_DATA_ALLOCA]][4]
+//      CHECK:   llvm.store %[[PROCESSOR_DATA_4]], %[[PROCESSOR_DATA_ALLOCA_PTR_4]]
+
+//      CHECK:   %[[PROCESSOR_DATA_PTR_5:.+]] = llvm.getelementptr inbounds %[[DATA_PTR]][5]
+//      CHECK:   %[[PROCESSOR_DATA_5:.+]] = llvm.load %[[PROCESSOR_DATA_PTR_5]]
+//      CHECK:   %[[PROCESSOR_DATA_ALLOCA_PTR_5:.+]] = llvm.getelementptr inbounds %[[PROCESSOR_DATA_ALLOCA]][5]
+//      CHECK:   llvm.store %[[PROCESSOR_DATA_5]], %[[PROCESSOR_DATA_ALLOCA_PTR_5]]
+
+//      CHECK:   %[[PROCESSOR_DATA_PTR_6:.+]] = llvm.getelementptr inbounds %[[DATA_PTR]][6]
+//      CHECK:   %[[PROCESSOR_DATA_6:.+]] = llvm.load %[[PROCESSOR_DATA_PTR_6]]
+//      CHECK:   %[[PROCESSOR_DATA_ALLOCA_PTR_6:.+]] = llvm.getelementptr inbounds %[[PROCESSOR_DATA_ALLOCA]][6]
+//      CHECK:   llvm.store %[[PROCESSOR_DATA_6]], %[[PROCESSOR_DATA_ALLOCA_PTR_6]]
+
+//      CHECK:   %[[PROCESSOR_DATA_PTR_7:.+]] = llvm.getelementptr inbounds %[[DATA_PTR]][7]
+//      CHECK:   %[[PROCESSOR_DATA_7:.+]] = llvm.load %[[PROCESSOR_DATA_PTR_7]]
+//      CHECK:   %[[PROCESSOR_DATA_ALLOCA_PTR_7:.+]] = llvm.getelementptr inbounds %[[PROCESSOR_DATA_ALLOCA]][7]
+//      CHECK:   llvm.store %[[PROCESSOR_DATA_7]], %[[PROCESSOR_DATA_ALLOCA_PTR_7]]
+
+//  CHECK-DAG:   %[[PROCESSOR_INFO:.+]] = llvm.load %arg2
+//      CHECK:   %[[PROCESSOR_ID:.+]] = llvm.extractvalue %[[PROCESSOR_INFO]][4]
+//      CHECK:   %[[PARAMSTRUCT_ALLOCA:.+]] = llvm.alloca %[[C1]] x !llvm.struct<(f32, ptr, i32, f64, ptr, i32)>
+//      CHECK:   %[[PARAMSTRUCT:.+]] = llvm.mlir.undef : !llvm.struct<(f32, ptr, i32, f64, ptr, i32)>
+//      CHECK:   %[[INSERT_ARG0:.+]] = llvm.insertvalue %[[ALLOCA]], %[[PARAMSTRUCT]][1]
+//      CHECK:   %[[INSERT_ARG1:.+]] = llvm.insertvalue %[[Ci32]], %[[INSERT_ARG0]][2]
+//      CHECK:   %[[INSERT_ARG2:.+]] = llvm.insertvalue %[[Cf64]], %[[INSERT_ARG1]][3]
+//      CHECK:   %[[INSERT_ARG3:.+]] = llvm.insertvalue %[[PROCESSOR_DATA_ALLOCA]], %[[INSERT_ARG2]][4]
+//      CHECK:   %[[INSERT_ARG4:.+]] = llvm.insertvalue %[[PROCESSOR_ID]], %[[INSERT_ARG3]][5]
+//      CHECK:   llvm.store %[[INSERT_ARG4]], %[[PARAMSTRUCT_ALLOCA]]
+//      CHECK:   llvm.call @paramstruct_cconv_with_extra_fields_and_executable_target(%[[PARAMSTRUCT_ALLOCA]])
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td b/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td
index 6794a0c..39735f0 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td
@@ -306,6 +306,18 @@
   let cppNamespace = "::mlir::iree_compiler::IREE::HAL";
 }
 
+def HAL_CallingConvention_Default : I32EnumAttrCase<"Default", 0>;
+def HAL_CallingConvention_ParameterStruct : I32EnumAttrCase<"ParameterStruct", 1>;
+def HAL_CallingConventionAttr :
+    I32EnumAttr<
+        "CallingConvention",
+        "Calling conversions for linked functions",[
+          HAL_CallingConvention_Default,
+          HAL_CallingConvention_ParameterStruct,
+        ]>{
+  let cppNamespace = "::mlir::iree_compiler::IREE::HAL";
+}
+
 //===----------------------------------------------------------------------===//
 // HAL types
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/BUILD.bazel b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/BUILD.bazel
index eaa2367..aa1c5b2 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/BUILD.bazel
@@ -17,15 +17,18 @@
     srcs = [
         "Device.cpp",
         "Musl.cpp",
+        "UKernel.cpp",
     ],
     hdrs = [
         "Device.h",
         "Musl.h",
+        "UKernel.h",
     ],
     deps = [
         "//compiler/src/iree/compiler/Dialect/HAL/IR",
         "//runtime/src/iree/builtins/device:libdevice_bitcode",
         "//runtime/src/iree/builtins/musl/bin:libmusl",
+        "//runtime/src/iree/builtins/ukernel:libukernel_bitcode",
         "@llvm-project//llvm:BitReader",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/CMakeLists.txt
index 75b4838..c89659c 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/CMakeLists.txt
@@ -16,9 +16,11 @@
   HDRS
     "Device.h"
     "Musl.h"
+    "UKernel.h"
   SRCS
     "Device.cpp"
     "Musl.cpp"
+    "UKernel.cpp"
   DEPS
     LLVMBitReader
     LLVMCore
@@ -27,6 +29,7 @@
     MLIRSupport
     iree::builtins::device::libdevice_bitcode
     iree::builtins::musl::bin::libmusl
+    iree::builtins::ukernel::libukernel_bitcode
     iree::compiler::Dialect::HAL::IR
   PUBLIC
 )
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.cpp b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.cpp
new file mode 100644
index 0000000..e06b687
--- /dev/null
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.cpp
@@ -0,0 +1,64 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.h"
+
+#include "iree/builtins/ukernel/libukernel.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Support/MemoryBufferRef.h"
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+namespace iree_compiler {
+namespace IREE {
+namespace HAL {
+
+static const iree_file_toc_t *lookupUKernelFile(StringRef filename) {
+  for (size_t i = 0; i < iree_builtins_libukernel_size(); ++i) {
+    const auto &file_toc = iree_builtins_libukernel_create()[i];
+    if (filename == file_toc.name) return &file_toc;
+  }
+  return nullptr;
+}
+
+static const iree_file_toc_t *lookupUKernelFile(
+    llvm::TargetMachine *targetMachine) {
+  const auto &triple = targetMachine->getTargetTriple();
+
+  // NOTE: other arch-specific checks go here.
+
+  // Fallback path using the generic wasm variants as they are largely
+  // machine-agnostic.
+  if (triple.isX86()) {
+    return lookupUKernelFile("ukernel_bitcode.bc");
+  } else {
+    return nullptr;
+  }
+}
+
+std::optional<std::unique_ptr<llvm::Module>> loadUKernelBitcode(
+    llvm::TargetMachine *targetMachine, llvm::LLVMContext &context) {
+  // Find a bitcode file for the current architecture.
+  const auto *file = lookupUKernelFile(targetMachine);
+  if (!file) {
+    return std::nullopt;
+  }
+
+  // Load the generic bitcode file contents.
+  llvm::MemoryBufferRef bitcodeBufferRef(
+      llvm::StringRef(file->data, file->size), file->name);
+  auto bitcodeFile = llvm::parseBitcodeFile(bitcodeBufferRef, context);
+  if (!bitcodeFile) {
+    // TODO: Do we want to error out here or silently proceed.
+    return std::nullopt;
+  }
+  return std::move(*bitcodeFile);
+}
+
+}  // namespace HAL
+}  // namespace IREE
+}  // namespace iree_compiler
+}  // namespace mlir
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.h b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.h
new file mode 100644
index 0000000..af662b2
--- /dev/null
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.h
@@ -0,0 +1,26 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_COMPILER_DIALECT_HAL_TARGET_LLVMCPU_BUILTINS_UKERNEL_H_
+#define IREE_COMPILER_DIALECT_HAL_TARGET_LLVMCPU_BUILTINS_UKERNEL_H_
+
+#include "llvm/IR/Module.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace mlir {
+namespace iree_compiler {
+namespace IREE {
+namespace HAL {
+
+std::optional<std::unique_ptr<llvm::Module>> loadUKernelBitcode(
+    llvm::TargetMachine *targetMachine, llvm::LLVMContext &context);
+
+}  // namespace HAL
+}  // namespace IREE
+}  // namespace iree_compiler
+}  // namespace mlir
+
+#endif  // IREE_COMPILER_DIALECT_HAL_TARGET_LLVMCPU_BUILTINS_UKERNEL_H_
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/LLVMCPUTarget.cpp b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/LLVMCPUTarget.cpp
index f7f7e2c..460cde8 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/LLVMCPUTarget.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/LLVMCPUTarget.cpp
@@ -14,6 +14,7 @@
 #include "iree/compiler/Codegen/LLVMCPU/LLVMCPUPasses.h"
 #include "iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/Device.h"
 #include "iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/Musl.h"
+#include "iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.h"
 #include "iree/compiler/Dialect/HAL/Target/LLVMCPU/LLVMIRPasses.h"
 #include "iree/compiler/Dialect/HAL/Target/LLVMCPU/LibraryBuilder.h"
 #include "iree/compiler/Dialect/HAL/Target/LLVMCPU/LinkerTool.h"
@@ -125,6 +126,22 @@
   return success();
 }
 
+/// Helper method to check if the variant op has a `ukernel` attribute
+/// in its `hal.executable.target`. If so, load the ukernel library
+/// for that target and link.
+// Note: This is duplicate of a similar function in Codegen/. For
+// now duplicating this to avoid false linking issues. Eventually
+// presence of this attribute in the `hal.executable.target` should
+// drive everything.
+static bool hasMicrokernel(IREE::HAL::ExecutableVariantOp variantOp) {
+  IREE::HAL::ExecutableTargetAttr targetAttr = variantOp.getTarget();
+  if (!targetAttr) return false;
+  auto config = targetAttr.getConfiguration();
+  if (!config) return false;
+  auto attr = config.getAs<BoolAttr>("ukernels");
+  return attr && attr.getValue();
+}
+
 class LLVMCPUTargetBackend final : public TargetBackend {
  public:
   explicit LLVMCPUTargetBackend(LLVMTargetOptions options)
@@ -417,6 +434,26 @@
              << targetTriple.str() << "'";
     }
 
+    // Link in ukernel file.
+    if (hasMicrokernel(variantOp)) {
+      auto setAlwaysInline = [&](llvm::Module &module) {
+        for (auto &func : module.getFunctionList()) {
+          func.addFnAttr(llvm::Attribute::AlwaysInline);
+        }
+      };
+      if (std::optional<std::unique_ptr<llvm::Module>> libUKernel =
+              loadUKernelBitcode(targetMachine.get(), context)) {
+        if (failed(linkBitcodeModule(
+                variantOp.getLoc(), moduleLinker, llvm::Linker::LinkOnlyNeeded,
+                *targetMachine, "libukernel", std::move(libUKernel.value()),
+                setAlwaysInline))) {
+          return mlir::emitError(variantOp.getLoc())
+                 << "failed linking in ukernel library for target triple '"
+                 << targetTriple.str() << "'";
+        }
+      }
+    }
+
     // Strip any compiler identifiers that may have snuck in. We let the linker
     // tag the module.
     auto *llvmIdent = llvmModule->getNamedMetadata("llvm.ident");
diff --git a/experimental/cpu_ukernel/iree_experimental_standalone_plugin.cmake b/experimental/cpu_ukernel/iree_experimental_standalone_plugin.cmake
index 079ffc4..7d48068 100644
--- a/experimental/cpu_ukernel/iree_experimental_standalone_plugin.cmake
+++ b/experimental/cpu_ukernel/iree_experimental_standalone_plugin.cmake
@@ -168,6 +168,11 @@
 #           `FILE_COPTS_VAR_NAME`.
 #           Example:  "x86_64:some_file_for_x86_64_using_avx512_instructions.c:NAME_OF_VARIABLE_CONTAINING_COPTS_FOR_X86_64_AVX512".
 function(iree_experimental_standalone_plugin)
+  # Early return if we don't have our own build of Clang and LLD available.
+  if (NOT (IREE_CLANG_TARGET AND IREE_LLD_TARGET))
+    return()
+  endif()
+
   cmake_parse_arguments(
     _RULE
     ""
diff --git a/runtime/src/iree/builtins/ukernel/BUILD.bazel b/runtime/src/iree/builtins/ukernel/BUILD.bazel
index 7ed6aa2..d4727c9 100644
--- a/runtime/src/iree/builtins/ukernel/BUILD.bazel
+++ b/runtime/src/iree/builtins/ukernel/BUILD.bazel
@@ -4,7 +4,9 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-load("//build_tools/bazel:build_defs.oss.bzl", "iree_runtime_cc_library")
+load("//build_tools/bazel:build_defs.oss.bzl", "iree_cmake_extra_content", "iree_runtime_cc_library")
+load("//build_tools/bazel:iree_bitcode_library.bzl", "iree_bitcode_library", "iree_link_bitcode")
+load("//build_tools/embed_data:build_defs.bzl", "c_embed_data")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -70,3 +72,113 @@
         "//runtime/src/iree/builtins/ukernel/arch:ukernel_arch",
     ],
 )
+
+#===------------------------------------------------------------------------===#
+# UKernel bitcode files
+#===------------------------------------------------------------------------===#
+
+iree_cmake_extra_content(
+    content = """
+if(IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU)
+""",
+    inline = True,
+)
+
+BITCODE_COPTS = [
+    # C17 with no system deps.
+    "-std=c17",
+    "-nostdinc",
+    "-ffreestanding",
+
+    # Optimized and unstamped.
+    "-O3",
+    "-DNDEBUG",
+    "-fno-ident",
+    "-fdiscard-value-names",
+
+    # Object file only in bitcode format:
+    "-c",
+    "-emit-llvm",
+
+    # Force the library into standalone mode (not depending on build-directory
+    # configuration).
+    "-DIREE_UK_STANDALONE=1",
+
+    # Force configure for X86_64 with own Clang
+    "-DIREE_UK_ARCH_X86_64",
+    "-DIREE_UK_POINTER_SIZE=8",
+    "-DIREE_UK_BUILD_X86_64_AVX2_FMA",
+    "-DIREE_UK_BUILD_X86_64_AVX512_BASE",
+    "-DIREE_UK_BUILD_X86_64_AVX512_VNNI",
+]
+
+UKERNEL_BASE_SRCS = [
+    "mmt4d.c",
+    "mmt4d_tile.c",
+    "pack.c",
+    "pack_tile.c",
+    "query_tile_sizes.c",
+    "unpack_tile.c",
+]
+
+UKERNEL_HDRS = [
+    "//runtime/src/iree/builtins/ukernel:common.h",
+    "//runtime/src/iree/builtins/ukernel:pack.h",
+    "//runtime/src/iree/builtins/ukernel:exported_bits.h",
+    "//runtime/src/iree/builtins/ukernel:static_assert.h",
+    "//runtime/src/iree/builtins/ukernel:api.h",
+    "//runtime/src/iree/builtins/ukernel:unpack.h",
+    "//runtime/src/iree/builtins/ukernel:query_tile_sizes.h",
+    "//runtime/src/iree/builtins/ukernel:unpack_internal.h",
+    "//runtime/src/iree/builtins/ukernel:mmt4d.h",
+    "//runtime/src/iree/builtins/ukernel:mmt4d_internal.h",
+    "//runtime/src/iree/builtins/ukernel:pack_internal.h",
+    "//runtime/src/iree/builtins/ukernel:elementwise.h",
+    "//runtime/src/iree/builtins/ukernel:query_tile_sizes_internal.h",
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:query_tile_sizes_x86_64.h",
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:unpack_x86_64.h",
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:pack_x86_64.h",
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:common_x86_64.h",
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:mmt4d_x86_64.h",
+    "//runtime/src/iree/schemas:cpu_data.h",
+    "//runtime/src/iree/schemas:cpu_feature_bits.inl",
+]
+
+iree_bitcode_library(
+    name = "ukernel_bitcode_base",
+    srcs = UKERNEL_BASE_SRCS,
+    hdrs = UKERNEL_HDRS,
+    copts = BITCODE_COPTS,
+)
+
+iree_link_bitcode(
+    name = "ukernel_bitcode",
+    bitcode_files = [
+        "ukernel_bitcode_base.bc",
+        "arch/x86_64:ukernel_bitcode_x86_64_base.bc",
+        "arch/x86_64:ukernel_bitcode_x86_64_avx2_fma.bc",
+        "arch/x86_64:ukernel_bitcode_x86_64_avx512_base.bc",
+        "arch/x86_64:ukernel_bitcode_x86_64_avx512_vnni.bc",
+    ],
+)
+
+c_embed_data(
+    name = "libukernel_bitcode",
+    srcs = [
+        ":ukernel_bitcode.bc",
+    ],
+    c_file_output = "libukernel.c",
+    flatten = True,
+    h_file_output = "libukernel.h",
+    identifier = "iree_builtins_libukernel",
+    deps = [
+        "//runtime/src:runtime_defines",
+    ],
+)
+
+iree_cmake_extra_content(
+    content = """
+endif()  # IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU
+""",
+    inline = True,
+)
diff --git a/runtime/src/iree/builtins/ukernel/CMakeLists.txt b/runtime/src/iree/builtins/ukernel/CMakeLists.txt
index 9db16e0..14d5449 100644
--- a/runtime/src/iree/builtins/ukernel/CMakeLists.txt
+++ b/runtime/src/iree/builtins/ukernel/CMakeLists.txt
@@ -85,4 +85,87 @@
   PUBLIC
 )
 
+if(IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU)
+
+iree_bitcode_library(
+  NAME
+    ukernel_bitcode_base
+  SRCS
+    "mmt4d.c"
+    "mmt4d_tile.c"
+    "pack.c"
+    "pack_tile.c"
+    "query_tile_sizes.c"
+    "unpack_tile.c"
+  HDRS
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:common_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:mmt4d_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:pack_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:query_tile_sizes_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:unpack_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel:api.h"
+    "//runtime/src/iree/builtins/ukernel:common.h"
+    "//runtime/src/iree/builtins/ukernel:elementwise.h"
+    "//runtime/src/iree/builtins/ukernel:exported_bits.h"
+    "//runtime/src/iree/builtins/ukernel:mmt4d.h"
+    "//runtime/src/iree/builtins/ukernel:mmt4d_internal.h"
+    "//runtime/src/iree/builtins/ukernel:pack.h"
+    "//runtime/src/iree/builtins/ukernel:pack_internal.h"
+    "//runtime/src/iree/builtins/ukernel:query_tile_sizes.h"
+    "//runtime/src/iree/builtins/ukernel:query_tile_sizes_internal.h"
+    "//runtime/src/iree/builtins/ukernel:static_assert.h"
+    "//runtime/src/iree/builtins/ukernel:unpack.h"
+    "//runtime/src/iree/builtins/ukernel:unpack_internal.h"
+    "//runtime/src/iree/schemas:cpu_data.h"
+    "//runtime/src/iree/schemas:cpu_feature_bits.inl"
+  COPTS
+    "-std=c17"
+    "-nostdinc"
+    "-ffreestanding"
+    "-O3"
+    "-DNDEBUG"
+    "-fno-ident"
+    "-fdiscard-value-names"
+    "-c"
+    "-emit-llvm"
+    "-DIREE_UK_STANDALONE=1"
+    "-DIREE_UK_ARCH_X86_64"
+    "-DIREE_UK_POINTER_SIZE=8"
+    "-DIREE_UK_BUILD_X86_64_AVX2_FMA"
+    "-DIREE_UK_BUILD_X86_64_AVX512_BASE"
+    "-DIREE_UK_BUILD_X86_64_AVX512_VNNI"
+  PUBLIC
+)
+
+iree_link_bitcode(
+  NAME
+    ukernel_bitcode
+  SRCS
+    "arch/x86_64/ukernel_bitcode_x86_64_avx2_fma.bc"
+    "arch/x86_64/ukernel_bitcode_x86_64_avx512_base.bc"
+    "arch/x86_64/ukernel_bitcode_x86_64_avx512_vnni.bc"
+    "arch/x86_64/ukernel_bitcode_x86_64_base.bc"
+    "ukernel_bitcode_base.bc"
+  PUBLIC
+)
+
+iree_c_embed_data(
+  NAME
+    libukernel_bitcode
+  GENERATED_SRCS
+    "ukernel_bitcode.bc"
+  DEPS
+
+  C_FILE_OUTPUT
+    "libukernel.c"
+  H_FILE_OUTPUT
+    "libukernel.h"
+  IDENTIFIER
+    "iree_builtins_libukernel"
+  FLATTEN
+  PUBLIC
+)
+
+endif()  # IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU
+
 ### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/builtins/ukernel/arch/x86_64/BUILD.bazel b/runtime/src/iree/builtins/ukernel/arch/x86_64/BUILD.bazel
index 62611b7..fd6d113 100644
--- a/runtime/src/iree/builtins/ukernel/arch/x86_64/BUILD.bazel
+++ b/runtime/src/iree/builtins/ukernel/arch/x86_64/BUILD.bazel
@@ -4,7 +4,8 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-load("//build_tools/bazel:build_defs.oss.bzl", "iree_runtime_cc_library")
+load("//build_tools/bazel:build_defs.oss.bzl", "iree_cmake_extra_content")
+load("//build_tools/bazel:iree_bitcode_library.bzl", "iree_bitcode_library")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -12,34 +13,138 @@
     licenses = ["notice"],  # Apache 2.0
 )
 
-iree_runtime_cc_library(
-    name = "mmt4d_x86_64",
-    hdrs = [
-        "mmt4d_x86_64.h",
-    ],
-    deps = ["//runtime/src/iree/builtins/ukernel:internal_headers"],
+#===------------------------------------------------------------------------===#
+# UKernel bitcode files
+#===------------------------------------------------------------------------===#
+
+iree_cmake_extra_content(
+    content = """
+if(IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU)
+""",
+    inline = True,
 )
 
-iree_runtime_cc_library(
-    name = "pack_x86_64",
-    hdrs = [
-        "pack_x86_64.h",
-    ],
-    deps = ["//runtime/src/iree/builtins/ukernel:internal_headers"],
+BITCODE_COPTS = [
+    # C17 with no system deps.
+    "-std=c17",
+    "-nostdinc",
+    "-ffreestanding",
+
+    # Optimized and unstamped.
+    "-O3",
+    "-DNDEBUG",
+    "-fno-ident",
+    "-fdiscard-value-names",
+
+    # Object file only in bitcode format:
+    "-c",
+    "-emit-llvm",
+
+    # Force the library into standalone mode (not depending on build-directory
+    # configuration).
+    "-DIREE_UK_STANDALONE=1",
+
+    # Force configure for X86_64 with own Clang
+    "-DIREE_UK_ARCH_X86_64",
+    "-DIREE_UK_POINTER_SIZE=8",
+    "-DIREE_UK_BUILD_X86_64_AVX2_FMA",
+    "-DIREE_UK_BUILD_X86_64_AVX512_BASE",
+    "-DIREE_UK_BUILD_X86_64_AVX512_VNNI",
+]
+
+UKERNEL_HDRS = [
+    "//runtime/src/iree/builtins/ukernel:common.h",
+    "//runtime/src/iree/builtins/ukernel:pack.h",
+    "//runtime/src/iree/builtins/ukernel:exported_bits.h",
+    "//runtime/src/iree/builtins/ukernel:static_assert.h",
+    "//runtime/src/iree/builtins/ukernel:api.h",
+    "//runtime/src/iree/builtins/ukernel:unpack.h",
+    "//runtime/src/iree/builtins/ukernel:query_tile_sizes.h",
+    "//runtime/src/iree/builtins/ukernel:unpack_internal.h",
+    "//runtime/src/iree/builtins/ukernel:mmt4d.h",
+    "//runtime/src/iree/builtins/ukernel:mmt4d_internal.h",
+    "//runtime/src/iree/builtins/ukernel:pack_internal.h",
+    "//runtime/src/iree/builtins/ukernel:elementwise.h",
+    "//runtime/src/iree/builtins/ukernel:query_tile_sizes_internal.h",
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:query_tile_sizes_x86_64.h",
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:unpack_x86_64.h",
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:pack_x86_64.h",
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:common_x86_64.h",
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:mmt4d_x86_64.h",
+    "//runtime/src/iree/schemas:cpu_data.h",
+    "//runtime/src/iree/schemas:cpu_feature_bits.inl",
+]
+
+UKERNEL_X86_64_BASE_SRCS = [
+    "mmt4d_x86_64.c",
+    "pack_x86_64.c",
+    "query_tile_sizes_x86_64.c",
+    "unpack_x86_64.c",
+]
+
+iree_bitcode_library(
+    name = "ukernel_bitcode_x86_64_base",
+    srcs = UKERNEL_X86_64_BASE_SRCS,
+    hdrs = UKERNEL_HDRS,
+    copts = BITCODE_COPTS,
 )
 
-iree_runtime_cc_library(
-    name = "query_tile_sizes_x86_64",
-    hdrs = [
-        "query_tile_sizes_x86_64.h",
+UKERNEL_X86_64_AVX2_FMA_SRCS = [
+    "mmt4d_x86_64_avx2_fma.c",
+    "pack_x86_64_avx2_fma.c",
+    "unpack_x86_64_avx2_fma.c",
+]
+
+iree_bitcode_library(
+    name = "ukernel_bitcode_x86_64_avx2_fma",
+    srcs = UKERNEL_X86_64_AVX2_FMA_SRCS,
+    hdrs = UKERNEL_HDRS,
+    copts = BITCODE_COPTS + [
+        "-mavx2",
+        "-mfma",
     ],
-    deps = ["//runtime/src/iree/builtins/ukernel:internal_headers"],
 )
 
-iree_runtime_cc_library(
-    name = "unpack_x86_64",
-    hdrs = [
-        "unpack_x86_64.h",
+UKERNEL_X86_64_AVX512_BASE_SRCS = [
+    "mmt4d_x86_64_avx512_base.c",
+    "pack_x86_64_avx512_base.c",
+    "unpack_x86_64_avx512_base.c",
+]
+
+iree_bitcode_library(
+    name = "ukernel_bitcode_x86_64_avx512_base",
+    srcs = UKERNEL_X86_64_AVX512_BASE_SRCS,
+    hdrs = UKERNEL_HDRS,
+    copts = BITCODE_COPTS + [
+        "-mavx512f",
+        "-mavx512vl",
+        "-mavx512cd",
+        "-mavx512bw",
+        "-mavx512dq",
     ],
-    deps = ["//runtime/src/iree/builtins/ukernel:internal_headers"],
+)
+
+UKERNEL_X86_64_AVX512_VNNI_SRCS = [
+    "mmt4d_x86_64_avx512_vnni.c",
+]
+
+iree_bitcode_library(
+    name = "ukernel_bitcode_x86_64_avx512_vnni",
+    srcs = UKERNEL_X86_64_AVX512_VNNI_SRCS,
+    hdrs = UKERNEL_HDRS,
+    copts = BITCODE_COPTS + [
+        "-mavx512f",
+        "-mavx512vl",
+        "-mavx512cd",
+        "-mavx512bw",
+        "-mavx512dq",
+        "-mavx512vnni",
+    ],
+)
+
+iree_cmake_extra_content(
+    content = """
+endif()  # IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU
+""",
+    inline = True,
 )
diff --git a/runtime/src/iree/builtins/ukernel/arch/x86_64/CMakeLists.txt b/runtime/src/iree/builtins/ukernel/arch/x86_64/CMakeLists.txt
index 7460134..38dea86 100644
--- a/runtime/src/iree/builtins/ukernel/arch/x86_64/CMakeLists.txt
+++ b/runtime/src/iree/builtins/ukernel/arch/x86_64/CMakeLists.txt
@@ -1,8 +1,220 @@
-# Copyright 2023 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# runtime/src/iree/builtins/ukernel/arch/x86_64/BUILD.bazel                    #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+if(IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU)
+
+iree_bitcode_library(
+  NAME
+    ukernel_bitcode_x86_64_base
+  SRCS
+    "mmt4d_x86_64.c"
+    "pack_x86_64.c"
+    "query_tile_sizes_x86_64.c"
+    "unpack_x86_64.c"
+  HDRS
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:common_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:mmt4d_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:pack_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:query_tile_sizes_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:unpack_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel:api.h"
+    "//runtime/src/iree/builtins/ukernel:common.h"
+    "//runtime/src/iree/builtins/ukernel:elementwise.h"
+    "//runtime/src/iree/builtins/ukernel:exported_bits.h"
+    "//runtime/src/iree/builtins/ukernel:mmt4d.h"
+    "//runtime/src/iree/builtins/ukernel:mmt4d_internal.h"
+    "//runtime/src/iree/builtins/ukernel:pack.h"
+    "//runtime/src/iree/builtins/ukernel:pack_internal.h"
+    "//runtime/src/iree/builtins/ukernel:query_tile_sizes.h"
+    "//runtime/src/iree/builtins/ukernel:query_tile_sizes_internal.h"
+    "//runtime/src/iree/builtins/ukernel:static_assert.h"
+    "//runtime/src/iree/builtins/ukernel:unpack.h"
+    "//runtime/src/iree/builtins/ukernel:unpack_internal.h"
+    "//runtime/src/iree/schemas:cpu_data.h"
+    "//runtime/src/iree/schemas:cpu_feature_bits.inl"
+  COPTS
+    "-std=c17"
+    "-nostdinc"
+    "-ffreestanding"
+    "-O3"
+    "-DNDEBUG"
+    "-fno-ident"
+    "-fdiscard-value-names"
+    "-c"
+    "-emit-llvm"
+    "-DIREE_UK_STANDALONE=1"
+    "-DIREE_UK_ARCH_X86_64"
+    "-DIREE_UK_POINTER_SIZE=8"
+    "-DIREE_UK_BUILD_X86_64_AVX2_FMA"
+    "-DIREE_UK_BUILD_X86_64_AVX512_BASE"
+    "-DIREE_UK_BUILD_X86_64_AVX512_VNNI"
+  PUBLIC
+)
+
+iree_bitcode_library(
+  NAME
+    ukernel_bitcode_x86_64_avx2_fma
+  SRCS
+    "mmt4d_x86_64_avx2_fma.c"
+    "pack_x86_64_avx2_fma.c"
+    "unpack_x86_64_avx2_fma.c"
+  HDRS
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:common_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:mmt4d_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:pack_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:query_tile_sizes_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:unpack_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel:api.h"
+    "//runtime/src/iree/builtins/ukernel:common.h"
+    "//runtime/src/iree/builtins/ukernel:elementwise.h"
+    "//runtime/src/iree/builtins/ukernel:exported_bits.h"
+    "//runtime/src/iree/builtins/ukernel:mmt4d.h"
+    "//runtime/src/iree/builtins/ukernel:mmt4d_internal.h"
+    "//runtime/src/iree/builtins/ukernel:pack.h"
+    "//runtime/src/iree/builtins/ukernel:pack_internal.h"
+    "//runtime/src/iree/builtins/ukernel:query_tile_sizes.h"
+    "//runtime/src/iree/builtins/ukernel:query_tile_sizes_internal.h"
+    "//runtime/src/iree/builtins/ukernel:static_assert.h"
+    "//runtime/src/iree/builtins/ukernel:unpack.h"
+    "//runtime/src/iree/builtins/ukernel:unpack_internal.h"
+    "//runtime/src/iree/schemas:cpu_data.h"
+    "//runtime/src/iree/schemas:cpu_feature_bits.inl"
+  COPTS
+    "-std=c17"
+    "-nostdinc"
+    "-ffreestanding"
+    "-O3"
+    "-DNDEBUG"
+    "-fno-ident"
+    "-fdiscard-value-names"
+    "-c"
+    "-emit-llvm"
+    "-DIREE_UK_STANDALONE=1"
+    "-DIREE_UK_ARCH_X86_64"
+    "-DIREE_UK_POINTER_SIZE=8"
+    "-DIREE_UK_BUILD_X86_64_AVX2_FMA"
+    "-DIREE_UK_BUILD_X86_64_AVX512_BASE"
+    "-DIREE_UK_BUILD_X86_64_AVX512_VNNI"
+    "-mavx2"
+    "-mfma"
+  PUBLIC
+)
+
+iree_bitcode_library(
+  NAME
+    ukernel_bitcode_x86_64_avx512_base
+  SRCS
+    "mmt4d_x86_64_avx512_base.c"
+    "pack_x86_64_avx512_base.c"
+    "unpack_x86_64_avx512_base.c"
+  HDRS
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:common_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:mmt4d_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:pack_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:query_tile_sizes_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:unpack_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel:api.h"
+    "//runtime/src/iree/builtins/ukernel:common.h"
+    "//runtime/src/iree/builtins/ukernel:elementwise.h"
+    "//runtime/src/iree/builtins/ukernel:exported_bits.h"
+    "//runtime/src/iree/builtins/ukernel:mmt4d.h"
+    "//runtime/src/iree/builtins/ukernel:mmt4d_internal.h"
+    "//runtime/src/iree/builtins/ukernel:pack.h"
+    "//runtime/src/iree/builtins/ukernel:pack_internal.h"
+    "//runtime/src/iree/builtins/ukernel:query_tile_sizes.h"
+    "//runtime/src/iree/builtins/ukernel:query_tile_sizes_internal.h"
+    "//runtime/src/iree/builtins/ukernel:static_assert.h"
+    "//runtime/src/iree/builtins/ukernel:unpack.h"
+    "//runtime/src/iree/builtins/ukernel:unpack_internal.h"
+    "//runtime/src/iree/schemas:cpu_data.h"
+    "//runtime/src/iree/schemas:cpu_feature_bits.inl"
+  COPTS
+    "-std=c17"
+    "-nostdinc"
+    "-ffreestanding"
+    "-O3"
+    "-DNDEBUG"
+    "-fno-ident"
+    "-fdiscard-value-names"
+    "-c"
+    "-emit-llvm"
+    "-DIREE_UK_STANDALONE=1"
+    "-DIREE_UK_ARCH_X86_64"
+    "-DIREE_UK_POINTER_SIZE=8"
+    "-DIREE_UK_BUILD_X86_64_AVX2_FMA"
+    "-DIREE_UK_BUILD_X86_64_AVX512_BASE"
+    "-DIREE_UK_BUILD_X86_64_AVX512_VNNI"
+    "-mavx512f"
+    "-mavx512vl"
+    "-mavx512cd"
+    "-mavx512bw"
+    "-mavx512dq"
+  PUBLIC
+)
+
+iree_bitcode_library(
+  NAME
+    ukernel_bitcode_x86_64_avx512_vnni
+  SRCS
+    "mmt4d_x86_64_avx512_vnni.c"
+  HDRS
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:common_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:mmt4d_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:pack_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:query_tile_sizes_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel/arch/x86_64:unpack_x86_64.h"
+    "//runtime/src/iree/builtins/ukernel:api.h"
+    "//runtime/src/iree/builtins/ukernel:common.h"
+    "//runtime/src/iree/builtins/ukernel:elementwise.h"
+    "//runtime/src/iree/builtins/ukernel:exported_bits.h"
+    "//runtime/src/iree/builtins/ukernel:mmt4d.h"
+    "//runtime/src/iree/builtins/ukernel:mmt4d_internal.h"
+    "//runtime/src/iree/builtins/ukernel:pack.h"
+    "//runtime/src/iree/builtins/ukernel:pack_internal.h"
+    "//runtime/src/iree/builtins/ukernel:query_tile_sizes.h"
+    "//runtime/src/iree/builtins/ukernel:query_tile_sizes_internal.h"
+    "//runtime/src/iree/builtins/ukernel:static_assert.h"
+    "//runtime/src/iree/builtins/ukernel:unpack.h"
+    "//runtime/src/iree/builtins/ukernel:unpack_internal.h"
+    "//runtime/src/iree/schemas:cpu_data.h"
+    "//runtime/src/iree/schemas:cpu_feature_bits.inl"
+  COPTS
+    "-std=c17"
+    "-nostdinc"
+    "-ffreestanding"
+    "-O3"
+    "-DNDEBUG"
+    "-fno-ident"
+    "-fdiscard-value-names"
+    "-c"
+    "-emit-llvm"
+    "-DIREE_UK_STANDALONE=1"
+    "-DIREE_UK_ARCH_X86_64"
+    "-DIREE_UK_POINTER_SIZE=8"
+    "-DIREE_UK_BUILD_X86_64_AVX2_FMA"
+    "-DIREE_UK_BUILD_X86_64_AVX512_BASE"
+    "-DIREE_UK_BUILD_X86_64_AVX512_VNNI"
+    "-mavx512f"
+    "-mavx512vl"
+    "-mavx512cd"
+    "-mavx512bw"
+    "-mavx512dq"
+    "-mavx512vnni"
+  PUBLIC
+)
+
+endif()  # IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
 
 # Target CPUs supporting AVX2+FMA3. That includes Intel Haswell (2013) and newer
 # and AMD Excavator (2015) and newer. There is no current plan to look after
diff --git a/runtime/src/iree/schemas/cpu_data.h b/runtime/src/iree/schemas/cpu_data.h
index 46df305..0677606 100644
--- a/runtime/src/iree/schemas/cpu_data.h
+++ b/runtime/src/iree/schemas/cpu_data.h
@@ -70,7 +70,8 @@
   IREE_CPU_FEATURE_BIT_NAME(arch, field_index, bit_name) = 1ull << bit_pos,
 #include "iree/schemas/cpu_feature_bits.inl"
 #undef IREE_CPU_FEATURE_BIT
-
 };
 
+#undef IREE_CPU_FEATURE_BIT_NAME
+
 #endif  // IREE_SCHEMAS_CPU_DATA_H_
diff --git a/tests/e2e/matmul/BUILD.bazel b/tests/e2e/matmul/BUILD.bazel
index 392d1bc..2cb329f 100644
--- a/tests/e2e/matmul/BUILD.bazel
+++ b/tests/e2e/matmul/BUILD.bazel
@@ -160,6 +160,31 @@
     "f32",
 ]]
 
+# Test x86_64+ukernel, mmt4d, with target CPU features variants relevant to each
+# lhs_rhs_type.
+[iree_generated_trace_runner_test(
+    name = "e2e_matmul_mmt4d_%s_small_ukernel_x86" % lhs_rhs_type,
+    compiler_flags = [
+        "--iree-llvmcpu-enable-microkernels",
+        "--iree-flow-enable-data-tiling",
+    ],
+    generator = ":generate_e2e_matmul_tests",
+    generator_args = [
+        "--lhs_rhs_type=%s" % lhs_rhs_type,
+        "--shapes=small",
+    ],
+    tags = [
+        "hostonly",
+    ],
+    target_backends_and_drivers = [
+        ("llvm-cpu", "local-task"),
+    ],
+    trace_runner = "//tools:iree-e2e-matmul-test",
+) for lhs_rhs_type in [
+    "i8",
+    "f32",
+]]
+
 [iree_generated_trace_runner_test(
     name = "e2e_matmul_direct_f32_gpu_large_%s" % compilation_info,
     generator = ":generate_e2e_matmul_tests",
diff --git a/tests/e2e/matmul/CMakeLists.txt b/tests/e2e/matmul/CMakeLists.txt
index 6445221..357dfae 100644
--- a/tests/e2e/matmul/CMakeLists.txt
+++ b/tests/e2e/matmul/CMakeLists.txt
@@ -250,6 +250,48 @@
 
 iree_generated_trace_runner_test(
   NAME
+    e2e_matmul_mmt4d_i8_small_ukernel_x86
+  GENERATOR
+    "generate_e2e_matmul_tests.py"
+  GENERATOR_ARGS
+    "--lhs_rhs_type=i8"
+    "--shapes=small"
+  TRACE_RUNNER
+    iree-e2e-matmul-test
+  TARGET_BACKENDS
+    "llvm-cpu"
+  DRIVERS
+    "local-task"
+  COMPILER_FLAGS
+    "--iree-llvmcpu-enable-microkernels"
+    "--iree-flow-enable-data-tiling"
+  LABELS
+    "hostonly"
+)
+
+iree_generated_trace_runner_test(
+  NAME
+    e2e_matmul_mmt4d_f32_small_ukernel_x86
+  GENERATOR
+    "generate_e2e_matmul_tests.py"
+  GENERATOR_ARGS
+    "--lhs_rhs_type=f32"
+    "--shapes=small"
+  TRACE_RUNNER
+    iree-e2e-matmul-test
+  TARGET_BACKENDS
+    "llvm-cpu"
+  DRIVERS
+    "local-task"
+  COMPILER_FLAGS
+    "--iree-llvmcpu-enable-microkernels"
+    "--iree-flow-enable-data-tiling"
+  LABELS
+    "hostonly"
+)
+
+iree_generated_trace_runner_test(
+  NAME
     e2e_matmul_direct_f32_gpu_large_LLVMGPUMatmulSimt
   GENERATOR
     "generate_e2e_matmul_tests.py"