CPU ukernels as bitcode (x86-only for now) (#13460)

While #13433 enabled the use of micro kernels within codegen backends
using the plugin mechanism, here the ukernel code is compiled into a
bitcode library. This bitcode library is linked with the generated code
at compilation time. The lowering to LLVM inlines the exact micro kernel
needed from the micro kernel library for a particular architecture. To
enable this end-to-end flow, the following changes are needed

- Add an enum attribute to HAL : `IREE::HAL::CallingConvention` that
allows specifying what calling convention to use for the micro kernel
call. `Default` leaves the params as is, `ParameterStruct` packs all the
returns and arguments into a parameter struct to mimic ABIs like
https://github.com/openxla/iree/blob/6cf092d022810d4347353b23e5ce2688a166dd67/runtime/src/iree/builtins/ukernel/mmt4d.h#L16
- Couple of patterns are added to `ConvertToLLVM` pass, to handle the
lowering of the function definition and function call, in keeping with
the specified ABI
- Allow specification of `hal.import.fields` to specify `processor_data`
and `processor_id` on ukernel function defn. This then generates the
code to forward this information to the microkernels (similar to what is
done for external calls using the plugin mechanism)
- Propagate the target CPU features in `hal.executable.target` of the
dispatch into the micro kernel call. This allows the LLVM passes to walk
through the branching used to pick the right micro kernel function and
effectively inline that.

Co-authored-by: Benoit Jacob <benoitjacob@google.com>
diff --git a/build_tools/bazel/iree_bitcode_library.bzl b/build_tools/bazel/iree_bitcode_library.bzl
index a38fa45..b4f9b38 100644
--- a/build_tools/bazel/iree_bitcode_library.bzl
+++ b/build_tools/bazel/iree_bitcode_library.bzl
@@ -40,23 +40,25 @@
     for bitcode_src in srcs:
         bitcode_out = "%s_%s.bc" % (name, bitcode_src)
         bitcode_files.append(bitcode_out)
+        system_headers = ["immintrin.h"]
         native.genrule(
             name = "gen_%s" % (bitcode_out),
-            srcs = [bitcode_src],
+            srcs = [bitcode_src] + hdrs + [builtin_headers_dep],
             outs = [bitcode_out],
             cmd = " && ".join([
                 " ".join([
                     "$(location %s)" % (clang_tool),
-                    "-isystem $(BINDIR)/%s" % (builtin_headers_path),
+                    "-isystem $(BINDIR)/%s" % builtin_headers_path,
                     " ".join(copts),
                     " ".join(["-D%s" % (define) for define in defines]),
+                    " ".join(["-I $(BINDIR)/runtime/src"]),
+                    " ".join(["-I runtime/src"]),
                     "-o $(location %s)" % (bitcode_out),
                     "$(location %s)" % (bitcode_src),
                 ]),
             ]),
-            tools = hdrs + data + [
+            tools = data + [
                 clang_tool,
-                builtin_headers_dep,
             ],
             message = "Compiling %s to %s..." % (bitcode_src, bitcode_out),
             output_to_bindir = 1,
@@ -81,3 +83,40 @@
         output_to_bindir = 1,
         **kwargs
     )
+
+def iree_link_bitcode(
+        name,
+        bitcode_files,
+        out = None,
+        link_tool = "@llvm-project//llvm:llvm-link",
+        **kwargs):
+    """Builds an LLVM bitcode library from an input file via clang.
+
+    Args:
+        name: Name of the target.
+        bitcode_files: bitcode files to link together.
+        out: output file name (defaults to name.bc).
+        link_tool: llvm-link tool used for linking bitcode files.
+        **kwargs: any additional attributes to pass to the underlying rules.
+    """
+
+    bitcode_files_qualified = [(("//" + native.package_name() + "/" + b) if b.count(":") else b) for b in bitcode_files]
+
+    if not out:
+        out = "%s.bc" % (name)
+    native.genrule(
+        name = name,
+        srcs = bitcode_files_qualified,
+        outs = [out],
+        cmd = " && ".join([
+            " ".join([
+                "$(location %s)" % (link_tool),
+                "-o $(location %s)" % (out),
+                " ".join(["$(locations %s)" % (src) for src in bitcode_files_qualified]),
+            ]),
+        ]),
+        tools = [link_tool],
+        message = "Linking bitcode library %s to %s..." % (name, out),
+        output_to_bindir = 1,
+        **kwargs
+    )
diff --git a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
index 4cbc856..d10af72 100644
--- a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
+++ b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
@@ -515,6 +515,20 @@
                              f"{testonly_block}"
                              f"  PUBLIC\n)\n\n")
 
+  def iree_link_bitcode(self, name, bitcode_files, data=None, testonly=None):
+    name_block = self._convert_string_arg_block("NAME", name, quote=False)
+    bitcode_files_block = self._convert_srcs_block(
+        [f.replace(":", "/") for f in bitcode_files])
+    data_block = self._convert_target_list_block("DATA", data)
+    testonly_block = self._convert_option_block("TESTONLY", testonly)
+
+    self._converter.body += (f"iree_link_bitcode(\n"
+                             f"{name_block}"
+                             f"{bitcode_files_block}"
+                             f"{data_block}"
+                             f"{testonly_block}"
+                             f"  PUBLIC\n)\n\n")
+
   def iree_bytecode_module(self,
                            name,
                            src,
diff --git a/build_tools/cmake/iree_bitcode_library.cmake b/build_tools/cmake/iree_bitcode_library.cmake
index 0f17151..05ee273 100644
--- a/build_tools/cmake/iree_bitcode_library.cmake
+++ b/build_tools/cmake/iree_bitcode_library.cmake
@@ -34,10 +34,6 @@
   set(_CLANG_TOOL "$<TARGET_FILE:${IREE_CLANG_TARGET}>")
   set(_LINK_TOOL "$<TARGET_FILE:${IREE_LLVM_LINK_TARGET}>")
 
-  # These are copied as part of the clang build; we could allow the user to
-  # override this but it should be harmless.
-  set(_BUILTIN_HEADERS_PATH "${IREE_BINARY_DIR}/third_party/llvm-project/llvm/lib/clang/${CLANG_VERSION_MAJOR}/include/")
-
   if(_RULE_TESTONLY AND NOT IREE_BUILD_TESTS)
     return()
   endif()
@@ -48,7 +44,21 @@
     set(_OUT "${_RULE_NAME}.bc")
   endif()
 
-  set(_ARGS "-isystem ${_BUILTIN_HEADERS_PATH}")
+  # We need CLANG_VERSION_MAJOR to set up include directories. Unfortunately,
+  # Clang's own CMakeLists do not expose CLANG_VERSION_MAJOR to PARENT_SCOPE.
+  # Likewise with LLVM_VERSION_MAJOR. However, CLANG_EXECUTABLE_VERSION is
+  # CACHE'd, so we can access it, and it currently has the same value.
+  set(_CLANG_VERSION_MAJOR "${CLANG_EXECUTABLE_VERSION}")
+
+  # These are copied as part of the clang build; we could allow the user to
+  # override this but it should be harmless.
+  set(_BUILTIN_HEADERS_PATH "${IREE_BINARY_DIR}/llvm-project/lib/clang/${_CLANG_VERSION_MAJOR}/include/")
+
+  set(_ARGS "")
+  list(APPEND _ARGS "-isystem" "${_BUILTIN_HEADERS_PATH}")
+  list(APPEND _ARGS "-I" "${IREE_SOURCE_DIR}/runtime/src")
+  list(APPEND _ARGS "-I" "${IREE_BINARY_DIR}/runtime/src")
+  
   list(APPEND _ARGS "${_RULE_COPTS}")
   foreach(_DEFINE ${_RULE_DEFINES})
     list(APPEND _ARGS "-D${_DEFINE}")
@@ -70,6 +80,7 @@
         "${_BITCODE_FILE}"
       DEPENDS
         ${_CLANG_TOOL}
+        ${_LINK_TOOL}
         ${_BITCODE_SRC}
       COMMENT
         "Compiling ${_BITCODE_SRC} to ${_BITCODE_FILE}"
@@ -101,3 +112,66 @@
     DEPENDS "${_OUT}"
   )
 endfunction()
+
+# iree_link_bitcode()
+#
+# Builds an LLVM bitcode library from an input file via clang
+#
+# Parameters:
+# NAME: Name of target (see Note).
+# SRCS: Source files to pass to clang.
+# HDRS: Additional headers included by the source files.
+# COPTS: additional flags to pass to clang.
+# DEFINES: Preprocessor definitions to pass to clang.
+# DATA: Additional data required during compilation.
+# OUT: Output file name (defaults to NAME.bc).
+# PUBLIC: Add this so that this library will be exported under ${PACKAGE}::
+#     Also in IDE, target will appear in ${PACKAGE} folder while non PUBLIC
+#     will be in ${PACKAGE}/internal.
+# TESTONLY: When added, this target will only be built if IREE_BUILD_TESTS=ON.
+function(iree_link_bitcode)
+  cmake_parse_arguments(
+    _RULE
+    "PUBLIC;TESTONLY"
+    "NAME;OUT"
+    "SRCS;DEFINES;DATA"
+    ${ARGN}
+  )
+
+  set(_LINK_TOOL "$<TARGET_FILE:${IREE_LLVM_LINK_TARGET}>")
+
+  if(_RULE_TESTONLY AND NOT IREE_BUILD_TESTS)
+    return()
+  endif()
+
+  if(DEFINED _RULE_OUT)
+    set(_OUT "${_RULE_OUT}")
+  else()
+    set(_OUT "${_RULE_NAME}.bc")
+  endif()
+
+  set(_BITCODE_FILES "${_RULE_SRCS}")
+
+  add_custom_command(
+    OUTPUT
+      ${_OUT}
+    COMMAND
+      ${_LINK_TOOL}
+      ${_BITCODE_FILES}
+      "-o"
+      "${_OUT}"
+    DEPENDS
+      ${_LINK_TOOL}
+      ${_BITCODE_FILES}
+    COMMENT
+      "Linking bitcode to ${_OUT}"
+    VERBATIM
+  )
+
+  # Only add iree_${NAME} as custom target doesn't support aliasing to
+  # iree::${NAME}.
+  iree_package_name(_PACKAGE_NAME)
+  add_custom_target("${_PACKAGE_NAME}_${_RULE_NAME}"
+    DEPENDS "${_OUT}"
+  )
+endfunction()
diff --git a/build_tools/cmake/iree_trace_runner_test.cmake b/build_tools/cmake/iree_trace_runner_test.cmake
index 96ef411..eff4d41 100644
--- a/build_tools/cmake/iree_trace_runner_test.cmake
+++ b/build_tools/cmake/iree_trace_runner_test.cmake
@@ -47,6 +47,10 @@
     ${ARGN}
   )
 
+  if(CMAKE_CROSSCOMPILING AND "hostonly" IN_LIST _RULE_LABELS)
+    return()
+  endif()
+
   iree_package_name(_PACKAGE_NAME)
   set(_NAME "${_PACKAGE_NAME}_${_RULE_NAME}")
 
@@ -149,6 +153,10 @@
     ${ARGN}
   )
 
+  if(CMAKE_CROSSCOMPILING AND "hostonly" IN_LIST _RULE_LABELS)
+    return()
+  endif()
+
   # Omit tests for which the specified driver or target backend is not enabled.
   # This overlaps with directory exclusions and other filtering mechanisms.
   string(TOUPPER ${_RULE_DRIVER} _UPPERCASE_DRIVER)
@@ -293,6 +301,10 @@
     ${ARGN}
   )
 
+  if(CMAKE_CROSSCOMPILING AND "hostonly" IN_LIST _RULE_LABELS)
+    return()
+  endif()
+
   if(NOT DEFINED _RULE_TARGET_BACKENDS AND NOT DEFINED _RULE_DRIVERS)
     set(_RULE_TARGET_BACKENDS "vmvx" "vulkan-spirv" "llvm-cpu")
     set(_RULE_DRIVERS "local-task" "vulkan" "local-task")