Ukernels: simplify the architecture-specific bitcode build. (#16126)
This PR basically re-does how we build and link architecture-specific
and generic bitcode, for CPU ukernels.
The main change is that we now build one separate ukernel bitcode file
for each CPU architecture, and it's the only thing that `iree-compile`
needs to load and link to. Before, it had to load and link together 3
different bitcode modules, two of which were actually compiled as
WebAssembly for genericity, and it involved weak symbols. Full inlining
is really important for performance here, and in this system, it
depended on two fragile mechanisms:
1. Reinterpreting a WebAssembly module as a native module.
2. Weak symbols.
That was done out of a concern for sharing generic code across
same-bitness architectures, but that turned out be substantially more
complex in both the ukernels and the compiler, and to suffer from
problems described in #16000, which this PR fixes. Namely, the inlining
across these modules and down to the calling dispatch function was
brittle, never worked outside of x86, and to make even that work, we had
to do some illegal target-attribute stripping that was effectively UB
and caused issues described in
https://github.com/llvm/llvm-project/issues/78206 (originally filed as a
LLVM bug but then understood to be caused by our own illegal
attribute-stripping).
So, we now build completely separate bitcode for each target CPU
architecture that we are supporting in `iree-compile`. At the moment,
that's roughly 20k of code for each additional architecture for which we
don't have dedicated optimized code paths. For comparison, each
architecture that does have dedicated code weighs roughly 100k. I found
that surprisingly large, until I remembered that IR is far more verbose
than object code. There are ways to control that if and when that
becomes a major concern:
* We could further trim the supported CPU architectures, but note that
we gain simplicity from not doing so at the moment.
* We could leave bitcode files separate rather than embedding them in
`iree-compile`, allowing each user to obtain their own for their target
architecture.
* We could just embed the ukernels *source* code, as pure C is so cheap
to compile - the main cost would be now having to essentially build/ship
`clang` as part of iree-compile, but supporting only C and not C++
shouldn't be that bad.
* One problem that that would also solve is that at the moment, for CPU
architectures for which `iree-compile` allows the user to pick from
multiple ABIs (looking at you, RISC-V --- you need to grow up, pick an
ABI and settle in life), compiling ukernels from source at that time
would make it trivial to have ukernels built for the right ABI. For now
in this PR we just pick whichever ABI our tests are currently using.
Note: this PR is broken into a few thematic commits to help reviewing.
Fixes #16000
diff --git a/build_tools/bazel/iree_bitcode_library.bzl b/build_tools/bazel/iree_bitcode_library.bzl
index 2475b60..27b1ed6 100644
--- a/build_tools/bazel/iree_bitcode_library.bzl
+++ b/build_tools/bazel/iree_bitcode_library.bzl
@@ -101,6 +101,19 @@
"-DIREE_DEVICE_STANDALONE=1",
]
+ if arch == "arm_32":
+ # Silence "warning: unknown platform, assuming -mfloat-abi=soft"
+ base_copts.append("-mfloat-abi=soft")
+ elif arch == "riscv_32":
+ # On RISC-V, linking LLVM modules requires matching target-abi.
+ # https://lists.llvm.org/pipermail/llvm-dev/2020-January/138450.html
+ # The choice of ilp32d is simply what we have in existing riscv_32 tests.
+ # Open question - how do we scale to supporting all RISC-V ABIs?
+ base_copts.append("-mabi=ilp32d")
+ elif arch == "riscv_64":
+ # Same comments as above riscv_32 case.
+ base_copts.append("-mabi=lp64d")
+
bitcode_files = []
for src in srcs:
bitcode_out = "%s_%s.bc" % (name, src)
@@ -258,7 +271,7 @@
**kwargs: any additional attributes to pass to the underlying rules.
"""
- bitcode_files_qualified = [(("//" + native.package_name() + "/" + b) if b.count(":") else b) for b in bitcode_files]
+ bitcode_files_qualified = [("//" + native.package_name() + b) if b.startswith(":") else ("//" + native.package_name() + "/" + b) if b.count(":") else b for b in bitcode_files]
if not out:
out = "%s.bc" % (name)
diff --git a/build_tools/cmake/iree_bitcode_library.cmake b/build_tools/cmake/iree_bitcode_library.cmake
index 7aae3cd..0c68562 100644
--- a/build_tools/cmake/iree_bitcode_library.cmake
+++ b/build_tools/cmake/iree_bitcode_library.cmake
@@ -31,6 +31,13 @@
set(_OUT "${_RULE_NAME}.bc")
endif()
+ # Produce an empty file if the compiler wouldn't use bitcode for this arch anyway.
+ iree_compiler_targeting_iree_arch(_IREE_COMPILER_TARGETING_THIS_ARCH "${_RULE_ARCH}")
+ if (NOT _IREE_COMPILER_TARGETING_THIS_ARCH)
+ iree_make_empty_file("${_OUT}")
+ return()
+ endif()
+
iree_arch_to_llvm_arch(_LLVM_ARCH "${_RULE_ARCH}")
set(_COPTS
@@ -69,6 +76,20 @@
list(APPEND _COPTS "-I" "${IREE_BINARY_DIR}/runtime/src")
list(APPEND _COPTS "${_RULE_COPTS}")
+ if (_RULE_ARCH STREQUAL "arm_32")
+ # Silence "warning: unknown platform, assuming -mfloat-abi=soft"
+ list(APPEND _COPTS "-mfloat-abi=soft")
+ elseif(_RULE_ARCH STREQUAL "riscv_32")
+ # On RISC-V, linking LLVM modules requires matching target-abi.
+ # https://lists.llvm.org/pipermail/llvm-dev/2020-January/138450.html
+ # The choice of ilp32d is simply what we have in existing riscv_32 tests.
+ # Open question - how do we scale to supporting all RISC-V ABIs?
+ list(APPEND _COPTS "-mabi=ilp32d")
+ elseif(_RULE_ARCH STREQUAL "riscv_64")
+ # Same comments as above riscv_32 case.
+ list(APPEND _COPTS "-mabi=lp64d")
+ endif()
+
set(_BITCODE_FILES)
foreach(_SRC ${_RULE_SRCS})
get_filename_component(_BITCODE_SRC_PATH "${_SRC}" REALPATH)
diff --git a/build_tools/cmake/iree_macros.cmake b/build_tools/cmake/iree_macros.cmake
index 84a4e7a..bc9a303 100644
--- a/build_tools/cmake/iree_macros.cmake
+++ b/build_tools/cmake/iree_macros.cmake
@@ -152,7 +152,8 @@
endif()
iree_arch_to_llvm_target(_LLVM_TARGET "${SRC_ARCH}")
- if (_LLVM_TARGET IN_LIST LLVM_TARGETS_TO_BUILD)
+ # WebAssembly is unconditionally enabled, and not enumerated in LLVM_TARGETS_TO_BUILD.
+ if (_LLVM_TARGET IN_LIST LLVM_TARGETS_TO_BUILD OR _LLVM_TARGET STREQUAL "WebAssembly")
set(${DST_VAR} ON PARENT_SCOPE)
else()
set(${DST_VAR} OFF PARENT_SCOPE)
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.cpp b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.cpp
index dd23c36..46d1978 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.cpp
@@ -27,96 +27,38 @@
return llvm::parseBitcodeFile(bitcodeBufferRef, context);
}
}
+
// Some bitcode files are optional: we don't have arch-specific ukernel code
// for all architectures. So it's normal to be returning nullptr here.
return nullptr;
}
-static void removeTargetAttributes(llvm::Module &module) {
- // Copied from Device.cpp - TODO: move this to a shared utility.
- // Clang adds its own per-function attributes that we need to strip so that
- // our current executable variant target is used instead.
- for (auto &func : module.functions()) {
- func.removeFnAttr("target-cpu");
- func.removeFnAttr("tune-cpu");
- func.removeFnAttr("target-features");
- }
-}
-
llvm::Expected<std::unique_ptr<llvm::Module>>
-loadUKernelBaseBitcode(llvm::TargetMachine *targetMachine,
- llvm::LLVMContext &context) {
- llvm::Triple triple = targetMachine->getTargetTriple();
- StringRef filename;
- if (triple.isArch64Bit()) {
- filename = "ukernel_bitcode_64bit_base.bc";
- } else if (triple.isArch32Bit()) {
- filename = "ukernel_bitcode_32bit_base.bc";
- } else {
- return llvm::createStringError(
- llvm::inconvertibleErrorCode(),
- "Don't know what ukernel bitcode file to load.");
- }
- llvm::Expected<std::unique_ptr<llvm::Module>> bitcode =
- loadUKernelBitcodeFile(filename, context);
- if (!bitcode) {
- // Propagate the error to the caller.
- return bitcode;
- }
-
- if (!bitcode.get()) {
- // File not found. For base bitcode, this shouldn't happen.
- return llvm::createStringError(llvm::inconvertibleErrorCode(),
- "Base ukernel bitcode file not found: %s",
- filename.str().c_str());
- }
-
- // Base bitcode is compiled for any reasonable architecture of the right
- // bitness, as we don't care about anything else than bitness here.
- removeTargetAttributes(*bitcode.get());
- return bitcode;
-}
-
-llvm::Expected<std::unique_ptr<llvm::Module>>
-loadUKernelArchEntryPointsBitcode(llvm::TargetMachine *targetMachine,
- llvm::LLVMContext &context) {
+loadUKernelBitcode(llvm::TargetMachine *targetMachine,
+ llvm::LLVMContext &context) {
const char *archName =
getIreeArchNameForTargetTriple(targetMachine->getTargetTriple());
- char filename[64];
- snprintf(filename, sizeof filename, "ukernel_bitcode_%s_entry_points.bc",
- archName);
- llvm::Expected<std::unique_ptr<llvm::Module>> bitcode =
+ std::string filename = std::string("ukernel_bitcode_") + archName + ".bc";
+ llvm::Expected<std::unique_ptr<llvm::Module>> module =
loadUKernelBitcodeFile(filename, context);
- if (!bitcode) {
- // Propagate the error to the caller.
- return bitcode;
+ if (!module) {
+ // Error. Propagate to the caller.
+ return module;
}
-
- if (!bitcode.get()) {
- // File not found. This is normal: arch-specific bitcode is optional.
- return bitcode;
+ if (!module.get()) {
+ // File not found. Just means that we don't have bitcode for that
+ // architecture. Return the null module as a success case.
+ return module;
}
-
- // Architecture entry-point functions should be inlinable into base (non-arch)
- // functions, so that their logic selecting specific "tile functions" can
- // evaluate at compile time based on constant argument values in the caller,
- // so that unused tile functions (e.g. for other data types, other CPU feature
- // variants, etc) get DCE'd. In order for these entry points to be inlinable,
- // they must have matching target attributes, so, just like we call
- // removeTargetAttributes in loadUKernelBaseBitcode, we need to do that also
- // here.
- removeTargetAttributes(*bitcode.get());
- return bitcode;
-}
-
-llvm::Expected<std::unique_ptr<llvm::Module>>
-loadUKernelArchBitcode(llvm::TargetMachine *targetMachine,
- llvm::LLVMContext &context) {
- const char *archName =
- getIreeArchNameForTargetTriple(targetMachine->getTargetTriple());
- char filename[64];
- snprintf(filename, sizeof filename, "ukernel_bitcode_%s.bc", archName);
- return loadUKernelBitcodeFile(filename, context);
+ // Ukernels rely fundamentally on always getting inlined, for their logic
+ // to specialize at compile time, including specialization for a specific
+ // combination of data types, a specific SIMD ISA variant, etc. Then all the
+ // unused code paths can get DCE'd. That's why failure to inline a ukernel
+ // can result in a large penalty in both performance and code size.
+ for (auto &func : module.get()->functions()) {
+ func.addFnAttr(llvm::Attribute::AlwaysInline);
+ }
+ return module;
}
} // namespace mlir::iree_compiler::IREE::HAL
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.h b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.h
index 10e6f6b..d788661 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.h
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.h
@@ -13,16 +13,8 @@
namespace mlir::iree_compiler::IREE::HAL {
llvm::Expected<std::unique_ptr<llvm::Module>>
-loadUKernelBaseBitcode(llvm::TargetMachine *targetMachine,
- llvm::LLVMContext &context);
-
-llvm::Expected<std::unique_ptr<llvm::Module>>
-loadUKernelArchEntryPointsBitcode(llvm::TargetMachine *targetMachine,
- llvm::LLVMContext &context);
-
-llvm::Expected<std::unique_ptr<llvm::Module>>
-loadUKernelArchBitcode(llvm::TargetMachine *targetMachine,
- llvm::LLVMContext &context);
+loadUKernelBitcode(llvm::TargetMachine *targetMachine,
+ llvm::LLVMContext &context);
} // namespace mlir::iree_compiler::IREE::HAL
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/LLVMCPUTarget.cpp b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/LLVMCPUTarget.cpp
index 2ce3076..634e7ed 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/LLVMCPUTarget.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/LLVMCPUTarget.cpp
@@ -484,133 +484,28 @@
}
if (clLinkCPUUKernelBitcode) {
- // Tracks ukernel functions, in order to set their linkage to internal
- // after ukernel bitcode modules are linked but before runLLVMIRPasses, so
- // that unused ukernel code paths get DCE'd. Notes:
- // 1. We can't rely on fixupVisibility to do this, because fixupVisibility
- // is called after runLLVMIRPasses, which is what performs DCE. The
- // reason why fixupVisibility can't be moved before runLLVMIRPasses is
- // that causes all math functions to be DCE'd, as references to them
- // get introduced only later down. The basic difference here between
- // ukernel functions and math functions is that any references to
- // ukernel functions already exist at this point.
- // 2. We can't just set internal linkage right away upon loading ukernel
- // bitcode modules, because some ukernel symbols have to override weak
- // symbols, and that's disabled when linkage is set to internal.
- std::unordered_set<std::string> ukernelFunctions;
-
// Link in ukernel bitcode.
if (hasUkernel(variantOp.getTarget())) {
- auto setAlwaysInline = [&](llvm::Module &module) {
- for (auto &func : module.getFunctionList()) {
- func.addFnAttr(llvm::Attribute::AlwaysInline);
- }
- };
- auto addUkernelFunctions = [&](const llvm::Module &module) {
- for (auto &func : module.getFunctionList()) {
- if (func.isDeclaration()) {
- continue;
- }
- ukernelFunctions.insert(func.getName().str());
- }
- };
-
- llvm::Expected<std::unique_ptr<llvm::Module>> archBitcode =
- loadUKernelArchBitcode(targetMachine.get(), context);
- if (!archBitcode) {
+ llvm::Expected<std::unique_ptr<llvm::Module>> bitcode =
+ loadUKernelBitcode(targetMachine.get(), context);
+ if (!bitcode) {
return mlir::emitError(variantOp.getLoc())
- << "failed to load architecture-specific ukernel bitcode: "
- << llvm::toString(archBitcode.takeError());
+ << "failed to load ukernel bitcode: "
+ << llvm::toString(bitcode.takeError());
}
- llvm::Expected<std::unique_ptr<llvm::Module>> archEntryPointsBitcode =
- loadUKernelArchEntryPointsBitcode(targetMachine.get(), context);
- if (!archEntryPointsBitcode) {
- return mlir::emitError(variantOp.getLoc())
- << "failed to load architecture-specific ukernel entry points "
- "bitcode: "
- << llvm::toString(archEntryPointsBitcode.takeError());
- }
-
- // archBitcode and archEntryPointsBitcode are optional, may be null if
- // there is none for the target architecture. However, they should
- // simultaneously be null or non-null.
- if ((archBitcode.get() == nullptr) !=
- (archEntryPointsBitcode.get() == nullptr)) {
- return mlir::emitError(variantOp.getLoc())
- << "there should be architecture-specific ukernel bit code "
- "if, "
- "and only if there is architecture-specific ukernels entry "
- "points bitcode.";
- }
-
- if (archBitcode.get()) {
- addUkernelFunctions(*archBitcode.get());
- addUkernelFunctions(*archEntryPointsBitcode.get());
-
- // archEntryPointsBitcode contains overrides for weak symbols that
- // will come in the baseBitcode below. So we link it before
- // baseBitcode, with OverrideFromSrc.
- StringRef archEntryPointsBitcodeName =
- archEntryPointsBitcode.get()->getName();
- if (failed(linkBitcodeModule(
- variantOp.getLoc(), moduleLinker, 0, *targetMachine,
- archEntryPointsBitcodeName, std::move(archEntryPointsBitcode),
- setAlwaysInline))) {
- return mlir::emitError(variantOp.getLoc())
- << "failed linking in architecture-specific ukernel entry "
- "points bitcode "
- "for target triple '"
- << targetTriple.str() << "'";
- }
-
- // archEntryPointsBitcode references symbols defined in archBitcode,
- // so we link that now. We can apply LinkOnlyNeeded, since the only
- // purpose of archBitcode is to satisfy references made in
- // archEntryPointsBitcode.
- StringRef archBitcodeName = archBitcode.get()->getName();
+ if (bitcode.get()) {
+ StringRef bitcodeName = bitcode.get()->getName();
if (failed(linkBitcodeModule(variantOp.getLoc(), moduleLinker,
llvm::Linker::LinkOnlyNeeded,
- *targetMachine, archBitcodeName,
- std::move(archBitcode), {}))) {
+ *targetMachine, bitcodeName,
+ std::move(bitcode), {}))) {
return mlir::emitError(variantOp.getLoc())
<< "failed linking in architecture-specific ukernel bitcode "
"for target triple '"
<< targetTriple.str() << "'";
}
}
-
- // The baseBitcode module contains weak symbols for fallbacks,
- // potentially overridden by symbols defined in archEntryPointsBitcode
- // above. So this must be linked after archEntryPointsBitcode. The
- // baseBitcode module contains the actual ukernel entry points as seen
- // from the MLIR module, and its purpose is to satisfy these references,
- // so we can apply LinkOnlyNeeded here.
- llvm::Expected<std::unique_ptr<llvm::Module>> baseBitcode =
- loadUKernelBaseBitcode(targetMachine.get(), context);
- if (baseBitcode) {
- addUkernelFunctions(*baseBitcode.get());
- }
- // Sequence that access before we std::move(baseBitcode)!
- StringRef baseBitcodeName =
- baseBitcode ? baseBitcode.get()->getName() : "";
- if (failed(linkBitcodeModule(
- variantOp.getLoc(), moduleLinker, llvm::Linker::LinkOnlyNeeded,
- *targetMachine, baseBitcodeName, std::move(baseBitcode),
- setAlwaysInline))) {
- return mlir::emitError(variantOp.getLoc())
- << "failed linking in base ukernel bitcode";
- }
- }
-
- // Set internal linkage on all ukernel functions. No new references to
- // ukernels will be created past this point, so any unreferenced ukernel
- // symbol is safe to DCE, which will happen below in runLLVMIRPasses, so
- // we need to set internal linkage before that.
- for (auto &func : llvmModule->getFunctionList()) {
- if (ukernelFunctions.count(func.getName().str())) {
- func.setLinkage(llvm::GlobalValue::LinkageTypes::InternalLinkage);
- }
}
}
diff --git a/runtime/src/iree/builtins/ukernel/BUILD.bazel b/runtime/src/iree/builtins/ukernel/BUILD.bazel
index 0cbdf64..d713803 100644
--- a/runtime/src/iree/builtins/ukernel/BUILD.bazel
+++ b/runtime/src/iree/builtins/ukernel/BUILD.bazel
@@ -5,7 +5,7 @@
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
load("//build_tools/bazel:build_defs.oss.bzl", "iree_cmake_extra_content", "iree_runtime_cc_library")
-load("//build_tools/bazel:iree_bitcode_library.bzl", "iree_bitcode_library")
+load("//build_tools/bazel:iree_bitcode_library.bzl", "iree_bitcode_library", "iree_link_bitcode")
load("//build_tools/embed_data:build_defs.bzl", "c_embed_data")
package(
@@ -47,9 +47,16 @@
],
)
+iree_runtime_cc_library(
+ name = "fallback",
+ srcs = ["fallback.c"],
+ hdrs = internal_headers,
+ visibility = [":__subpackages__"],
+)
+
# Entry points.
iree_runtime_cc_library(
- name = "ukernel_noweak",
+ name = "ukernel",
srcs = [
"mmt4d.c",
"mmt4d_tile.c",
@@ -60,7 +67,6 @@
"unpack_tile.c",
] + internal_headers,
hdrs = ["api.h"],
- visibility = ["//visibility:private"],
deps = [
":exported_bits",
"//runtime/src/iree/base:core_headers",
@@ -68,22 +74,6 @@
],
)
-iree_runtime_cc_library(
- name = "zzz_weak_linklast",
- srcs = ["weak.c"],
- visibility = ["//visibility:private"],
- deps = [":internal_headers"],
-)
-
-iree_runtime_cc_library(
- name = "ukernel",
- hdrs = ["api.h"],
- deps = [
- ":ukernel_noweak",
- ":zzz_weak_linklast",
- ],
-)
-
#===------------------------------------------------------------------------===#
# UKernel bitcode files
#===------------------------------------------------------------------------===#
@@ -95,48 +85,56 @@
inline = True,
)
+# Enumerate all archs for which to generate the build-system logic and targets
+# to build ukernel bitcode. This doesn't necessarily imply actually building
+# bitcode for all these architectures, as we can still have e.g. a CMake option
+# to control which archs to build bitcode for, and even without that, we still
+# have CMake logic omitting bitcode for archs which we are not targeting, so that
+# disabling a target arch also disables the corresponding bitcode.
+# As of early 2024, generic bitcode for an additional arch is ~ 20 kB, while
+# bitcode for archs for which we have dedicated optimized ukernel code is ~ 100 kB.
+bitcode_generic_archs = [
+ "x86_64",
+ "arm_64",
+ "arm_32",
+ "riscv_64",
+ "riscv_32",
+]
+
+# Enumerate all archs for which we have arch-specific dedicated ukernel code,
+# in a arch/${arch} subdirectory.
+bitcode_specific_archs = [
+ "x86_64",
+ "arm_64",
+]
+
[iree_bitcode_library(
- name = "ukernel_bitcode_%sbit_base" % bitness,
+ name = "ukernel_bitcode_generic_%s" % arch,
srcs = [
- # Note: only some of these ukernels are actually used in LLVMCPU, which
- # is the backend that consumes this bitcode, while other ukernels are
- # only used in VMVX, which links to the native build, not bitcode.
- # We still keep all ukernels here to ensure that they all build as
- # bitcode, so we could easily let LLVMCPU use them at any time. This
- # unused bitcode should be only a small inflation of the IREE compiler
- # (where it is embedded as data). It should have no effect on generated
- # modules.
"mmt4d.c",
"mmt4d_tile.c",
- "pack.c",
- "pack_tile.c",
- "query_tile_sizes.c",
- "unpack_tile.c",
- "weak.c",
- ],
- # wasm_X here is a proxy for "some reasonable X-bit architecture". The
- # exact architecture should not matter (other than bitness) as the code here
- # is not architecture-specialized, and ukernel source code is carefully
- # written to be completely stand-alone.
- arch = "wasm_%s" % bitness,
+ ] + ([] if arch in bitcode_specific_archs else ["fallback.c"]),
+ arch = arch,
internal_hdrs = [
":internal_headers_filegroup",
"//runtime/src/iree/schemas:cpu_data_headers_filegroup",
],
-) for bitness in [
- "32",
- "64",
-]]
+) for arch in bitcode_generic_archs]
+
+[iree_link_bitcode(
+ name = "ukernel_bitcode_%s" % arch,
+ bitcode_files = [
+ ":ukernel_bitcode_generic_%s.bc" % arch,
+ ] + ([
+ "arch/%s:ukernel_bitcode_arch_%s.bc" % (arch, arch),
+ ] if arch in bitcode_specific_archs else []),
+) for arch in bitcode_generic_archs]
c_embed_data(
name = "embed_ukernel_bitcode",
srcs = [
- ":ukernel_bitcode_32bit_base.bc",
- ":ukernel_bitcode_64bit_base.bc",
- "//runtime/src/iree/builtins/ukernel/arch/arm_64:ukernel_bitcode_arm_64.bc",
- "//runtime/src/iree/builtins/ukernel/arch/arm_64:ukernel_bitcode_arm_64_entry_points.bc",
- "//runtime/src/iree/builtins/ukernel/arch/x86_64:ukernel_bitcode_x86_64.bc",
- "//runtime/src/iree/builtins/ukernel/arch/x86_64:ukernel_bitcode_x86_64_entry_points.bc",
+ ":ukernel_bitcode_%s.bc" % arch
+ for arch in bitcode_generic_archs
],
c_file_output = "ukernel_bitcode.c",
flatten = True,
diff --git a/runtime/src/iree/builtins/ukernel/CMakeLists.txt b/runtime/src/iree/builtins/ukernel/CMakeLists.txt
index 78c7511..63b4bc7 100644
--- a/runtime/src/iree/builtins/ukernel/CMakeLists.txt
+++ b/runtime/src/iree/builtins/ukernel/CMakeLists.txt
@@ -60,7 +60,28 @@
iree_cc_library(
NAME
- ukernel_noweak
+ fallback
+ HDRS
+ "common.h"
+ "exported_bits.h"
+ "mmt4d.h"
+ "mmt4d_internal.h"
+ "pack.h"
+ "pack_internal.h"
+ "query_tile_sizes.h"
+ "query_tile_sizes_internal.h"
+ "unpack.h"
+ "unpack_internal.h"
+ SRCS
+ "fallback.c"
+ DEPS
+
+ PUBLIC
+)
+
+iree_cc_library(
+ NAME
+ ukernel
HDRS
"api.h"
SRCS
@@ -88,75 +109,127 @@
PUBLIC
)
-iree_cc_library(
- NAME
- zzz_weak_linklast
- SRCS
- "weak.c"
- DEPS
- ::internal_headers
- PUBLIC
-)
-
-iree_cc_library(
- NAME
- ukernel
- HDRS
- "api.h"
- DEPS
- ::ukernel_noweak
- ::zzz_weak_linklast
- PUBLIC
-)
-
if(IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU)
iree_bitcode_library(
NAME
- ukernel_bitcode_32bit_base
+ ukernel_bitcode_generic_x86_64
ARCH
- wasm_32
+ x86_64
INTERNAL_HDRS
"${PROJECT_BINARY_DIR}/runtime/src/iree/schemas/cpu_data_headers_filegroup.stamp"
"internal_headers_filegroup.stamp"
SRCS
"mmt4d.c"
"mmt4d_tile.c"
- "pack.c"
- "pack_tile.c"
- "query_tile_sizes.c"
- "unpack_tile.c"
- "weak.c"
)
iree_bitcode_library(
NAME
- ukernel_bitcode_64bit_base
+ ukernel_bitcode_generic_arm_64
ARCH
- wasm_64
+ arm_64
INTERNAL_HDRS
"${PROJECT_BINARY_DIR}/runtime/src/iree/schemas/cpu_data_headers_filegroup.stamp"
"internal_headers_filegroup.stamp"
SRCS
"mmt4d.c"
"mmt4d_tile.c"
- "pack.c"
- "pack_tile.c"
- "query_tile_sizes.c"
- "unpack_tile.c"
- "weak.c"
+)
+
+iree_bitcode_library(
+ NAME
+ ukernel_bitcode_generic_arm_32
+ ARCH
+ arm_32
+ INTERNAL_HDRS
+ "${PROJECT_BINARY_DIR}/runtime/src/iree/schemas/cpu_data_headers_filegroup.stamp"
+ "internal_headers_filegroup.stamp"
+ SRCS
+ "fallback.c"
+ "mmt4d.c"
+ "mmt4d_tile.c"
+)
+
+iree_bitcode_library(
+ NAME
+ ukernel_bitcode_generic_riscv_64
+ ARCH
+ riscv_64
+ INTERNAL_HDRS
+ "${PROJECT_BINARY_DIR}/runtime/src/iree/schemas/cpu_data_headers_filegroup.stamp"
+ "internal_headers_filegroup.stamp"
+ SRCS
+ "fallback.c"
+ "mmt4d.c"
+ "mmt4d_tile.c"
+)
+
+iree_bitcode_library(
+ NAME
+ ukernel_bitcode_generic_riscv_32
+ ARCH
+ riscv_32
+ INTERNAL_HDRS
+ "${PROJECT_BINARY_DIR}/runtime/src/iree/schemas/cpu_data_headers_filegroup.stamp"
+ "internal_headers_filegroup.stamp"
+ SRCS
+ "fallback.c"
+ "mmt4d.c"
+ "mmt4d_tile.c"
+)
+
+iree_link_bitcode(
+ NAME
+ ukernel_bitcode_x86_64
+ SRCS
+ "arch/x86_64/ukernel_bitcode_arch_x86_64.bc"
+ "ukernel_bitcode_generic_x86_64.bc"
+
+)
+
+iree_link_bitcode(
+ NAME
+ ukernel_bitcode_arm_64
+ SRCS
+ "arch/arm_64/ukernel_bitcode_arch_arm_64.bc"
+ "ukernel_bitcode_generic_arm_64.bc"
+
+)
+
+iree_link_bitcode(
+ NAME
+ ukernel_bitcode_arm_32
+ SRCS
+ "ukernel_bitcode_generic_arm_32.bc"
+
+)
+
+iree_link_bitcode(
+ NAME
+ ukernel_bitcode_riscv_64
+ SRCS
+ "ukernel_bitcode_generic_riscv_64.bc"
+
+)
+
+iree_link_bitcode(
+ NAME
+ ukernel_bitcode_riscv_32
+ SRCS
+ "ukernel_bitcode_generic_riscv_32.bc"
+
)
iree_c_embed_data(
NAME
embed_ukernel_bitcode
SRCS
- "${PROJECT_BINARY_DIR}/runtime/src/iree/builtins/ukernel/arch/arm_64/ukernel_bitcode_arm_64.bc"
- "${PROJECT_BINARY_DIR}/runtime/src/iree/builtins/ukernel/arch/arm_64/ukernel_bitcode_arm_64_entry_points.bc"
- "${PROJECT_BINARY_DIR}/runtime/src/iree/builtins/ukernel/arch/x86_64/ukernel_bitcode_x86_64.bc"
- "${PROJECT_BINARY_DIR}/runtime/src/iree/builtins/ukernel/arch/x86_64/ukernel_bitcode_x86_64_entry_points.bc"
- "ukernel_bitcode_32bit_base.bc"
- "ukernel_bitcode_64bit_base.bc"
+ "ukernel_bitcode_arm_32.bc"
+ "ukernel_bitcode_arm_64.bc"
+ "ukernel_bitcode_riscv_32.bc"
+ "ukernel_bitcode_riscv_64.bc"
+ "ukernel_bitcode_x86_64.bc"
DEPS
C_FILE_OUTPUT
diff --git a/runtime/src/iree/builtins/ukernel/arch/BUILD.bazel b/runtime/src/iree/builtins/ukernel/arch/BUILD.bazel
index 03e6756..a222685 100644
--- a/runtime/src/iree/builtins/ukernel/arch/BUILD.bazel
+++ b/runtime/src/iree/builtins/ukernel/arch/BUILD.bazel
@@ -13,7 +13,10 @@
)
# In the CMake build, this library provides the arch-dependent symbols.
-# Not yet implemented on Bazel.
+# For now in Bazel, this just unconditionally uses the fallback to generic code.
iree_runtime_cc_library(
name = "ukernel_arch",
+ deps = [
+ "//runtime/src/iree/builtins/ukernel:fallback",
+ ],
)
diff --git a/runtime/src/iree/builtins/ukernel/arch/CMakeLists.txt b/runtime/src/iree/builtins/ukernel/arch/CMakeLists.txt
index a905838..273fd25 100644
--- a/runtime/src/iree/builtins/ukernel/arch/CMakeLists.txt
+++ b/runtime/src/iree/builtins/ukernel/arch/CMakeLists.txt
@@ -6,6 +6,10 @@
iree_add_all_subdirs()
+if (NOT IREE_UK_ARCH_DEPS)
+ set(IREE_UK_ARCH_DEPS "iree::builtins::ukernel::fallback")
+endif()
+
iree_cc_library(
NAME
ukernel_arch
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/BUILD.bazel b/runtime/src/iree/builtins/ukernel/arch/arm_64/BUILD.bazel
index 3898b9f..1103c01 100644
--- a/runtime/src/iree/builtins/ukernel/arch/arm_64/BUILD.bazel
+++ b/runtime/src/iree/builtins/ukernel/arch/arm_64/BUILD.bazel
@@ -30,44 +30,30 @@
"common_arm_64.h",
"common_arm_64_entry_point.h",
"mmt4d_arm_64_internal.h",
- "pack_arm_64_internal.h",
- "unpack_arm_64_internal.h",
"//runtime/src/iree/builtins/ukernel:internal_headers_filegroup",
"//runtime/src/iree/schemas:cpu_data_headers_filegroup",
]
iree_bitcode_library(
- name = "ukernel_bitcode_arm_64_entry_points",
+ name = "ukernel_bitcode_arch_arm_64_entry_points",
srcs = [
"mmt4d_arm_64_entry_point.c",
- "pack_arm_64_entry_point.c",
- "query_tile_sizes_arm_64_entry_point.c",
- "unpack_arm_64_entry_point.c",
- ],
- # wasm_64 here is a proxy for "some reasonable 64-bit architecture". This
- # should match the `ukernel_bitcode_64bit_base` bitcode library, because we
- # want this code inlined into the ukernel entry points so that code path
- # selection logic can evaluate at compile time and unused code paths can be
- # DCE'd. The entry points don't need to be architecture-specialized, all
- # they do is return function pointers to the actual architecture-specialized
- # symbols defined in the other iree_bitcode_library's below.
- arch = "wasm_64",
- internal_hdrs = UKERNEL_ARM_64_INTERNAL_HEADERS,
-)
-
-iree_bitcode_library(
- name = "ukernel_bitcode_arm_64_base",
- srcs = [
- "mmt4d_arm_64.c",
- "pack_arm_64.c",
- "unpack_arm_64.c",
],
arch = "arm_64",
internal_hdrs = UKERNEL_ARM_64_INTERNAL_HEADERS,
)
iree_bitcode_library(
- name = "ukernel_bitcode_arm_64_fullfp16",
+ name = "ukernel_bitcode_arch_arm_64_base",
+ srcs = [
+ "mmt4d_arm_64_base.c",
+ ],
+ arch = "arm_64",
+ internal_hdrs = UKERNEL_ARM_64_INTERNAL_HEADERS,
+)
+
+iree_bitcode_library(
+ name = "ukernel_bitcode_arch_arm_64_fullfp16",
srcs = ["mmt4d_arm_64_fullfp16.c"],
arch = "arm_64",
copts = ["-march=armv8.2-a+fp16"],
@@ -75,7 +61,7 @@
)
iree_bitcode_library(
- name = "ukernel_bitcode_arm_64_fp16fml",
+ name = "ukernel_bitcode_arch_arm_64_fp16fml",
srcs = ["mmt4d_arm_64_fp16fml.c"],
arch = "arm_64",
copts = ["-march=armv8.2-a+fp16fml"],
@@ -83,7 +69,7 @@
)
iree_bitcode_library(
- name = "ukernel_bitcode_arm_64_bf16",
+ name = "ukernel_bitcode_arch_arm_64_bf16",
srcs = ["mmt4d_arm_64_bf16.c"],
arch = "arm_64",
copts = ["-march=armv8.2-a+bf16"],
@@ -91,7 +77,7 @@
)
iree_bitcode_library(
- name = "ukernel_bitcode_arm_64_dotprod",
+ name = "ukernel_bitcode_arch_arm_64_dotprod",
srcs = ["mmt4d_arm_64_dotprod.c"],
arch = "arm_64",
copts = ["-march=armv8.2-a+dotprod"],
@@ -99,7 +85,7 @@
)
iree_bitcode_library(
- name = "ukernel_bitcode_arm_64_i8mm",
+ name = "ukernel_bitcode_arch_arm_64_i8mm",
srcs = ["mmt4d_arm_64_i8mm.c"],
arch = "arm_64",
copts = ["-march=armv8.2-a+i8mm"],
@@ -107,22 +93,22 @@
)
iree_link_bitcode(
- name = "ukernel_bitcode_arm_64",
+ name = "ukernel_bitcode_arch_arm_64",
bitcode_files = [
- "ukernel_bitcode_arm_64_base.bc",
- "ukernel_bitcode_arm_64_fullfp16.bc",
- "ukernel_bitcode_arm_64_fp16fml.bc",
- "ukernel_bitcode_arm_64_bf16.bc",
- "ukernel_bitcode_arm_64_dotprod.bc",
- "ukernel_bitcode_arm_64_i8mm.bc",
+ "ukernel_bitcode_arch_arm_64_entry_points.bc",
+ "ukernel_bitcode_arch_arm_64_base.bc",
+ "ukernel_bitcode_arch_arm_64_fullfp16.bc",
+ "ukernel_bitcode_arch_arm_64_fp16fml.bc",
+ "ukernel_bitcode_arch_arm_64_bf16.bc",
+ "ukernel_bitcode_arch_arm_64_dotprod.bc",
+ "ukernel_bitcode_arch_arm_64_i8mm.bc",
],
)
iree_cmake_extra_content(
content = """
elseif(IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU)
-iree_make_empty_file("${CMAKE_CURRENT_BINARY_DIR}/ukernel_bitcode_arm_64.bc")
-iree_make_empty_file("${CMAKE_CURRENT_BINARY_DIR}/ukernel_bitcode_arm_64_entry_points.bc")
+iree_make_empty_file("${CMAKE_CURRENT_BINARY_DIR}/ukernel_bitcode_arch_arm_64.bc")
endif() # _IREE_UKERNEL_BITCODE_BUILD_ARM_64
""",
inline = True,
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/CMakeLists.txt b/runtime/src/iree/builtins/ukernel/arch/arm_64/CMakeLists.txt
index fd8f1b6..93019fe 100644
--- a/runtime/src/iree/builtins/ukernel/arch/arm_64/CMakeLists.txt
+++ b/runtime/src/iree/builtins/ukernel/arch/arm_64/CMakeLists.txt
@@ -15,27 +15,22 @@
iree_bitcode_library(
NAME
- ukernel_bitcode_arm_64_entry_points
+ ukernel_bitcode_arch_arm_64_entry_points
ARCH
- wasm_64
+ arm_64
INTERNAL_HDRS
"${PROJECT_BINARY_DIR}/runtime/src/iree/builtins/ukernel/internal_headers_filegroup.stamp"
"${PROJECT_BINARY_DIR}/runtime/src/iree/schemas/cpu_data_headers_filegroup.stamp"
"common_arm_64.h"
"common_arm_64_entry_point.h"
"mmt4d_arm_64_internal.h"
- "pack_arm_64_internal.h"
- "unpack_arm_64_internal.h"
SRCS
"mmt4d_arm_64_entry_point.c"
- "pack_arm_64_entry_point.c"
- "query_tile_sizes_arm_64_entry_point.c"
- "unpack_arm_64_entry_point.c"
)
iree_bitcode_library(
NAME
- ukernel_bitcode_arm_64_base
+ ukernel_bitcode_arch_arm_64_base
ARCH
arm_64
INTERNAL_HDRS
@@ -44,17 +39,13 @@
"common_arm_64.h"
"common_arm_64_entry_point.h"
"mmt4d_arm_64_internal.h"
- "pack_arm_64_internal.h"
- "unpack_arm_64_internal.h"
SRCS
- "mmt4d_arm_64.c"
- "pack_arm_64.c"
- "unpack_arm_64.c"
+ "mmt4d_arm_64_base.c"
)
iree_bitcode_library(
NAME
- ukernel_bitcode_arm_64_fullfp16
+ ukernel_bitcode_arch_arm_64_fullfp16
ARCH
arm_64
INTERNAL_HDRS
@@ -63,8 +54,6 @@
"common_arm_64.h"
"common_arm_64_entry_point.h"
"mmt4d_arm_64_internal.h"
- "pack_arm_64_internal.h"
- "unpack_arm_64_internal.h"
SRCS
"mmt4d_arm_64_fullfp16.c"
COPTS
@@ -73,7 +62,7 @@
iree_bitcode_library(
NAME
- ukernel_bitcode_arm_64_fp16fml
+ ukernel_bitcode_arch_arm_64_fp16fml
ARCH
arm_64
INTERNAL_HDRS
@@ -82,8 +71,6 @@
"common_arm_64.h"
"common_arm_64_entry_point.h"
"mmt4d_arm_64_internal.h"
- "pack_arm_64_internal.h"
- "unpack_arm_64_internal.h"
SRCS
"mmt4d_arm_64_fp16fml.c"
COPTS
@@ -92,7 +79,7 @@
iree_bitcode_library(
NAME
- ukernel_bitcode_arm_64_bf16
+ ukernel_bitcode_arch_arm_64_bf16
ARCH
arm_64
INTERNAL_HDRS
@@ -101,8 +88,6 @@
"common_arm_64.h"
"common_arm_64_entry_point.h"
"mmt4d_arm_64_internal.h"
- "pack_arm_64_internal.h"
- "unpack_arm_64_internal.h"
SRCS
"mmt4d_arm_64_bf16.c"
COPTS
@@ -111,7 +96,7 @@
iree_bitcode_library(
NAME
- ukernel_bitcode_arm_64_dotprod
+ ukernel_bitcode_arch_arm_64_dotprod
ARCH
arm_64
INTERNAL_HDRS
@@ -120,8 +105,6 @@
"common_arm_64.h"
"common_arm_64_entry_point.h"
"mmt4d_arm_64_internal.h"
- "pack_arm_64_internal.h"
- "unpack_arm_64_internal.h"
SRCS
"mmt4d_arm_64_dotprod.c"
COPTS
@@ -130,7 +113,7 @@
iree_bitcode_library(
NAME
- ukernel_bitcode_arm_64_i8mm
+ ukernel_bitcode_arch_arm_64_i8mm
ARCH
arm_64
INTERNAL_HDRS
@@ -139,8 +122,6 @@
"common_arm_64.h"
"common_arm_64_entry_point.h"
"mmt4d_arm_64_internal.h"
- "pack_arm_64_internal.h"
- "unpack_arm_64_internal.h"
SRCS
"mmt4d_arm_64_i8mm.c"
COPTS
@@ -149,20 +130,20 @@
iree_link_bitcode(
NAME
- ukernel_bitcode_arm_64
+ ukernel_bitcode_arch_arm_64
SRCS
- "ukernel_bitcode_arm_64_base.bc"
- "ukernel_bitcode_arm_64_bf16.bc"
- "ukernel_bitcode_arm_64_dotprod.bc"
- "ukernel_bitcode_arm_64_fp16fml.bc"
- "ukernel_bitcode_arm_64_fullfp16.bc"
- "ukernel_bitcode_arm_64_i8mm.bc"
+ "ukernel_bitcode_arch_arm_64_base.bc"
+ "ukernel_bitcode_arch_arm_64_bf16.bc"
+ "ukernel_bitcode_arch_arm_64_dotprod.bc"
+ "ukernel_bitcode_arch_arm_64_entry_points.bc"
+ "ukernel_bitcode_arch_arm_64_fp16fml.bc"
+ "ukernel_bitcode_arch_arm_64_fullfp16.bc"
+ "ukernel_bitcode_arch_arm_64_i8mm.bc"
)
elseif(IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU)
-iree_make_empty_file("${CMAKE_CURRENT_BINARY_DIR}/ukernel_bitcode_arm_64.bc")
-iree_make_empty_file("${CMAKE_CURRENT_BINARY_DIR}/ukernel_bitcode_arm_64_entry_points.bc")
+iree_make_empty_file("${CMAKE_CURRENT_BINARY_DIR}/ukernel_bitcode_arch_arm_64.bc")
endif() # _IREE_UKERNEL_BITCODE_BUILD_ARM_64
### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
@@ -290,12 +271,12 @@
arm_64
SRCS
"mmt4d_arm_64_entry_point.c"
- "mmt4d_arm_64.c"
+ "mmt4d_arm_64_base.c"
"pack_arm_64_entry_point.c"
- "pack_arm_64.c"
+ "pack_arm_64_base.c"
"query_tile_sizes_arm_64_entry_point.c"
"unpack_arm_64_entry_point.c"
- "unpack_arm_64.c"
+ "unpack_arm_64_base.c"
DEPS
::common_arm_64
iree::base::core_headers
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64.c b/runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64_base.c
similarity index 100%
rename from runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64.c
rename to runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64_base.c
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_arm_64.c b/runtime/src/iree/builtins/ukernel/arch/arm_64/pack_arm_64_base.c
similarity index 100%
rename from runtime/src/iree/builtins/ukernel/arch/arm_64/pack_arm_64.c
rename to runtime/src/iree/builtins/ukernel/arch/arm_64/pack_arm_64_base.c
diff --git a/runtime/src/iree/builtins/ukernel/arch/arm_64/unpack_arm_64.c b/runtime/src/iree/builtins/ukernel/arch/arm_64/unpack_arm_64_base.c
similarity index 100%
rename from runtime/src/iree/builtins/ukernel/arch/arm_64/unpack_arm_64.c
rename to runtime/src/iree/builtins/ukernel/arch/arm_64/unpack_arm_64_base.c
diff --git a/runtime/src/iree/builtins/ukernel/arch/x86_64/BUILD.bazel b/runtime/src/iree/builtins/ukernel/arch/x86_64/BUILD.bazel
index 88a4cc8..ff6f7b1 100644
--- a/runtime/src/iree/builtins/ukernel/arch/x86_64/BUILD.bazel
+++ b/runtime/src/iree/builtins/ukernel/arch/x86_64/BUILD.bazel
@@ -30,28 +30,16 @@
"common_x86_64.h",
"common_x86_64_entry_point.h",
"mmt4d_x86_64_internal.h",
- "pack_x86_64_internal.h",
- "unpack_x86_64_internal.h",
"//runtime/src/iree/builtins/ukernel:internal_headers_filegroup",
"//runtime/src/iree/schemas:cpu_data_headers_filegroup",
]
iree_bitcode_library(
- name = "ukernel_bitcode_x86_64_entry_points",
+ name = "ukernel_bitcode_arch_x86_64_entry_points",
srcs = [
"mmt4d_x86_64_entry_point.c",
- "pack_x86_64_entry_point.c",
- "query_tile_sizes_x86_64_entry_point.c",
- "unpack_x86_64_entry_point.c",
],
- # wasm_64 here is a proxy for "some reasonable 64-bit architecture". This
- # should match the `ukernel_bitcode_64bit_base` bitcode library, because we
- # want this code inlined into the ukernel entry points so that code path
- # selection logic can evaluate at compile time and unused code paths can be
- # DCE'd. The entry points don't need to be architecture-specialized, all
- # they do is return function pointers to the actual architecture-specialized
- # symbols defined in the other iree_bitcode_library's below.
- arch = "wasm_64",
+ arch = "x86_64",
internal_hdrs = UKERNEL_X86_64_INTERNAL_HEADERS,
)
@@ -63,11 +51,9 @@
]
iree_bitcode_library(
- name = "ukernel_bitcode_x86_64_avx2_fma",
+ name = "ukernel_bitcode_arch_x86_64_avx2_fma",
srcs = [
"mmt4d_x86_64_avx2_fma.c",
- "pack_x86_64_avx2_fma.c",
- "unpack_x86_64_avx2_fma.c",
],
arch = "x86_64",
copts = UKERNEL_X86_64_AVX2_FMA_COPTS,
@@ -83,11 +69,9 @@
]
iree_bitcode_library(
- name = "ukernel_bitcode_x86_64_avx512_base",
+ name = "ukernel_bitcode_arch_x86_64_avx512_base",
srcs = [
"mmt4d_x86_64_avx512_base.c",
- "pack_x86_64_avx512_base.c",
- "unpack_x86_64_avx512_base.c",
],
arch = "x86_64",
copts = UKERNEL_X86_64_AVX512_BASE_COPTS,
@@ -99,7 +83,7 @@
]
iree_bitcode_library(
- name = "ukernel_bitcode_x86_64_avx512_vnni",
+ name = "ukernel_bitcode_arch_x86_64_avx512_vnni",
srcs = [
"mmt4d_x86_64_avx512_vnni.c",
],
@@ -113,7 +97,7 @@
]
iree_bitcode_library(
- name = "ukernel_bitcode_x86_64_avx512_bf16",
+ name = "ukernel_bitcode_arch_x86_64_avx512_bf16",
srcs = [
"mmt4d_x86_64_avx512_bf16.c",
],
@@ -123,20 +107,20 @@
)
iree_link_bitcode(
- name = "ukernel_bitcode_x86_64",
+ name = "ukernel_bitcode_arch_x86_64",
bitcode_files = [
- "ukernel_bitcode_x86_64_avx2_fma.bc",
- "ukernel_bitcode_x86_64_avx512_base.bc",
- "ukernel_bitcode_x86_64_avx512_vnni.bc",
- "ukernel_bitcode_x86_64_avx512_bf16.bc",
+ "ukernel_bitcode_arch_x86_64_entry_points.bc",
+ "ukernel_bitcode_arch_x86_64_avx2_fma.bc",
+ "ukernel_bitcode_arch_x86_64_avx512_base.bc",
+ "ukernel_bitcode_arch_x86_64_avx512_vnni.bc",
+ "ukernel_bitcode_arch_x86_64_avx512_bf16.bc",
],
)
iree_cmake_extra_content(
content = """
elseif(IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU)
-iree_make_empty_file("${CMAKE_CURRENT_BINARY_DIR}/ukernel_bitcode_x86_64.bc")
-iree_make_empty_file("${CMAKE_CURRENT_BINARY_DIR}/ukernel_bitcode_x86_64_entry_points.bc")
+iree_make_empty_file("${CMAKE_CURRENT_BINARY_DIR}/ukernel_bitcode_arch_x86_64.bc")
endif() # _IREE_UKERNEL_BITCODE_BUILD_X86_64
""",
inline = True,
diff --git a/runtime/src/iree/builtins/ukernel/arch/x86_64/CMakeLists.txt b/runtime/src/iree/builtins/ukernel/arch/x86_64/CMakeLists.txt
index 2d5b155..18c8274 100644
--- a/runtime/src/iree/builtins/ukernel/arch/x86_64/CMakeLists.txt
+++ b/runtime/src/iree/builtins/ukernel/arch/x86_64/CMakeLists.txt
@@ -15,27 +15,7 @@
iree_bitcode_library(
NAME
- ukernel_bitcode_x86_64_entry_points
- ARCH
- wasm_64
- INTERNAL_HDRS
- "${PROJECT_BINARY_DIR}/runtime/src/iree/builtins/ukernel/internal_headers_filegroup.stamp"
- "${PROJECT_BINARY_DIR}/runtime/src/iree/schemas/cpu_data_headers_filegroup.stamp"
- "common_x86_64.h"
- "common_x86_64_entry_point.h"
- "mmt4d_x86_64_internal.h"
- "pack_x86_64_internal.h"
- "unpack_x86_64_internal.h"
- SRCS
- "mmt4d_x86_64_entry_point.c"
- "pack_x86_64_entry_point.c"
- "query_tile_sizes_x86_64_entry_point.c"
- "unpack_x86_64_entry_point.c"
-)
-
-iree_bitcode_library(
- NAME
- ukernel_bitcode_x86_64_avx2_fma
+ ukernel_bitcode_arch_x86_64_entry_points
ARCH
x86_64
INTERNAL_HDRS
@@ -44,12 +24,23 @@
"common_x86_64.h"
"common_x86_64_entry_point.h"
"mmt4d_x86_64_internal.h"
- "pack_x86_64_internal.h"
- "unpack_x86_64_internal.h"
+ SRCS
+ "mmt4d_x86_64_entry_point.c"
+)
+
+iree_bitcode_library(
+ NAME
+ ukernel_bitcode_arch_x86_64_avx2_fma
+ ARCH
+ x86_64
+ INTERNAL_HDRS
+ "${PROJECT_BINARY_DIR}/runtime/src/iree/builtins/ukernel/internal_headers_filegroup.stamp"
+ "${PROJECT_BINARY_DIR}/runtime/src/iree/schemas/cpu_data_headers_filegroup.stamp"
+ "common_x86_64.h"
+ "common_x86_64_entry_point.h"
+ "mmt4d_x86_64_internal.h"
SRCS
"mmt4d_x86_64_avx2_fma.c"
- "pack_x86_64_avx2_fma.c"
- "unpack_x86_64_avx2_fma.c"
COPTS
"-mavx"
"-mavx2"
@@ -59,7 +50,7 @@
iree_bitcode_library(
NAME
- ukernel_bitcode_x86_64_avx512_base
+ ukernel_bitcode_arch_x86_64_avx512_base
ARCH
x86_64
INTERNAL_HDRS
@@ -68,12 +59,8 @@
"common_x86_64.h"
"common_x86_64_entry_point.h"
"mmt4d_x86_64_internal.h"
- "pack_x86_64_internal.h"
- "unpack_x86_64_internal.h"
SRCS
"mmt4d_x86_64_avx512_base.c"
- "pack_x86_64_avx512_base.c"
- "unpack_x86_64_avx512_base.c"
COPTS
"-mavx"
"-mavx2"
@@ -88,7 +75,7 @@
iree_bitcode_library(
NAME
- ukernel_bitcode_x86_64_avx512_vnni
+ ukernel_bitcode_arch_x86_64_avx512_vnni
ARCH
x86_64
INTERNAL_HDRS
@@ -97,8 +84,6 @@
"common_x86_64.h"
"common_x86_64_entry_point.h"
"mmt4d_x86_64_internal.h"
- "pack_x86_64_internal.h"
- "unpack_x86_64_internal.h"
SRCS
"mmt4d_x86_64_avx512_vnni.c"
COPTS
@@ -116,7 +101,7 @@
iree_bitcode_library(
NAME
- ukernel_bitcode_x86_64_avx512_bf16
+ ukernel_bitcode_arch_x86_64_avx512_bf16
ARCH
x86_64
INTERNAL_HDRS
@@ -125,8 +110,6 @@
"common_x86_64.h"
"common_x86_64_entry_point.h"
"mmt4d_x86_64_internal.h"
- "pack_x86_64_internal.h"
- "unpack_x86_64_internal.h"
SRCS
"mmt4d_x86_64_avx512_bf16.c"
COPTS
@@ -144,18 +127,18 @@
iree_link_bitcode(
NAME
- ukernel_bitcode_x86_64
+ ukernel_bitcode_arch_x86_64
SRCS
- "ukernel_bitcode_x86_64_avx2_fma.bc"
- "ukernel_bitcode_x86_64_avx512_base.bc"
- "ukernel_bitcode_x86_64_avx512_bf16.bc"
- "ukernel_bitcode_x86_64_avx512_vnni.bc"
+ "ukernel_bitcode_arch_x86_64_avx2_fma.bc"
+ "ukernel_bitcode_arch_x86_64_avx512_base.bc"
+ "ukernel_bitcode_arch_x86_64_avx512_bf16.bc"
+ "ukernel_bitcode_arch_x86_64_avx512_vnni.bc"
+ "ukernel_bitcode_arch_x86_64_entry_points.bc"
)
elseif(IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU)
-iree_make_empty_file("${CMAKE_CURRENT_BINARY_DIR}/ukernel_bitcode_x86_64.bc")
-iree_make_empty_file("${CMAKE_CURRENT_BINARY_DIR}/ukernel_bitcode_x86_64_entry_points.bc")
+iree_make_empty_file("${CMAKE_CURRENT_BINARY_DIR}/ukernel_bitcode_arch_x86_64.bc")
endif() # _IREE_UKERNEL_BITCODE_BUILD_X86_64
### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/builtins/ukernel/common.h b/runtime/src/iree/builtins/ukernel/common.h
index 21f0342..afa2563 100644
--- a/runtime/src/iree/builtins/ukernel/common.h
+++ b/runtime/src/iree/builtins/ukernel/common.h
@@ -175,14 +175,6 @@
#define IREE_UK_ATTRIBUTE_ALIGNED(N)
#endif // IREE_UK_HAVE_ATTRIBUTE(noinline)
-#if IREE_UK_HAVE_ATTRIBUTE(weak) || defined(IREE_UK_COMPILER_GCC)
-#define IREE_UK_WEAK __attribute__((weak))
-#define IREE_UK_HAVE_WEAK 1
-#else
-#define IREE_UK_WEAK
-#define IREE_UK_HAVE_WEAK 0
-#endif // IREE_UK_HAVE_ATTRIBUTE(noinline)
-
#if IREE_UK_HAVE_ATTRIBUTE(maybe_unused) && defined(IREE_UK_COMPILER_CLANG)
#define IREE_UK_ATTRIBUTE_UNUSED __attribute__((maybe_unused))
#elif IREE_UK_HAVE_ATTRIBUTE(unused) || defined(IREE_UK_COMPILER_CLANG_OR_GCC)
diff --git a/runtime/src/iree/builtins/ukernel/fallback.c b/runtime/src/iree/builtins/ukernel/fallback.c
new file mode 100644
index 0000000..eaee5b2
--- /dev/null
+++ b/runtime/src/iree/builtins/ukernel/fallback.c
@@ -0,0 +1,31 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/builtins/ukernel/mmt4d_internal.h"
+#include "iree/builtins/ukernel/pack_internal.h"
+#include "iree/builtins/ukernel/query_tile_sizes_internal.h"
+#include "iree/builtins/ukernel/unpack_internal.h"
+
+iree_uk_mmt4d_tile_func_t iree_uk_mmt4d_select_tile_func_arch(
+ const iree_uk_mmt4d_params_t* params) {
+ return 0;
+}
+
+iree_uk_pack_tile_func_t iree_uk_pack_select_tile_func_arch(
+ const iree_uk_pack_params_t* params) {
+ return 0;
+}
+
+iree_uk_unpack_tile_func_t iree_uk_unpack_select_tile_func_arch(
+ const iree_uk_unpack_params_t* params) {
+ return 0;
+}
+
+bool iree_uk_query_matmul_tile_sizes_arch(
+ const iree_uk_query_tile_sizes_2d_params_t* params,
+ iree_uk_matmul_tile_sizes_t* out_matmul_tile_sizes) {
+ return false;
+}
diff --git a/runtime/src/iree/builtins/ukernel/weak.c b/runtime/src/iree/builtins/ukernel/weak.c
deleted file mode 100644
index 6ac9840..0000000
--- a/runtime/src/iree/builtins/ukernel/weak.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2023 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/builtins/ukernel/mmt4d_internal.h"
-#include "iree/builtins/ukernel/pack_internal.h"
-#include "iree/builtins/ukernel/query_tile_sizes_internal.h"
-#include "iree/builtins/ukernel/unpack_internal.h"
-
-#if defined(IREE_UK_HAVE_WEAK)
-
-IREE_UK_WEAK iree_uk_mmt4d_tile_func_t
-iree_uk_mmt4d_select_tile_func_arch(const iree_uk_mmt4d_params_t* params) {
- return 0;
-}
-
-IREE_UK_WEAK iree_uk_pack_tile_func_t
-iree_uk_pack_select_tile_func_arch(const iree_uk_pack_params_t* params) {
- return 0;
-}
-
-IREE_UK_WEAK iree_uk_unpack_tile_func_t
-iree_uk_unpack_select_tile_func_arch(const iree_uk_unpack_params_t* params) {
- return 0;
-}
-
-IREE_UK_WEAK bool iree_uk_query_matmul_tile_sizes_arch(
- const iree_uk_query_tile_sizes_2d_params_t* params,
- iree_uk_matmul_tile_sizes_t* out_matmul_tile_sizes) {
- return false;
-}
-
-#endif // defined(IREE_UK_HAVE_WEAK)