CPU ukernels as bitcode (x86-only for now) (#13460)
While #13433 enabled the use of micro kernels within codegen backends
using the plugin mechanism, here the ukernel code is compiled into a
bitcode library. This bitcode library is linked with the generated code
at compilation time. The lowering to LLVM inlines the exact micro kernel
needed from the micro kernel library for a particular architecture. To
enable this end-to-end flow, the following changes are needed
- Add an enum attribute to HAL : `IREE::HAL::CallingConvention` that
allows specifying what calling convention to use for the micro kernel
call. `Default` leaves the params as is, `ParameterStruct` packs all the
returns and arguments into a parameter struct to mimic ABIs like
https://github.com/openxla/iree/blob/6cf092d022810d4347353b23e5ce2688a166dd67/runtime/src/iree/builtins/ukernel/mmt4d.h#L16
- Couple of patterns are added to `ConvertToLLVM` pass, to handle the
lowering of the function definition and function call, in keeping with
the specified ABI
- Allow specification of `hal.import.fields` to specify `processor_data`
and `processor_id` on ukernel function defn. This then generates the
code to forward this information to the microkernels (similar to what is
done for external calls using the plugin mechanism)
- Propagate the target CPU features in `hal.executable.target` of the
dispatch into the micro kernel call. This allows the LLVM passes to walk
through the branching used to pick the right micro kernel function and
effectively inline that.
Co-authored-by: Benoit Jacob <benoitjacob@google.com>
diff --git a/build_tools/bazel/iree_bitcode_library.bzl b/build_tools/bazel/iree_bitcode_library.bzl
index a38fa45..b4f9b38 100644
--- a/build_tools/bazel/iree_bitcode_library.bzl
+++ b/build_tools/bazel/iree_bitcode_library.bzl
@@ -40,23 +40,25 @@
for bitcode_src in srcs:
bitcode_out = "%s_%s.bc" % (name, bitcode_src)
bitcode_files.append(bitcode_out)
+ system_headers = ["immintrin.h"]
native.genrule(
name = "gen_%s" % (bitcode_out),
- srcs = [bitcode_src],
+ srcs = [bitcode_src] + hdrs + [builtin_headers_dep],
outs = [bitcode_out],
cmd = " && ".join([
" ".join([
"$(location %s)" % (clang_tool),
- "-isystem $(BINDIR)/%s" % (builtin_headers_path),
+ "-isystem $(BINDIR)/%s" % builtin_headers_path,
" ".join(copts),
" ".join(["-D%s" % (define) for define in defines]),
+ " ".join(["-I $(BINDIR)/runtime/src"]),
+ " ".join(["-I runtime/src"]),
"-o $(location %s)" % (bitcode_out),
"$(location %s)" % (bitcode_src),
]),
]),
- tools = hdrs + data + [
+ tools = data + [
clang_tool,
- builtin_headers_dep,
],
message = "Compiling %s to %s..." % (bitcode_src, bitcode_out),
output_to_bindir = 1,
@@ -81,3 +83,40 @@
output_to_bindir = 1,
**kwargs
)
+
+def iree_link_bitcode(
+ name,
+ bitcode_files,
+ out = None,
+ link_tool = "@llvm-project//llvm:llvm-link",
+ **kwargs):
+ """Builds an LLVM bitcode library from an input file via clang.
+
+ Args:
+ name: Name of the target.
+ bitcode_files: bitcode files to link together.
+ out: output file name (defaults to name.bc).
+ link_tool: llvm-link tool used for linking bitcode files.
+ **kwargs: any additional attributes to pass to the underlying rules.
+ """
+
+ bitcode_files_qualified = [(("//" + native.package_name() + "/" + b) if b.count(":") else b) for b in bitcode_files]
+
+ if not out:
+ out = "%s.bc" % (name)
+ native.genrule(
+ name = name,
+ srcs = bitcode_files_qualified,
+ outs = [out],
+ cmd = " && ".join([
+ " ".join([
+ "$(location %s)" % (link_tool),
+ "-o $(location %s)" % (out),
+ " ".join(["$(locations %s)" % (src) for src in bitcode_files_qualified]),
+ ]),
+ ]),
+ tools = [link_tool],
+ message = "Linking bitcode library %s to %s..." % (name, out),
+ output_to_bindir = 1,
+ **kwargs
+ )
diff --git a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
index 4cbc856..d10af72 100644
--- a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
+++ b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
@@ -515,6 +515,20 @@
f"{testonly_block}"
f" PUBLIC\n)\n\n")
+ def iree_link_bitcode(self, name, bitcode_files, data=None, testonly=None):
+ name_block = self._convert_string_arg_block("NAME", name, quote=False)
+ bitcode_files_block = self._convert_srcs_block(
+ [f.replace(":", "/") for f in bitcode_files])
+ data_block = self._convert_target_list_block("DATA", data)
+ testonly_block = self._convert_option_block("TESTONLY", testonly)
+
+ self._converter.body += (f"iree_link_bitcode(\n"
+ f"{name_block}"
+ f"{bitcode_files_block}"
+ f"{data_block}"
+ f"{testonly_block}"
+ f" PUBLIC\n)\n\n")
+
def iree_bytecode_module(self,
name,
src,
diff --git a/build_tools/cmake/iree_bitcode_library.cmake b/build_tools/cmake/iree_bitcode_library.cmake
index 0f17151..05ee273 100644
--- a/build_tools/cmake/iree_bitcode_library.cmake
+++ b/build_tools/cmake/iree_bitcode_library.cmake
@@ -34,10 +34,6 @@
set(_CLANG_TOOL "$<TARGET_FILE:${IREE_CLANG_TARGET}>")
set(_LINK_TOOL "$<TARGET_FILE:${IREE_LLVM_LINK_TARGET}>")
- # These are copied as part of the clang build; we could allow the user to
- # override this but it should be harmless.
- set(_BUILTIN_HEADERS_PATH "${IREE_BINARY_DIR}/third_party/llvm-project/llvm/lib/clang/${CLANG_VERSION_MAJOR}/include/")
-
if(_RULE_TESTONLY AND NOT IREE_BUILD_TESTS)
return()
endif()
@@ -48,7 +44,21 @@
set(_OUT "${_RULE_NAME}.bc")
endif()
- set(_ARGS "-isystem ${_BUILTIN_HEADERS_PATH}")
+ # We need CLANG_VERSION_MAJOR to set up include directories. Unfortunately,
+ # Clang's own CMakeLists do not expose CLANG_VERSION_MAJOR to PARENT_SCOPE.
+ # Likewise with LLVM_VERSION_MAJOR. However, CLANG_EXECUTABLE_VERSION is
+ # CACHE'd, so we can access it, and it currently has the same value.
+ set(_CLANG_VERSION_MAJOR "${CLANG_EXECUTABLE_VERSION}")
+
+ # These are copied as part of the clang build; we could allow the user to
+ # override this but it should be harmless.
+ set(_BUILTIN_HEADERS_PATH "${IREE_BINARY_DIR}/llvm-project/lib/clang/${_CLANG_VERSION_MAJOR}/include/")
+
+ set(_ARGS "")
+ list(APPEND _ARGS "-isystem" "${_BUILTIN_HEADERS_PATH}")
+ list(APPEND _ARGS "-I" "${IREE_SOURCE_DIR}/runtime/src")
+ list(APPEND _ARGS "-I" "${IREE_BINARY_DIR}/runtime/src")
+
list(APPEND _ARGS "${_RULE_COPTS}")
foreach(_DEFINE ${_RULE_DEFINES})
list(APPEND _ARGS "-D${_DEFINE}")
@@ -70,6 +80,7 @@
"${_BITCODE_FILE}"
DEPENDS
${_CLANG_TOOL}
+ ${_LINK_TOOL}
${_BITCODE_SRC}
COMMENT
"Compiling ${_BITCODE_SRC} to ${_BITCODE_FILE}"
@@ -101,3 +112,66 @@
DEPENDS "${_OUT}"
)
endfunction()
+
+# iree_link_bitcode()
+#
+# Builds an LLVM bitcode library from an input file via clang
+#
+# Parameters:
+# NAME: Name of target (see Note).
+# SRCS: Source files to pass to clang.
+# HDRS: Additional headers included by the source files.
+# COPTS: additional flags to pass to clang.
+# DEFINES: Preprocessor definitions to pass to clang.
+# DATA: Additional data required during compilation.
+# OUT: Output file name (defaults to NAME.bc).
+# PUBLIC: Add this so that this library will be exported under ${PACKAGE}::
+# Also in IDE, target will appear in ${PACKAGE} folder while non PUBLIC
+# will be in ${PACKAGE}/internal.
+# TESTONLY: When added, this target will only be built if IREE_BUILD_TESTS=ON.
+function(iree_link_bitcode)
+ cmake_parse_arguments(
+ _RULE
+ "PUBLIC;TESTONLY"
+ "NAME;OUT"
+ "SRCS;DEFINES;DATA"
+ ${ARGN}
+ )
+
+ set(_LINK_TOOL "$<TARGET_FILE:${IREE_LLVM_LINK_TARGET}>")
+
+ if(_RULE_TESTONLY AND NOT IREE_BUILD_TESTS)
+ return()
+ endif()
+
+ if(DEFINED _RULE_OUT)
+ set(_OUT "${_RULE_OUT}")
+ else()
+ set(_OUT "${_RULE_NAME}.bc")
+ endif()
+
+ set(_BITCODE_FILES "${_RULE_SRCS}")
+
+ add_custom_command(
+ OUTPUT
+ ${_OUT}
+ COMMAND
+ ${_LINK_TOOL}
+ ${_BITCODE_FILES}
+ "-o"
+ "${_OUT}"
+ DEPENDS
+ ${_LINK_TOOL}
+ ${_BITCODE_FILES}
+ COMMENT
+ "Linking bitcode to ${_OUT}"
+ VERBATIM
+ )
+
+ # Only add iree_${NAME} as custom target doesn't support aliasing to
+ # iree::${NAME}.
+ iree_package_name(_PACKAGE_NAME)
+ add_custom_target("${_PACKAGE_NAME}_${_RULE_NAME}"
+ DEPENDS "${_OUT}"
+ )
+endfunction()
diff --git a/build_tools/cmake/iree_trace_runner_test.cmake b/build_tools/cmake/iree_trace_runner_test.cmake
index 96ef411..eff4d41 100644
--- a/build_tools/cmake/iree_trace_runner_test.cmake
+++ b/build_tools/cmake/iree_trace_runner_test.cmake
@@ -47,6 +47,10 @@
${ARGN}
)
+ if(CMAKE_CROSSCOMPILING AND "hostonly" IN_LIST _RULE_LABELS)
+ return()
+ endif()
+
iree_package_name(_PACKAGE_NAME)
set(_NAME "${_PACKAGE_NAME}_${_RULE_NAME}")
@@ -149,6 +153,10 @@
${ARGN}
)
+ if(CMAKE_CROSSCOMPILING AND "hostonly" IN_LIST _RULE_LABELS)
+ return()
+ endif()
+
# Omit tests for which the specified driver or target backend is not enabled.
# This overlaps with directory exclusions and other filtering mechanisms.
string(TOUPPER ${_RULE_DRIVER} _UPPERCASE_DRIVER)
@@ -293,6 +301,10 @@
${ARGN}
)
+ if(CMAKE_CROSSCOMPILING AND "hostonly" IN_LIST _RULE_LABELS)
+ return()
+ endif()
+
if(NOT DEFINED _RULE_TARGET_BACKENDS AND NOT DEFINED _RULE_DRIVERS)
set(_RULE_TARGET_BACKENDS "vmvx" "vulkan-spirv" "llvm-cpu")
set(_RULE_DRIVERS "local-task" "vulkan" "local-task")
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
index c4127a3..2899ee9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
@@ -72,6 +72,7 @@
"//llvm-external-projects/iree-dialects:IREELinalgTransformDialect",
"//llvm-external-projects/iree-dialects:IREELinalgTransformDialectPasses",
"//runtime/src/iree/builtins/ukernel:exported_bits",
+ "//runtime/src/iree/schemas:cpu_data",
"//runtime/src/iree/schemas/instruments",
"@llvm-project//llvm:BinaryFormat",
"@llvm-project//llvm:Support",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
index d7af1fc..4c2bea1 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
@@ -120,6 +120,7 @@
iree::compiler::Dialect::Util::IR
iree::compiler::Dialect::Util::Transforms
iree::compiler::Utils
+ iree::schemas::cpu_data
iree::schemas::instruments
PUBLIC
)
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
index e30ec4d..bc50628 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
@@ -666,6 +666,137 @@
}
};
+/// Helper method to get information about extra operands that need to be
+/// appended to a function defn/call operation.
+static SmallVector<StringRef> getExtraFields(Operation *forOp) {
+ SmallVector<StringRef> extraFields;
+ if (auto extraFieldsAttr =
+ forOp->getAttrOfType<ArrayAttr>("hal.import.fields")) {
+ extraFields = llvm::to_vector(llvm::map_range(
+ extraFieldsAttr.getValue(),
+ [](Attribute attr) { return attr.cast<StringAttr>().getValue(); }));
+ }
+ return extraFields;
+}
+
+/// Return calling convention to use for the operation.
+static IREE::HAL::CallingConvention getCallingConvention(Operation *forOp) {
+ auto cConv = IREE::HAL::CallingConvention::Default;
+ if (auto cConvAttr = forOp->getAttrOfType<IREE::HAL::CallingConventionAttr>(
+ "hal.import.cconv")) {
+ cConv = cConvAttr.getValue();
+ }
+ return cConv;
+}
+
+/// Lower func ops with specified ABI. Currently this pattern is triggered
+/// only for operations with the `hal.import.bitcode` attribute set.
+///
+/// Note: this is an LLVM::CallOp -> LLVM::CallOp rewrite that is introduced
+/// after all conversions are done. Importantly, this is not a conversion
+/// pattern.
+struct RewriteFuncOpABI : public OpRewritePattern<LLVM::LLVMFuncOp> {
+ RewriteFuncOpABI(HALDispatchABI &abi, LLVMTypeConverter &typeConverter)
+ : OpRewritePattern(&typeConverter.getContext()),
+ abi(abi),
+ typeConverter(typeConverter) {}
+
+ LogicalResult matchAndRewrite(LLVM::LLVMFuncOp funcOp,
+ PatternRewriter &rewriter) const override {
+ if (!funcOp.isExternal()) {
+ return rewriter.notifyMatchFailure(funcOp, "skipping non-external calls");
+ }
+ if (!funcOp->hasAttr("hal.import.bitcode")) {
+ return rewriter.notifyMatchFailure(
+ funcOp, "callee is not imported using bitcode linkage; skipping");
+ }
+ IREE::HAL::CallingConvention cConv = getCallingConvention(funcOp);
+
+ SmallVector<StringRef> extraFields = getExtraFields(funcOp);
+ auto funcType = funcOp.getFunctionType();
+ FailureOr<LLVM::LLVMFunctionType> expectedType =
+ abi.getABIFunctionType(funcOp, cConv, funcType.getReturnTypes(),
+ funcType.getParams(), extraFields);
+ if (failed(expectedType)) {
+ return rewriter.notifyMatchFailure(
+ funcOp,
+ "unable to get function type to match the calling convention");
+ }
+ if (abi.hasCompatibleFunctionSignature(
+ rewriter.getContext(), expectedType.value(),
+ funcType.getReturnTypes(), funcType.getParams())) {
+ return failure();
+ }
+ auto attrs = getPrunedAttributeList(
+ funcOp, llvm::to_vector(LLVM::LLVMFuncOp::getAttributeNames()));
+ SmallVector<DictionaryAttr> argAttrs;
+ if (auto currArgAttrs = funcOp.getArgAttrsAttr()) {
+ argAttrs =
+ llvm::to_vector(llvm::map_range(currArgAttrs, [](Attribute attr) {
+ return attr.cast<DictionaryAttr>();
+ }));
+ }
+ rewriter.create<LLVM::LLVMFuncOp>(
+ funcOp.getLoc(), funcOp.getName(), expectedType.value(),
+ funcOp.getLinkage(), funcOp.getDsoLocal(), funcOp.getCConv(), attrs,
+ argAttrs, funcOp.getFunctionEntryCount());
+ rewriter.eraseOp(funcOp);
+ return success();
+ }
+
+ private:
+ HALDispatchABI &abi;
+ LLVMTypeConverter &typeConverter;
+};
+
+/// Lower call ops with specified ABI. The ABI to use is looked up from the
+/// callee. Currently this pattern is triggered only for operations where the
+/// callee has the `hal.import.bitcode` attribute set.
+///
+/// Note: this is an LLVM::CallOp -> LLVM::CallOp rewrite that is introduced
+/// after all conversions are done. Importantly, this is not a conversion
+/// pattern.
+struct RewriteCallOpABI : public OpRewritePattern<LLVM::CallOp> {
+ RewriteCallOpABI(HALDispatchABI &abi, LLVMTypeConverter &typeConverter)
+ : OpRewritePattern(&typeConverter.getContext()),
+ abi(abi),
+ typeConverter(typeConverter) {}
+
+ LogicalResult matchAndRewrite(LLVM::CallOp callOp,
+ PatternRewriter &rewriter) const override {
+ auto symbol = callOp.getCallableForCallee().dyn_cast<SymbolRefAttr>();
+ auto flatSymbol = symbol.dyn_cast_or_null<FlatSymbolRefAttr>();
+ if (!flatSymbol) return failure();
+
+ // Ensure the target function is extern.
+ // To support conversion inserting calls in local patterns that can't add
+ // global function symbols we assume any missing callee is extern.
+ auto calleeOp =
+ SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(callOp, symbol);
+ if (!calleeOp || !calleeOp->hasAttr("hal.import.bitcode") ||
+ !calleeOp.isExternal()) {
+ return rewriter.notifyMatchFailure(
+ callOp, "callee is not imported using bitcode linakge; skipping");
+ }
+
+ IREE::HAL::CallingConvention cConv = getCallingConvention(calleeOp);
+ SmallVector<StringRef> extraFields = getExtraFields(calleeOp);
+
+ FailureOr<SmallVector<Value>> results = abi.materializeABI(
+ callOp, calleeOp.getSymName(), cConv, callOp->getResultTypes(),
+ callOp->getOperands(), extraFields, rewriter);
+ if (failed(results)) {
+ return failure();
+ }
+ rewriter.replaceOp(callOp, *results);
+ return success();
+ }
+
+ private:
+ HALDispatchABI &abi;
+ LLVMTypeConverter &typeConverter;
+};
+
/// Rewrites calls to extern functions to dynamic library import calls.
/// The parent LLVMFuncOp must be compatible with HALDispatchABI.
///
@@ -702,12 +833,22 @@
// let it fall through to the linker stage where it can be picked up either
// from the runtime build (in the case of us producing static libraries) or
// the user-specified object files (when producing dynamic libraries).
- if (calleeOp->hasAttr("hal.import.static")) {
+ if (calleeOp->hasAttr("hal.import.static") ||
+ calleeOp->hasAttr("hal.import.bitcode")) {
return rewriter.notifyMatchFailure(callOp,
"external function is marked static "
"and does not need an import wrapper");
}
+ // The call may need some additional internal fields appended.
+ SmallVector<StringRef> extraFields;
+ if (auto extraFieldsAttr =
+ calleeOp->getAttrOfType<ArrayAttr>("hal.import.fields")) {
+ for (auto extraFieldAttr : extraFieldsAttr) {
+ extraFields.push_back(extraFieldAttr.cast<StringAttr>().getValue());
+ }
+ }
+
// Allow multiple imports to alias by having their name explicitly
// specified.
StringRef importName = flatSymbol.getValue();
@@ -719,15 +860,6 @@
// TODO(benvanik): way to determine if weak (maybe via linkage?).
bool weak = false;
- // The call may need some additional internal fields appended.
- SmallVector<StringRef> extraFields;
- if (auto extraFieldsAttr =
- calleeOp->getAttrOfType<ArrayAttr>("hal.import.fields")) {
- for (auto extraFieldAttr : extraFieldsAttr) {
- extraFields.push_back(extraFieldAttr.cast<StringAttr>().getValue());
- }
- }
-
// Rewrite the call to a dynamic import call.
SmallVector<Value> results = abi.wrapAndCallImport(
callOp, importName, weak, callOp->getResultTypes(),
@@ -955,8 +1087,8 @@
// Rewrite any extern calls emitted to dynamic library imports.
{
RewritePatternSet patterns(&getContext());
- patterns.insert<RewriteExternCallOpToDynamicImportCallOp>(abi,
- typeConverter);
+ patterns.insert<RewriteExternCallOpToDynamicImportCallOp, RewriteCallOpABI,
+ RewriteFuncOpABI>(abi, typeConverter);
if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
return signalPassFailure();
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp
index da116a6..4b00a2d 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp
@@ -6,6 +6,7 @@
#include "iree/compiler/Codegen/LLVMCPU/DispatchABI.h"
+#include "iree/schemas/cpu_data.h"
#include "llvm/BinaryFormat/Dwarf.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@@ -13,6 +14,7 @@
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
static llvm::cl::opt<bool> clVerboseDebugInfo(
"iree-codegen-llvm-verbose-debug-info",
@@ -22,6 +24,39 @@
namespace mlir {
namespace iree_compiler {
+// List of all defined llvm feature-name to bit pattern used to represent it.
+// This is derived based on the schema in `runtime/src/iree/schemas/`.
+// TODO(ravishankarm): This link to the runtime schemas needs to be broken.
+// Instead we should use a reflection callback to resolve arch guarded features
+// directly in the compiler.
+
+// Struct to capture tuple of llvm feature-name to bit pattern used to represent
+// it.
+struct iree_llvm_name_and_bit_pattern_t {
+ const char *llvm_name;
+ unsigned long long bit_pattern;
+};
+
+#define IREE_CPU_FEATURE_BIT_NAME(arch, field_index, bit_name) \
+ IREE_CPU_DATA##field_index##_##arch##_##bit_name
+
+#define IREE_CPU_FEATURE_NAME_AND_BIT_PATTERN(arch, field_index, bit_name, \
+ llvm_name) \
+ {llvm_name, IREE_CPU_FEATURE_BIT_NAME(arch, field_index, bit_name)},
+
+static const struct iree_llvm_name_and_bit_pattern_t
+ iree_llvm_name_and_bit_pattern_list[] = {
+
+#define IREE_CPU_FEATURE_BIT(arch, field_index, bit_pos, bit_name, llvm_name) \
+ IREE_CPU_FEATURE_NAME_AND_BIT_PATTERN(arch, field_index, bit_name, llvm_name)
+#include "iree/schemas/cpu_feature_bits.inl"
+#undef IREE_CPU_FEATURE_BIT
+
+};
+
+#undef IREE_CPU_FEATURE_NAME_AND_BIT_PATTERN
+#undef IREE_CPU_FEATURE_BIT_NAME
+
//------------------------------------------------------------------------------
// ExecutableLibraryDI
//------------------------------------------------------------------------------
@@ -823,6 +858,10 @@
}
}
+Type HALDispatchABI::getProcessorIDType() {
+ return getFieldType(WorkgroupStateField::processor_id);
+}
+
Value HALDispatchABI::loadProcessorID(Operation *forOp, OpBuilder &builder) {
auto resultValue =
loadFieldValue(forOp, WorkgroupStateField::processor_id, builder);
@@ -830,6 +869,79 @@
di.getBasicType(resultValue.getType()), builder);
}
+Value HALDispatchABI::updateProcessorDataFromTargetAttr(
+ Operation *forOp, Value processorDataPtrValue, OpBuilder &builder) {
+ // Get the target attr.
+ IREE::HAL::ExecutableTargetAttr targetAttr =
+ IREE::HAL::ExecutableTargetAttr::lookup(forOp);
+ if (!targetAttr) {
+ return processorDataPtrValue;
+ }
+
+ // Lookup CPU features.
+ std::optional<NamedAttribute> cpuFeatures =
+ targetAttr.getConfiguration().getNamed("cpu_features");
+ if (!cpuFeatures) {
+ return processorDataPtrValue;
+ }
+
+ SmallVector<uint64_t> specifiedFeatureBitPatterns;
+ {
+ llvm::StringMap<uint64_t> featureToBitPattern;
+ for (auto [llvmName, bitPattern] : iree_llvm_name_and_bit_pattern_list) {
+ featureToBitPattern[llvmName] = bitPattern;
+ }
+ SmallVector<StringRef> cpuFeatureStrings;
+ cpuFeatures->getValue().cast<StringAttr>().getValue().split(
+ cpuFeatureStrings, ',', /*MakeSplit=*/-1, /*KeepEmpty=*/false);
+ for (auto featureString : cpuFeatureStrings) {
+ if (featureToBitPattern.count(featureString.drop_front())) {
+ specifiedFeatureBitPatterns.push_back(
+ featureToBitPattern.lookup(featureString.drop_front()));
+ }
+ }
+ }
+ if (specifiedFeatureBitPatterns.empty()) {
+ return processorDataPtrValue;
+ }
+
+ // Create a new stack allocation for the bit pattern.
+ Location loc = forOp->getLoc();
+ MLIRContext *context = forOp->getContext();
+ auto ptrType = LLVM::LLVMPointerType::get(context);
+ auto i64Ty = builder.getI64Type();
+ Value arraySize = builder.create<LLVM::ConstantOp>(
+ loc, i64Ty, builder.getI64IntegerAttr(ProcessorDataCapacity));
+ Value alloca = builder.create<LLVM::AllocaOp>(loc, ptrType, i64Ty, arraySize,
+ /*alignment=*/sizeof(uint64_t));
+ // Load the 0-th value.
+ Value srcData0 =
+ builder.create<LLVM::LoadOp>(loc, i64Ty, processorDataPtrValue);
+ // Set the specified CPU arch data.
+ for (auto bitPattern : specifiedFeatureBitPatterns) {
+ Value bitPatternVal = builder.create<LLVM::ConstantOp>(
+ loc, i64Ty, builder.getI64IntegerAttr(bitPattern));
+ srcData0 = builder.create<LLVM::OrOp>(loc, srcData0, bitPatternVal);
+ }
+ builder.create<LLVM::StoreOp>(loc, srcData0, alloca);
+ // Copy over the rest.
+ for (int64_t i = 1, e = ProcessorDataCapacity; i < e; ++i) {
+ Value loadPtr = builder.create<LLVM::GEPOp>(
+ loc, processorDataPtrValue.getType(), i64Ty, processorDataPtrValue,
+ LLVM::GEPArg(int32_t(i)), /*inbounds =*/true);
+ Value loadVal = builder.create<LLVM::LoadOp>(loc, i64Ty, loadPtr);
+ Value storePtr = builder.create<LLVM::GEPOp>(
+ loc, alloca.getType(), i64Ty, alloca, LLVM::GEPArg(int32_t(i)),
+ /*inbounds =*/true);
+ builder.create<LLVM::StoreOp>(loc, loadVal, storePtr);
+ }
+ return alloca;
+}
+
+Type HALDispatchABI::getProcessorDataType() {
+ return LLVM::LLVMPointerType::get(processorType.getContext());
+}
+
Value HALDispatchABI::loadProcessorData(Operation *forOp, OpBuilder &builder) {
// To get a pointer to the processor data we need to track pointers all the
// way from the environment argument. This is redundant with loadFieldValue
@@ -848,7 +960,9 @@
LLVM::LLVMPointerType::get(processorType), processorPtrValue,
LLVM::GEPArg(int32_t(ProcessorField::data)),
/*inbounds=*/true);
- return buildValueDI(forOp, processorDataPtrValue, "processor_data",
+ Value updatedProcessorData =
+ updateProcessorDataFromTargetAttr(forOp, processorDataPtrValue, builder);
+ return buildValueDI(forOp, updatedProcessorData, "processor_data",
di.getPtrOf(di.getConstOf(di.getArrayOf(
di.getUint64T(), ProcessorDataCapacity))),
builder);
@@ -993,59 +1107,207 @@
return callOp.getResult();
}
-SmallVector<Value> HALDispatchABI::wrapAndCallImport(
- Operation *forOp, StringRef importName, bool weak, TypeRange resultTypes,
- ValueRange args, ArrayRef<StringRef> extraFields, OpBuilder &builder) {
- auto loc = forOp->getLoc();
- auto context = builder.getContext();
-
+// static
+std::optional<Type> HALDispatchABI::getParameterStructType(
+ TypeRange resultTypes, ValueRange args, TypeRange extraFieldsTypes) {
// Struct types are ordered [results..., args...].
SmallVector<Type> types(resultTypes);
types.reserve(resultTypes.size() + args.size());
for (Value arg : args) {
types.push_back(typeConverter->convertType(arg.getType()));
}
+ types.append(extraFieldsTypes.begin(), extraFieldsTypes.end());
+
+ if (types.empty()) {
+ return std::nullopt;
+ }
+ return LLVM::LLVMStructType::getLiteral(context, types);
+}
+
+// static
+std::tuple<Type, Value> HALDispatchABI::packIntoParameterStruct(
+ Operation *forOp, TypeRange resultTypes, ValueRange args,
+ ValueRange extraFields, OpBuilder &builder) {
+ Location loc = forOp->getLoc();
+ MLIRContext *context = builder.getContext();
// Query any extra fields that were requested and append them to the struct.
- SmallVector<Value> extraFieldValues;
- for (auto extraField : extraFields) {
- auto extraFieldValue = getExtraField(forOp, extraField, builder);
- extraFieldValues.push_back(extraFieldValue);
- types.push_back(extraFieldValue.getType());
+ auto extraFieldsTypes = llvm::to_vector(
+ llvm::map_range(extraFields, [](Value v) { return v.getType(); }));
+
+ std::optional<Type> structType =
+ getParameterStructType(resultTypes, args, extraFieldsTypes);
+
+ if (!structType) {
+ Type voidPtrType = LLVM::LLVMPointerType::get(context);
+ return {voidPtrType,
+ builder.create<LLVM::UndefOp>(loc, voidPtrType).getResult()};
}
- // Pack parameter structure.
- Type structType;
- Value paramsPtr, voidPtr;
- auto voidPtrTy = LLVM::LLVMPointerType::get(context);
- if (!types.empty()) {
- // TODO(benvanik): set specific layout to match runtime.
- structType = LLVM::LLVMStructType::getLiteral(context, types);
- auto ptrStructType = LLVM::LLVMPointerType::get(context);
- Value one = builder.create<LLVM::ConstantOp>(loc, builder.getI64Type(),
- builder.getIndexAttr(1));
- paramsPtr =
- builder.create<LLVM::AllocaOp>(loc, ptrStructType, structType, one,
- /*alignment=*/0);
- Value structVal = builder.create<LLVM::UndefOp>(loc, structType);
- for (int64_t i = 0, e = args.size(); i < e; ++i) {
- structVal = builder.create<LLVM::InsertValueOp>(loc, structVal, args[i],
- i + resultTypes.size());
- }
- for (int64_t i = 0, e = extraFieldValues.size(); i < e; ++i) {
- structVal = builder.create<LLVM::InsertValueOp>(
- loc, structVal, extraFieldValues[i],
- i + resultTypes.size() + args.size());
- }
- // Store into the alloca'ed descriptor.
- builder.create<LLVM::StoreOp>(loc, structVal, paramsPtr);
- voidPtr = builder.create<LLVM::BitcastOp>(loc, voidPtrTy, paramsPtr);
- } else {
- voidPtr = builder.create<LLVM::UndefOp>(loc, voidPtrTy);
+ auto ptrStructType = LLVM::LLVMPointerType::get(context);
+ Value one = builder.create<LLVM::ConstantOp>(loc, builder.getI64Type(),
+ builder.getIndexAttr(1));
+ Value paramsPtr =
+ builder.create<LLVM::AllocaOp>(loc, ptrStructType, *structType, one,
+ /*alignment=*/0);
+ Value structVal = builder.create<LLVM::UndefOp>(loc, *structType);
+ for (int64_t i = 0, e = args.size(); i < e; ++i) {
+ structVal = builder.create<LLVM::InsertValueOp>(loc, structVal, args[i],
+ i + resultTypes.size());
}
+ for (int64_t i = 0, e = extraFields.size(); i < e; ++i) {
+ structVal = builder.create<LLVM::InsertValueOp>(
+ loc, structVal, extraFields[i], i + resultTypes.size() + args.size());
+ }
+ // Store into the alloca'ed descriptor.
+ builder.create<LLVM::StoreOp>(loc, structVal, paramsPtr);
+ return {*structType, paramsPtr};
+}
+
+// static
+FailureOr<LLVM::LLVMFunctionType> HALDispatchABI::getABIFunctionType(
+ Operation *forOp, IREE::HAL::CallingConvention cConv, TypeRange resultTypes,
+ TypeRange argTypes, ArrayRef<StringRef> extraFields) {
+ MLIRContext *context = forOp->getContext();
+ SmallVector<Type> extraFieldsTypes = llvm::to_vector(llvm::map_range(
+ extraFields, [&](StringRef name) { return getExtraFieldType(name); }));
+
+ // Check for extra fields already added.
+ if (argTypes.size() >= extraFieldsTypes.size()) {
+ if (llvm::all_of(llvm::zip(argTypes.take_back(extraFieldsTypes.size()),
+ extraFieldsTypes),
+ [](auto it) {
+ auto lhsType = std::get<0>(it);
+ auto rhsType = std::get<1>(it);
+ return (lhsType.template isa<LLVM::LLVMPointerType>() &&
+ rhsType.template isa<LLVM::LLVMPointerType>()) ||
+ std::get<0>(it) == std::get<1>(it);
+ })) {
+ // Extra fields already added. Drop them.
+ extraFieldsTypes.clear();
+ }
+ }
+
+ switch (cConv) {
+ case IREE::HAL::CallingConvention::Default: {
+ if (resultTypes.size() > 1) {
+ return forOp->emitOpError(
+ "Cannot have multiple return values for function");
+ }
+ Type resultType = resultTypes.size() == 1
+ ? resultTypes[0]
+ : LLVM::LLVMVoidType::get(context);
+ SmallVector<Type> allArgTypes = argTypes;
+ allArgTypes.append(extraFieldsTypes.begin(), extraFieldsTypes.end());
+ return LLVM::LLVMFunctionType::get(resultType, allArgTypes);
+ }
+ case IREE::HAL::CallingConvention::ParameterStruct:
+ return LLVM::LLVMFunctionType::get(LLVM::LLVMVoidType::get(context),
+ LLVM::LLVMPointerType::get(context));
+ }
+}
+
+// static
+bool HALDispatchABI::hasCompatibleFunctionSignature(
+ MLIRContext *context, LLVM::LLVMFunctionType funcType,
+ TypeRange resultTypes, TypeRange paramTypes) {
+ TypeRange funcParamTypes = funcType.getParams();
+ if (funcParamTypes.size() != paramTypes.size()) {
+ return false;
+ }
+ if (!llvm::all_of(llvm::zip(funcParamTypes, paramTypes), [](auto it) {
+ auto lhsType = std::get<0>(it);
+ auto rhsType = std::get<1>(it);
+ return (lhsType.template isa<LLVM::LLVMPointerType>() &&
+ rhsType.template isa<LLVM::LLVMPointerType>()) ||
+ std::get<0>(it) == std::get<1>(it);
+ })) {
+ return false;
+ }
+ if (resultTypes.size() > 1) {
+ return false;
+ }
+ Type funcResultType = funcType.getReturnType();
+ if (resultTypes.empty() &&
+ funcResultType != LLVM::LLVMVoidType::get(context)) {
+ return false;
+ }
+ if (resultTypes.size() == 1 && resultTypes[0] != funcResultType) {
+ return false;
+ }
+ return true;
+}
+
+FailureOr<SmallVector<Value>> HALDispatchABI::materializeABI(
+ Operation *forOp, StringRef symbolName, IREE::HAL::CallingConvention cConv,
+ TypeRange resultTypes, ValueRange args, ArrayRef<StringRef> extraFields,
+ RewriterBase &rewriter) {
+ auto argTypes = llvm::to_vector(
+ llvm::map_range(args, [](Value v) { return v.getType(); }));
+ FailureOr<LLVM::LLVMFunctionType> abiFunctionType =
+ getABIFunctionType(forOp, cConv, resultTypes, argTypes, extraFields);
+ if (failed(abiFunctionType)) {
+ return forOp->emitOpError(
+ "failed to get function type for calling convention");
+ }
+ if (hasCompatibleFunctionSignature(rewriter.getContext(),
+ abiFunctionType.value(), resultTypes,
+ argTypes)) {
+ return rewriter.notifyMatchFailure(
+ forOp, "no change in function signature. skipping");
+ }
+
+ // Combined args list.
+ SmallVector<Value> allArgsList = llvm::to_vector(args);
+ SmallVector<Value> extraFieldVals =
+ llvm::to_vector(llvm::map_range(extraFields, [&](StringRef fieldName) {
+ return getExtraField(forOp, fieldName, rewriter);
+ }));
+ allArgsList.append(extraFieldVals);
+
+ Location loc = forOp->getLoc();
+ if (cConv == IREE::HAL::CallingConvention::Default) {
+ auto callOp = rewriter.create<LLVM::CallOp>(
+ loc, abiFunctionType->getReturnTypes(), allArgsList, forOp->getAttrs());
+ return llvm::to_vector(llvm::map_range(
+ callOp.getResults(), [](OpResult v) -> Value { return v; }));
+ }
+
+ if (cConv == IREE::HAL::CallingConvention::ParameterStruct) {
+ auto [structType, paramsStructPtr] = packIntoParameterStruct(
+ forOp, resultTypes, args, extraFieldVals, rewriter);
+ rewriter.create<LLVM::CallOp>(loc, TypeRange{}, paramsStructPtr,
+ forOp->getAttrs());
+ SmallVector<Value> results;
+ if (!resultTypes.empty()) {
+ results.reserve(resultTypes.size());
+ Value structVal =
+ rewriter.create<LLVM::LoadOp>(loc, structType, paramsStructPtr);
+ for (int64_t i = 0, e = resultTypes.size(); i < e; ++i) {
+ results.push_back(
+ rewriter.create<LLVM::ExtractValueOp>(loc, structVal, i));
+ }
+ }
+ return results;
+ }
+ return forOp->emitOpError("unhandled calling convention");
+}
+
+SmallVector<Value> HALDispatchABI::wrapAndCallImport(
+ Operation *forOp, StringRef importName, bool weak, TypeRange resultTypes,
+ ValueRange args, ArrayRef<StringRef> extraFields, OpBuilder &builder) {
+ auto loc = forOp->getLoc();
+
+ SmallVector<Value> extraFieldVals =
+ llvm::to_vector(llvm::map_range(extraFields, [&](StringRef fieldName) {
+ return getExtraField(forOp, fieldName, builder);
+ }));
+
+ auto [structType, paramsPtr] = packIntoParameterStruct(
+ forOp, resultTypes, args, extraFieldVals, builder);
// Calls return 0 (success) or non-zero (failure).
- auto callResult = callImport(forOp, importName, weak, voidPtr, builder);
+ auto callResult = callImport(forOp, importName, weak, paramsPtr, builder);
Block *trueDest =
builder.getInsertionBlock()->splitBlock(++builder.getInsertionPoint());
Block *falseDest = builder.createBlock(trueDest);
@@ -1129,6 +1391,10 @@
return builder.create<LLVM::ExtractValueOp>(loc, stateValue, position);
}
+Type HALDispatchABI::getFieldType(WorkgroupStateField field) {
+ return workgroupStateType.getBody()[int64_t(field)];
+}
+
Value HALDispatchABI::loadFieldValue(Operation *forOp,
WorkgroupStateField field,
OpBuilder &builder) {
@@ -1142,6 +1408,17 @@
return builder.create<LLVM::ExtractValueOp>(loc, stateValue, position);
}
+Type HALDispatchABI::getExtraFieldType(StringRef extraField) {
+ if (extraField == "processor_id") {
+ return getProcessorIDType();
+ }
+ if (extraField == "processor_data") {
+ return getProcessorDataType();
+ }
+ assert(false && "unhandled extra filed");
+ return {};
+}
+
Value HALDispatchABI::getExtraField(Operation *forOp, StringRef extraField,
OpBuilder &builder) {
if (extraField == "processor_id") {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.h
index 07278b8..e21cb59 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.h
@@ -267,9 +267,11 @@
// Loads the processor ID the code is (most likely) being run on.
// Equivalent to:
// uint32_t processor_id = state->processor_id;
+ Type getProcessorIDType();
Value loadProcessorID(Operation *forOp, OpBuilder &builder);
// Loads a pointer to the processor information data fields.
+ Type getProcessorDataType();
Value loadProcessorData(Operation *forOp, OpBuilder &builder);
// Loads a processor information data field at the given index.
@@ -322,6 +324,39 @@
ArrayRef<StringRef> extraFields,
OpBuilder &builder);
+ //===--------------------------------------------------------------------==//
+ // External/bitcode function ABI handling methods.
+ //===--------------------------------------------------------------------==//
+ // Methods required for handling ABI for functions whose definitions are
+ // external.
+
+ /// Check if the `funcType` is equivalent to a function with return being
+ /// of type `resultTypes` and operands of type `paramTypes`.
+ static bool hasCompatibleFunctionSignature(MLIRContext *context,
+ LLVM::LLVMFunctionType funcType,
+ TypeRange resultTypes,
+ TypeRange paramTypes);
+
+ /// Given a calling convention `cConv`, and callee with return of
+ /// `resultTypes` and operands with type `argTypes`, along with extra fields
+ /// to append to argument list specified in `extraFields`; return the function
+ /// type of use for the function that implements the specified calling
+ /// convention.
+ FailureOr<LLVM::LLVMFunctionType> getABIFunctionType(
+ Operation *forOp, IREE::HAL::CallingConvention cConv,
+ TypeRange resultTypes, TypeRange argTypes,
+ ArrayRef<StringRef> extraFields);
+
+ /// Given a calling convention `cConv`, and callee with return of
+ /// `resultTypes` and operands with type `argTypes`, along with extra fields
+ /// to append to argument list specified in `extraFields`; modify the `callOp`
+ /// to implement the specified ABI. The calleee signature is expected to have
+ /// been/to be modified separately, i.e. it isnt done within this method.
+ FailureOr<SmallVector<Value>> materializeABI(
+ Operation *callOp, StringRef symbolName,
+ IREE::HAL::CallingConvention cConv, TypeRange resultTypes,
+ ValueRange args, ArrayRef<StringRef> extraFields, RewriterBase &builder);
+
private:
Value getIndexValue(Location loc, int64_t value, OpBuilder &builder);
@@ -334,12 +369,38 @@
OpBuilder &builder);
Value loadFieldValue(Operation *forOp, DispatchStateField field,
OpBuilder &builder);
+ Type getFieldType(WorkgroupStateField field);
Value loadFieldValue(Operation *forOp, WorkgroupStateField field,
OpBuilder &builder);
+ Type getExtraFieldType(StringRef extraField);
Value getExtraField(Operation *forOp, StringRef extraField,
OpBuilder &builder);
+ // Update the processor data based on the `cpu_features` present in
+ // `executable.target` attribute.
+ Value updateProcessorDataFromTargetAttr(Operation *forOp,
+ Value processorDataPtrValue,
+ OpBuilder &builder);
+
+ // Return LLVM Struct type that represents a container for arguments
+ // and return types. The struct type are ordered [results..., args...]
+ std::optional<Type> getParameterStructType(TypeRange resultTypes,
+ ValueRange args,
+ TypeRange extraFieldsTypes);
+ // For a given call operation, generate the struct that is the container
+ // for passing the arguments.
+ //
+ // The provided |resultTypes| and |args| are packed in a struct and transit
+ // through memory so that we can expose a single void* argument. Optionally
+ // |extraFields| can be specified with an ordered list of field names to be
+ // appended to the end of the struct.
+ std::tuple<Type, Value> packIntoParameterStruct(Operation *forOp,
+ TypeRange resultTypes,
+ ValueRange args,
+ ValueRange extraFields,
+ OpBuilder &builder);
+
mlir::MLIRContext *context;
LLVMTypeConverter *typeConverter;
LLVM::LLVMStructType processorType;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerToUKernels.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerToUKernels.cpp
index 1d0a210..01688b8 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerToUKernels.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerToUKernels.cpp
@@ -63,6 +63,13 @@
result.defAttrs.emplace_back(
rewriter.getStringAttr("hal.import.fields"),
rewriter.getArrayAttr({rewriter.getStringAttr("processor_data")}));
+ result.defAttrs.emplace_back(rewriter.getStringAttr("hal.import.bitcode"),
+ rewriter.getBoolAttr(true));
+ result.defAttrs.emplace_back(
+ rewriter.getStringAttr("hal.import.cconv"),
+ IREE::HAL::CallingConventionAttr::get(
+ rewriter.getContext(),
+ IREE::HAL::CallingConvention::ParameterStruct));
}
return result;
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir
index 38017ed..7acae65 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt -iree-convert-to-llvm %s | FileCheck %s
+// RUN: iree-opt -iree-convert-to-llvm --split-input-file %s | FileCheck %s
builtin.module {
func.func private @extern_public()
@@ -13,3 +13,163 @@
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: !llvm.ptr {llvm.align = 16 : i64, llvm.noalias}) -> i32
// CHECK: llvm.return %{{.+}} : i32
+// -----
+
+module {
+ func.func private @default_cconv_with_extra_fields(memref<f32>, i32, f64) -> (f32) attributes {
+ hal.import.bitcode = true,
+ hal.import.cconv = 0 : i32,
+ hal.import.fields = ["processor_data", "processor_id"],
+ llvm.bareptr = true
+ }
+ func.func @bar() {
+ %c0 = arith.constant 42 : i32
+ %c1 = arith.constant 42.0 : f64
+ %0 = memref.alloca() : memref<f32>
+ %1 = call @default_cconv_with_extra_fields(%0, %c0, %c1) : (memref<f32>, i32, f64) -> (f32)
+ return
+ }
+}
+// CHECK: llvm.func @default_cconv_with_extra_fields(!llvm.ptr, i32, f64, !llvm.ptr, i32) -> f32
+// CHECK: llvm.func @bar
+// CHECK-DAG: %[[Ci32:.+]] = llvm.mlir.constant(42 : i32) : i32
+// CHECK-DAG: %[[Cf64:.+]] = llvm.mlir.constant(4.200000e+01 : f64) : f64
+// CHECK-DAG: %[[ALLOCA:.+]] = llvm.alloca
+// CHECK-DAG: %[[DATA:.+]] = llvm.getelementptr inbounds %arg0[4]
+// CHECK-DAG: %[[PROCESSOR_INFO:.+]] = llvm.load %arg2
+// CHECK: %[[PROCESSOR_ID:.+]] = llvm.extractvalue %[[PROCESSOR_INFO]][4]
+// CHECK: %[[VAL:.+]] = llvm.call @default_cconv_with_extra_fields
+// CHECK-SAME: (%[[ALLOCA]], %[[Ci32]], %[[Cf64]], %[[DATA]], %[[PROCESSOR_ID]])
+
+// -----
+
+module {
+ func.func private @paramstruct_cconv_with_extra_fields(memref<f32>, i32, f64) -> (f32) attributes {
+ hal.import.bitcode = true,
+ hal.import.cconv = 1 : i32,
+ hal.import.fields = ["processor_data", "processor_id"],
+ llvm.bareptr = true
+ }
+ func.func @bar() {
+ %c0 = arith.constant 42 : i32
+ %c1 = arith.constant 42.0 : f64
+ %0 = memref.alloca() : memref<f32>
+ %1 = call @paramstruct_cconv_with_extra_fields(%0, %c0, %c1) : (memref<f32>, i32, f64) -> (f32)
+ return
+ }
+}
+// CHECK: llvm.func @paramstruct_cconv_with_extra_fields(!llvm.ptr)
+// CHECK: llvm.func @bar
+// CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : index) : i64
+// CHECK-DAG: %[[Ci32:.+]] = llvm.mlir.constant(42 : i32) : i32
+// CHECK-DAG: %[[Cf64:.+]] = llvm.mlir.constant(4.200000e+01 : f64) : f64
+// CHECK-DAG: %[[ALLOCA:.+]] = llvm.alloca
+// CHECK-DAG: %[[DATA:.+]] = llvm.getelementptr inbounds %arg0[4]
+// CHECK-DAG: %[[PROCESSOR_INFO:.+]] = llvm.load %arg2
+// CHECK: %[[PROCESSOR_ID:.+]] = llvm.extractvalue %[[PROCESSOR_INFO]][4]
+// CHECK: %[[PARAMSTRUCT_ALLOCA:.+]] = llvm.alloca %[[C1]] x !llvm.struct<(f32, ptr, i32, f64, ptr, i32)>
+// CHECK: %[[PARAMSTRUCT:.+]] = llvm.mlir.undef : !llvm.struct<(f32, ptr, i32, f64, ptr, i32)>
+// CHECK: %[[INSERT_ARG0:.+]] = llvm.insertvalue %[[ALLOCA]], %[[PARAMSTRUCT]][1]
+// CHECK: %[[INSERT_ARG1:.+]] = llvm.insertvalue %[[Ci32]], %[[INSERT_ARG0]][2]
+// CHECK: %[[INSERT_ARG2:.+]] = llvm.insertvalue %[[Cf64]], %[[INSERT_ARG1]][3]
+// CHECK: %[[INSERT_ARG3:.+]] = llvm.insertvalue %[[DATA]], %[[INSERT_ARG2]][4]
+// CHECK: %[[INSERT_ARG4:.+]] = llvm.insertvalue %[[PROCESSOR_ID]], %[[INSERT_ARG3]][5]
+// CHECK: llvm.store %[[INSERT_ARG4]], %[[PARAMSTRUCT_ALLOCA]]
+// CHECK: llvm.call @paramstruct_cconv_with_extra_fields(%[[PARAMSTRUCT_ALLOCA]])
+
+// -----
+
+module attributes {
+ hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
+ cpu = "haswell",
+ cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+fma,+bmi,+bmi2,+pclmul,+cx16,+cx8,+crc32,+f16c,+fsgsbase,+fxsr,+invpcid,+lzcnt,+movbe,+rdrnd,+sahf,+x87,+xsave,+xsaveopt",
+ data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+ native_vector_size = 32 : index,
+ target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = true}>} {
+ func.func private @paramstruct_cconv_with_extra_fields_and_executable_target(memref<f32>, i32, f64) -> (f32) attributes {
+ hal.import.bitcode = true,
+ hal.import.cconv = 1 : i32,
+ hal.import.fields = ["processor_data", "processor_id"],
+ llvm.bareptr = true
+ }
+ func.func @bar() {
+ %c0 = arith.constant 42 : i32
+ %c1 = arith.constant 42.0 : f64
+ %0 = memref.alloca() : memref<f32>
+ %1 = call @paramstruct_cconv_with_extra_fields_and_executable_target(%0, %c0, %c1) : (memref<f32>, i32, f64) -> (f32)
+ return
+ }
+}
+// CHECK: llvm.func @paramstruct_cconv_with_extra_fields_and_executable_target(!llvm.ptr)
+// CHECK: llvm.func @bar
+// CHECK-DAG: %[[F16C:.+]] = llvm.mlir.constant(16384 : i64) : i64
+// CHECK-DAG: %[[FMA:.+]] = llvm.mlir.constant(2048 : i64) : i64
+// CHECK-DAG: %[[AVX2:.+]] = llvm.mlir.constant(32768 : i64) : i64
+// CHECK-DAG: %[[AVX:.+]] = llvm.mlir.constant(1024 : i64) : i64
+// CHECK-DAG: %[[SSE41:.+]] = llvm.mlir.constant(4 : i64) : i64
+// CHECK-DAG: %[[SSSE3:.+]] = llvm.mlir.constant(2 : i64) : i64
+// CHECK-DAG: %[[SSE3:.+]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK-DAG: %[[C8:.+]] = llvm.mlir.constant(8 : i64) : i64
+// CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : index) : i64
+// CHECK-DAG: %[[Ci32:.+]] = llvm.mlir.constant(42 : i32) : i32
+// CHECK-DAG: %[[Cf64:.+]] = llvm.mlir.constant(4.200000e+01 : f64) : f64
+// CHECK-DAG: %[[ALLOCA:.+]] = llvm.alloca
+// CHECK-DAG: %[[DATA_PTR:.+]] = llvm.getelementptr inbounds %arg0[4]
+// CHECK: %[[PROCESSOR_DATA_ALLOCA:.+]] = llvm.alloca %[[C8]] x i64 {alignment = 8 : i64}
+// CHECK-DAG: %[[DATA:.+]] = llvm.load %[[DATA_PTR]]
+// CHECK-DAG: %[[OR0:.+]] = llvm.or %[[DATA]], %[[SSE3]]
+// CHECK-DAG: %[[OR1:.+]] = llvm.or %[[OR0]], %[[SSSE3]]
+// CHECK-DAG: %[[OR2:.+]] = llvm.or %[[OR1]], %[[SSE41]]
+// CHECK-DAG: %[[OR3:.+]] = llvm.or %[[OR2]], %[[C8]]
+// CHECK-DAG: %[[OR4:.+]] = llvm.or %[[OR3]], %[[AVX]]
+// CHECK-DAG: %[[OR5:.+]] = llvm.or %[[OR4]], %[[AVX2]]
+// CHECK-DAG: %[[OR6:.+]] = llvm.or %[[OR5]], %[[FMA]]
+// CHECK-DAG: %[[OR7:.+]] = llvm.or %[[OR6]], %[[F16C]]
+// CHECK: llvm.store %[[OR7]], %[[PROCESSOR_DATA_ALLOCA]]
+
+// CHECK: %[[PROCESSOR_DATA_PTR_1:.+]] = llvm.getelementptr inbounds %[[DATA_PTR]][1]
+// CHECK: %[[PROCESSOR_DATA_1:.+]] = llvm.load %[[PROCESSOR_DATA_PTR_1]]
+// CHECK: %[[PROCESSOR_DATA_ALLOCA_PTR_1:.+]] = llvm.getelementptr inbounds %[[PROCESSOR_DATA_ALLOCA]][1]
+// CHECK: llvm.store %[[PROCESSOR_DATA_1]], %[[PROCESSOR_DATA_ALLOCA_PTR_1]]
+
+// CHECK: %[[PROCESSOR_DATA_PTR_2:.+]] = llvm.getelementptr inbounds %[[DATA_PTR]][2]
+// CHECK: %[[PROCESSOR_DATA_2:.+]] = llvm.load %[[PROCESSOR_DATA_PTR_2]]
+// CHECK: %[[PROCESSOR_DATA_ALLOCA_PTR_2:.+]] = llvm.getelementptr inbounds %[[PROCESSOR_DATA_ALLOCA]][2]
+// CHECK: llvm.store %[[PROCESSOR_DATA_2]], %[[PROCESSOR_DATA_ALLOCA_PTR_2]]
+
+// CHECK: %[[PROCESSOR_DATA_PTR_3:.+]] = llvm.getelementptr inbounds %[[DATA_PTR]][3]
+// CHECK: %[[PROCESSOR_DATA_3:.+]] = llvm.load %[[PROCESSOR_DATA_PTR_3]]
+// CHECK: %[[PROCESSOR_DATA_ALLOCA_PTR_3:.+]] = llvm.getelementptr inbounds %[[PROCESSOR_DATA_ALLOCA]][3]
+// CHECK: llvm.store %[[PROCESSOR_DATA_3]], %[[PROCESSOR_DATA_ALLOCA_PTR_3]]
+
+// CHECK: %[[PROCESSOR_DATA_PTR_4:.+]] = llvm.getelementptr inbounds %[[DATA_PTR]][4]
+// CHECK: %[[PROCESSOR_DATA_4:.+]] = llvm.load %[[PROCESSOR_DATA_PTR_4]]
+// CHECK: %[[PROCESSOR_DATA_ALLOCA_PTR_4:.+]] = llvm.getelementptr inbounds %[[PROCESSOR_DATA_ALLOCA]][4]
+// CHECK: llvm.store %[[PROCESSOR_DATA_4]], %[[PROCESSOR_DATA_ALLOCA_PTR_4]]
+
+// CHECK: %[[PROCESSOR_DATA_PTR_5:.+]] = llvm.getelementptr inbounds %[[DATA_PTR]][5]
+// CHECK: %[[PROCESSOR_DATA_5:.+]] = llvm.load %[[PROCESSOR_DATA_PTR_5]]
+// CHECK: %[[PROCESSOR_DATA_ALLOCA_PTR_5:.+]] = llvm.getelementptr inbounds %[[PROCESSOR_DATA_ALLOCA]][5]
+// CHECK: llvm.store %[[PROCESSOR_DATA_5]], %[[PROCESSOR_DATA_ALLOCA_PTR_5]]
+
+// CHECK: %[[PROCESSOR_DATA_PTR_6:.+]] = llvm.getelementptr inbounds %[[DATA_PTR]][6]
+// CHECK: %[[PROCESSOR_DATA_6:.+]] = llvm.load %[[PROCESSOR_DATA_PTR_6]]
+// CHECK: %[[PROCESSOR_DATA_ALLOCA_PTR_6:.+]] = llvm.getelementptr inbounds %[[PROCESSOR_DATA_ALLOCA]][6]
+// CHECK: llvm.store %[[PROCESSOR_DATA_6]], %[[PROCESSOR_DATA_ALLOCA_PTR_6]]
+
+// CHECK: %[[PROCESSOR_DATA_PTR_7:.+]] = llvm.getelementptr inbounds %[[DATA_PTR]][7]
+// CHECK: %[[PROCESSOR_DATA_7:.+]] = llvm.load %[[PROCESSOR_DATA_PTR_7]]
+// CHECK: %[[PROCESSOR_DATA_ALLOCA_PTR_7:.+]] = llvm.getelementptr inbounds %[[PROCESSOR_DATA_ALLOCA]][7]
+// CHECK: llvm.store %[[PROCESSOR_DATA_7]], %[[PROCESSOR_DATA_ALLOCA_PTR_7]]
+
+// CHECK-DAG: %[[PROCESSOR_INFO:.+]] = llvm.load %arg2
+// CHECK: %[[PROCESSOR_ID:.+]] = llvm.extractvalue %[[PROCESSOR_INFO]][4]
+// CHECK: %[[PARAMSTRUCT_ALLOCA:.+]] = llvm.alloca %[[C1]] x !llvm.struct<(f32, ptr, i32, f64, ptr, i32)>
+// CHECK: %[[PARAMSTRUCT:.+]] = llvm.mlir.undef : !llvm.struct<(f32, ptr, i32, f64, ptr, i32)>
+// CHECK: %[[INSERT_ARG0:.+]] = llvm.insertvalue %[[ALLOCA]], %[[PARAMSTRUCT]][1]
+// CHECK: %[[INSERT_ARG1:.+]] = llvm.insertvalue %[[Ci32]], %[[INSERT_ARG0]][2]
+// CHECK: %[[INSERT_ARG2:.+]] = llvm.insertvalue %[[Cf64]], %[[INSERT_ARG1]][3]
+// CHECK: %[[INSERT_ARG3:.+]] = llvm.insertvalue %[[PROCESSOR_DATA_ALLOCA]], %[[INSERT_ARG2]][4]
+// CHECK: %[[INSERT_ARG4:.+]] = llvm.insertvalue %[[PROCESSOR_ID]], %[[INSERT_ARG3]][5]
+// CHECK: llvm.store %[[INSERT_ARG4]], %[[PARAMSTRUCT_ALLOCA]]
+// CHECK: llvm.call @paramstruct_cconv_with_extra_fields_and_executable_target(%[[PARAMSTRUCT_ALLOCA]])
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td b/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td
index 6794a0c..39735f0 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td
@@ -306,6 +306,18 @@
let cppNamespace = "::mlir::iree_compiler::IREE::HAL";
}
+def HAL_CallingConvention_Default : I32EnumAttrCase<"Default", 0>;
+def HAL_CallingConvention_ParameterStruct : I32EnumAttrCase<"ParameterStruct", 1>;
+def HAL_CallingConventionAttr :
+ I32EnumAttr<
+ "CallingConvention",
+ "Calling conversions for linked functions",[
+ HAL_CallingConvention_Default,
+ HAL_CallingConvention_ParameterStruct,
+ ]>{
+ let cppNamespace = "::mlir::iree_compiler::IREE::HAL";
+}
+
//===----------------------------------------------------------------------===//
// HAL types
//===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/BUILD.bazel b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/BUILD.bazel
index eaa2367..aa1c5b2 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/BUILD.bazel
@@ -17,15 +17,18 @@
srcs = [
"Device.cpp",
"Musl.cpp",
+ "UKernel.cpp",
],
hdrs = [
"Device.h",
"Musl.h",
+ "UKernel.h",
],
deps = [
"//compiler/src/iree/compiler/Dialect/HAL/IR",
"//runtime/src/iree/builtins/device:libdevice_bitcode",
"//runtime/src/iree/builtins/musl/bin:libmusl",
+ "//runtime/src/iree/builtins/ukernel:libukernel_bitcode",
"@llvm-project//llvm:BitReader",
"@llvm-project//llvm:Core",
"@llvm-project//llvm:Support",
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/CMakeLists.txt
index 75b4838..c89659c 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/CMakeLists.txt
@@ -16,9 +16,11 @@
HDRS
"Device.h"
"Musl.h"
+ "UKernel.h"
SRCS
"Device.cpp"
"Musl.cpp"
+ "UKernel.cpp"
DEPS
LLVMBitReader
LLVMCore
@@ -27,6 +29,7 @@
MLIRSupport
iree::builtins::device::libdevice_bitcode
iree::builtins::musl::bin::libmusl
+ iree::builtins::ukernel::libukernel_bitcode
iree::compiler::Dialect::HAL::IR
PUBLIC
)
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.cpp b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.cpp
new file mode 100644
index 0000000..e06b687
--- /dev/null
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.cpp
@@ -0,0 +1,64 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.h"
+
+#include "iree/builtins/ukernel/libukernel.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Support/MemoryBufferRef.h"
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+namespace iree_compiler {
+namespace IREE {
+namespace HAL {
+
+static const iree_file_toc_t *lookupUKernelFile(StringRef filename) {
+ for (size_t i = 0; i < iree_builtins_libukernel_size(); ++i) {
+ const auto &file_toc = iree_builtins_libukernel_create()[i];
+ if (filename == file_toc.name) return &file_toc;
+ }
+ return nullptr;
+}
+
+static const iree_file_toc_t *lookupUKernelFile(
+ llvm::TargetMachine *targetMachine) {
+ const auto &triple = targetMachine->getTargetTriple();
+
+ // NOTE: other arch-specific checks go here.
+
+ // Fallback path using the generic wasm variants as they are largely
+ // machine-agnostic.
+ if (triple.isX86()) {
+ return lookupUKernelFile("ukernel_bitcode.bc");
+ } else {
+ return nullptr;
+ }
+}
+
+std::optional<std::unique_ptr<llvm::Module>> loadUKernelBitcode(
+ llvm::TargetMachine *targetMachine, llvm::LLVMContext &context) {
+ // Find a bitcode file for the current architecture.
+ const auto *file = lookupUKernelFile(targetMachine);
+ if (!file) {
+ return std::nullopt;
+ }
+
+ // Load the generic bitcode file contents.
+ llvm::MemoryBufferRef bitcodeBufferRef(
+ llvm::StringRef(file->data, file->size), file->name);
+ auto bitcodeFile = llvm::parseBitcodeFile(bitcodeBufferRef, context);
+ if (!bitcodeFile) {
+ // TODO: Do we want to error out here or silently proceed.
+ return std::nullopt;
+ }
+ return std::move(*bitcodeFile);
+}
+
+} // namespace HAL
+} // namespace IREE
+} // namespace iree_compiler
+} // namespace mlir
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.h b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.h
new file mode 100644
index 0000000..af662b2
--- /dev/null
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.h
@@ -0,0 +1,26 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_COMPILER_DIALECT_HAL_TARGET_LLVMCPU_BUILTINS_UKERNEL_H_
+#define IREE_COMPILER_DIALECT_HAL_TARGET_LLVMCPU_BUILTINS_UKERNEL_H_
+
+#include "llvm/IR/Module.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace mlir {
+namespace iree_compiler {
+namespace IREE {
+namespace HAL {
+
+std::optional<std::unique_ptr<llvm::Module>> loadUKernelBitcode(
+ llvm::TargetMachine *targetMachine, llvm::LLVMContext &context);
+
+} // namespace HAL
+} // namespace IREE
+} // namespace iree_compiler
+} // namespace mlir
+
+#endif // IREE_COMPILER_DIALECT_HAL_TARGET_LLVMCPU_BUILTINS_UKERNEL_H_
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/LLVMCPUTarget.cpp b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/LLVMCPUTarget.cpp
index f7f7e2c..460cde8 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/LLVMCPUTarget.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/LLVMCPUTarget.cpp
@@ -14,6 +14,7 @@
#include "iree/compiler/Codegen/LLVMCPU/LLVMCPUPasses.h"
#include "iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/Device.h"
#include "iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/Musl.h"
+#include "iree/compiler/Dialect/HAL/Target/LLVMCPU/Builtins/UKernel.h"
#include "iree/compiler/Dialect/HAL/Target/LLVMCPU/LLVMIRPasses.h"
#include "iree/compiler/Dialect/HAL/Target/LLVMCPU/LibraryBuilder.h"
#include "iree/compiler/Dialect/HAL/Target/LLVMCPU/LinkerTool.h"
@@ -125,6 +126,22 @@
return success();
}
+/// Helper method to check if the variant op has a `ukernel` attribute
+/// in its `hal.executable.target`. If so, load the ukernel library
+/// for that target and link.
+// Note: This is duplicate of a similar function in Codegen/. For
+// now duplicating this to avoid false linking issues. Eventually
+// presence of this attribute in the `hal.executable.target` should
+// drive everything.
+static bool hasMicrokernel(IREE::HAL::ExecutableVariantOp variantOp) {
+ IREE::HAL::ExecutableTargetAttr targetAttr = variantOp.getTarget();
+ if (!targetAttr) return false;
+ auto config = targetAttr.getConfiguration();
+ if (!config) return false;
+ auto attr = config.getAs<BoolAttr>("ukernels");
+ return attr && attr.getValue();
+}
+
class LLVMCPUTargetBackend final : public TargetBackend {
public:
explicit LLVMCPUTargetBackend(LLVMTargetOptions options)
@@ -417,6 +434,26 @@
<< targetTriple.str() << "'";
}
+ // Link in ukernel file.
+ if (hasMicrokernel(variantOp)) {
+ auto setAlwaysInline = [&](llvm::Module &module) {
+ for (auto &func : module.getFunctionList()) {
+ func.addFnAttr(llvm::Attribute::AlwaysInline);
+ }
+ };
+ if (std::optional<std::unique_ptr<llvm::Module>> libUKernel =
+ loadUKernelBitcode(targetMachine.get(), context)) {
+ if (failed(linkBitcodeModule(
+ variantOp.getLoc(), moduleLinker, llvm::Linker::LinkOnlyNeeded,
+ *targetMachine, "libukernel", std::move(libUKernel.value()),
+ setAlwaysInline))) {
+ return mlir::emitError(variantOp.getLoc())
+ << "failed linking in ukernel library for target triple '"
+ << targetTriple.str() << "'";
+ }
+ }
+ }
+
// Strip any compiler identifiers that may have snuck in. We let the linker
// tag the module.
auto *llvmIdent = llvmModule->getNamedMetadata("llvm.ident");
diff --git a/experimental/cpu_ukernel/iree_experimental_standalone_plugin.cmake b/experimental/cpu_ukernel/iree_experimental_standalone_plugin.cmake
index 079ffc4..7d48068 100644
--- a/experimental/cpu_ukernel/iree_experimental_standalone_plugin.cmake
+++ b/experimental/cpu_ukernel/iree_experimental_standalone_plugin.cmake
@@ -168,6 +168,11 @@
# `FILE_COPTS_VAR_NAME`.
# Example: "x86_64:some_file_for_x86_64_using_avx512_instructions.c:NAME_OF_VARIABLE_CONTAINING_COPTS_FOR_X86_64_AVX512".
function(iree_experimental_standalone_plugin)
+ # Early return if we don't have our own build of Clang and LLD available.
+ if (NOT (IREE_CLANG_TARGET AND IREE_LLD_TARGET))
+ return()
+ endif()
+
cmake_parse_arguments(
_RULE
""
diff --git a/runtime/src/iree/builtins/ukernel/BUILD.bazel b/runtime/src/iree/builtins/ukernel/BUILD.bazel
index 7ed6aa2..d4727c9 100644
--- a/runtime/src/iree/builtins/ukernel/BUILD.bazel
+++ b/runtime/src/iree/builtins/ukernel/BUILD.bazel
@@ -4,7 +4,9 @@
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-load("//build_tools/bazel:build_defs.oss.bzl", "iree_runtime_cc_library")
+load("//build_tools/bazel:build_defs.oss.bzl", "iree_cmake_extra_content", "iree_runtime_cc_library")
+load("//build_tools/bazel:iree_bitcode_library.bzl", "iree_bitcode_library", "iree_link_bitcode")
+load("//build_tools/embed_data:build_defs.bzl", "c_embed_data")
package(
default_visibility = ["//visibility:public"],
@@ -70,3 +72,113 @@
"//runtime/src/iree/builtins/ukernel/arch:ukernel_arch",
],
)
+
+#===------------------------------------------------------------------------===#
+# UKernel bitcode files
+#===------------------------------------------------------------------------===#
+
+iree_cmake_extra_content(
+ content = """
+if(IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU)
+""",
+ inline = True,
+)
+
+BITCODE_COPTS = [
+ # C17 with no system deps.
+ "-std=c17",
+ "-nostdinc",
+ "-ffreestanding",
+
+ # Optimized and unstamped.
+ "-O3",
+ "-DNDEBUG",
+ "-fno-ident",
+ "-fdiscard-value-names",
+
+ # Object file only in bitcode format:
+ "-c",
+ "-emit-llvm",
+
+ # Force the library into standalone mode (not depending on build-directory
+ # configuration).
+ "-DIREE_UK_STANDALONE=1",
+
+ # Force configure for X86_64 with own Clang
+ "-DIREE_UK_ARCH_X86_64",
+ "-DIREE_UK_POINTER_SIZE=8",
+ "-DIREE_UK_BUILD_X86_64_AVX2_FMA",
+ "-DIREE_UK_BUILD_X86_64_AVX512_BASE",
+ "-DIREE_UK_BUILD_X86_64_AVX512_VNNI",
+]
+
+UKERNEL_BASE_SRCS = [
+ "mmt4d.c",
+ "mmt4d_tile.c",
+ "pack.c",
+ "pack_tile.c",
+ "query_tile_sizes.c",
+ "unpack_tile.c",
+]
+
+UKERNEL_HDRS = [
+ "//runtime/src/iree/builtins/ukernel:common.h",
+ "//runtime/src/iree/builtins/ukernel:pack.h",
+ "//runtime/src/iree/builtins/ukernel:exported_bits.h",
+ "//runtime/src/iree/builtins/ukernel:static_assert.h",
+ "//runtime/src/iree/builtins/ukernel:api.h",
+ "//runtime/src/iree/builtins/ukernel:unpack.h",
+ "//runtime/src/iree/builtins/ukernel:query_tile_sizes.h",
+ "//runtime/src/iree/builtins/ukernel:unpack_internal.h",
+ "//runtime/src/iree/builtins/ukernel:mmt4d.h",
+ "//runtime/src/iree/builtins/ukernel:mmt4d_internal.h",
+ "//runtime/src/iree/builtins/ukernel:pack_internal.h",
+ "//runtime/src/iree/builtins/ukernel:elementwise.h",
+ "//runtime/src/iree/builtins/ukernel:query_tile_sizes_internal.h",
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:query_tile_sizes_x86_64.h",
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:unpack_x86_64.h",
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:pack_x86_64.h",
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:common_x86_64.h",
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:mmt4d_x86_64.h",
+ "//runtime/src/iree/schemas:cpu_data.h",
+ "//runtime/src/iree/schemas:cpu_feature_bits.inl",
+]
+
+iree_bitcode_library(
+ name = "ukernel_bitcode_base",
+ srcs = UKERNEL_BASE_SRCS,
+ hdrs = UKERNEL_HDRS,
+ copts = BITCODE_COPTS,
+)
+
+iree_link_bitcode(
+ name = "ukernel_bitcode",
+ bitcode_files = [
+ "ukernel_bitcode_base.bc",
+ "arch/x86_64:ukernel_bitcode_x86_64_base.bc",
+ "arch/x86_64:ukernel_bitcode_x86_64_avx2_fma.bc",
+ "arch/x86_64:ukernel_bitcode_x86_64_avx512_base.bc",
+ "arch/x86_64:ukernel_bitcode_x86_64_avx512_vnni.bc",
+ ],
+)
+
+c_embed_data(
+ name = "libukernel_bitcode",
+ srcs = [
+ ":ukernel_bitcode.bc",
+ ],
+ c_file_output = "libukernel.c",
+ flatten = True,
+ h_file_output = "libukernel.h",
+ identifier = "iree_builtins_libukernel",
+ deps = [
+ "//runtime/src:runtime_defines",
+ ],
+)
+
+iree_cmake_extra_content(
+ content = """
+endif() # IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU
+""",
+ inline = True,
+)
diff --git a/runtime/src/iree/builtins/ukernel/CMakeLists.txt b/runtime/src/iree/builtins/ukernel/CMakeLists.txt
index 9db16e0..14d5449 100644
--- a/runtime/src/iree/builtins/ukernel/CMakeLists.txt
+++ b/runtime/src/iree/builtins/ukernel/CMakeLists.txt
@@ -85,4 +85,87 @@
PUBLIC
)
+if(IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU)
+
+iree_bitcode_library(
+ NAME
+ ukernel_bitcode_base
+ SRCS
+ "mmt4d.c"
+ "mmt4d_tile.c"
+ "pack.c"
+ "pack_tile.c"
+ "query_tile_sizes.c"
+ "unpack_tile.c"
+ HDRS
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:common_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:mmt4d_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:pack_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:query_tile_sizes_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:unpack_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel:api.h"
+ "//runtime/src/iree/builtins/ukernel:common.h"
+ "//runtime/src/iree/builtins/ukernel:elementwise.h"
+ "//runtime/src/iree/builtins/ukernel:exported_bits.h"
+ "//runtime/src/iree/builtins/ukernel:mmt4d.h"
+ "//runtime/src/iree/builtins/ukernel:mmt4d_internal.h"
+ "//runtime/src/iree/builtins/ukernel:pack.h"
+ "//runtime/src/iree/builtins/ukernel:pack_internal.h"
+ "//runtime/src/iree/builtins/ukernel:query_tile_sizes.h"
+ "//runtime/src/iree/builtins/ukernel:query_tile_sizes_internal.h"
+ "//runtime/src/iree/builtins/ukernel:static_assert.h"
+ "//runtime/src/iree/builtins/ukernel:unpack.h"
+ "//runtime/src/iree/builtins/ukernel:unpack_internal.h"
+ "//runtime/src/iree/schemas:cpu_data.h"
+ "//runtime/src/iree/schemas:cpu_feature_bits.inl"
+ COPTS
+ "-std=c17"
+ "-nostdinc"
+ "-ffreestanding"
+ "-O3"
+ "-DNDEBUG"
+ "-fno-ident"
+ "-fdiscard-value-names"
+ "-c"
+ "-emit-llvm"
+ "-DIREE_UK_STANDALONE=1"
+ "-DIREE_UK_ARCH_X86_64"
+ "-DIREE_UK_POINTER_SIZE=8"
+ "-DIREE_UK_BUILD_X86_64_AVX2_FMA"
+ "-DIREE_UK_BUILD_X86_64_AVX512_BASE"
+ "-DIREE_UK_BUILD_X86_64_AVX512_VNNI"
+ PUBLIC
+)
+
+iree_link_bitcode(
+ NAME
+ ukernel_bitcode
+ SRCS
+ "arch/x86_64/ukernel_bitcode_x86_64_avx2_fma.bc"
+ "arch/x86_64/ukernel_bitcode_x86_64_avx512_base.bc"
+ "arch/x86_64/ukernel_bitcode_x86_64_avx512_vnni.bc"
+ "arch/x86_64/ukernel_bitcode_x86_64_base.bc"
+ "ukernel_bitcode_base.bc"
+ PUBLIC
+)
+
+iree_c_embed_data(
+ NAME
+ libukernel_bitcode
+ GENERATED_SRCS
+ "ukernel_bitcode.bc"
+ DEPS
+
+ C_FILE_OUTPUT
+ "libukernel.c"
+ H_FILE_OUTPUT
+ "libukernel.h"
+ IDENTIFIER
+ "iree_builtins_libukernel"
+ FLATTEN
+ PUBLIC
+)
+
+endif() # IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU
+
### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/builtins/ukernel/arch/x86_64/BUILD.bazel b/runtime/src/iree/builtins/ukernel/arch/x86_64/BUILD.bazel
index 62611b7..fd6d113 100644
--- a/runtime/src/iree/builtins/ukernel/arch/x86_64/BUILD.bazel
+++ b/runtime/src/iree/builtins/ukernel/arch/x86_64/BUILD.bazel
@@ -4,7 +4,8 @@
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-load("//build_tools/bazel:build_defs.oss.bzl", "iree_runtime_cc_library")
+load("//build_tools/bazel:build_defs.oss.bzl", "iree_cmake_extra_content")
+load("//build_tools/bazel:iree_bitcode_library.bzl", "iree_bitcode_library")
package(
default_visibility = ["//visibility:public"],
@@ -12,34 +13,138 @@
licenses = ["notice"], # Apache 2.0
)
-iree_runtime_cc_library(
- name = "mmt4d_x86_64",
- hdrs = [
- "mmt4d_x86_64.h",
- ],
- deps = ["//runtime/src/iree/builtins/ukernel:internal_headers"],
+#===------------------------------------------------------------------------===#
+# UKernel bitcode files
+#===------------------------------------------------------------------------===#
+
+iree_cmake_extra_content(
+ content = """
+if(IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU)
+""",
+ inline = True,
)
-iree_runtime_cc_library(
- name = "pack_x86_64",
- hdrs = [
- "pack_x86_64.h",
- ],
- deps = ["//runtime/src/iree/builtins/ukernel:internal_headers"],
+BITCODE_COPTS = [
+ # C17 with no system deps.
+ "-std=c17",
+ "-nostdinc",
+ "-ffreestanding",
+
+ # Optimized and unstamped.
+ "-O3",
+ "-DNDEBUG",
+ "-fno-ident",
+ "-fdiscard-value-names",
+
+ # Object file only in bitcode format:
+ "-c",
+ "-emit-llvm",
+
+ # Force the library into standalone mode (not depending on build-directory
+ # configuration).
+ "-DIREE_UK_STANDALONE=1",
+
+ # Force configure for X86_64 with own Clang
+ "-DIREE_UK_ARCH_X86_64",
+ "-DIREE_UK_POINTER_SIZE=8",
+ "-DIREE_UK_BUILD_X86_64_AVX2_FMA",
+ "-DIREE_UK_BUILD_X86_64_AVX512_BASE",
+ "-DIREE_UK_BUILD_X86_64_AVX512_VNNI",
+]
+
+UKERNEL_HDRS = [
+ "//runtime/src/iree/builtins/ukernel:common.h",
+ "//runtime/src/iree/builtins/ukernel:pack.h",
+ "//runtime/src/iree/builtins/ukernel:exported_bits.h",
+ "//runtime/src/iree/builtins/ukernel:static_assert.h",
+ "//runtime/src/iree/builtins/ukernel:api.h",
+ "//runtime/src/iree/builtins/ukernel:unpack.h",
+ "//runtime/src/iree/builtins/ukernel:query_tile_sizes.h",
+ "//runtime/src/iree/builtins/ukernel:unpack_internal.h",
+ "//runtime/src/iree/builtins/ukernel:mmt4d.h",
+ "//runtime/src/iree/builtins/ukernel:mmt4d_internal.h",
+ "//runtime/src/iree/builtins/ukernel:pack_internal.h",
+ "//runtime/src/iree/builtins/ukernel:elementwise.h",
+ "//runtime/src/iree/builtins/ukernel:query_tile_sizes_internal.h",
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:query_tile_sizes_x86_64.h",
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:unpack_x86_64.h",
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:pack_x86_64.h",
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:common_x86_64.h",
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:mmt4d_x86_64.h",
+ "//runtime/src/iree/schemas:cpu_data.h",
+ "//runtime/src/iree/schemas:cpu_feature_bits.inl",
+]
+
+UKERNEL_X86_64_BASE_SRCS = [
+ "mmt4d_x86_64.c",
+ "pack_x86_64.c",
+ "query_tile_sizes_x86_64.c",
+ "unpack_x86_64.c",
+]
+
+iree_bitcode_library(
+ name = "ukernel_bitcode_x86_64_base",
+ srcs = UKERNEL_X86_64_BASE_SRCS,
+ hdrs = UKERNEL_HDRS,
+ copts = BITCODE_COPTS,
)
-iree_runtime_cc_library(
- name = "query_tile_sizes_x86_64",
- hdrs = [
- "query_tile_sizes_x86_64.h",
+UKERNEL_X86_64_AVX2_FMA_SRCS = [
+ "mmt4d_x86_64_avx2_fma.c",
+ "pack_x86_64_avx2_fma.c",
+ "unpack_x86_64_avx2_fma.c",
+]
+
+iree_bitcode_library(
+ name = "ukernel_bitcode_x86_64_avx2_fma",
+ srcs = UKERNEL_X86_64_AVX2_FMA_SRCS,
+ hdrs = UKERNEL_HDRS,
+ copts = BITCODE_COPTS + [
+ "-mavx2",
+ "-mfma",
],
- deps = ["//runtime/src/iree/builtins/ukernel:internal_headers"],
)
-iree_runtime_cc_library(
- name = "unpack_x86_64",
- hdrs = [
- "unpack_x86_64.h",
+UKERNEL_X86_64_AVX512_BASE_SRCS = [
+ "mmt4d_x86_64_avx512_base.c",
+ "pack_x86_64_avx512_base.c",
+ "unpack_x86_64_avx512_base.c",
+]
+
+iree_bitcode_library(
+ name = "ukernel_bitcode_x86_64_avx512_base",
+ srcs = UKERNEL_X86_64_AVX512_BASE_SRCS,
+ hdrs = UKERNEL_HDRS,
+ copts = BITCODE_COPTS + [
+ "-mavx512f",
+ "-mavx512vl",
+ "-mavx512cd",
+ "-mavx512bw",
+ "-mavx512dq",
],
- deps = ["//runtime/src/iree/builtins/ukernel:internal_headers"],
+)
+
+UKERNEL_X86_64_AVX512_VNNI_SRCS = [
+ "mmt4d_x86_64_avx512_vnni.c",
+]
+
+iree_bitcode_library(
+ name = "ukernel_bitcode_x86_64_avx512_vnni",
+ srcs = UKERNEL_X86_64_AVX512_VNNI_SRCS,
+ hdrs = UKERNEL_HDRS,
+ copts = BITCODE_COPTS + [
+ "-mavx512f",
+ "-mavx512vl",
+ "-mavx512cd",
+ "-mavx512bw",
+ "-mavx512dq",
+ "-mavx512vnni",
+ ],
+)
+
+iree_cmake_extra_content(
+ content = """
+endif() # IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU
+""",
+ inline = True,
)
diff --git a/runtime/src/iree/builtins/ukernel/arch/x86_64/CMakeLists.txt b/runtime/src/iree/builtins/ukernel/arch/x86_64/CMakeLists.txt
index 7460134..38dea86 100644
--- a/runtime/src/iree/builtins/ukernel/arch/x86_64/CMakeLists.txt
+++ b/runtime/src/iree/builtins/ukernel/arch/x86_64/CMakeLists.txt
@@ -1,8 +1,220 @@
-# Copyright 2023 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from #
+# runtime/src/iree/builtins/ukernel/arch/x86_64/BUILD.bazel #
+# #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary #
+# CMake-only content. #
+# #
+# To disable autogeneration for this file entirely, delete this header. #
+################################################################################
+
+iree_add_all_subdirs()
+
+if(IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU)
+
+iree_bitcode_library(
+ NAME
+ ukernel_bitcode_x86_64_base
+ SRCS
+ "mmt4d_x86_64.c"
+ "pack_x86_64.c"
+ "query_tile_sizes_x86_64.c"
+ "unpack_x86_64.c"
+ HDRS
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:common_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:mmt4d_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:pack_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:query_tile_sizes_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:unpack_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel:api.h"
+ "//runtime/src/iree/builtins/ukernel:common.h"
+ "//runtime/src/iree/builtins/ukernel:elementwise.h"
+ "//runtime/src/iree/builtins/ukernel:exported_bits.h"
+ "//runtime/src/iree/builtins/ukernel:mmt4d.h"
+ "//runtime/src/iree/builtins/ukernel:mmt4d_internal.h"
+ "//runtime/src/iree/builtins/ukernel:pack.h"
+ "//runtime/src/iree/builtins/ukernel:pack_internal.h"
+ "//runtime/src/iree/builtins/ukernel:query_tile_sizes.h"
+ "//runtime/src/iree/builtins/ukernel:query_tile_sizes_internal.h"
+ "//runtime/src/iree/builtins/ukernel:static_assert.h"
+ "//runtime/src/iree/builtins/ukernel:unpack.h"
+ "//runtime/src/iree/builtins/ukernel:unpack_internal.h"
+ "//runtime/src/iree/schemas:cpu_data.h"
+ "//runtime/src/iree/schemas:cpu_feature_bits.inl"
+ COPTS
+ "-std=c17"
+ "-nostdinc"
+ "-ffreestanding"
+ "-O3"
+ "-DNDEBUG"
+ "-fno-ident"
+ "-fdiscard-value-names"
+ "-c"
+ "-emit-llvm"
+ "-DIREE_UK_STANDALONE=1"
+ "-DIREE_UK_ARCH_X86_64"
+ "-DIREE_UK_POINTER_SIZE=8"
+ "-DIREE_UK_BUILD_X86_64_AVX2_FMA"
+ "-DIREE_UK_BUILD_X86_64_AVX512_BASE"
+ "-DIREE_UK_BUILD_X86_64_AVX512_VNNI"
+ PUBLIC
+)
+
+iree_bitcode_library(
+ NAME
+ ukernel_bitcode_x86_64_avx2_fma
+ SRCS
+ "mmt4d_x86_64_avx2_fma.c"
+ "pack_x86_64_avx2_fma.c"
+ "unpack_x86_64_avx2_fma.c"
+ HDRS
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:common_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:mmt4d_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:pack_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:query_tile_sizes_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:unpack_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel:api.h"
+ "//runtime/src/iree/builtins/ukernel:common.h"
+ "//runtime/src/iree/builtins/ukernel:elementwise.h"
+ "//runtime/src/iree/builtins/ukernel:exported_bits.h"
+ "//runtime/src/iree/builtins/ukernel:mmt4d.h"
+ "//runtime/src/iree/builtins/ukernel:mmt4d_internal.h"
+ "//runtime/src/iree/builtins/ukernel:pack.h"
+ "//runtime/src/iree/builtins/ukernel:pack_internal.h"
+ "//runtime/src/iree/builtins/ukernel:query_tile_sizes.h"
+ "//runtime/src/iree/builtins/ukernel:query_tile_sizes_internal.h"
+ "//runtime/src/iree/builtins/ukernel:static_assert.h"
+ "//runtime/src/iree/builtins/ukernel:unpack.h"
+ "//runtime/src/iree/builtins/ukernel:unpack_internal.h"
+ "//runtime/src/iree/schemas:cpu_data.h"
+ "//runtime/src/iree/schemas:cpu_feature_bits.inl"
+ COPTS
+ "-std=c17"
+ "-nostdinc"
+ "-ffreestanding"
+ "-O3"
+ "-DNDEBUG"
+ "-fno-ident"
+ "-fdiscard-value-names"
+ "-c"
+ "-emit-llvm"
+ "-DIREE_UK_STANDALONE=1"
+ "-DIREE_UK_ARCH_X86_64"
+ "-DIREE_UK_POINTER_SIZE=8"
+ "-DIREE_UK_BUILD_X86_64_AVX2_FMA"
+ "-DIREE_UK_BUILD_X86_64_AVX512_BASE"
+ "-DIREE_UK_BUILD_X86_64_AVX512_VNNI"
+ "-mavx2"
+ "-mfma"
+ PUBLIC
+)
+
+iree_bitcode_library(
+ NAME
+ ukernel_bitcode_x86_64_avx512_base
+ SRCS
+ "mmt4d_x86_64_avx512_base.c"
+ "pack_x86_64_avx512_base.c"
+ "unpack_x86_64_avx512_base.c"
+ HDRS
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:common_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:mmt4d_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:pack_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:query_tile_sizes_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:unpack_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel:api.h"
+ "//runtime/src/iree/builtins/ukernel:common.h"
+ "//runtime/src/iree/builtins/ukernel:elementwise.h"
+ "//runtime/src/iree/builtins/ukernel:exported_bits.h"
+ "//runtime/src/iree/builtins/ukernel:mmt4d.h"
+ "//runtime/src/iree/builtins/ukernel:mmt4d_internal.h"
+ "//runtime/src/iree/builtins/ukernel:pack.h"
+ "//runtime/src/iree/builtins/ukernel:pack_internal.h"
+ "//runtime/src/iree/builtins/ukernel:query_tile_sizes.h"
+ "//runtime/src/iree/builtins/ukernel:query_tile_sizes_internal.h"
+ "//runtime/src/iree/builtins/ukernel:static_assert.h"
+ "//runtime/src/iree/builtins/ukernel:unpack.h"
+ "//runtime/src/iree/builtins/ukernel:unpack_internal.h"
+ "//runtime/src/iree/schemas:cpu_data.h"
+ "//runtime/src/iree/schemas:cpu_feature_bits.inl"
+ COPTS
+ "-std=c17"
+ "-nostdinc"
+ "-ffreestanding"
+ "-O3"
+ "-DNDEBUG"
+ "-fno-ident"
+ "-fdiscard-value-names"
+ "-c"
+ "-emit-llvm"
+ "-DIREE_UK_STANDALONE=1"
+ "-DIREE_UK_ARCH_X86_64"
+ "-DIREE_UK_POINTER_SIZE=8"
+ "-DIREE_UK_BUILD_X86_64_AVX2_FMA"
+ "-DIREE_UK_BUILD_X86_64_AVX512_BASE"
+ "-DIREE_UK_BUILD_X86_64_AVX512_VNNI"
+ "-mavx512f"
+ "-mavx512vl"
+ "-mavx512cd"
+ "-mavx512bw"
+ "-mavx512dq"
+ PUBLIC
+)
+
+iree_bitcode_library(
+ NAME
+ ukernel_bitcode_x86_64_avx512_vnni
+ SRCS
+ "mmt4d_x86_64_avx512_vnni.c"
+ HDRS
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:common_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:mmt4d_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:pack_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:query_tile_sizes_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel/arch/x86_64:unpack_x86_64.h"
+ "//runtime/src/iree/builtins/ukernel:api.h"
+ "//runtime/src/iree/builtins/ukernel:common.h"
+ "//runtime/src/iree/builtins/ukernel:elementwise.h"
+ "//runtime/src/iree/builtins/ukernel:exported_bits.h"
+ "//runtime/src/iree/builtins/ukernel:mmt4d.h"
+ "//runtime/src/iree/builtins/ukernel:mmt4d_internal.h"
+ "//runtime/src/iree/builtins/ukernel:pack.h"
+ "//runtime/src/iree/builtins/ukernel:pack_internal.h"
+ "//runtime/src/iree/builtins/ukernel:query_tile_sizes.h"
+ "//runtime/src/iree/builtins/ukernel:query_tile_sizes_internal.h"
+ "//runtime/src/iree/builtins/ukernel:static_assert.h"
+ "//runtime/src/iree/builtins/ukernel:unpack.h"
+ "//runtime/src/iree/builtins/ukernel:unpack_internal.h"
+ "//runtime/src/iree/schemas:cpu_data.h"
+ "//runtime/src/iree/schemas:cpu_feature_bits.inl"
+ COPTS
+ "-std=c17"
+ "-nostdinc"
+ "-ffreestanding"
+ "-O3"
+ "-DNDEBUG"
+ "-fno-ident"
+ "-fdiscard-value-names"
+ "-c"
+ "-emit-llvm"
+ "-DIREE_UK_STANDALONE=1"
+ "-DIREE_UK_ARCH_X86_64"
+ "-DIREE_UK_POINTER_SIZE=8"
+ "-DIREE_UK_BUILD_X86_64_AVX2_FMA"
+ "-DIREE_UK_BUILD_X86_64_AVX512_BASE"
+ "-DIREE_UK_BUILD_X86_64_AVX512_VNNI"
+ "-mavx512f"
+ "-mavx512vl"
+ "-mavx512cd"
+ "-mavx512bw"
+ "-mavx512dq"
+ "-mavx512vnni"
+ PUBLIC
+)
+
+endif() # IREE_BUILD_COMPILER AND IREE_TARGET_BACKEND_LLVM_CPU
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
# Target CPUs supporting AVX2+FMA3. That includes Intel Haswell (2013) and newer
# and AMD Excavator (2015) and newer. There is no current plan to look after
diff --git a/runtime/src/iree/schemas/cpu_data.h b/runtime/src/iree/schemas/cpu_data.h
index 46df305..0677606 100644
--- a/runtime/src/iree/schemas/cpu_data.h
+++ b/runtime/src/iree/schemas/cpu_data.h
@@ -70,7 +70,8 @@
IREE_CPU_FEATURE_BIT_NAME(arch, field_index, bit_name) = 1ull << bit_pos,
#include "iree/schemas/cpu_feature_bits.inl"
#undef IREE_CPU_FEATURE_BIT
-
};
+#undef IREE_CPU_FEATURE_BIT_NAME
+
#endif // IREE_SCHEMAS_CPU_DATA_H_
diff --git a/tests/e2e/matmul/BUILD.bazel b/tests/e2e/matmul/BUILD.bazel
index 392d1bc..2cb329f 100644
--- a/tests/e2e/matmul/BUILD.bazel
+++ b/tests/e2e/matmul/BUILD.bazel
@@ -160,6 +160,31 @@
"f32",
]]
+# Test x86_64+ukernel, mmt4d, with target CPU features variants relevant to each
+# lhs_rhs_type.
+[iree_generated_trace_runner_test(
+ name = "e2e_matmul_mmt4d_%s_small_ukernel_x86" % lhs_rhs_type,
+ compiler_flags = [
+ "--iree-llvmcpu-enable-microkernels",
+ "--iree-flow-enable-data-tiling",
+ ],
+ generator = ":generate_e2e_matmul_tests",
+ generator_args = [
+ "--lhs_rhs_type=%s" % lhs_rhs_type,
+ "--shapes=small",
+ ],
+ tags = [
+ "hostonly",
+ ],
+ target_backends_and_drivers = [
+ ("llvm-cpu", "local-task"),
+ ],
+ trace_runner = "//tools:iree-e2e-matmul-test",
+) for lhs_rhs_type in [
+ "i8",
+ "f32",
+]]
+
[iree_generated_trace_runner_test(
name = "e2e_matmul_direct_f32_gpu_large_%s" % compilation_info,
generator = ":generate_e2e_matmul_tests",
diff --git a/tests/e2e/matmul/CMakeLists.txt b/tests/e2e/matmul/CMakeLists.txt
index 6445221..357dfae 100644
--- a/tests/e2e/matmul/CMakeLists.txt
+++ b/tests/e2e/matmul/CMakeLists.txt
@@ -250,6 +250,48 @@
iree_generated_trace_runner_test(
NAME
+ e2e_matmul_mmt4d_i8_small_ukernel_x86
+ GENERATOR
+ "generate_e2e_matmul_tests.py"
+ GENERATOR_ARGS
+ "--lhs_rhs_type=i8"
+ "--shapes=small"
+ TRACE_RUNNER
+ iree-e2e-matmul-test
+ TARGET_BACKENDS
+ "llvm-cpu"
+ DRIVERS
+ "local-task"
+ COMPILER_FLAGS
+ "--iree-llvmcpu-enable-microkernels"
+ "--iree-flow-enable-data-tiling"
+ LABELS
+ "hostonly"
+)
+
+iree_generated_trace_runner_test(
+ NAME
+ e2e_matmul_mmt4d_f32_small_ukernel_x86
+ GENERATOR
+ "generate_e2e_matmul_tests.py"
+ GENERATOR_ARGS
+ "--lhs_rhs_type=f32"
+ "--shapes=small"
+ TRACE_RUNNER
+ iree-e2e-matmul-test
+ TARGET_BACKENDS
+ "llvm-cpu"
+ DRIVERS
+ "local-task"
+ COMPILER_FLAGS
+ "--iree-llvmcpu-enable-microkernels"
+ "--iree-flow-enable-data-tiling"
+ LABELS
+ "hostonly"
+)
+
+iree_generated_trace_runner_test(
+ NAME
e2e_matmul_direct_f32_gpu_large_LLVMGPUMatmulSimt
GENERATOR
"generate_e2e_matmul_tests.py"