blob: 64b246bdc4914f42793de3bbaa34a2cb77efa2c4 [file] [log] [blame]
// RUN: iree-compile %s \
// RUN: --iree-hal-executable-object-search-path=$IREE_BINARY_DIR | \
// RUN: iree-run-module \
// RUN: --device=local-sync \
// RUN: --module=- \
// RUN: --function=mixed_invocation \
// RUN: --input=8xf32=2 \
// RUN: --input=8xf32=4 | \
// RUN: FileCheck %s
// This example demonstrates authoring and dispatching retargetable executables
// from the IREE `hal` dialect layer. This allows for target-specific code to
// be written - including unique calls for each target - as the executable
// variants are manually specified. The example_stream.mlir example shows how
// where possible the executable variant generation can be left to the compiler.
//
// Enabling this at the HAL layer allows for codegen backends translating
// executable variants to make local decisions about which external calls to
// make and where the objects come from to provide those functions. Since
// objects can be embedded in the IR it's possible for the backends to even
// generate them on-demand for embedding (such as precompiling/JITing).
// The configuration used for executable compilation.
// This lets the compiler and runtime know the format and requirements of the
// executable binaries produced and multiple variants with differing formats
// and compilation options (architectures, etc) can be embedded for runtime
// selection. By fully specifying the targets here we can target multiple
// architectures and it's always possible to embed these instead of using the
// coarse command line compiler flags that only set single targets.
//
// To avoid too much boilerplate this example only shows a single target. See
// example_stream.mlir for an example with multi-targeting as there's less
// boilerplate required at that level.
#x86_64_target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
native_vector_size = 32 : index,
target_triple = "x86_64-none-elf"
}>
// The target devices that the program will run on.
// These can come from compiler flags and multiple targets can be supported
// It's possible, for example, to support targeting multiple devices in the same
// compiled binary (CPU + Vulkan, etc).
#cpu_target = #hal.device.target<"local", [
#x86_64_target
]> : !hal.device
#pipeline_layout_0 = #hal.pipeline.layout<constants = 1, bindings = [
#hal.pipeline.binding<storage_buffer, ReadOnly>,
#hal.pipeline.binding<storage_buffer, ReadOnly>,
#hal.pipeline.binding<storage_buffer>
]>
#pipeline_layout_1 = #hal.pipeline.layout<constants = 1, bindings = [
#hal.pipeline.binding<storage_buffer, ReadOnly>,
#hal.pipeline.binding<storage_buffer>
]>
module @example attributes {hal.device.targets = [#cpu_target]} {
// Executable containing exported shims and calls to external functions.
// Each executable can contain multiple exported functions and variants for
// different architectures or even devices. It's also possible to mix hand-
// authored functions with code generated ones even for the same functions
// such that code generation is used as a fallback when the hand-authored
// kernels aren't supported at runtime.
hal.executable private @executable {
// Variant linking in an x86-64 object file containing external functions.
hal.executable.variant public @x86_64 target(#x86_64_target) objects([
// Object files linked into the executable.
// These object files are linked into the dynamic library and must meet
// the requirements for embedded ELF linkage (no TLS, no globals, no
// syscalls, no libc, etc).
#hal.executable.object<{
// Referencing a file path on disk but could also have the data
// embedded in order to make the MLIR file hermetic/portable across
// compilation pipelines. In the future we'll likely use MLIR's
// external resource functionality for this. By allowing for the
// objects to be embedded we can support JIT scenarios where some
// layer higher or lower may be emitting the objects to link in as
// part of the overall compilation.
path = "samples/custom_dispatch/cpu/embedded/functions_x86_64.o"
}>
]) {
// TODO(benvanik): demonstrate hal.executable.constant.block for
// specialization via host logic and hal.executable.constant.load for
// referencing them in the shims.
// Exported shim function calling the C `simple_mul_workgroup` function.
// The ordinal must be assigned by the user and unique for the executable.
// The layout defines the required bindings and push constants and can be
// thought of as the function signature.
hal.executable.export public @simple_mul ordinal(0) layout(#pipeline_layout_0) {
^bb0(%device: !hal.device, %workload: index):
// This host function is used to compute the XYZ workgroup count
// dispatched at runtime. It can query the %device for capabilities
// and limits (last-level cache sizes, etc). The other arguments are the
// values passed in the dispatch operation (usually things like root
// output op tensor dimensions and other abstract values).
%x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
%c1 = arith.constant 1 : index
hal.return %x, %c1, %c1 : index, index, index
}
// Similar to the above but in-place by using a read/write binding.
hal.executable.export public @simple_mul_inplace ordinal(1) layout(#pipeline_layout_1) {
^bb0(%device: !hal.device, %workload: index):
%x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
%c1 = arith.constant 1 : index
hal.return %x, %c1, %c1 : index, index, index
}
// On the CPU side we use shims here to marshal across the ABI. This
// allows us to hide the implementation details of how the runtime calls
// into functions and call out to C functions that don't need to link
// against the runtime. We could probably come up with ways of automating
// this but that's mostly left as an exercise to the frontends that may be
// producing this IR for input to the IREE compiler as each may have its
// own quirks.
builtin.module {
// External function declaration using a user-chosen calling convention.
// NOTE: MLIR->LLVM conversion expands each memref to a tuple and
// there's currently no way to change that behavior.
// Each memref becomes:
// (%base_ptr: !llvm.ptr<f32>, %aligned_ptr: !llvm.ptr<f32>,
// %offset: i64, %size: i64, %stride: i64)
// That results in the following llvm.func:
// (!llvm.ptr<f32>, !llvm.ptr<f32>, i64, i64, i64, // binding0
// !llvm.ptr<f32>, !llvm.ptr<f32>, i64, i64, i64, // binding1
// !llvm.ptr<f32>, !llvm.ptr<f32>, i64, i64, i64, // binding2
// i64, // dim
// i64) // tid
// And required external C function:
// (float*, float*, size_t, size_t, size_t,
// float*, float*, size_t, size_t, size_t,
// float*, float*, size_t, size_t, size_t,
// size_t,
// size_t)
// This is not a good state to be in as we can't then map to external
// functions that have signatures we don't want to change. Please file
// upstream MLIR bugs about this behavior and the ability to just pass
// bare pointers if you care!
//
// NOTE: index will convert to i32 when targeting an ABI with 32-bit
// pointers and i64 otherwise. Use size_t on the C side to allow the
// same source code to work when compiled in either mode.
func.func private @simple_mul_workgroup(%binding0: memref<?xf32>, %binding1: memref<?xf32>, %binding2: memref<?xf32>, %dim: index, %tid: index) attributes {
// Ensures that we try to statically link this external function and
// pull it in from the object file.
hal.import.static
}
// IREE exported function using a HAL interface.
// At this layer of the stack all operands have been converted into
// constants and bindings have been specified.
func.func @simple_mul() {
%c0 = arith.constant 0 : index
// Push constants representing primitive operands can be loaded here.
%dim_i32 = hal.interface.constant.load layout(#pipeline_layout_0) ordinal(0) : i32
%dim = arith.index_castui %dim_i32 : i32 to index
// This function is invoked once per workgroup so determine where this
// particular workgroup is in the grid. In this example we use a
// workgroup size of 64x1x1 (which is exceedingly small for CPUs but
// useful for demonstration).
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%tid = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
// Bindings are accessed by reference.
%binding0 = hal.interface.binding.subspan layout(#pipeline_layout_0) binding(0) alignment(64) offset(%c0) : memref<?xf32>{%dim}
%binding1 = hal.interface.binding.subspan layout(#pipeline_layout_0) binding(1) alignment(64) offset(%c0) : memref<?xf32>{%dim}
%binding2 = hal.interface.binding.subspan layout(#pipeline_layout_0) binding(2) alignment(64) offset(%c0) : memref<?xf32>{%dim}
// Call the externally defined C function with an (almost) plain C
// calling convention (see above for details about the mess memrefs
// turn into).
//
// TODO: there are ways of accessing CPU information here such as
// active architecture and feature bits but it is not yet exposed to
// the HAL level.
func.call @simple_mul_workgroup(%binding0, %binding1, %binding2, %dim, %tid) : (memref<?xf32>, memref<?xf32>, memref<?xf32>, index, index) -> ()
// NOTE: this is code generated as normal - other MLIR ops can be used
// here for looping/control flow, vector operations, linalg, etc.
// This simple sample is just calling out to the external function but
// microkernels fused with other code are possible.
return
}
func.func private @simple_mul_inplace_workgroup(%binding0: memref<?xf32>, %binding1: memref<?xf32>, %dim: index, %tid: index) attributes {
hal.import.static
}
func.func @simple_mul_inplace() {
%c0 = arith.constant 0 : index
%dim_i32 = hal.interface.constant.load layout(#pipeline_layout_1) ordinal(0) : i32
%dim = arith.index_castui %dim_i32 : i32 to index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%tid = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
// Same as above but note that we're treating %binding1 as read/write.
%binding0 = hal.interface.binding.subspan layout(#pipeline_layout_1) binding(0) alignment(64) offset(%c0) : memref<?xf32>{%dim}
%binding1 = hal.interface.binding.subspan layout(#pipeline_layout_1) binding(1) alignment(64) offset(%c0) : memref<?xf32>{%dim}
func.call @simple_mul_inplace_workgroup(%binding0, %binding1, %dim, %tid) : (memref<?xf32>, memref<?xf32>, index, index) -> ()
return
}
}
} // hal.executable.variant
} // hal.executable
// Function demonstrating a few hand-authored dispatches mixed with codegen.
// Invoke with:
// --device=local-sync
// --function=mixed_invocation
// --input=8xf32=2
// --input=8xf32=4
// CHECK-LABEL: EXEC @mixed_invocation
func.func @mixed_invocation(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
// The only externally available metadata in the dispatch are the values
// passed in as operands. Here we pass in the dynamic dimension.
//
// HACK: for hand-authored kernels all primitive values passed in need to
// be i32 or a bit-castable type. This is because ABI packing of other types
// happens inside of the PackDispatchOperandsPass that is currently not
// usable with external functions as it changes the ABI. In the future we
// can better define the ABI such that it's possible to match the compiler
// expectations around padding/alignment. For now users must do the packing
// themselves (splitting i64 into i32+i32, etc).
%c0 = arith.constant 0 : index
%dim = tensor.dim %arg0, %c0 : tensor<?xf32>
%dim_i32 = arith.index_cast %dim : index to i32
// Dispatch a basic `ret = lhs * rhs` using an external function.
%0 = flow.dispatch @executable::@x86_64::@simple_mul[%dim](%dim_i32, %arg0, %arg1) {
// HACK: keep the executable live through DCE. Only required when
// using the automatic variant selection.
// TODO(benvanik): automatically add this when required.
hal.executable.ref = [@executable]
} : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> tensor<?xf32>{%dim}
// Code gen some other ops - these will interleave with the hand-authored
// ones but naturally won't be able to fuse with them.
%1 = arith.addf %0, %arg1 : tensor<?xf32>
// Dispatch an in-place `rhs *= lhs` using an external function.
// This form (@executable::@variant::@export) specifically chooses a variant
// instead of relying on automatic selection. This can be used by frontends
// to allow user-controlled overrides of the dispatches, custom selection
// logic based on runtime parameters, etc. In general, though, the above
// automatic selection should be used.
%2 = flow.dispatch @executable::@x86_64::@simple_mul_inplace[%dim](%dim_i32, %0, %1) : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> %1{%dim}
// CHECK: 8xf32=96 96 96 96 96 96 96 96
return %2 : tensor<?xf32>
}
} // module