blob: 4d1138475b8c4f242132b25f8afce4b32a406c4e [file] [log] [blame]
// RUN: iree-compile %s \
// RUN: --iree-hal-executable-object-search-path=$IREE_BINARY_DIR | \
// RUN: iree-run-module \
// RUN: --device=local-sync \
// RUN: --module=- \
// RUN: --function=mixed_invocation \
// RUN: --input=8xf32=2 \
// RUN: --input=8xf32=4 | \
// RUN: FileCheck %s
// This example demonstrates authoring and dispatching retargetable executables
// from the IREE `hal` dialect layer. This allows for target-specific code to
// be written - including unique calls for each target - as the executable
// variants are manually specified. The example_stream.mlir example shows how
// where possible the executable variant generation can be left to the compiler.
//
// Enabling this at the HAL layer allows for codegen backends translating
// executable variants to make local decisions about which external calls to
// make and where the objects come from to provide those functions. Since
// objects can be embedded in the IR it's possible for the backends to even
// generate them on-demand for embedding (such as precompiling/JITing).
// The configuration used for executable compilation.
// This lets the compiler and runtime know the format and requirements of the
// executable binaries produced and multiple variants with differing formats
// and compilation options (architectures, etc) can be embedded for runtime
// selection. By fully specifying the targets here we can target multiple
// architectures and it's always possible to embed these instead of using the
// coarse command line compiler flags that only set single targets.
//
// To avoid too much boilerplate this example only shows a single target. See
// example_stream.mlir for an example with multi-targeting as there's less
// boilerplate required at that level.
#x86_64_target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
native_vector_size = 32 : index,
target_triple = "x86_64-none-elf"
}>
// The target devices that the program will run on.
// These can come from compiler flags and multiple targets can be supported
// It's possible, for example, to support targeting multiple devices in the same
// compiled binary (CPU + Vulkan, etc).
#cpu_target = #hal.device.target<"local", [
#x86_64_target
]> : !hal.device
#pipeline_layout_0 = #hal.pipeline.layout<constants = 1, bindings = [
#hal.pipeline.binding<storage_buffer, ReadOnly>,
#hal.pipeline.binding<storage_buffer, ReadOnly>,
#hal.pipeline.binding<storage_buffer>
]>
#pipeline_layout_1 = #hal.pipeline.layout<constants = 1, bindings = [
#hal.pipeline.binding<storage_buffer, ReadOnly>,
#hal.pipeline.binding<storage_buffer>
]>
module @example attributes {hal.device.targets = [#cpu_target]} {
// Executable containing exported shims and calls to external functions.
// Each executable can contain multiple exported functions and variants for
// different architectures or even devices. It's also possible to mix hand-
// authored functions with code generated ones even for the same functions
// such that code generation is used as a fallback when the hand-authored
// kernels aren't supported at runtime.
hal.executable private @executable {
// Variant linking in an x86-64 object file containing external functions.
hal.executable.variant public @x86_64 target(#x86_64_target) objects([
// Object files linked into the executable.
// These object files are linked into the dynamic library and must meet
// the requirements for embedded ELF linkage (no TLS, no globals, no
// syscalls, no libc, etc).
#hal.executable.object<{
// Referencing a file path on disk but could also have the data
// embedded in order to make the MLIR file hermetic/portable across
// compilation pipelines. In the future we'll likely use MLIR's
// external resource functionality for this. By allowing for the
// objects to be embedded we can support JIT scenarios where some
// layer higher or lower may be emitting the objects to link in as
// part of the overall compilation.
path = "samples/custom_dispatch/cpu/embedded/functions_x86_64.o"
}>
]) {
// TODO(benvanik): demonstrate hal.executable.constant.block for
// specialization via host logic and hal.executable.constant.load for
// referencing them in the shims.
// Exported shim function calling the C `simple_mul_workgroup` function.
// The ordinal must be assigned by the user and unique for the executable.
// The layout defines the required bindings and push constants and can be
// thought of as the function signature.
hal.executable.export public @simple_mul ordinal(0) layout(#pipeline_layout_0) count(%device: !hal.device, %workload: index) -> (index, index, index) {
// This host function is used to compute the XYZ workgroup count
// dispatched at runtime. It can query the %device for capabilities
// and limits (last-level cache sizes, etc). The other arguments are the
// values passed in the dispatch operation (usually things like root
// output op tensor dimensions and other abstract values).
%x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
%c1 = arith.constant 1 : index
hal.return %x, %c1, %c1 : index, index, index
}
// Similar to the above but in-place by using a read/write binding.
hal.executable.export public @simple_mul_inplace ordinal(1) layout(#pipeline_layout_1) count(%device: !hal.device, %workload: index) -> (index, index, index) {
%x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
%c1 = arith.constant 1 : index
hal.return %x, %c1, %c1 : index, index, index
}
// On the CPU side we use shims here to marshal across the ABI. This
// allows us to hide the implementation details of how the runtime calls
// into functions and call out to C functions that don't need to link
// against the runtime. We could probably come up with ways of automating
// this but that's mostly left as an exercise to the frontends that may be
// producing this IR for input to the IREE compiler as each may have its
// own quirks.
builtin.module {
// External function declaration using a user-chosen calling convention.
// Using llvm.bareptr=true and iree_codegen.extract_strided_metadata to get
// a simplified signature with just base pointers and offsets.
// This results in a clean C function signature:
// (float*, size_t, float*, size_t, float*, size_t, size_t, size_t)
func.func private @simple_mul_workgroup(
%binding0: memref<f32>, %binding0_offset: index,
%binding1: memref<f32>, %binding1_offset: index,
%binding2: memref<f32>, %binding2_offset: index,
%dim: index, %tid: index) attributes {
// Ensures that we try to statically link this external function and
// pull it in from the object file.
hal.import.static,
llvm.bareptr = true
}
// IREE exported function using a HAL interface.
// At this layer of the stack all operands have been converted into
// constants and bindings have been specified.
func.func @simple_mul() {
%c0 = arith.constant 0 : index
// Push constants representing primitive operands can be loaded here.
%dim_i32 = hal.interface.constant.load layout(#pipeline_layout_0) ordinal(0) : i32
%dim = arith.index_castui %dim_i32 : i32 to index
// This function is invoked once per workgroup so determine where this
// particular workgroup is in the grid. In this example we use a
// workgroup size of 64x1x1 (which is exceedingly small for CPUs but
// useful for demonstration).
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%tid = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
// Bindings are accessed by reference.
%binding0 = hal.interface.binding.subspan layout(#pipeline_layout_0) binding(0) alignment(64) offset(%c0) : memref<?xf32>{%dim}
%binding1 = hal.interface.binding.subspan layout(#pipeline_layout_0) binding(1) alignment(64) offset(%c0) : memref<?xf32>{%dim}
%binding2 = hal.interface.binding.subspan layout(#pipeline_layout_0) binding(2) alignment(64) offset(%c0) : memref<?xf32>{%dim}
// Extract base pointers and offsets from the bindings.
// This preserves the SSA values through buffer aliasing optimizations.
%base0, %offset0, %sizes0, %strides0 = iree_codegen.extract_strided_metadata %binding0
: memref<?xf32> -> memref<f32>, index, index, index
%base1, %offset1, %sizes1, %strides1 = iree_codegen.extract_strided_metadata %binding1
: memref<?xf32> -> memref<f32>, index, index, index
%base2, %offset2, %sizes2, %strides2 = iree_codegen.extract_strided_metadata %binding2
: memref<?xf32> -> memref<f32>, index, index, index
// Call the externally defined C function with a simplified calling
// convention using bareptr (base pointer + offset per buffer).
func.call @simple_mul_workgroup(%base0, %offset0, %base1, %offset1, %base2, %offset2, %dim, %tid)
: (memref<f32>, index, memref<f32>, index, memref<f32>, index, index, index) -> ()
// NOTE: this is code generated as normal - other MLIR ops can be used
// here for looping/control flow, vector operations, linalg, etc.
// This simple sample is just calling out to the external function but
// microkernels fused with other code are possible.
return
}
func.func private @simple_mul_inplace_workgroup(
%binding0: memref<f32>, %binding0_offset: index,
%binding1: memref<f32>, %binding1_offset: index,
%dim: index, %tid: index) attributes {
hal.import.static,
llvm.bareptr = true
}
func.func @simple_mul_inplace() {
%c0 = arith.constant 0 : index
%dim_i32 = hal.interface.constant.load layout(#pipeline_layout_1) ordinal(0) : i32
%dim = arith.index_castui %dim_i32 : i32 to index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%tid = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
// Same as above but note that we're treating %binding1 as read/write.
%binding0 = hal.interface.binding.subspan layout(#pipeline_layout_1) binding(0) alignment(64) offset(%c0) : memref<?xf32>{%dim}
%binding1 = hal.interface.binding.subspan layout(#pipeline_layout_1) binding(1) alignment(64) offset(%c0) : memref<?xf32>{%dim}
// Extract base pointers and offsets from the bindings.
%base0, %offset0, %sizes0, %strides0 = iree_codegen.extract_strided_metadata %binding0
: memref<?xf32> -> memref<f32>, index, index, index
%base1, %offset1, %sizes1, %strides1 = iree_codegen.extract_strided_metadata %binding1
: memref<?xf32> -> memref<f32>, index, index, index
func.call @simple_mul_inplace_workgroup(%base0, %offset0, %base1, %offset1, %dim, %tid)
: (memref<f32>, index, memref<f32>, index, index, index) -> ()
return
}
}
} // hal.executable.variant
} // hal.executable
// Function demonstrating a few hand-authored dispatches mixed with codegen.
// Invoke with:
// --device=local-sync
// --function=mixed_invocation
// --input=8xf32=2
// --input=8xf32=4
// CHECK-LABEL: EXEC @mixed_invocation
func.func @mixed_invocation(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
// The only externally available metadata in the dispatch are the values
// passed in as operands. Here we pass in the dynamic dimension.
//
// HACK: for hand-authored kernels all primitive values passed in need to
// be i32 or a bit-castable type. This is because ABI packing of other types
// happens inside of the PackDispatchOperandsPass that is currently not
// usable with external functions as it changes the ABI. In the future we
// can better define the ABI such that it's possible to match the compiler
// expectations around padding/alignment. For now users must do the packing
// themselves (splitting i64 into i32+i32, etc).
%c0 = arith.constant 0 : index
%dim = tensor.dim %arg0, %c0 : tensor<?xf32>
%dim_i32 = arith.index_cast %dim : index to i32
// Dispatch a basic `ret = lhs * rhs` using an external function.
%0 = flow.dispatch @executable::@x86_64::@simple_mul[%dim](%dim_i32, %arg0, %arg1) {
// HACK: keep the executable live through DCE. Only required when
// using the automatic variant selection.
// TODO(benvanik): automatically add this when required.
hal.executable.ref = [@executable]
} : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> tensor<?xf32>{%dim}
// Code gen some other ops - these will interleave with the hand-authored
// ones but naturally won't be able to fuse with them.
%1 = arith.addf %0, %arg1 : tensor<?xf32>
// Dispatch an in-place `rhs *= lhs` using an external function.
// This form (@executable::@variant::@export) specifically chooses a variant
// instead of relying on automatic selection. This can be used by frontends
// to allow user-controlled overrides of the dispatches, custom selection
// logic based on runtime parameters, etc. In general, though, the above
// automatic selection should be used.
%2 = flow.dispatch @executable::@x86_64::@simple_mul_inplace[%dim](%dim_i32, %0, %1) : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> %1{%dim}
// CHECK: 8xf32=96 96 96 96 96 96 96 96
return %2 : tensor<?xf32>
}
} // module