| // RUN: iree-compile %s \ |
| // RUN: --iree-hal-executable-object-search-path=$IREE_BINARY_DIR | \ |
| // RUN: iree-run-module \ |
| // RUN: --device=local-sync \ |
| // RUN: --module=- \ |
| // RUN: --function=mixed_invocation \ |
| // RUN: --input=8xf32=2 \ |
| // RUN: --input=8xf32=4 | \ |
| // RUN: FileCheck %s |
| |
| // This example demonstrates authoring and dispatching retargetable executables |
| // from the IREE `hal` dialect layer. This allows for target-specific code to |
| // be written - including unique calls for each target - as the executable |
| // variants are manually specified. The example_stream.mlir example shows how |
| // where possible the executable variant generation can be left to the compiler. |
| // |
| // Enabling this at the HAL layer allows for codegen backends translating |
| // executable variants to make local decisions about which external calls to |
| // make and where the objects come from to provide those functions. Since |
| // objects can be embedded in the IR it's possible for the backends to even |
| // generate them on-demand for embedding (such as precompiling/JITing). |
| |
| // The configuration used for executable compilation. |
| // This lets the compiler and runtime know the format and requirements of the |
| // executable binaries produced and multiple variants with differing formats |
| // and compilation options (architectures, etc) can be embedded for runtime |
| // selection. By fully specifying the targets here we can target multiple |
| // architectures and it's always possible to embed these instead of using the |
| // coarse command line compiler flags that only set single targets. |
| // |
| // To avoid too much boilerplate this example only shows a single target. See |
| // example_stream.mlir for an example with multi-targeting as there's less |
| // boilerplate required at that level. |
| #x86_64_target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", { |
| data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", |
| native_vector_size = 32 : index, |
| target_triple = "x86_64-none-elf" |
| }> |
| |
| // The target devices that the program will run on. |
| // These can come from compiler flags and multiple targets can be supported |
| // It's possible, for example, to support targeting multiple devices in the same |
| // compiled binary (CPU + Vulkan, etc). |
| #cpu_target = #hal.device.target<"local", [ |
| #x86_64_target |
| ]> : !hal.device |
| |
| #pipeline_layout_0 = #hal.pipeline.layout<constants = 1, bindings = [ |
| #hal.pipeline.binding<storage_buffer, ReadOnly>, |
| #hal.pipeline.binding<storage_buffer, ReadOnly>, |
| #hal.pipeline.binding<storage_buffer> |
| ]> |
| |
| #pipeline_layout_1 = #hal.pipeline.layout<constants = 1, bindings = [ |
| #hal.pipeline.binding<storage_buffer, ReadOnly>, |
| #hal.pipeline.binding<storage_buffer> |
| ]> |
| |
| module @example attributes {hal.device.targets = [#cpu_target]} { |
| |
| // Executable containing exported shims and calls to external functions. |
| // Each executable can contain multiple exported functions and variants for |
| // different architectures or even devices. It's also possible to mix hand- |
| // authored functions with code generated ones even for the same functions |
| // such that code generation is used as a fallback when the hand-authored |
| // kernels aren't supported at runtime. |
| hal.executable private @executable { |
| |
| // Variant linking in an x86-64 object file containing external functions. |
| hal.executable.variant public @x86_64 target(#x86_64_target) objects([ |
| // Object files linked into the executable. |
| // These object files are linked into the dynamic library and must meet |
| // the requirements for embedded ELF linkage (no TLS, no globals, no |
| // syscalls, no libc, etc). |
| #hal.executable.object<{ |
| // Referencing a file path on disk but could also have the data |
| // embedded in order to make the MLIR file hermetic/portable across |
| // compilation pipelines. In the future we'll likely use MLIR's |
| // external resource functionality for this. By allowing for the |
| // objects to be embedded we can support JIT scenarios where some |
| // layer higher or lower may be emitting the objects to link in as |
| // part of the overall compilation. |
| path = "samples/custom_dispatch/cpu/embedded/functions_x86_64.o" |
| }> |
| ]) { |
| |
| // TODO(benvanik): demonstrate hal.executable.constant.block for |
| // specialization via host logic and hal.executable.constant.load for |
| // referencing them in the shims. |
| |
| // Exported shim function calling the C `simple_mul_workgroup` function. |
| // The ordinal must be assigned by the user and unique for the executable. |
| // The layout defines the required bindings and push constants and can be |
| // thought of as the function signature. |
| hal.executable.export public @simple_mul ordinal(0) layout(#pipeline_layout_0) { |
| ^bb0(%device: !hal.device, %workload: index): |
| // This host function is used to compute the XYZ workgroup count |
| // dispatched at runtime. It can query the %device for capabilities |
| // and limits (last-level cache sizes, etc). The other arguments are the |
| // values passed in the dispatch operation (usually things like root |
| // output op tensor dimensions and other abstract values). |
| %x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload] |
| %c1 = arith.constant 1 : index |
| hal.return %x, %c1, %c1 : index, index, index |
| } |
| |
| // Similar to the above but in-place by using a read/write binding. |
| hal.executable.export public @simple_mul_inplace ordinal(1) layout(#pipeline_layout_1) { |
| ^bb0(%device: !hal.device, %workload: index): |
| %x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload] |
| %c1 = arith.constant 1 : index |
| hal.return %x, %c1, %c1 : index, index, index |
| } |
| |
| // On the CPU side we use shims here to marshal across the ABI. This |
| // allows us to hide the implementation details of how the runtime calls |
| // into functions and call out to C functions that don't need to link |
| // against the runtime. We could probably come up with ways of automating |
| // this but that's mostly left as an exercise to the frontends that may be |
| // producing this IR for input to the IREE compiler as each may have its |
| // own quirks. |
| builtin.module { |
| // External function declaration using a user-chosen calling convention. |
| // NOTE: MLIR->LLVM conversion expands each memref to a tuple and |
| // there's currently no way to change that behavior. |
| // Each memref becomes: |
| // (%base_ptr: !llvm.ptr<f32>, %aligned_ptr: !llvm.ptr<f32>, |
| // %offset: i64, %size: i64, %stride: i64) |
| // That results in the following llvm.func: |
| // (!llvm.ptr<f32>, !llvm.ptr<f32>, i64, i64, i64, // binding0 |
| // !llvm.ptr<f32>, !llvm.ptr<f32>, i64, i64, i64, // binding1 |
| // !llvm.ptr<f32>, !llvm.ptr<f32>, i64, i64, i64, // binding2 |
| // i64, // dim |
| // i64) // tid |
| // And required external C function: |
| // (float*, float*, size_t, size_t, size_t, |
| // float*, float*, size_t, size_t, size_t, |
| // float*, float*, size_t, size_t, size_t, |
| // size_t, |
| // size_t) |
| // This is not a good state to be in as we can't then map to external |
| // functions that have signatures we don't want to change. Please file |
| // upstream MLIR bugs about this behavior and the ability to just pass |
| // bare pointers if you care! |
| // |
| // NOTE: index will convert to i32 when targeting an ABI with 32-bit |
| // pointers and i64 otherwise. Use size_t on the C side to allow the |
| // same source code to work when compiled in either mode. |
| func.func private @simple_mul_workgroup(%binding0: memref<?xf32>, %binding1: memref<?xf32>, %binding2: memref<?xf32>, %dim: index, %tid: index) attributes { |
| // Ensures that we try to statically link this external function and |
| // pull it in from the object file. |
| hal.import.static |
| } |
| |
| // IREE exported function using a HAL interface. |
| // At this layer of the stack all operands have been converted into |
| // constants and bindings have been specified. |
| func.func @simple_mul() { |
| %c0 = arith.constant 0 : index |
| |
| // Push constants representing primitive operands can be loaded here. |
| %dim_i32 = hal.interface.constant.load layout(#pipeline_layout_0) ordinal(0) : i32 |
| %dim = arith.index_castui %dim_i32 : i32 to index |
| |
| // This function is invoked once per workgroup so determine where this |
| // particular workgroup is in the grid. In this example we use a |
| // workgroup size of 64x1x1 (which is exceedingly small for CPUs but |
| // useful for demonstration). |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index |
| %tid = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] |
| |
| // Bindings are accessed by reference. |
| %binding0 = hal.interface.binding.subspan layout(#pipeline_layout_0) binding(0) alignment(64) offset(%c0) : memref<?xf32>{%dim} |
| %binding1 = hal.interface.binding.subspan layout(#pipeline_layout_0) binding(1) alignment(64) offset(%c0) : memref<?xf32>{%dim} |
| %binding2 = hal.interface.binding.subspan layout(#pipeline_layout_0) binding(2) alignment(64) offset(%c0) : memref<?xf32>{%dim} |
| |
| // Call the externally defined C function with an (almost) plain C |
| // calling convention (see above for details about the mess memrefs |
| // turn into). |
| // |
| // TODO: there are ways of accessing CPU information here such as |
| // active architecture and feature bits but it is not yet exposed to |
| // the HAL level. |
| func.call @simple_mul_workgroup(%binding0, %binding1, %binding2, %dim, %tid) : (memref<?xf32>, memref<?xf32>, memref<?xf32>, index, index) -> () |
| |
| // NOTE: this is code generated as normal - other MLIR ops can be used |
| // here for looping/control flow, vector operations, linalg, etc. |
| // This simple sample is just calling out to the external function but |
| // microkernels fused with other code are possible. |
| |
| return |
| } |
| |
| func.func private @simple_mul_inplace_workgroup(%binding0: memref<?xf32>, %binding1: memref<?xf32>, %dim: index, %tid: index) attributes { |
| hal.import.static |
| } |
| func.func @simple_mul_inplace() { |
| %c0 = arith.constant 0 : index |
| |
| %dim_i32 = hal.interface.constant.load layout(#pipeline_layout_1) ordinal(0) : i32 |
| %dim = arith.index_castui %dim_i32 : i32 to index |
| |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index |
| %tid = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] |
| |
| // Same as above but note that we're treating %binding1 as read/write. |
| %binding0 = hal.interface.binding.subspan layout(#pipeline_layout_1) binding(0) alignment(64) offset(%c0) : memref<?xf32>{%dim} |
| %binding1 = hal.interface.binding.subspan layout(#pipeline_layout_1) binding(1) alignment(64) offset(%c0) : memref<?xf32>{%dim} |
| |
| func.call @simple_mul_inplace_workgroup(%binding0, %binding1, %dim, %tid) : (memref<?xf32>, memref<?xf32>, index, index) -> () |
| |
| return |
| } |
| } |
| |
| } // hal.executable.variant |
| |
| } // hal.executable |
| |
| // Function demonstrating a few hand-authored dispatches mixed with codegen. |
| // Invoke with: |
| // --device=local-sync |
| // --function=mixed_invocation |
| // --input=8xf32=2 |
| // --input=8xf32=4 |
| // CHECK-LABEL: EXEC @mixed_invocation |
| func.func @mixed_invocation(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> { |
| // The only externally available metadata in the dispatch are the values |
| // passed in as operands. Here we pass in the dynamic dimension. |
| // |
| // HACK: for hand-authored kernels all primitive values passed in need to |
| // be i32 or a bit-castable type. This is because ABI packing of other types |
| // happens inside of the PackDispatchOperandsPass that is currently not |
| // usable with external functions as it changes the ABI. In the future we |
| // can better define the ABI such that it's possible to match the compiler |
| // expectations around padding/alignment. For now users must do the packing |
| // themselves (splitting i64 into i32+i32, etc). |
| %c0 = arith.constant 0 : index |
| %dim = tensor.dim %arg0, %c0 : tensor<?xf32> |
| %dim_i32 = arith.index_cast %dim : index to i32 |
| |
| // Dispatch a basic `ret = lhs * rhs` using an external function. |
| %0 = flow.dispatch @executable::@x86_64::@simple_mul[%dim](%dim_i32, %arg0, %arg1) { |
| // HACK: keep the executable live through DCE. Only required when |
| // using the automatic variant selection. |
| // TODO(benvanik): automatically add this when required. |
| hal.executable.ref = [@executable] |
| } : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> tensor<?xf32>{%dim} |
| |
| // Code gen some other ops - these will interleave with the hand-authored |
| // ones but naturally won't be able to fuse with them. |
| %1 = arith.addf %0, %arg1 : tensor<?xf32> |
| |
| // Dispatch an in-place `rhs *= lhs` using an external function. |
| // This form (@executable::@variant::@export) specifically chooses a variant |
| // instead of relying on automatic selection. This can be used by frontends |
| // to allow user-controlled overrides of the dispatches, custom selection |
| // logic based on runtime parameters, etc. In general, though, the above |
| // automatic selection should be used. |
| %2 = flow.dispatch @executable::@x86_64::@simple_mul_inplace[%dim](%dim_i32, %0, %1) : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> %1{%dim} |
| |
| // CHECK: 8xf32=96 96 96 96 96 96 96 96 |
| return %2 : tensor<?xf32> |
| } |
| |
| } // module |