samples/custom_dispatch/cpu/embedded/example_hal.mlir - 3p/openxla/iree - Git at Google

 // RUN: iree-compile %s \
 // RUN:     --iree-hal-executable-object-search-path=$IREE_BINARY_DIR | \
 // RUN: iree-run-module \
 // RUN:     --device=local-sync \
 // RUN:     --module=- \
 // RUN:     --function=mixed_invocation \
 // RUN:     --input=8xf32=2 \
 // RUN:     --input=8xf32=4 | \
 // RUN: FileCheck %s

 // This example demonstrates authoring and dispatching retargetable executables
 // from the IREE `hal` dialect layer. This allows for target-specific code to
 // be written - including unique calls for each target - as the executable
 // variants are manually specified. The example_stream.mlir example shows how
 // where possible the executable variant generation can be left to the compiler.
 //
 // Enabling this at the HAL layer allows for codegen backends translating
 // executable variants to make local decisions about which external calls to
 // make and where the objects come from to provide those functions. Since
 // objects can be embedded in the IR it's possible for the backends to even
 // generate them on-demand for embedding (such as precompiling/JITing).

 // The configuration used for executable compilation.
 // This lets the compiler and runtime know the format and requirements of the
 // executable binaries produced and multiple variants with differing formats
 // and compilation options (architectures, etc) can be embedded for runtime
 // selection. By fully specifying the targets here we can target multiple
 // architectures and it's always possible to embed these instead of using the
 // coarse command line compiler flags that only set single targets.
 //
 // To avoid too much boilerplate this example only shows a single target. See
 // example_stream.mlir for an example with multi-targeting as there's less
 // boilerplate required at that level.
 #x86_64_target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
   data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
   native_vector_size = 32 : index,
   target_triple = "x86_64-none-elf"
 }>

 // The target devices that the program will run on.
 // These can come from compiler flags and multiple targets can be supported
 // It's possible, for example, to support targeting multiple devices in the same
 // compiled binary (CPU + Vulkan, etc).
 #cpu_target = #hal.device.target<"local", [
   #x86_64_target
 ]> : !hal.device

 #pipeline_layout_0 = #hal.pipeline.layout<constants = 1, bindings = [
   #hal.pipeline.binding<storage_buffer, ReadOnly>,
   #hal.pipeline.binding<storage_buffer, ReadOnly>,
   #hal.pipeline.binding<storage_buffer>
 ]>

 #pipeline_layout_1 = #hal.pipeline.layout<constants = 1, bindings = [
   #hal.pipeline.binding<storage_buffer, ReadOnly>,
   #hal.pipeline.binding<storage_buffer>
 ]>

 module @example attributes {hal.device.targets = [#cpu_target]} {

   // Executable containing exported shims and calls to external functions.
   // Each executable can contain multiple exported functions and variants for
   // different architectures or even devices. It's also possible to mix hand-
   // authored functions with code generated ones even for the same functions
   // such that code generation is used as a fallback when the hand-authored
   // kernels aren't supported at runtime.
   hal.executable private @executable {

     // Variant linking in an x86-64 object file containing external functions.
     hal.executable.variant public @x86_64 target(#x86_64_target) objects([
       // Object files linked into the executable.
       // These object files are linked into the dynamic library and must meet
       // the requirements for embedded ELF linkage (no TLS, no globals, no
       // syscalls, no libc, etc).
       #hal.executable.object<{
         // Referencing a file path on disk but could also have the data
         // embedded in order to make the MLIR file hermetic/portable across
         // compilation pipelines. In the future we'll likely use MLIR's
         // external resource functionality for this. By allowing for the
         // objects to be embedded we can support JIT scenarios where some
         // layer higher or lower may be emitting the objects to link in as
         // part of the overall compilation.
         path = "samples/custom_dispatch/cpu/embedded/functions_x86_64.o"
       }>
     ]) {

       // TODO(benvanik): demonstrate hal.executable.constant.block for
       // specialization via host logic and hal.executable.constant.load for
       // referencing them in the shims.

       // Exported shim function calling the C `simple_mul_workgroup` function.
       // The ordinal must be assigned by the user and unique for the executable.
       // The layout defines the required bindings and push constants and can be
       // thought of as the function signature.
       hal.executable.export public @simple_mul ordinal(0) layout(#pipeline_layout_0) count(%device: !hal.device, %workload: index) -> (index, index, index) {
         // This host function is used to compute the XYZ workgroup count
         // dispatched at runtime. It can query the %device for capabilities
         // and limits (last-level cache sizes, etc). The other arguments are the
         // values passed in the dispatch operation (usually things like root
         // output op tensor dimensions and other abstract values).
         %x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
         %c1 = arith.constant 1 : index
         hal.return %x, %c1, %c1 : index, index, index
       }

       // Similar to the above but in-place by using a read/write binding.
       hal.executable.export public @simple_mul_inplace ordinal(1) layout(#pipeline_layout_1) count(%device: !hal.device, %workload: index) -> (index, index, index) {
         %x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
         %c1 = arith.constant 1 : index
         hal.return %x, %c1, %c1 : index, index, index
       }

       // On the CPU side we use shims here to marshal across the ABI. This
       // allows us to hide the implementation details of how the runtime calls
       // into functions and call out to C functions that don't need to link
       // against the runtime. We could probably come up with ways of automating
       // this but that's mostly left as an exercise to the frontends that may be
       // producing this IR for input to the IREE compiler as each may have its
       // own quirks.
       builtin.module {
         // External function declaration using a user-chosen calling convention.
         // Using llvm.bareptr=true and iree_codegen.extract_strided_metadata to get
         // a simplified signature with just base pointers and offsets.
         // This results in a clean C function signature:
         // (float*, size_t, float*, size_t, float*, size_t, size_t, size_t)
         func.func private @simple_mul_workgroup(
             %binding0: memref<f32>, %binding0_offset: index,
             %binding1: memref<f32>, %binding1_offset: index,
             %binding2: memref<f32>, %binding2_offset: index,
             %dim: index, %tid: index) attributes {
           // Ensures that we try to statically link this external function and
           // pull it in from the object file.
           hal.import.static,
           llvm.bareptr = true
         }

         // IREE exported function using a HAL interface.
         // At this layer of the stack all operands have been converted into
         // constants and bindings have been specified.
         func.func @simple_mul() {
           %c0 = arith.constant 0 : index

           // Push constants representing primitive operands can be loaded here.
           %dim_i32 = hal.interface.constant.load layout(#pipeline_layout_0) ordinal(0) : i32
           %dim = arith.index_castui %dim_i32 : i32 to index

           // This function is invoked once per workgroup so determine where this
           // particular workgroup is in the grid. In this example we use a
           // workgroup size of 64x1x1 (which is exceedingly small for CPUs but
           // useful for demonstration).
           %workgroup_id_x = hal.interface.workgroup.id[0] : index
           %tid = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]

           // Bindings are accessed by reference.
           %binding0 = hal.interface.binding.subspan layout(#pipeline_layout_0) binding(0) alignment(64) offset(%c0) : memref<?xf32>{%dim}
           %binding1 = hal.interface.binding.subspan layout(#pipeline_layout_0) binding(1) alignment(64) offset(%c0) : memref<?xf32>{%dim}
           %binding2 = hal.interface.binding.subspan layout(#pipeline_layout_0) binding(2) alignment(64) offset(%c0) : memref<?xf32>{%dim}

           // Extract base pointers and offsets from the bindings.
           // This preserves the SSA values through buffer aliasing optimizations.
           %base0, %offset0, %sizes0, %strides0 = iree_codegen.extract_strided_metadata %binding0
               : memref<?xf32> -> memref<f32>, index, index, index
           %base1, %offset1, %sizes1, %strides1 = iree_codegen.extract_strided_metadata %binding1
               : memref<?xf32> -> memref<f32>, index, index, index
           %base2, %offset2, %sizes2, %strides2 = iree_codegen.extract_strided_metadata %binding2
               : memref<?xf32> -> memref<f32>, index, index, index

           // Call the externally defined C function with a simplified calling
           // convention using bareptr (base pointer + offset per buffer).
           func.call @simple_mul_workgroup(%base0, %offset0, %base1, %offset1, %base2, %offset2, %dim, %tid)
               : (memref<f32>, index, memref<f32>, index, memref<f32>, index, index, index) -> ()

           // NOTE: this is code generated as normal - other MLIR ops can be used
           // here for looping/control flow, vector operations, linalg, etc.
           // This simple sample is just calling out to the external function but
           // microkernels fused with other code are possible.

           return
         }

         func.func private @simple_mul_inplace_workgroup(
             %binding0: memref<f32>, %binding0_offset: index,
             %binding1: memref<f32>, %binding1_offset: index,
             %dim: index, %tid: index) attributes {
           hal.import.static,
           llvm.bareptr = true
         }
         func.func @simple_mul_inplace() {
           %c0 = arith.constant 0 : index

           %dim_i32 = hal.interface.constant.load layout(#pipeline_layout_1) ordinal(0) : i32
           %dim = arith.index_castui %dim_i32 : i32 to index

           %workgroup_id_x = hal.interface.workgroup.id[0] : index
           %tid = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]

           // Same as above but note that we're treating %binding1 as read/write.
           %binding0 = hal.interface.binding.subspan layout(#pipeline_layout_1) binding(0) alignment(64) offset(%c0) : memref<?xf32>{%dim}
           %binding1 = hal.interface.binding.subspan layout(#pipeline_layout_1) binding(1) alignment(64) offset(%c0) : memref<?xf32>{%dim}

           // Extract base pointers and offsets from the bindings.
           %base0, %offset0, %sizes0, %strides0 = iree_codegen.extract_strided_metadata %binding0
               : memref<?xf32> -> memref<f32>, index, index, index
           %base1, %offset1, %sizes1, %strides1 = iree_codegen.extract_strided_metadata %binding1
               : memref<?xf32> -> memref<f32>, index, index, index

           func.call @simple_mul_inplace_workgroup(%base0, %offset0, %base1, %offset1, %dim, %tid)
               : (memref<f32>, index, memref<f32>, index, index, index) -> ()

           return
         }
       }

     }  // hal.executable.variant

   }  // hal.executable

   // Function demonstrating a few hand-authored dispatches mixed with codegen.
   // Invoke with:
   //  --device=local-sync
   //  --function=mixed_invocation
   //  --input=8xf32=2
   //  --input=8xf32=4
   // CHECK-LABEL: EXEC @mixed_invocation
   func.func @mixed_invocation(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
     // The only externally available metadata in the dispatch are the values
     // passed in as operands. Here we pass in the dynamic dimension.
     //
     // HACK: for hand-authored kernels all primitive values passed in need to
     // be i32 or a bit-castable type. This is because ABI packing of other types
     // happens inside of the PackDispatchOperandsPass that is currently not
     // usable with external functions as it changes the ABI. In the future we
     // can better define the ABI such that it's possible to match the compiler
     // expectations around padding/alignment. For now users must do the packing
     // themselves (splitting i64 into i32+i32, etc).
     %c0 = arith.constant 0 : index
     %dim = tensor.dim %arg0, %c0 : tensor<?xf32>
     %dim_i32 = arith.index_cast %dim : index to i32

     // Dispatch a basic `ret = lhs * rhs` using an external function.
     %0 = flow.dispatch @executable::@x86_64::@simple_mul[%dim](%dim_i32, %arg0, %arg1) {
       // HACK: keep the executable live through DCE. Only required when
       // using the automatic variant selection.
       // TODO(benvanik): automatically add this when required.
       hal.executable.ref = [@executable]
     } : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> tensor<?xf32>{%dim}

     // Code gen some other ops - these will interleave with the hand-authored
     // ones but naturally won't be able to fuse with them.
     %1 = arith.addf %0, %arg1 : tensor<?xf32>

     // Dispatch an in-place `rhs *= lhs` using an external function.
     // This form (@executable::@variant::@export) specifically chooses a variant
     // instead of relying on automatic selection. This can be used by frontends
     // to allow user-controlled overrides of the dispatches, custom selection
     // logic based on runtime parameters, etc. In general, though, the above
     // automatic selection should be used.
     %2 = flow.dispatch @executable::@x86_64::@simple_mul_inplace[%dim](%dim_i32, %0, %1) : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> %1{%dim}

     // CHECK: 8xf32=96 96 96 96 96 96 96 96
     return %2 : tensor<?xf32>
   }

 }  // module
	// RUN: iree-compile %s \
	// RUN: --iree-hal-executable-object-search-path=$IREE_BINARY_DIR \| \
	// RUN: iree-run-module \
	// RUN: --device=local-sync \
	// RUN: --module=- \
	// RUN: --function=mixed_invocation \
	// RUN: --input=8xf32=2 \
	// RUN: --input=8xf32=4 \| \
	// RUN: FileCheck %s

	// This example demonstrates authoring and dispatching retargetable executables
	// from the IREE `hal` dialect layer. This allows for target-specific code to
	// be written - including unique calls for each target - as the executable
	// variants are manually specified. The example_stream.mlir example shows how
	// where possible the executable variant generation can be left to the compiler.
	//
	// Enabling this at the HAL layer allows for codegen backends translating
	// executable variants to make local decisions about which external calls to
	// make and where the objects come from to provide those functions. Since
	// objects can be embedded in the IR it's possible for the backends to even
	// generate them on-demand for embedding (such as precompiling/JITing).

	// The configuration used for executable compilation.
	// This lets the compiler and runtime know the format and requirements of the
	// executable binaries produced and multiple variants with differing formats
	// and compilation options (architectures, etc) can be embedded for runtime
	// selection. By fully specifying the targets here we can target multiple
	// architectures and it's always possible to embed these instead of using the
	// coarse command line compiler flags that only set single targets.
	//
	// To avoid too much boilerplate this example only shows a single target. See
	// example_stream.mlir for an example with multi-targeting as there's less
	// boilerplate required at that level.
	#x86_64_target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
	data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
	native_vector_size = 32 : index,
	target_triple = "x86_64-none-elf"
	}>

	// The target devices that the program will run on.
	// These can come from compiler flags and multiple targets can be supported
	// It's possible, for example, to support targeting multiple devices in the same
	// compiled binary (CPU + Vulkan, etc).
	#cpu_target = #hal.device.target<"local", [
	#x86_64_target
	]> : !hal.device

	#pipeline_layout_0 = #hal.pipeline.layout<constants = 1, bindings = [
	#hal.pipeline.binding<storage_buffer, ReadOnly>,
	#hal.pipeline.binding<storage_buffer, ReadOnly>,
	#hal.pipeline.binding<storage_buffer>
	]>

	#pipeline_layout_1 = #hal.pipeline.layout<constants = 1, bindings = [
	#hal.pipeline.binding<storage_buffer, ReadOnly>,
	#hal.pipeline.binding<storage_buffer>
	]>

	module @example attributes {hal.device.targets = [#cpu_target]} {

	// Executable containing exported shims and calls to external functions.
	// Each executable can contain multiple exported functions and variants for
	// different architectures or even devices. It's also possible to mix hand-
	// authored functions with code generated ones even for the same functions
	// such that code generation is used as a fallback when the hand-authored
	// kernels aren't supported at runtime.
	hal.executable private @executable {

	// Variant linking in an x86-64 object file containing external functions.
	hal.executable.variant public @x86_64 target(#x86_64_target) objects([
	// Object files linked into the executable.
	// These object files are linked into the dynamic library and must meet
	// the requirements for embedded ELF linkage (no TLS, no globals, no
	// syscalls, no libc, etc).
	#hal.executable.object<{
	// Referencing a file path on disk but could also have the data
	// embedded in order to make the MLIR file hermetic/portable across
	// compilation pipelines. In the future we'll likely use MLIR's
	// external resource functionality for this. By allowing for the
	// objects to be embedded we can support JIT scenarios where some
	// layer higher or lower may be emitting the objects to link in as
	// part of the overall compilation.
	path = "samples/custom_dispatch/cpu/embedded/functions_x86_64.o"
	}>
	]) {

	// TODO(benvanik): demonstrate hal.executable.constant.block for
	// specialization via host logic and hal.executable.constant.load for
	// referencing them in the shims.

	// Exported shim function calling the C `simple_mul_workgroup` function.
	// The ordinal must be assigned by the user and unique for the executable.
	// The layout defines the required bindings and push constants and can be
	// thought of as the function signature.
	hal.executable.export public @simple_mul ordinal(0) layout(#pipeline_layout_0) count(%device: !hal.device, %workload: index) -> (index, index, index) {
	// This host function is used to compute the XYZ workgroup count
	// dispatched at runtime. It can query the %device for capabilities
	// and limits (last-level cache sizes, etc). The other arguments are the
	// values passed in the dispatch operation (usually things like root
	// output op tensor dimensions and other abstract values).
	%x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
	%c1 = arith.constant 1 : index
	hal.return %x, %c1, %c1 : index, index, index
	}

	// Similar to the above but in-place by using a read/write binding.
	hal.executable.export public @simple_mul_inplace ordinal(1) layout(#pipeline_layout_1) count(%device: !hal.device, %workload: index) -> (index, index, index) {
	%x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
	%c1 = arith.constant 1 : index
	hal.return %x, %c1, %c1 : index, index, index
	}

	// On the CPU side we use shims here to marshal across the ABI. This
	// allows us to hide the implementation details of how the runtime calls
	// into functions and call out to C functions that don't need to link
	// against the runtime. We could probably come up with ways of automating
	// this but that's mostly left as an exercise to the frontends that may be
	// producing this IR for input to the IREE compiler as each may have its
	// own quirks.
	builtin.module {
	// External function declaration using a user-chosen calling convention.
	// Using llvm.bareptr=true and iree_codegen.extract_strided_metadata to get
	// a simplified signature with just base pointers and offsets.
	// This results in a clean C function signature:
	// (float, size_t, float, size_t, float*, size_t, size_t, size_t)
	func.func private @simple_mul_workgroup(
	%binding0: memref<f32>, %binding0_offset: index,
	%binding1: memref<f32>, %binding1_offset: index,
	%binding2: memref<f32>, %binding2_offset: index,
	%dim: index, %tid: index) attributes {
	// Ensures that we try to statically link this external function and
	// pull it in from the object file.
	hal.import.static,
	llvm.bareptr = true
	}

	// IREE exported function using a HAL interface.
	// At this layer of the stack all operands have been converted into
	// constants and bindings have been specified.
	func.func @simple_mul() {
	%c0 = arith.constant 0 : index

	// Push constants representing primitive operands can be loaded here.
	%dim_i32 = hal.interface.constant.load layout(#pipeline_layout_0) ordinal(0) : i32
	%dim = arith.index_castui %dim_i32 : i32 to index

	// This function is invoked once per workgroup so determine where this
	// particular workgroup is in the grid. In this example we use a
	// workgroup size of 64x1x1 (which is exceedingly small for CPUs but
	// useful for demonstration).
	%workgroup_id_x = hal.interface.workgroup.id[0] : index
	%tid = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]

	// Bindings are accessed by reference.
	%binding0 = hal.interface.binding.subspan layout(#pipeline_layout_0) binding(0) alignment(64) offset(%c0) : memref<?xf32>{%dim}
	%binding1 = hal.interface.binding.subspan layout(#pipeline_layout_0) binding(1) alignment(64) offset(%c0) : memref<?xf32>{%dim}
	%binding2 = hal.interface.binding.subspan layout(#pipeline_layout_0) binding(2) alignment(64) offset(%c0) : memref<?xf32>{%dim}

	// Extract base pointers and offsets from the bindings.
	// This preserves the SSA values through buffer aliasing optimizations.
	%base0, %offset0, %sizes0, %strides0 = iree_codegen.extract_strided_metadata %binding0
	: memref<?xf32> -> memref<f32>, index, index, index
	%base1, %offset1, %sizes1, %strides1 = iree_codegen.extract_strided_metadata %binding1
	: memref<?xf32> -> memref<f32>, index, index, index
	%base2, %offset2, %sizes2, %strides2 = iree_codegen.extract_strided_metadata %binding2
	: memref<?xf32> -> memref<f32>, index, index, index

	// Call the externally defined C function with a simplified calling
	// convention using bareptr (base pointer + offset per buffer).
	func.call @simple_mul_workgroup(%base0, %offset0, %base1, %offset1, %base2, %offset2, %dim, %tid)
	: (memref<f32>, index, memref<f32>, index, memref<f32>, index, index, index) -> ()

	// NOTE: this is code generated as normal - other MLIR ops can be used
	// here for looping/control flow, vector operations, linalg, etc.
	// This simple sample is just calling out to the external function but
	// microkernels fused with other code are possible.

	return
	}

	func.func private @simple_mul_inplace_workgroup(
	%binding0: memref<f32>, %binding0_offset: index,
	%binding1: memref<f32>, %binding1_offset: index,
	%dim: index, %tid: index) attributes {
	hal.import.static,
	llvm.bareptr = true
	}
	func.func @simple_mul_inplace() {
	%c0 = arith.constant 0 : index

	%dim_i32 = hal.interface.constant.load layout(#pipeline_layout_1) ordinal(0) : i32
	%dim = arith.index_castui %dim_i32 : i32 to index

	%workgroup_id_x = hal.interface.workgroup.id[0] : index
	%tid = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]

	// Same as above but note that we're treating %binding1 as read/write.
	%binding0 = hal.interface.binding.subspan layout(#pipeline_layout_1) binding(0) alignment(64) offset(%c0) : memref<?xf32>{%dim}
	%binding1 = hal.interface.binding.subspan layout(#pipeline_layout_1) binding(1) alignment(64) offset(%c0) : memref<?xf32>{%dim}

	// Extract base pointers and offsets from the bindings.
	%base0, %offset0, %sizes0, %strides0 = iree_codegen.extract_strided_metadata %binding0
	: memref<?xf32> -> memref<f32>, index, index, index
	%base1, %offset1, %sizes1, %strides1 = iree_codegen.extract_strided_metadata %binding1
	: memref<?xf32> -> memref<f32>, index, index, index

	func.call @simple_mul_inplace_workgroup(%base0, %offset0, %base1, %offset1, %dim, %tid)
	: (memref<f32>, index, memref<f32>, index, index, index) -> ()

	return
	}
	}

	} // hal.executable.variant

	} // hal.executable

	// Function demonstrating a few hand-authored dispatches mixed with codegen.
	// Invoke with:
	// --device=local-sync
	// --function=mixed_invocation
	// --input=8xf32=2
	// --input=8xf32=4
	// CHECK-LABEL: EXEC @mixed_invocation
	func.func @mixed_invocation(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
	// The only externally available metadata in the dispatch are the values
	// passed in as operands. Here we pass in the dynamic dimension.
	//
	// HACK: for hand-authored kernels all primitive values passed in need to
	// be i32 or a bit-castable type. This is because ABI packing of other types
	// happens inside of the PackDispatchOperandsPass that is currently not
	// usable with external functions as it changes the ABI. In the future we
	// can better define the ABI such that it's possible to match the compiler
	// expectations around padding/alignment. For now users must do the packing
	// themselves (splitting i64 into i32+i32, etc).
	%c0 = arith.constant 0 : index
	%dim = tensor.dim %arg0, %c0 : tensor<?xf32>
	%dim_i32 = arith.index_cast %dim : index to i32

	// Dispatch a basic `ret = lhs * rhs` using an external function.
	%0 = flow.dispatch @executable::@x86_64::@simple_mul[%dim](%dim_i32, %arg0, %arg1) {
	// HACK: keep the executable live through DCE. Only required when
	// using the automatic variant selection.
	// TODO(benvanik): automatically add this when required.
	hal.executable.ref = [@executable]
	} : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> tensor<?xf32>{%dim}

	// Code gen some other ops - these will interleave with the hand-authored
	// ones but naturally won't be able to fuse with them.
	%1 = arith.addf %0, %arg1 : tensor<?xf32>

	// Dispatch an in-place `rhs *= lhs` using an external function.
	// This form (@executable::@variant::@export) specifically chooses a variant
	// instead of relying on automatic selection. This can be used by frontends
	// to allow user-controlled overrides of the dispatches, custom selection
	// logic based on runtime parameters, etc. In general, though, the above
	// automatic selection should be used.
	%2 = flow.dispatch @executable::@x86_64::@simple_mul_inplace[%dim](%dim_i32, %0, %1) : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> %1{%dim}

	// CHECK: 8xf32=96 96 96 96 96 96 96 96
	return %2 : tensor<?xf32>
	}

	} // module