samples/custom_dispatch/cpu/embedded/example_hal.mlir - 3p/openxla/iree - Git at Google

 // RUN: iree-compile %s \
 // RUN:     --iree-hal-executable-object-search-path=$IREE_BINARY_DIR | \
 // RUN: iree-run-module \
 // RUN:     --device=local-sync \
 // RUN:     --module=- \
 // RUN:     --function=mixed_invocation \
 // RUN:     --input=8xf32=2 \
 // RUN:     --input=8xf32=4 | \
 // RUN: FileCheck %s

 // This example demonstrates authoring and dispatching retargetable executables
 // from the IREE `hal` dialect layer. This allows for target-specific code to
 // be written - including unique calls for each target - as the executable
 // variants are manually specified. The example_stream.mlir example shows how
 // where possible the executable variant generation can be left to the compiler.
 //
 // Enabling this at the HAL layer allows for codegen backends translating
 // executable variants to make local decisions about which external calls to
 // make and where the objects come from to provide those functions. Since
 // objects can be embedded in the IR it's possible for the backends to even
 // generate them on-demand for embedding (such as precompiling/JITing).

 // The configuration used for executable compilation.
 // This lets the compiler and runtime know the format and requirements of the
 // executable binaries produced and multiple variants with differing formats
 // and compilation options (architectures, etc) can be embedded for runtime
 // selection. By fully specifying the targets here we can target multiple
 // architectures and it's always possible to embed these instead of using the
 // coarse command line compiler flags that only set single targets.
 //
 // To avoid too much boilerplate this example only shows a single target. See
 // example_stream.mlir for an example with multi-targeting as there's less
 // boilerplate required at that level.
 #x86_64_target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
   data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
   native_vector_size = 32 : index,
   target_triple = "x86_64-none-elf"
 }>

 // The target devices that the program will run on.
 // These can come from compiler flags and multiple targets can be supported
 // It's possible, for example, to support targeting multiple devices in the same
 // compiled binary (CPU + Vulkan, etc).
 #cpu_target = #hal.device.target<"llvm-cpu", {
   executable_targets = [
     #x86_64_target
   ]
 }>

 module @example attributes {hal.device.targets = [#cpu_target]} {

   // Executable containing exported shims and calls to external functions.
   // Each executable can contain multiple exported functions and variants for
   // different architectures or even devices. It's also possible to mix hand-
   // authored functions with code generated ones even for the same functions
   // such that code generation is used as a fallback when the hand-authored
   // kernels aren't supported at runtime.
   hal.executable private @executable {

     // Variant linking in an x86-64 object file containing external functions.
     hal.executable.variant public @x86_64, target = #x86_64_target, objects = [
       // Object files linked into the executable.
       // These object files are linked into the dynamic library and must meet
       // the requirements for embedded ELF linkage (no TLS, no globals, no
       // syscalls, no libc, etc).
       #hal.executable.object<{
         // Referencing a file path on disk but could also have the data
         // embedded in order to make the MLIR file hermetic/portable across
         // compilation pipelines. In the future we'll likely use MLIR's
         // external resource functionality for this. By allowing for the
         // objects to be embedded we can support JIT scenarios where some
         // layer higher or lower may be emitting the objects to link in as
         // part of the overall compilation.
         path = "samples/custom_dispatch/cpu/embedded/functions_x86_64.o"
       }>
     ] {

       // TODO(benvanik): demonstrate hal.executable.constant.block for
       // specialization via host logic and hal.executable.constant.load for
       // referencing them in the shims.

       // Exported shim function calling the C `simple_mul_workgroup` function.
       // The ordinal must be assigned by the user and unique for the executable.
       // The layout defines the required bindings and push constants and can be
       // thought of as the function signature.
       hal.executable.export public @simple_mul ordinal(0)
           layout(#hal.pipeline.layout<push_constants = 1, sets = [
             <0, bindings = [
                 <0, storage_buffer, ReadOnly>,
                 <1, storage_buffer, ReadOnly>,
                 <2, storage_buffer>
             ]>
           ]>) {
       ^bb0(%device: !hal.device, %workload: index):
         // This host function is used to compute the XYZ workgroup count
         // dispatched at runtime. It can query the %device for capabilities
         // and limits (last-level cache sizes, etc). The other arguments are the
         // values passed in the dispatch operation (usually things like root
         // output op tensor dimensions and other abstract values).
         %x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
         %c1 = arith.constant 1 : index
         hal.return %x, %c1, %c1 : index, index, index
       }

       // Similar to the above but in-place by using a read/write binding.
       hal.executable.export public @simple_mul_inplace ordinal(1)
           layout(#hal.pipeline.layout<push_constants = 1, sets = [
             <0, bindings = [
                 <0, storage_buffer, ReadOnly>,
                 <1, storage_buffer>
             ]>
           ]>) {
       ^bb0(%device: !hal.device, %workload: index):
         %x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
         %c1 = arith.constant 1 : index
         hal.return %x, %c1, %c1 : index, index, index
       }

       // On the CPU side we use shims here to marshal across the ABI. This
       // allows us to hide the implementation details of how the runtime calls
       // into functions and call out to C functions that don't need to link
       // against the runtime. We could probably come up with ways of automating
       // this but that's mostly left as an exercise to the frontends that may be
       // producing this IR for input to the IREE compiler as each may have its
       // own quirks.
       builtin.module {
         // External function declaration using a user-chosen calling convention.
         // NOTE: MLIR->LLVM conversion expands each memref to a tuple and
         // there's currently no way to change that behavior.
         // Each memref becomes:
         // (%base_ptr: !llvm.ptr<f32>, %aligned_ptr: !llvm.ptr<f32>,
         //  %offset: i64, %size: i64, %stride: i64)
         // That results in the following llvm.func:
         // (!llvm.ptr<f32>, !llvm.ptr<f32>, i64, i64, i64,  // binding0
         //  !llvm.ptr<f32>, !llvm.ptr<f32>, i64, i64, i64,  // binding1
         //  !llvm.ptr<f32>, !llvm.ptr<f32>, i64, i64, i64,  // binding2
         //  i64,                                            // dim
         //  i64)                                            // tid
         // And required external C function:
         // (float*, float*, size_t, size_t, size_t,
         //  float*, float*, size_t, size_t, size_t,
         //  float*, float*, size_t, size_t, size_t,
         //  size_t,
         //  size_t)
         // This is not a good state to be in as we can't then map to external
         // functions that have signatures we don't want to change. Please file
         // upstream MLIR bugs about this behavior and the ability to just pass
         // bare pointers if you care!
         //
         // NOTE: index will convert to i32 when targeting an ABI with 32-bit
         // pointers and i64 otherwise. Use size_t on the C side to allow the
         // same source code to work when compiled in either mode.
         func.func private @simple_mul_workgroup(%binding0: memref<?xf32>, %binding1: memref<?xf32>, %binding2: memref<?xf32>, %dim: index, %tid: index) attributes {
           // Ensures that we try to statically link this external function and
           // pull it in from the object file.
           hal.import.static
         }

         // IREE exported function using a HAL interface.
         // At this layer of the stack all operands have been converted into
         // constants and bindings have been specified.
         func.func @simple_mul() {
           %c0 = arith.constant 0 : index

           // Push constants representing primitive operands can be loaded here.
           %dim_i32 = hal.interface.constant.load[0] : i32
           %dim = arith.index_castui %dim_i32 : i32 to index

           // This function is invoked once per workgroup so determine where this
           // particular workgroup is in the grid. In this example we use a
           // workgroup size of 64x1x1 (which is exceedingly small for CPUs but
           // useful for demonstration).
           %workgroup_id_x = hal.interface.workgroup.id[0] : index
           %tid = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]

           // Bindings are accessed by reference.
           %binding0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32>{%dim}
           %binding1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32>{%dim}
           %binding2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32>{%dim}

           // Call the externally defined C function with an (almost) plain C
           // calling convention (see above for details about the mess memrefs
           // turn into).
           //
           // TODO: there are ways of accessing CPU information here such as
           // active architecture and feature bits but it is not yet exposed to
           // the HAL level.
           func.call @simple_mul_workgroup(%binding0, %binding1, %binding2, %dim, %tid) : (memref<?xf32>, memref<?xf32>, memref<?xf32>, index, index) -> ()

           // NOTE: this is code generated as normal - other MLIR ops can be used
           // here for looping/control flow, vector operations, linalg, etc.
           // This simple sample is just calling out to the external function but
           // microkernels fused with other code are possible.

           return
         }

         func.func private @simple_mul_inplace_workgroup(%binding0: memref<?xf32>, %binding1: memref<?xf32>, %dim: index, %tid: index) attributes {
           hal.import.static
         }
         func.func @simple_mul_inplace() {
           %c0 = arith.constant 0 : index

           %dim_i32 = hal.interface.constant.load[0] : i32
           %dim = arith.index_castui %dim_i32 : i32 to index

           %workgroup_id_x = hal.interface.workgroup.id[0] : index
           %tid = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]

           // Same as above but note that we're treating %binding1 as read/write.
           %binding0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32>{%dim}
           %binding1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32>{%dim}

           func.call @simple_mul_inplace_workgroup(%binding0, %binding1, %dim, %tid) : (memref<?xf32>, memref<?xf32>, index, index) -> ()

           return
         }
       }

     }  // hal.executable.variant

   }  // hal.executable

   // Function demonstrating a few hand-authored dispatches mixed with codegen.
   // Invoke with:
   //  --device=local-sync
   //  --function=mixed_invocation
   //  --input=8xf32=2
   //  --input=8xf32=4
   // CHECK-LABEL: EXEC @mixed_invocation
   func.func @mixed_invocation(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
     // The only externally available metadata in the dispatch are the values
     // passed in as operands. Here we pass in the dynamic dimension.
     //
     // HACK: for hand-authored kernels all primitive values passed in need to
     // be i32 or a bit-castable type. This is because ABI packing of other types
     // happens inside of the PackDispatchOperandsPass that is currently not
     // usable with external functions as it changes the ABI. In the future we
     // can better define the ABI such that it's possible to match the compiler
     // expectations around padding/alignment. For now users must do the packing
     // themselves (splitting i64 into i32+i32, etc).
     %c0 = arith.constant 0 : index
     %dim = tensor.dim %arg0, %c0 : tensor<?xf32>
     %dim_i32 = arith.index_cast %dim : index to i32

     // Dispatch a basic `ret = lhs * rhs` using an external function.
     %0 = flow.dispatch @executable::@x86_64::@simple_mul[%dim](%dim_i32, %arg0, %arg1) {
       // Bindings are automatically inferred when possible as part of the ABI
       // but can be overridden if the user wants to use features such as sparse
       // bindings or multiple descriptor sets. To do so the
       // `hal.interface.bindings` attribute can be added to a dispatch op as
       // follows mapping tensor operands/results to the pipeline layout
       // sets/bindings:
       hal.interface.bindings = [
         #hal.interface.binding<0, 0>,
         #hal.interface.binding<0, 1>,
         #hal.interface.binding<0, 2>
       ],
       // HACK: keep the executable live through DCE. Only required when
       // using the automatic variant selection.
       // TODO(benvanik): automatically add this when required.
       hal.executable.ref = [@executable]
     } : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> tensor<?xf32>{%dim}

     // Code gen some other ops - these will interleave with the hand-authored
     // ones but naturally won't be able to fuse with them.
     %1 = arith.addf %0, %arg1 : tensor<?xf32>

     // Dispatch an in-place `rhs *= lhs` using an external function.
     // This form (@executable::@variant::@export) specifically chooses a variant
     // instead of relying on automatic selection. This can be used by frontends
     // to allow user-controlled overrides of the dispatches, custom selection
     // logic based on runtime parameters, etc. In general, though, the above
     // automatic selection should be used.
     //
     // Note that we don't declare the hal.interface.bindings and let them be
     // inferred - this only works when either specifying the variant that has
     // a pipeline layout defined or all variants have the same pipeline layouts.
     %2 = flow.dispatch @executable::@x86_64::@simple_mul_inplace[%dim](%dim_i32, %0, %1) : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> %1{%dim}

     // CHECK: 8xf32=96 96 96 96 96 96 96 96
     return %2 : tensor<?xf32>
   }

 }  // module
	// RUN: iree-compile %s \
	// RUN: --iree-hal-executable-object-search-path=$IREE_BINARY_DIR \| \
	// RUN: iree-run-module \
	// RUN: --device=local-sync \
	// RUN: --module=- \
	// RUN: --function=mixed_invocation \
	// RUN: --input=8xf32=2 \
	// RUN: --input=8xf32=4 \| \
	// RUN: FileCheck %s

	// This example demonstrates authoring and dispatching retargetable executables
	// from the IREE `hal` dialect layer. This allows for target-specific code to
	// be written - including unique calls for each target - as the executable
	// variants are manually specified. The example_stream.mlir example shows how
	// where possible the executable variant generation can be left to the compiler.
	//
	// Enabling this at the HAL layer allows for codegen backends translating
	// executable variants to make local decisions about which external calls to
	// make and where the objects come from to provide those functions. Since
	// objects can be embedded in the IR it's possible for the backends to even
	// generate them on-demand for embedding (such as precompiling/JITing).

	// The configuration used for executable compilation.
	// This lets the compiler and runtime know the format and requirements of the
	// executable binaries produced and multiple variants with differing formats
	// and compilation options (architectures, etc) can be embedded for runtime
	// selection. By fully specifying the targets here we can target multiple
	// architectures and it's always possible to embed these instead of using the
	// coarse command line compiler flags that only set single targets.
	//
	// To avoid too much boilerplate this example only shows a single target. See
	// example_stream.mlir for an example with multi-targeting as there's less
	// boilerplate required at that level.
	#x86_64_target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
	data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
	native_vector_size = 32 : index,
	target_triple = "x86_64-none-elf"
	}>

	// The target devices that the program will run on.
	// These can come from compiler flags and multiple targets can be supported
	// It's possible, for example, to support targeting multiple devices in the same
	// compiled binary (CPU + Vulkan, etc).
	#cpu_target = #hal.device.target<"llvm-cpu", {
	executable_targets = [
	#x86_64_target
	]
	}>

	module @example attributes {hal.device.targets = [#cpu_target]} {

	// Executable containing exported shims and calls to external functions.
	// Each executable can contain multiple exported functions and variants for
	// different architectures or even devices. It's also possible to mix hand-
	// authored functions with code generated ones even for the same functions
	// such that code generation is used as a fallback when the hand-authored
	// kernels aren't supported at runtime.
	hal.executable private @executable {

	// Variant linking in an x86-64 object file containing external functions.
	hal.executable.variant public @x86_64, target = #x86_64_target, objects = [
	// Object files linked into the executable.
	// These object files are linked into the dynamic library and must meet
	// the requirements for embedded ELF linkage (no TLS, no globals, no
	// syscalls, no libc, etc).
	#hal.executable.object<{
	// Referencing a file path on disk but could also have the data
	// embedded in order to make the MLIR file hermetic/portable across
	// compilation pipelines. In the future we'll likely use MLIR's
	// external resource functionality for this. By allowing for the
	// objects to be embedded we can support JIT scenarios where some
	// layer higher or lower may be emitting the objects to link in as
	// part of the overall compilation.
	path = "samples/custom_dispatch/cpu/embedded/functions_x86_64.o"
	}>
	] {

	// TODO(benvanik): demonstrate hal.executable.constant.block for
	// specialization via host logic and hal.executable.constant.load for
	// referencing them in the shims.

	// Exported shim function calling the C `simple_mul_workgroup` function.
	// The ordinal must be assigned by the user and unique for the executable.
	// The layout defines the required bindings and push constants and can be
	// thought of as the function signature.
	hal.executable.export public @simple_mul ordinal(0)
	layout(#hal.pipeline.layout<push_constants = 1, sets = [
	<0, bindings = [
	<0, storage_buffer, ReadOnly>,
	<1, storage_buffer, ReadOnly>,
	<2, storage_buffer>
	]>
	]>) {
	^bb0(%device: !hal.device, %workload: index):
	// This host function is used to compute the XYZ workgroup count
	// dispatched at runtime. It can query the %device for capabilities
	// and limits (last-level cache sizes, etc). The other arguments are the
	// values passed in the dispatch operation (usually things like root
	// output op tensor dimensions and other abstract values).
	%x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
	%c1 = arith.constant 1 : index
	hal.return %x, %c1, %c1 : index, index, index
	}

	// Similar to the above but in-place by using a read/write binding.
	hal.executable.export public @simple_mul_inplace ordinal(1)
	layout(#hal.pipeline.layout<push_constants = 1, sets = [
	<0, bindings = [
	<0, storage_buffer, ReadOnly>,
	<1, storage_buffer>
	]>
	]>) {
	^bb0(%device: !hal.device, %workload: index):
	%x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
	%c1 = arith.constant 1 : index
	hal.return %x, %c1, %c1 : index, index, index
	}

	// On the CPU side we use shims here to marshal across the ABI. This
	// allows us to hide the implementation details of how the runtime calls
	// into functions and call out to C functions that don't need to link
	// against the runtime. We could probably come up with ways of automating
	// this but that's mostly left as an exercise to the frontends that may be
	// producing this IR for input to the IREE compiler as each may have its
	// own quirks.
	builtin.module {
	// External function declaration using a user-chosen calling convention.
	// NOTE: MLIR->LLVM conversion expands each memref to a tuple and
	// there's currently no way to change that behavior.
	// Each memref becomes:
	// (%base_ptr: !llvm.ptr<f32>, %aligned_ptr: !llvm.ptr<f32>,
	// %offset: i64, %size: i64, %stride: i64)
	// That results in the following llvm.func:
	// (!llvm.ptr<f32>, !llvm.ptr<f32>, i64, i64, i64, // binding0
	// !llvm.ptr<f32>, !llvm.ptr<f32>, i64, i64, i64, // binding1
	// !llvm.ptr<f32>, !llvm.ptr<f32>, i64, i64, i64, // binding2
	// i64, // dim
	// i64) // tid
	// And required external C function:
	// (float, float, size_t, size_t, size_t,
	// float, float, size_t, size_t, size_t,
	// float, float, size_t, size_t, size_t,
	// size_t,
	// size_t)
	// This is not a good state to be in as we can't then map to external
	// functions that have signatures we don't want to change. Please file
	// upstream MLIR bugs about this behavior and the ability to just pass
	// bare pointers if you care!
	//
	// NOTE: index will convert to i32 when targeting an ABI with 32-bit
	// pointers and i64 otherwise. Use size_t on the C side to allow the
	// same source code to work when compiled in either mode.
	func.func private @simple_mul_workgroup(%binding0: memref<?xf32>, %binding1: memref<?xf32>, %binding2: memref<?xf32>, %dim: index, %tid: index) attributes {
	// Ensures that we try to statically link this external function and
	// pull it in from the object file.
	hal.import.static
	}

	// IREE exported function using a HAL interface.
	// At this layer of the stack all operands have been converted into
	// constants and bindings have been specified.
	func.func @simple_mul() {
	%c0 = arith.constant 0 : index

	// Push constants representing primitive operands can be loaded here.
	%dim_i32 = hal.interface.constant.load[0] : i32
	%dim = arith.index_castui %dim_i32 : i32 to index

	// This function is invoked once per workgroup so determine where this
	// particular workgroup is in the grid. In this example we use a
	// workgroup size of 64x1x1 (which is exceedingly small for CPUs but
	// useful for demonstration).
	%workgroup_id_x = hal.interface.workgroup.id[0] : index
	%tid = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]

	// Bindings are accessed by reference.
	%binding0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32>{%dim}
	%binding1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32>{%dim}
	%binding2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32>{%dim}

	// Call the externally defined C function with an (almost) plain C
	// calling convention (see above for details about the mess memrefs
	// turn into).
	//
	// TODO: there are ways of accessing CPU information here such as
	// active architecture and feature bits but it is not yet exposed to
	// the HAL level.
	func.call @simple_mul_workgroup(%binding0, %binding1, %binding2, %dim, %tid) : (memref<?xf32>, memref<?xf32>, memref<?xf32>, index, index) -> ()

	// NOTE: this is code generated as normal - other MLIR ops can be used
	// here for looping/control flow, vector operations, linalg, etc.
	// This simple sample is just calling out to the external function but
	// microkernels fused with other code are possible.

	return
	}

	func.func private @simple_mul_inplace_workgroup(%binding0: memref<?xf32>, %binding1: memref<?xf32>, %dim: index, %tid: index) attributes {
	hal.import.static
	}
	func.func @simple_mul_inplace() {
	%c0 = arith.constant 0 : index

	%dim_i32 = hal.interface.constant.load[0] : i32
	%dim = arith.index_castui %dim_i32 : i32 to index

	%workgroup_id_x = hal.interface.workgroup.id[0] : index
	%tid = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]

	// Same as above but note that we're treating %binding1 as read/write.
	%binding0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32>{%dim}
	%binding1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xf32>{%dim}

	func.call @simple_mul_inplace_workgroup(%binding0, %binding1, %dim, %tid) : (memref<?xf32>, memref<?xf32>, index, index) -> ()

	return
	}
	}

	} // hal.executable.variant

	} // hal.executable

	// Function demonstrating a few hand-authored dispatches mixed with codegen.
	// Invoke with:
	// --device=local-sync
	// --function=mixed_invocation
	// --input=8xf32=2
	// --input=8xf32=4
	// CHECK-LABEL: EXEC @mixed_invocation
	func.func @mixed_invocation(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
	// The only externally available metadata in the dispatch are the values
	// passed in as operands. Here we pass in the dynamic dimension.
	//
	// HACK: for hand-authored kernels all primitive values passed in need to
	// be i32 or a bit-castable type. This is because ABI packing of other types
	// happens inside of the PackDispatchOperandsPass that is currently not
	// usable with external functions as it changes the ABI. In the future we
	// can better define the ABI such that it's possible to match the compiler
	// expectations around padding/alignment. For now users must do the packing
	// themselves (splitting i64 into i32+i32, etc).
	%c0 = arith.constant 0 : index
	%dim = tensor.dim %arg0, %c0 : tensor<?xf32>
	%dim_i32 = arith.index_cast %dim : index to i32

	// Dispatch a basic `ret = lhs * rhs` using an external function.
	%0 = flow.dispatch @executable::@x86_64::@simple_mul[%dim](%dim_i32, %arg0, %arg1) {
	// Bindings are automatically inferred when possible as part of the ABI
	// but can be overridden if the user wants to use features such as sparse
	// bindings or multiple descriptor sets. To do so the
	// `hal.interface.bindings` attribute can be added to a dispatch op as
	// follows mapping tensor operands/results to the pipeline layout
	// sets/bindings:
	hal.interface.bindings = [
	#hal.interface.binding<0, 0>,
	#hal.interface.binding<0, 1>,
	#hal.interface.binding<0, 2>
	],
	// HACK: keep the executable live through DCE. Only required when
	// using the automatic variant selection.
	// TODO(benvanik): automatically add this when required.
	hal.executable.ref = [@executable]
	} : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> tensor<?xf32>{%dim}

	// Code gen some other ops - these will interleave with the hand-authored
	// ones but naturally won't be able to fuse with them.
	%1 = arith.addf %0, %arg1 : tensor<?xf32>

	// Dispatch an in-place `rhs *= lhs` using an external function.
	// This form (@executable::@variant::@export) specifically chooses a variant
	// instead of relying on automatic selection. This can be used by frontends
	// to allow user-controlled overrides of the dispatches, custom selection
	// logic based on runtime parameters, etc. In general, though, the above
	// automatic selection should be used.
	//
	// Note that we don't declare the hal.interface.bindings and let them be
	// inferred - this only works when either specifying the variant that has
	// a pipeline layout defined or all variants have the same pipeline layouts.
	%2 = flow.dispatch @executable::@x86_64::@simple_mul_inplace[%dim](%dim_i32, %0, %1) : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> %1{%dim}

	// CHECK: 8xf32=96 96 96 96 96 96 96 96
	return %2 : tensor<?xf32>
	}

	} // module