samples/custom_dispatch/vulkan/shaders/example.mlir - 3p/openxla/iree - Git at Google

 // RUN: iree-compile %s \
 // RUN:     --iree-hal-executable-object-search-path=$IREE_BINARY_DIR | \
 // RUN: iree-run-module \
 // RUN:     --device=vulkan \
 // RUN:     --module=- \
 // RUN:     --function=mixed_invocation \
 // RUN:     --input=8xf32=2 \
 // RUN:     --input=8xf32=4 | \
 // RUN: FileCheck %s

 // The configuration used for executable compilation.
 // This lets the compiler and runtime know the format and requirements of the
 // executable binaries produced and multiple variants with differing formats
 // and compilation options (architectures, etc) can be embedded for runtime
 // selection.
 #spirv_target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
   spirv.target_env = #spirv.target_env<
     #spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>,
     #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64>
   >
 }>

 // The target devices that the program will run on.
 // These can come from compiler flags and multiple targets can be supported
 // It's possible, for example, to support targeting multiple devices in the same
 // compiled binary.
 #vulkan_target = #hal.device.target<"vulkan", {
   executable_targets = [#spirv_target]
 }>

 module @example attributes {hal.device.targets = [#vulkan_target]} {

   // Executable containing hand-authored shaders.
   // Though each executable can contain multiple exported functions SPIR-V lacks
   // decent linking support and we model them separately here. In future
   // iterations of this functionality we will allow multiple .spv object files
   // to be linked into a single executable so that we can eliminate some binary
   // size overheads and improve runtime shader compilation performance.
   //
   // TODO(#7824): make export ordinals map to objects so that we can perform
   // our own linking (until some future Vulkan extension is added for pipeline
   // libraries).
   hal.executable.source private @simple_mul attributes {
     // Object files linked into the executable.
     // Certain backends (today) support either wholesale definition or linking
     // of partial objects for imports used by generated code. Each compilation
     // target can have its own unique set of objects to link in and the target
     // keys can be generic. This allows for an object file to be linked in based
     // only on the target triple while allowing for more specialized ones
     // requiring certain CPU features to be only included when building those.
     objects = #hal.executable.objects<{
       #spirv_target = [
         #hal.executable.object<{
           // Referencing a file path on disk but could also have the data
           // embedded in order to make the MLIR file hermetic/portable across
           // compilation pipelines. In the future we'll likely use MLIR's
           // external resource functionality for this. By allowing for the
           // objects to be embedded we can support JIT scenarios where some
           // layer higher or lower may be emitting the objects to link in as
           // part of the overall compilation.
           path = "samples/custom_dispatch/vulkan/shaders/simple_mul.spv"
         }>
       ]
     }>
   } {

     // TODO(benvanik): demonstrate hal.executable.constant.block for
     // specialization via host logic. These map to specialization constants.

     // Exported function with the C name `simple_mul`.
     // The ordinal must be assigned by the user and unique for the executable.
     // The layout defines the required bindings and push constants and can be
     // thought of as the function signature.
     hal.executable.export public @main ordinal(0)
         layout(#hal.pipeline.layout<push_constants = 1, sets = [
           <0, bindings = [
               <0, storage_buffer, ReadOnly>,
               <1, storage_buffer, ReadOnly>,
               <2, storage_buffer>
           ]>
         ]>) {
     ^bb0(%device: !hal.device, %workload: index):
       // This host function is used to compute the XYZ workgroup count
       // dispatched at runtime. It can query the %device for capabilities
       // and limits (shared memory size, etc). The other arguments are the
       // values passed in the dispatch operation (usually things like root
       // output op tensor dimensions and other abstract values).
       %x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
       %c1 = arith.constant 1 : index
       hal.return %x, %c1, %c1 : index, index, index
     }

   }  // hal.executable

   hal.executable.source private @simple_mul_inplace attributes {
     objects = #hal.executable.objects<{
       #spirv_target = [
         #hal.executable.object<{
           path = "samples/custom_dispatch/vulkan/shaders/simple_mul_inplace.spv"
         }>
       ]
     }>
   } {
     // Similar to the above but in-place by using a read/write binding.
     hal.executable.export public @main ordinal(0)
         layout(#hal.pipeline.layout<push_constants = 1, sets = [
           <0, bindings = [
               <0, storage_buffer, ReadOnly>,
               <1, storage_buffer>
           ]>
         ]>) {
     ^bb0(%device: !hal.device, %workload: index):
       %x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
       %c1 = arith.constant 1 : index
       hal.return %x, %c1, %c1 : index, index, index
     }
   }  // hal.executable

   // Function demonstrating a few hand-authored dispatches mixed with codegen.
   // Invoke with:
   //  --device=vulkan
   //  --function=mixed_invocation
   //  --input=8xf32=2
   //  --input=8xf32=4
   // CHECK-LABEL: EXEC @mixed_invocation
   func.func @mixed_invocation(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
     // HACK: for hand-authored shaders all primitive values passed in need to
     // be i32 or a bit-castable type. This is because ABI packing of other types
     // happens inside of the PackDispatchOperandsPass that is currently not
     // usable with external functions as it changes the ABI. In the future we
     // can better define the ABI such that it's possible to match the compiler
     // expectations around padding/alignment. For now users must do the packing
     // themselves (splitting i64 into i32+i32, etc).
     %c0 = arith.constant 0 : index
     %dim = tensor.dim %arg0, %c0 : tensor<?xf32>
     %dim_i32 = arith.index_cast %dim : index to i32

     // Dispatch a basic `ret = lhs * rhs` shader.
     %0 = flow.dispatch @simple_mul::@main[%dim](%dim_i32, %arg0, %arg1) {
       // Bindings are automatically inferred when possible as part of the ABI
       // but can be overridden if the user wants to use features such as sparse
       // bindings or multiple descriptor sets. To do so the
       // `hal.interface.bindings` attribute can be added to a dispatch op as
       // follows mapping tensor operands/results to the pipeline layout
       // sets/bindings:
       hal.interface.bindings = [
         #hal.interface.binding<0, 0>,
         #hal.interface.binding<0, 1>,
         #hal.interface.binding<0, 2>
       ]
     } : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> tensor<?xf32>{%dim}

     // Code gen some other ops - these will interleave with the hand-authored
     // ones but naturally won't be able to fuse with them.
     %1 = arith.addf %0, %arg1 : tensor<?xf32>

     // Dispatch an in-place `rhs *= lhs` shader.
     //
     // Note that we don't declare the hal.interface.bindings and let them be
     // inferred - this only works when either specifying the variant that has
     // a pipeline layout defined or all variants have the same pipeline layouts.
     %2 = flow.dispatch @simple_mul_inplace::@main[%dim](%dim_i32, %0, %1) : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> %1{%dim}

     // CHECK: 8xf32=96 96 96 96 96 96 96 96
     return %2 : tensor<?xf32>
   }

 }  // module
	// RUN: iree-compile %s \
	// RUN: --iree-hal-executable-object-search-path=$IREE_BINARY_DIR \| \
	// RUN: iree-run-module \
	// RUN: --device=vulkan \
	// RUN: --module=- \
	// RUN: --function=mixed_invocation \
	// RUN: --input=8xf32=2 \
	// RUN: --input=8xf32=4 \| \
	// RUN: FileCheck %s

	// The configuration used for executable compilation.
	// This lets the compiler and runtime know the format and requirements of the
	// executable binaries produced and multiple variants with differing formats
	// and compilation options (architectures, etc) can be embedded for runtime
	// selection.
	#spirv_target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
	spirv.target_env = #spirv.target_env<
	#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>,
	#spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64>
	>
	}>

	// The target devices that the program will run on.
	// These can come from compiler flags and multiple targets can be supported
	// It's possible, for example, to support targeting multiple devices in the same
	// compiled binary.
	#vulkan_target = #hal.device.target<"vulkan", {
	executable_targets = [#spirv_target]
	}>

	module @example attributes {hal.device.targets = [#vulkan_target]} {

	// Executable containing hand-authored shaders.
	// Though each executable can contain multiple exported functions SPIR-V lacks
	// decent linking support and we model them separately here. In future
	// iterations of this functionality we will allow multiple .spv object files
	// to be linked into a single executable so that we can eliminate some binary
	// size overheads and improve runtime shader compilation performance.
	//
	// TODO(#7824): make export ordinals map to objects so that we can perform
	// our own linking (until some future Vulkan extension is added for pipeline
	// libraries).
	hal.executable.source private @simple_mul attributes {
	// Object files linked into the executable.
	// Certain backends (today) support either wholesale definition or linking
	// of partial objects for imports used by generated code. Each compilation
	// target can have its own unique set of objects to link in and the target
	// keys can be generic. This allows for an object file to be linked in based
	// only on the target triple while allowing for more specialized ones
	// requiring certain CPU features to be only included when building those.
	objects = #hal.executable.objects<{
	#spirv_target = [
	#hal.executable.object<{
	// Referencing a file path on disk but could also have the data
	// embedded in order to make the MLIR file hermetic/portable across
	// compilation pipelines. In the future we'll likely use MLIR's
	// external resource functionality for this. By allowing for the
	// objects to be embedded we can support JIT scenarios where some
	// layer higher or lower may be emitting the objects to link in as
	// part of the overall compilation.
	path = "samples/custom_dispatch/vulkan/shaders/simple_mul.spv"
	}>
	]
	}>
	} {

	// TODO(benvanik): demonstrate hal.executable.constant.block for
	// specialization via host logic. These map to specialization constants.

	// Exported function with the C name `simple_mul`.
	// The ordinal must be assigned by the user and unique for the executable.
	// The layout defines the required bindings and push constants and can be
	// thought of as the function signature.
	hal.executable.export public @main ordinal(0)
	layout(#hal.pipeline.layout<push_constants = 1, sets = [
	<0, bindings = [
	<0, storage_buffer, ReadOnly>,
	<1, storage_buffer, ReadOnly>,
	<2, storage_buffer>
	]>
	]>) {
	^bb0(%device: !hal.device, %workload: index):
	// This host function is used to compute the XYZ workgroup count
	// dispatched at runtime. It can query the %device for capabilities
	// and limits (shared memory size, etc). The other arguments are the
	// values passed in the dispatch operation (usually things like root
	// output op tensor dimensions and other abstract values).
	%x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
	%c1 = arith.constant 1 : index
	hal.return %x, %c1, %c1 : index, index, index
	}

	} // hal.executable

	hal.executable.source private @simple_mul_inplace attributes {
	objects = #hal.executable.objects<{
	#spirv_target = [
	#hal.executable.object<{
	path = "samples/custom_dispatch/vulkan/shaders/simple_mul_inplace.spv"
	}>
	]
	}>
	} {
	// Similar to the above but in-place by using a read/write binding.
	hal.executable.export public @main ordinal(0)
	layout(#hal.pipeline.layout<push_constants = 1, sets = [
	<0, bindings = [
	<0, storage_buffer, ReadOnly>,
	<1, storage_buffer>
	]>
	]>) {
	^bb0(%device: !hal.device, %workload: index):
	%x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
	%c1 = arith.constant 1 : index
	hal.return %x, %c1, %c1 : index, index, index
	}
	} // hal.executable

	// Function demonstrating a few hand-authored dispatches mixed with codegen.
	// Invoke with:
	// --device=vulkan
	// --function=mixed_invocation
	// --input=8xf32=2
	// --input=8xf32=4
	// CHECK-LABEL: EXEC @mixed_invocation
	func.func @mixed_invocation(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
	// HACK: for hand-authored shaders all primitive values passed in need to
	// be i32 or a bit-castable type. This is because ABI packing of other types
	// happens inside of the PackDispatchOperandsPass that is currently not
	// usable with external functions as it changes the ABI. In the future we
	// can better define the ABI such that it's possible to match the compiler
	// expectations around padding/alignment. For now users must do the packing
	// themselves (splitting i64 into i32+i32, etc).
	%c0 = arith.constant 0 : index
	%dim = tensor.dim %arg0, %c0 : tensor<?xf32>
	%dim_i32 = arith.index_cast %dim : index to i32

	// Dispatch a basic `ret = lhs * rhs` shader.
	%0 = flow.dispatch @simple_mul::@main[%dim](%dim_i32, %arg0, %arg1) {
	// Bindings are automatically inferred when possible as part of the ABI
	// but can be overridden if the user wants to use features such as sparse
	// bindings or multiple descriptor sets. To do so the
	// `hal.interface.bindings` attribute can be added to a dispatch op as
	// follows mapping tensor operands/results to the pipeline layout
	// sets/bindings:
	hal.interface.bindings = [
	#hal.interface.binding<0, 0>,
	#hal.interface.binding<0, 1>,
	#hal.interface.binding<0, 2>
	]
	} : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> tensor<?xf32>{%dim}

	// Code gen some other ops - these will interleave with the hand-authored
	// ones but naturally won't be able to fuse with them.
	%1 = arith.addf %0, %arg1 : tensor<?xf32>

	// Dispatch an in-place `rhs *= lhs` shader.
	//
	// Note that we don't declare the hal.interface.bindings and let them be
	// inferred - this only works when either specifying the variant that has
	// a pipeline layout defined or all variants have the same pipeline layouts.
	%2 = flow.dispatch @simple_mul_inplace::@main[%dim](%dim_i32, %0, %1) : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> %1{%dim}

	// CHECK: 8xf32=96 96 96 96 96 96 96 96
	return %2 : tensor<?xf32>
	}

	} // module