samples/custom_dispatch/cuda/kernels/example.mlir - 3p/openxla/iree - Git at Google

 // RUN: iree-compile %s \
 // RUN:     --iree-hal-executable-object-search-path=$IREE_BINARY_DIR | \
 // RUN: iree-run-module \
 // RUN:     --device=cuda \
 // RUN:     --module=- \
 // RUN:     --function=mixed_invocation \
 // RUN:     --input=8xf32=2 \
 // RUN:     --input=8xf32=4 | \
 // RUN: FileCheck %s

 // The configurations used for executable compilation.
 // This lets the compiler and runtime know the format and requirements of the
 // executable binaries produced and multiple variants with differing formats
 // and compilation options (architectures, etc) can be embedded for runtime
 // selection.
 #nvptx_sm_52_target = #hal.executable.target<"cuda", "cuda-nvptx-fb", {
   target_arch = "sm_52"
 }>
 #nvptx_sm_80_target = #hal.executable.target<"cuda", "cuda-nvptx-fb", {
   target_arch = "sm_80"
 }>

 // The target devices that the program will run on.
 // These can come from compiler flags and multiple targets can be supported
 // It's possible, for example, to support targeting multiple devices in the same
 // compiled binary.
 #cuda_target = #hal.device.target<"cuda", [
   #nvptx_sm_52_target,
   #nvptx_sm_80_target
 ]> : !hal.device

 module @example attributes {hal.device.targets = [#cuda_target]} {

   // Executable containing hand-authored kernels.
   // Each executable can contain multiple exported functions and variants for
   // different architectures or even devices. It's also possible to mix hand-
   // authored functions with code generated ones even for the same functions
   // such that code generation is used as a fallback when the hand-authored
   // kernels aren't supported at runtime.
   hal.executable.source private @executable attributes {
     // Object files linked into the executable per-target.
     // Certain backends (today) support either wholesale definition or linking
     // of partial objects for imports used by generated code. Each compilation
     // target can have its own unique set of objects to link in and the target
     // keys can be generic. This allows for an object file to be linked in based
     // only on the target triple while allowing for more specialized ones
     // requiring certain CPU features to be only included when building those.
     objects = #hal.executable.objects<{
       #nvptx_sm_52_target = [
         #hal.executable.object<{
           // Referencing a file path on disk but could also have the data
           // embedded in order to make the MLIR file hermetic/portable across
           // compilation pipelines. In the future we'll likely use MLIR's
           // external resource functionality for this. By allowing for the
           // objects to be embedded we can support JIT scenarios where some
           // layer higher or lower may be emitting the objects to link in as
           // part of the overall compilation.
           path = "samples/custom_dispatch/cuda/kernels/kernels_sm_52.ptx"
         }>
       ],
       #nvptx_sm_80_target = [
         #hal.executable.object<{
           path = "samples/custom_dispatch/cuda/kernels/kernels_sm_80.ptx"
         }>
       ]
     }>
   } {

     // TODO(benvanik): demonstrate hal.executable.constant.block for
     // specialization via host logic. Maps to a read-only buffer passed into
     // kernels. CUDA doesn't yet have these wired up.

     // Exported function with the C name `simple_mul`.
     // The ordinal must be assigned by the user and unique for the executable.
     // The layout defines the required bindings and push constants and can be
     // thought of as the function signature.
     hal.executable.export public @simple_mul ordinal(0)
         layout(#hal.pipeline.layout<constants = 1, bindings = [
           #hal.pipeline.binding<storage_buffer, ReadOnly>,
           #hal.pipeline.binding<storage_buffer, ReadOnly>,
           #hal.pipeline.binding<storage_buffer>
         ]>) attributes {
       // Certain backends (like CUDA) require a workgroup size (aka block
       // size) to be defined ahead of time.
       workgroup_size = [64 : index, 1 : index, 1 : index]
     } {
     ^bb0(%device: !hal.device, %workload: index):
       // This host function is used to compute the XYZ workgroup count
       // dispatched at runtime. It can query the %device for capabilities
       // and limits (shared memory size, etc). The other arguments are the
       // values passed in the dispatch operation (usually things like root
       // output op tensor dimensions and other abstract values).
       %x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
       %c1 = arith.constant 1 : index
       hal.return %x, %c1, %c1 : index, index, index
     }

     // Similar to the above but in-place by using a read/write binding.
     hal.executable.export public @simple_mul_inplace ordinal(1)
         layout(#hal.pipeline.layout<constants = 1, bindings = [
           #hal.pipeline.binding<storage_buffer, ReadOnly>,
           #hal.pipeline.binding<storage_buffer>
         ]>) attributes {
       workgroup_size = [64 : index, 1 : index, 1 : index]
     } {
     ^bb0(%device: !hal.device, %workload: index):
       %x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
       %c1 = arith.constant 1 : index
       hal.return %x, %c1, %c1 : index, index, index
     }

   }  // hal.executable.source

   // Function demonstrating a few hand-authored dispatches mixed with codegen.
   // Invoke with:
   //  --device=cuda
   //  --function=mixed_invocation
   //  --input=8xf32=2
   //  --input=8xf32=4
   // CHECK-LABEL: EXEC @mixed_invocation
   func.func @mixed_invocation(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
     // HACK: for hand-authored kernels all primitive values passed in need to
     // be i32 or a bit-castable type. This is because ABI packing of other types
     // happens inside of the PackDispatchOperandsPass that is currently not
     // usable with external functions as it changes the ABI. In the future we
     // can better define the ABI such that it's possible to match the compiler
     // expectations around padding/alignment. For now users must do the packing
     // themselves (splitting i64 into i32+i32, etc).
     %c0 = arith.constant 0 : index
     %dim = tensor.dim %arg0, %c0 : tensor<?xf32>
     %dim_i32 = arith.index_cast %dim : index to i32

     // Dispatch a basic `ret = lhs * rhs` kernel.
     %0 = flow.dispatch @executable::@simple_mul[%dim](%dim_i32, %arg0, %arg1) : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> tensor<?xf32>{%dim}

     // Code gen some other ops - these will interleave with the hand-authored
     // ones but naturally won't be able to fuse with them.
     %1 = arith.addf %0, %arg1 : tensor<?xf32>

     // Dispatch an in-place `rhs *= lhs` kernel.
     %2 = flow.dispatch @executable::@simple_mul_inplace[%dim](%dim_i32, %0, %1) : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> %1{%dim}

     // CHECK: 8xf32=96 96 96 96 96 96 96 96
     return %2 : tensor<?xf32>
   }

 }  // module
	// RUN: iree-compile %s \
	// RUN: --iree-hal-executable-object-search-path=$IREE_BINARY_DIR \| \
	// RUN: iree-run-module \
	// RUN: --device=cuda \
	// RUN: --module=- \
	// RUN: --function=mixed_invocation \
	// RUN: --input=8xf32=2 \
	// RUN: --input=8xf32=4 \| \
	// RUN: FileCheck %s

	// The configurations used for executable compilation.
	// This lets the compiler and runtime know the format and requirements of the
	// executable binaries produced and multiple variants with differing formats
	// and compilation options (architectures, etc) can be embedded for runtime
	// selection.
	#nvptx_sm_52_target = #hal.executable.target<"cuda", "cuda-nvptx-fb", {
	target_arch = "sm_52"
	}>
	#nvptx_sm_80_target = #hal.executable.target<"cuda", "cuda-nvptx-fb", {
	target_arch = "sm_80"
	}>

	// The target devices that the program will run on.
	// These can come from compiler flags and multiple targets can be supported
	// It's possible, for example, to support targeting multiple devices in the same
	// compiled binary.
	#cuda_target = #hal.device.target<"cuda", [
	#nvptx_sm_52_target,
	#nvptx_sm_80_target
	]> : !hal.device

	module @example attributes {hal.device.targets = [#cuda_target]} {

	// Executable containing hand-authored kernels.
	// Each executable can contain multiple exported functions and variants for
	// different architectures or even devices. It's also possible to mix hand-
	// authored functions with code generated ones even for the same functions
	// such that code generation is used as a fallback when the hand-authored
	// kernels aren't supported at runtime.
	hal.executable.source private @executable attributes {
	// Object files linked into the executable per-target.
	// Certain backends (today) support either wholesale definition or linking
	// of partial objects for imports used by generated code. Each compilation
	// target can have its own unique set of objects to link in and the target
	// keys can be generic. This allows for an object file to be linked in based
	// only on the target triple while allowing for more specialized ones
	// requiring certain CPU features to be only included when building those.
	objects = #hal.executable.objects<{
	#nvptx_sm_52_target = [
	#hal.executable.object<{
	// Referencing a file path on disk but could also have the data
	// embedded in order to make the MLIR file hermetic/portable across
	// compilation pipelines. In the future we'll likely use MLIR's
	// external resource functionality for this. By allowing for the
	// objects to be embedded we can support JIT scenarios where some
	// layer higher or lower may be emitting the objects to link in as
	// part of the overall compilation.
	path = "samples/custom_dispatch/cuda/kernels/kernels_sm_52.ptx"
	}>
	],
	#nvptx_sm_80_target = [
	#hal.executable.object<{
	path = "samples/custom_dispatch/cuda/kernels/kernels_sm_80.ptx"
	}>
	]
	}>
	} {

	// TODO(benvanik): demonstrate hal.executable.constant.block for
	// specialization via host logic. Maps to a read-only buffer passed into
	// kernels. CUDA doesn't yet have these wired up.

	// Exported function with the C name `simple_mul`.
	// The ordinal must be assigned by the user and unique for the executable.
	// The layout defines the required bindings and push constants and can be
	// thought of as the function signature.
	hal.executable.export public @simple_mul ordinal(0)
	layout(#hal.pipeline.layout<constants = 1, bindings = [
	#hal.pipeline.binding<storage_buffer, ReadOnly>,
	#hal.pipeline.binding<storage_buffer, ReadOnly>,
	#hal.pipeline.binding<storage_buffer>
	]>) attributes {
	// Certain backends (like CUDA) require a workgroup size (aka block
	// size) to be defined ahead of time.
	workgroup_size = [64 : index, 1 : index, 1 : index]
	} {
	^bb0(%device: !hal.device, %workload: index):
	// This host function is used to compute the XYZ workgroup count
	// dispatched at runtime. It can query the %device for capabilities
	// and limits (shared memory size, etc). The other arguments are the
	// values passed in the dispatch operation (usually things like root
	// output op tensor dimensions and other abstract values).
	%x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
	%c1 = arith.constant 1 : index
	hal.return %x, %c1, %c1 : index, index, index
	}

	// Similar to the above but in-place by using a read/write binding.
	hal.executable.export public @simple_mul_inplace ordinal(1)
	layout(#hal.pipeline.layout<constants = 1, bindings = [
	#hal.pipeline.binding<storage_buffer, ReadOnly>,
	#hal.pipeline.binding<storage_buffer>
	]>) attributes {
	workgroup_size = [64 : index, 1 : index, 1 : index]
	} {
	^bb0(%device: !hal.device, %workload: index):
	%x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
	%c1 = arith.constant 1 : index
	hal.return %x, %c1, %c1 : index, index, index
	}

	} // hal.executable.source

	// Function demonstrating a few hand-authored dispatches mixed with codegen.
	// Invoke with:
	// --device=cuda
	// --function=mixed_invocation
	// --input=8xf32=2
	// --input=8xf32=4
	// CHECK-LABEL: EXEC @mixed_invocation
	func.func @mixed_invocation(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
	// HACK: for hand-authored kernels all primitive values passed in need to
	// be i32 or a bit-castable type. This is because ABI packing of other types
	// happens inside of the PackDispatchOperandsPass that is currently not
	// usable with external functions as it changes the ABI. In the future we
	// can better define the ABI such that it's possible to match the compiler
	// expectations around padding/alignment. For now users must do the packing
	// themselves (splitting i64 into i32+i32, etc).
	%c0 = arith.constant 0 : index
	%dim = tensor.dim %arg0, %c0 : tensor<?xf32>
	%dim_i32 = arith.index_cast %dim : index to i32

	// Dispatch a basic `ret = lhs * rhs` kernel.
	%0 = flow.dispatch @executable::@simple_mul[%dim](%dim_i32, %arg0, %arg1) : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> tensor<?xf32>{%dim}

	// Code gen some other ops - these will interleave with the hand-authored
	// ones but naturally won't be able to fuse with them.
	%1 = arith.addf %0, %arg1 : tensor<?xf32>

	// Dispatch an in-place `rhs *= lhs` kernel.
	%2 = flow.dispatch @executable::@simple_mul_inplace[%dim](%dim_i32, %0, %1) : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> %1{%dim}

	// CHECK: 8xf32=96 96 96 96 96 96 96 96
	return %2 : tensor<?xf32>
	}

	} // module