samples/custom_dispatch/cpu/plugin/standalone_example.mlir - 3p/openxla/iree - Git at Google

 // This example demonstrates calling dynamically imported functions in the
 // runtime. Alternatively the functions can be embedded into the compiled IREE
 // programs for hermetic deployment (see custom_dispatch/cpu/embedded/).

 // NOTE: this file is identical to system_example.mlir besides the lit config
 // controlling the iree-run-module flag.
 // TODO(benvanik): find a way to share the files (environment variables saying
 // what types to run, etc).

 // RUN: iree-compile --iree-hal-target-backends=llvm-cpu %s | \
 // RUN: iree-run-module \
 // RUN:     --device=local-sync \
 // RUN:     --executable_plugin=$IREE_BINARY_DIR/samples/custom_dispatch/cpu/plugin/standalone_plugin.sos \
 // RUN:     --module=- \
 // RUN:     --function=mixed_invocation \
 // RUN:     --input=8xf32=2 \
 // RUN:     --input=8xf32=4 | \
 // RUN: FileCheck %s --check-prefix=CHECK-STANDALONE

 // CHECK-STANDALONE: EXEC @mixed_invocation
 // CHECK-STANDALONE: 8xf32=12 12 12 12 12 12 12 12

 module @example {

   // Executable containing exported shims and calls to external functions.
   // Each executable can contain multiple exported functions and variants for
   // different architectures or even devices. It's also possible to mix hand-
   // authored functions with code generated ones even for the same functions
   // such that code generation is used as a fallback when the hand-authored
   // kernels aren't supported at runtime.
   stream.executable private @executable {
     stream.executable.export public @simple_mul workgroups(%workload: index) -> (index, index, index) {
       // This host function is used to compute the XYZ workgroup count
       // dispatched at runtime. It can query the %device for capabilities
       // and limits (last-level cache sizes, etc). The other arguments are the
       // values passed in the dispatch operation (usually things like root
       // output op tensor dimensions and other abstract values).
       %x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
       %c1 = arith.constant 1 : index
       stream.return %x, %c1, %c1 : index, index, index
     }

     builtin.module {
       // External function declaration using a user-chosen calling convention.
       func.func private @simple_mul_workgroup(
             %binding0: memref<f32>,
             %binding0_offset : index,
             %binding1: memref<f32>,
             %binding1_offset : index,
             %binding2: memref<f32>,
             %binding2_offset : index,
             %dim: index, %tid: index) attributes {
         // We can include some additional fields on the parameters struct as
         // needed. Here we request which processor is executing the call and
         // its data fields as defined by runtime/src/iree/schemas/cpu_data.h.
         hal.import.fields = ["processor_id", "processor_data"],
         llvm.bareptr = true
       }

       // IREE exported function using stream bindings and operands.
       // Compiler passes will be able to optimize across this interface and
       // deduplicate bindings/operands, convert/pack operands, and inline
       // constants operands.
       func.func @simple_mul(
           %binding0: !stream.binding,
           %binding1: !stream.binding,
           %binding2: !stream.binding,
           %dim: index) {
         %c0 = arith.constant 0 : index

         // This function is invoked once per workgroup so determine where this
         // particular workgroup is in the grid. In this example we use a
         // workgroup size of 64x1x1 (which is exceedingly small for CPUs but
         // useful for demonstration).
         %workgroup_id_x = stream.dispatch.workgroup.id[0] : index
         %tid = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]

         // Bindings are accessed by reference.
         %memref0 = stream.binding.subspan %binding0[%c0] : !stream.binding -> memref<?xf32>{%dim}
         %memref1 = stream.binding.subspan %binding1[%c0] : !stream.binding -> memref<?xf32>{%dim}
         %memref2 = stream.binding.subspan %binding2[%c0] : !stream.binding -> memref<?xf32>{%dim}

         // The default `memref` lowering contains additional fields that might not be
         // always required. In this example, we only need the base and offset of the
         // `memref`s. So extract the base and offset from the memrefs.
         %base0, %offset0, %size0, %stride0 = memref.extract_strided_metadata %memref0
             : memref<?xf32> -> memref<f32>, index, index, index
         %base1, %offset1, %size1, %stride1 = memref.extract_strided_metadata %memref1
             : memref<?xf32> -> memref<f32>, index, index, index
         %base2, %offset2, %size2, %stride2 = memref.extract_strided_metadata %memref2
             : memref<?xf32> -> memref<f32>, index, index, index

         // Call the externally defined C function with an (almost) plain C
         // calling convention (see above for details about the mess memrefs
         // turn into). This will be fetched at runtime from the plugin binary.
         func.call @simple_mul_workgroup(
             %base0, %offset0, %base1, %offset1, %base2, %offset2, %dim, %workgroup_id_x)
             : (memref<f32>, index, memref<f32>, index, memref<f32>, index, index, index) -> ()

         // NOTE: this is code generated as normal - other MLIR ops can be used
         // here for looping/control flow, vector operations, linalg, etc.
         // This simple sample is just calling out to the external function but
         // microkernels fused with other code are possible.

         return
       }
     }
   }

   // Function demonstrating executable plugins and mixing plugins and codegen.
   // Invoke with:
   //  --device=local-sync
   //  --executable_plugin=standalone_plugin.sos
   //  --function=mixed_invocation
   //  --input=8xf32=2
   //  --input=8xf32=4
   func.func @mixed_invocation(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
     // The only externally available metadata in the dispatch are the values
     // passed in as operands. Here we pass in the dynamic dimension.
     %c0 = arith.constant 0 : index
     %dim = tensor.dim %arg0, %c0 : tensor<?xf32>

     // Dispatch a basic `ret = lhs * rhs` using an external function.
     // This form (@executable::@export) allows for automatic variant selection
     // when multi-targeting.
     %0 = flow.dispatch @executable::@simple_mul[%dim](%arg0, %arg1, %dim) : (tensor<?xf32>{%dim}, tensor<?xf32>{%dim}, index) -> tensor<?xf32>{%dim}

     // Code gen some other ops - these will interleave with hand-authored
     // ones but naturally won't be able to fuse with them.
     %1 = arith.addf %0, %arg1 : tensor<?xf32>

     return %1 : tensor<?xf32>
   }

 }  // module
	// This example demonstrates calling dynamically imported functions in the
	// runtime. Alternatively the functions can be embedded into the compiled IREE
	// programs for hermetic deployment (see custom_dispatch/cpu/embedded/).

	// NOTE: this file is identical to system_example.mlir besides the lit config
	// controlling the iree-run-module flag.
	// TODO(benvanik): find a way to share the files (environment variables saying
	// what types to run, etc).

	// RUN: iree-compile --iree-hal-target-backends=llvm-cpu %s \| \
	// RUN: iree-run-module \
	// RUN: --device=local-sync \
	// RUN: --executable_plugin=$IREE_BINARY_DIR/samples/custom_dispatch/cpu/plugin/standalone_plugin.sos \
	// RUN: --module=- \
	// RUN: --function=mixed_invocation \
	// RUN: --input=8xf32=2 \
	// RUN: --input=8xf32=4 \| \
	// RUN: FileCheck %s --check-prefix=CHECK-STANDALONE

	// CHECK-STANDALONE: EXEC @mixed_invocation
	// CHECK-STANDALONE: 8xf32=12 12 12 12 12 12 12 12

	module @example {

	// Executable containing exported shims and calls to external functions.
	// Each executable can contain multiple exported functions and variants for
	// different architectures or even devices. It's also possible to mix hand-
	// authored functions with code generated ones even for the same functions
	// such that code generation is used as a fallback when the hand-authored
	// kernels aren't supported at runtime.
	stream.executable private @executable {
	stream.executable.export public @simple_mul workgroups(%workload: index) -> (index, index, index) {
	// This host function is used to compute the XYZ workgroup count
	// dispatched at runtime. It can query the %device for capabilities
	// and limits (last-level cache sizes, etc). The other arguments are the
	// values passed in the dispatch operation (usually things like root
	// output op tensor dimensions and other abstract values).
	%x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
	%c1 = arith.constant 1 : index
	stream.return %x, %c1, %c1 : index, index, index
	}

	builtin.module {
	// External function declaration using a user-chosen calling convention.
	func.func private @simple_mul_workgroup(
	%binding0: memref<f32>,
	%binding0_offset : index,
	%binding1: memref<f32>,
	%binding1_offset : index,
	%binding2: memref<f32>,
	%binding2_offset : index,
	%dim: index, %tid: index) attributes {
	// We can include some additional fields on the parameters struct as
	// needed. Here we request which processor is executing the call and
	// its data fields as defined by runtime/src/iree/schemas/cpu_data.h.
	hal.import.fields = ["processor_id", "processor_data"],
	llvm.bareptr = true
	}

	// IREE exported function using stream bindings and operands.
	// Compiler passes will be able to optimize across this interface and
	// deduplicate bindings/operands, convert/pack operands, and inline
	// constants operands.
	func.func @simple_mul(
	%binding0: !stream.binding,
	%binding1: !stream.binding,
	%binding2: !stream.binding,
	%dim: index) {
	%c0 = arith.constant 0 : index

	// This function is invoked once per workgroup so determine where this
	// particular workgroup is in the grid. In this example we use a
	// workgroup size of 64x1x1 (which is exceedingly small for CPUs but
	// useful for demonstration).
	%workgroup_id_x = stream.dispatch.workgroup.id[0] : index
	%tid = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]

	// Bindings are accessed by reference.
	%memref0 = stream.binding.subspan %binding0[%c0] : !stream.binding -> memref<?xf32>{%dim}
	%memref1 = stream.binding.subspan %binding1[%c0] : !stream.binding -> memref<?xf32>{%dim}
	%memref2 = stream.binding.subspan %binding2[%c0] : !stream.binding -> memref<?xf32>{%dim}

	// The default `memref` lowering contains additional fields that might not be
	// always required. In this example, we only need the base and offset of the
	// `memref`s. So extract the base and offset from the memrefs.
	%base0, %offset0, %size0, %stride0 = memref.extract_strided_metadata %memref0
	: memref<?xf32> -> memref<f32>, index, index, index
	%base1, %offset1, %size1, %stride1 = memref.extract_strided_metadata %memref1
	: memref<?xf32> -> memref<f32>, index, index, index
	%base2, %offset2, %size2, %stride2 = memref.extract_strided_metadata %memref2
	: memref<?xf32> -> memref<f32>, index, index, index

	// Call the externally defined C function with an (almost) plain C
	// calling convention (see above for details about the mess memrefs
	// turn into). This will be fetched at runtime from the plugin binary.
	func.call @simple_mul_workgroup(
	%base0, %offset0, %base1, %offset1, %base2, %offset2, %dim, %workgroup_id_x)
	: (memref<f32>, index, memref<f32>, index, memref<f32>, index, index, index) -> ()

	// NOTE: this is code generated as normal - other MLIR ops can be used
	// here for looping/control flow, vector operations, linalg, etc.
	// This simple sample is just calling out to the external function but
	// microkernels fused with other code are possible.

	return
	}
	}
	}

	// Function demonstrating executable plugins and mixing plugins and codegen.
	// Invoke with:
	// --device=local-sync
	// --executable_plugin=standalone_plugin.sos
	// --function=mixed_invocation
	// --input=8xf32=2
	// --input=8xf32=4
	func.func @mixed_invocation(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
	// The only externally available metadata in the dispatch are the values
	// passed in as operands. Here we pass in the dynamic dimension.
	%c0 = arith.constant 0 : index
	%dim = tensor.dim %arg0, %c0 : tensor<?xf32>

	// Dispatch a basic `ret = lhs * rhs` using an external function.
	// This form (@executable::@export) allows for automatic variant selection
	// when multi-targeting.
	%0 = flow.dispatch @executable::@simple_mul[%dim](%arg0, %arg1, %dim) : (tensor<?xf32>{%dim}, tensor<?xf32>{%dim}, index) -> tensor<?xf32>{%dim}

	// Code gen some other ops - these will interleave with hand-authored
	// ones but naturally won't be able to fuse with them.
	%1 = arith.addf %0, %arg1 : tensor<?xf32>

	return %1 : tensor<?xf32>
	}

	} // module