blob: 281ebda250edeff9ff0b1856a04f03d7752f9e7e [file] [log] [blame]
// This example demonstrates calling dynamically imported functions in the
// runtime through the use of ukernels. This is calling the same function
// as `system_example.mlir`, but using the `iree_codegen.ukernel.generic`.
// This is an example of how ukernels can be called from code generated
// by IREE.
// RUN: iree-compile --iree-hal-target-backends=llvm-cpu %s | \
// RUN: iree-run-module \
// RUN: --device=local-sync \
// RUN: --executable_plugin=$IREE_BINARY_DIR/samples/custom_dispatch/cpu/plugin/system_plugin$IREE_DYLIB_EXT \
// RUN: --function=ukernel_example \
// RUN: --module=- \
// RUN: --input=8xf32=2 \
// RUN: --input=8xf32=4 | \
// RUN: FileCheck %s --check-prefix=CHECK-SYSTEM
// CHECK-SYSTEM: EXEC @ukernel_example
// simple_mul_workgroup
// CHECK-SYSTEM-DAG: mul[0:0](2 * 4 = 8)
// CHECK-SYSTEM-DAG: mul[0:1](2 * 4 = 8)
// CHECK-SYSTEM-DAG: mul[0:2](2 * 4 = 8)
// CHECK-SYSTEM-DAG: mul[0:3](2 * 4 = 8)
// CHECK-SYSTEM-DAG: mul[1:0](2 * 4 = 8)
// CHECK-SYSTEM-DAG: mul[1:1](2 * 4 = 8)
// CHECK-SYSTEM-DAG: mul[1:2](2 * 4 = 8)
// CHECK-SYSTEM-DAG: mul[1:3](2 * 4 = 8)
// CHECK-SYSTEM: 8xf32=8 8 8 8 8 8 8 8
func.func @ukernel_example(%arg0 : tensor<?xf32>, %arg1 : tensor<?xf32>) -> tensor<?xf32> {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%d0 = tensor.dim %arg0, %c0 : tensor<?xf32>
%dest = tensor.empty(%d0) : tensor<?xf32>
// Create a dispatch that operands on `2` threads. Set the `worload` of
// `flow.dispatch.region` to capture the values needed to specify the number
// of threads to use in the `count` region.
%0 = flow.dispatch.region[%c2] -> (tensor<?xf32>{%d0}) {
%id = flow.dispatch.workgroup.id[0] : index
%count = flow.dispatch.workgroup.count[0] : index
// Each thread has to perform a slice of the computation.
%tilesize = affine.apply affine_map<()[s0, s1] -> (s0 ceildiv s1)>()[%d0, %count]
// Compute the offset and size of the slice
%offset = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%id, %tilesize]
%size = affine.min affine_map<(d0)[s0, s1] -> (s1 - d0, s0)>(%offset)[%tilesize, %d0]
// Extract slices of the inputs and outputs.
%1 = tensor.extract_slice %arg0[%offset] [%size] [1] : tensor<?xf32> to tensor<?xf32>
%2 = tensor.extract_slice %arg1[%offset] [%size] [1] : tensor<?xf32> to tensor<?xf32>
%3 = tensor.extract_slice %dest[%offset] [%size] [1] : tensor<?xf32> to tensor<?xf32>
// Invoke the ukernel.
%4 = iree_codegen.ukernel.generic "simple_mul_workgroup"
ins(%1, %2 : tensor<?xf32>, tensor<?xf32>)
outs(%3 : tensor<?xf32>)
(%size, %id : index, index)
// We can include some additional fields on the parameters struct as
// needed. Here we request which processor is executing the call and
// its data fields as defined by runtime/src/iree/schemas/cpu_data.h.
fn_def_attrs {hal.import.fields = ["processor_id", "processor_data"]}
// Set the operation to not incorporate any strides. The implementation
// expects no stride arguments.
strided_outer_dims(0) -> tensor<?xf32>
// Insert the result back into the result at the right position.
%5 = tensor.insert_slice %4 into %dest[%offset] [%size] [1] : tensor<?xf32> into tensor<?xf32>
flow.return %5 : tensor<?xf32>
} count(%b0 : index) -> (index, index, index) {
// Specify the number of threads to use. `%b0` represents
// the values captured as workload (within `[` `]` in the `flow.dispatch.region` above)
// Use that to derive the number of threads to use along `x`, `y` and `z`.
flow.return %b0, %c1, %c1 : index, index, index
}
return %0 : tensor<?xf32>
}