| func.func @ukernel_example() { |
| %s0 = arith.constant dense<[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]> : tensor<10xf32> |
| %s1 = arith.constant dense<[0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0]> : tensor<10xf32> |
| %arg0 = util.optimization_barrier %s0 : tensor<10xf32> |
| %arg1 = util.optimization_barrier %s1 : tensor<10xf32> |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %c3 = arith.constant 3 : index |
| %dest = tensor.empty() : tensor<10xf32> |
| // Create a dispatch that uses a workgroup size of 4. |
| // `flow.dispatch.region` to capture the values needed to specify the number |
| // of threads to use in the `count` region. |
| %0 = flow.dispatch.region[] -> (tensor<10xf32>) |
| // Use `#iree_codegen.export_config` to specify control over the execution. Currently |
| // the workgroup size/block size. |
| // Note: The name "iree_codegen.export_config" is also important for it to be |
| // propagated through the compiler. |
| attributes {iree_codegen.export_config = #iree_codegen.export_config<workgroup_size = [4]>} { |
| %id = flow.dispatch.workgroup.id[0] : index |
| %count = flow.dispatch.workgroup.count[0] : index |
| |
| // Compute the offset and size of the slice |
| %offset = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%id] |
| %size = affine.min affine_map<(d0)[] -> (4, 10 - d0)>(%offset)[] |
| |
| // Extract slices of the inputs and outputs. |
| %1 = tensor.extract_slice %arg0[%offset] [%size] [1] : tensor<10xf32> to tensor<?xf32> |
| %2 = tensor.extract_slice %arg1[%offset] [%size] [1] : tensor<10xf32> to tensor<?xf32> |
| %3 = tensor.extract_slice %dest[%offset] [%size] [1] : tensor<10xf32> to tensor<?xf32> |
| |
| // Invoke the ukernel. |
| %4 = iree_codegen.ukernel.generic "simple_mul_workgroup" |
| ins(%1, %2 : tensor<?xf32>, tensor<?xf32>) |
| outs(%3 : tensor<?xf32>) |
| (%size : index) |
| // Set the operation to not incorporate any strides. The implementation |
| // expects no stride arguments. |
| strided_dims([[], [], []]) -> tensor<?xf32> |
| |
| // Insert the result back into the result at the right position. |
| %5 = tensor.insert_slice %4 into %dest[%offset] [%size] [1] : tensor<?xf32> into tensor<10xf32> |
| flow.return %5 : tensor<10xf32> |
| } count() -> (index, index, index) { |
| flow.return %c3, %c1, %c1 : index, index, index |
| } |
| check.expect_almost_eq_const(%0, dense<[0.0, 2.0, 8.0, 18.0, 32.0, 50.0, 72.0, 98.0, 128.0, 162.0]> : tensor<10xf32>) : tensor<10xf32> |
| return |
| } |