Ben Vanik | dc6f0cd | 2023-11-20 21:36:58 -0800 | [diff] [blame] | 1 | // RUN: (iree-compile --iree-execution-model=async-external --iree-hal-target-backends=vmvx %p/module_a.mlir -o=%t.module_a.vmfb && \ |
| 2 | // RUN: iree-compile --iree-execution-model=async-external --iree-hal-target-backends=vmvx %p/module_b.mlir -o=%t.module_b.vmfb && \ |
| 3 | // RUN: iree-compile --iree-execution-model=async-external --iree-hal-target-backends=vmvx %s | \ |
Ben Vanik | c05323f | 2024-05-21 09:40:28 -0700 | [diff] [blame] | 4 | // RUN: iree-run-module \ |
| 5 | // RUN: --device=local-task \ |
Ben Vanik | dc6f0cd | 2023-11-20 21:36:58 -0800 | [diff] [blame] | 6 | // RUN: --module=%t.module_a.vmfb \ |
| 7 | // RUN: --module=%t.module_b.vmfb \ |
| 8 | // RUN: --module=- --function=run \ |
| 9 | // RUN: --input=4096xf32=-2.0 \ |
| 10 | // RUN: --expected_output=4096xf32=4.0) | \ |
| 11 | // RUN: FileCheck %s |
| 12 | // CHECK: [SUCCESS] |
| 13 | |
| 14 | // Functions declared in external modules - note `module_name.func_name`. |
| 15 | // `abs` will allocate transient memory to pass back the result. |
| 16 | // `mul` will use the provided output memory to produce the result in-place. |
| 17 | // Note that though the returned SSA tensor value shares its storage with the |
| 18 | // `%output` arg the returned value *must* be used to reference the produced |
| 19 | // version of its contents. |
| 20 | // |
| 21 | // In this asynchronous example both functions follow the "coarse-fences" ABI |
| 22 | // model where the compiler inserts a wait and signal fence pair on each call. |
| 23 | // To enable this the modules must compiled with the |
| 24 | // `--iree-execution-model=async-external` and the external declarations must |
| 25 | // be annotated with the `iree.abi.model` attribute so that the compiler knows |
| 26 | // the calls have the fences. Note that it's possible to have any combination of |
| 27 | // asynchronous and synchronous modules and calls in the same program. |
| 28 | func.func private @module_a.abs(%input: tensor<4096xf32>) -> tensor<4096xf32> attributes { |
| 29 | iree.abi.model = "coarse-fences" |
| 30 | } |
| 31 | func.func private @module_b.mul(%lhs: tensor<4096xf32>, %rhs: tensor<4096xf32>, %output: tensor<4096xf32> {iree.abi.output = 0 : index}) -> tensor<4096xf32> attributes { |
| 32 | iree.abi.model = "coarse-fences" |
| 33 | } |
| 34 | |
| 35 | // Top-level pipeline invoked by the command line tool. |
| 36 | // Since this is compiled with `--iree-execution-model=async-external` this |
| 37 | // export will have a wait and signal fence pair that allows the hosting |
| 38 | // application to execute the entire pipeline asynchronously. |
| 39 | func.func @run(%input: tensor<4096xf32>) -> tensor<4096xf32> { |
| 40 | // Make a simple call that produces a transient result tensor. |
| 41 | // Since the call is asynchronous the result is not ready upon return to this |
| 42 | // function and it'll be passed with the fence down to the consumer call. |
| 43 | %input_abs = call @module_a.abs(%input) : (tensor<4096xf32>) -> tensor<4096xf32> |
| 44 | |
| 45 | // Allocate output storage for the next call. This isn't needed here and |
| 46 | // functionally equivalent to `abs` above allocating its own transient memory |
| 47 | // but demonstrates how in-place operations can be performed across module |
| 48 | // boundaries. The allocation is asynchronous and will be passed with a fence |
| 49 | // indicating when it's ready to the consumer call. |
| 50 | %result_storage = tensor.empty() : tensor<4096xf32> |
| 51 | |
| 52 | // Make a call that produces its output in the given `%result_storage`. |
| 53 | // The inputs and result storage are passed with their respective fences and |
| 54 | // no guarantee that they are available at the time the call is made. The |
| 55 | // `mul` implementation will chain its work with the fences and only signal |
| 56 | // its fence when all transitive dependencies and its own execution has |
| 57 | // completed. |
| 58 | %result = call @module_b.mul(%input_abs, %input_abs, %result_storage) : (tensor<4096xf32>, tensor<4096xf32>, tensor<4096xf32>) -> tensor<4096xf32> |
| 59 | |
| 60 | // Return the final result value - note that we pass back the result of the |
| 61 | // `mul` call that aliases the `%result_storage` representing the computed |
| 62 | // value and not just `%result_storage`. This is required as the `%result` has |
| 63 | // an associated fence indicating when it is available for use and using |
| 64 | // `%result_storage` would just wait for the storage to be allocated and not |
| 65 | // for the contents to have been populated by `mul`. |
| 66 | return %result : tensor<4096xf32> |
| 67 | } |