blob: 367635963a56fc95df16bb0428cdac79ea17e180 [file] [log] [blame]
Ben Vanikdc6f0cd2023-11-20 21:36:58 -08001// RUN: (iree-compile --iree-execution-model=async-external --iree-hal-target-backends=vmvx %p/module_a.mlir -o=%t.module_a.vmfb && \
2// RUN: iree-compile --iree-execution-model=async-external --iree-hal-target-backends=vmvx %p/module_b.mlir -o=%t.module_b.vmfb && \
3// RUN: iree-compile --iree-execution-model=async-external --iree-hal-target-backends=vmvx %s | \
Ben Vanikc05323f2024-05-21 09:40:28 -07004// RUN: iree-run-module \
5// RUN: --device=local-task \
Ben Vanikdc6f0cd2023-11-20 21:36:58 -08006// RUN: --module=%t.module_a.vmfb \
7// RUN: --module=%t.module_b.vmfb \
8// RUN: --module=- --function=run \
9// RUN: --input=4096xf32=-2.0 \
10// RUN: --expected_output=4096xf32=4.0) | \
11// RUN: FileCheck %s
12// CHECK: [SUCCESS]
13
14// Functions declared in external modules - note `module_name.func_name`.
15// `abs` will allocate transient memory to pass back the result.
16// `mul` will use the provided output memory to produce the result in-place.
17// Note that though the returned SSA tensor value shares its storage with the
18// `%output` arg the returned value *must* be used to reference the produced
19// version of its contents.
20//
21// In this asynchronous example both functions follow the "coarse-fences" ABI
22// model where the compiler inserts a wait and signal fence pair on each call.
23// To enable this the modules must compiled with the
24// `--iree-execution-model=async-external` and the external declarations must
25// be annotated with the `iree.abi.model` attribute so that the compiler knows
26// the calls have the fences. Note that it's possible to have any combination of
27// asynchronous and synchronous modules and calls in the same program.
28func.func private @module_a.abs(%input: tensor<4096xf32>) -> tensor<4096xf32> attributes {
29 iree.abi.model = "coarse-fences"
30}
31func.func private @module_b.mul(%lhs: tensor<4096xf32>, %rhs: tensor<4096xf32>, %output: tensor<4096xf32> {iree.abi.output = 0 : index}) -> tensor<4096xf32> attributes {
32 iree.abi.model = "coarse-fences"
33}
34
35// Top-level pipeline invoked by the command line tool.
36// Since this is compiled with `--iree-execution-model=async-external` this
37// export will have a wait and signal fence pair that allows the hosting
38// application to execute the entire pipeline asynchronously.
39func.func @run(%input: tensor<4096xf32>) -> tensor<4096xf32> {
40 // Make a simple call that produces a transient result tensor.
41 // Since the call is asynchronous the result is not ready upon return to this
42 // function and it'll be passed with the fence down to the consumer call.
43 %input_abs = call @module_a.abs(%input) : (tensor<4096xf32>) -> tensor<4096xf32>
44
45 // Allocate output storage for the next call. This isn't needed here and
46 // functionally equivalent to `abs` above allocating its own transient memory
47 // but demonstrates how in-place operations can be performed across module
48 // boundaries. The allocation is asynchronous and will be passed with a fence
49 // indicating when it's ready to the consumer call.
50 %result_storage = tensor.empty() : tensor<4096xf32>
51
52 // Make a call that produces its output in the given `%result_storage`.
53 // The inputs and result storage are passed with their respective fences and
54 // no guarantee that they are available at the time the call is made. The
55 // `mul` implementation will chain its work with the fences and only signal
56 // its fence when all transitive dependencies and its own execution has
57 // completed.
58 %result = call @module_b.mul(%input_abs, %input_abs, %result_storage) : (tensor<4096xf32>, tensor<4096xf32>, tensor<4096xf32>) -> tensor<4096xf32>
59
60 // Return the final result value - note that we pass back the result of the
61 // `mul` call that aliases the `%result_storage` representing the computed
62 // value and not just `%result_storage`. This is required as the `%result` has
63 // an associated fence indicating when it is available for use and using
64 // `%result_storage` would just wait for the storage to be allocated and not
65 // for the contents to have been populated by `mul`.
66 return %result : tensor<4096xf32>
67}