blob: 03ebbc331527f574b1370988b9b60a62e02f85b6 [file] [log] [blame]
Stella Laurenzof44859d2022-05-31 20:00:19 -07001// RUN: iree-compile --split-input-file --iree-hal-target-backends=vmvx \
2// RUN: --output-format=vm-bytecode \
3// RUN: --iree-vm-bytecode-module-output-format=flatbuffer-text %s \
4// RUN: --mlir-print-ir-after=iree-vm-ordinal-allocation 2>&1 | FileCheck %s
Ben Vanik9de6fe02022-02-02 18:05:31 -08005
6// This file has a few test programs that show how to mix `flow` dispatches into
7// those created by the `linalg` dispatch region formation: the idea is to use
8// any normal IREE input (mhlo/tosa/linalg/etc) on tensors and then also include
9// `flow.dispatch` ops calling `stream.executable`s. `flow.executable`s could be
10// used too but currently have some ergonomics issues that need to be resolved;
11// the improved version of `flow.dispatch` (and `flow.dispatch.workgroups`) will
12// be made part of the public `iree` dialect at which time this file will change
13// to using that. The `flow`/`stream` dialects are generally not considered
14// stable.
15
16// A simple element-wise multiply of two static tensors:
17// %ret0 = %arg0 * %arg1
18//
19// The host code performs the dispatch with a workload of 4x1x1 - how many
20// workgroups that gets distributed across is left to the HAL backend to decide
21// based on the target device and how the work is tiled.
22//
23// The device code in the stream.executable is tiled - but does not need to be:
24// the only thing we care about at this level is the bindings and any operands
25// that may need to be passed from host->device.
26
27// CHECK-LABEL: vm.module public @e2e
28module @e2e {
29// CHECK: vm.rodata private @executable_0_vmvx_bytecode_fb
30stream.executable private @executable_0 {
MaheshRavishankarbeda4642022-06-30 18:04:13 -070031 stream.executable.export public @dispatch workgroups(%arg0: index) -> (index, index, index) {
MaheshRavishankar83b01032022-09-09 11:13:30 -070032 %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
MaheshRavishankarbeda4642022-06-30 18:04:13 -070033 stream.return %x, %y, %z : index, index, index
34 }
Ben Vanik9de6fe02022-02-02 18:05:31 -080035 builtin.module {
Thomasf0f64ca2022-03-30 10:38:01 -070036 func.func @dispatch(%arg0: !stream.binding, %arg1: !stream.binding, %ret0: !stream.binding) {
Ben Vanik9de6fe02022-02-02 18:05:31 -080037 %c0 = arith.constant 0 : index
MaheshRavishankar4552fda2022-10-25 16:52:39 -070038 %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xf32>>
39 %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xf32>>
40 %2 = stream.binding.subspan %ret0[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<4xf32>>
41 %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<4xf32>
42 %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<4xf32>
Thomas8f39d272022-10-12 14:12:03 -070043 %5 = tensor.empty() : tensor<4xf32>
MaheshRavishankarbeda4642022-06-30 18:04:13 -070044 %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
Ben Vanik9de6fe02022-02-02 18:05:31 -080045 ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
46 %10 = arith.mulf %arg4, %arg5 : f32
47 linalg.yield %10 : f32
MaheshRavishankarbeda4642022-06-30 18:04:13 -070048 } -> tensor<4xf32>
MaheshRavishankar4552fda2022-10-25 16:52:39 -070049 flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:tensor<4xf32>>
Ben Vanik9de6fe02022-02-02 18:05:31 -080050 return
51 }
52 }
53}
54// CHECK: vm.func private @simple_mul
Thomasf0f64ca2022-03-30 10:38:01 -070055func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
Ben Vanik9de6fe02022-02-02 18:05:31 -080056 %c4 = arith.constant 4 : index
Ben Vanike2a2b2b2024-08-22 11:56:59 -070057 // CHECK: vm.call.variadic @hal.command_buffer.dispatch
MaheshRavishankarbeda4642022-06-30 18:04:13 -070058 %ret0 = flow.dispatch @executable_0::@dispatch[%c4](%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
Ben Vanik9de6fe02022-02-02 18:05:31 -080059 return %ret0 : tensor<4xf32>
60}
61} // module
62
63// -----
64
65// The same element-wise multiply but now in-place:
66// %arg0 = %arg0 * %arg1
67//
68// In-place operations can often introduce false dependencies between dispatches
69// and should be avoided at this level in most cases - there's currently no cost
70// model for making dispatches into in-place operations but it's something that
71// would happen in the stream dialect after scheduling: two dispatches known to
72// not be running concurrently and operating on the same resources could be made
73// in-place.
74
75// CHECK-LABEL: vm.module public @inplace
76module @inplace {
77// CHECK: vm.rodata private @executable_1_vmvx_bytecode_fb
78stream.executable private @executable_1 {
MaheshRavishankarbeda4642022-06-30 18:04:13 -070079 stream.executable.export public @dispatch workgroups(%arg0: index) -> (index, index, index) {
MaheshRavishankar83b01032022-09-09 11:13:30 -070080 %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
MaheshRavishankarbeda4642022-06-30 18:04:13 -070081 stream.return %x, %y, %z : index, index, index
82 }
Ben Vanik9de6fe02022-02-02 18:05:31 -080083 builtin.module {
Thomasf0f64ca2022-03-30 10:38:01 -070084 func.func @dispatch(%arg0: !stream.binding, %arg1: !stream.binding) {
Ben Vanik9de6fe02022-02-02 18:05:31 -080085 %c0 = arith.constant 0 : index
MaheshRavishankar4552fda2022-10-25 16:52:39 -070086 %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<4xf32>>
87 %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xf32>>
88 %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<4xf32>> -> tensor<4xf32>
89 %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<4xf32>
Thomas8f39d272022-10-12 14:12:03 -070090 %5 = tensor.empty() : tensor<4xf32>
MaheshRavishankarbeda4642022-06-30 18:04:13 -070091 %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
Ben Vanik9de6fe02022-02-02 18:05:31 -080092 ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
93 %10 = arith.mulf %arg4, %arg5 : f32
94 linalg.yield %10 : f32
MaheshRavishankarbeda4642022-06-30 18:04:13 -070095 } -> tensor<4xf32>
MaheshRavishankar4552fda2022-10-25 16:52:39 -070096 flow.dispatch.tensor.store %6, %0, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<readwrite:tensor<4xf32>>
Ben Vanik9de6fe02022-02-02 18:05:31 -080097 return
98 }
99 }
100}
101// CHECK: vm.func private @simple_mul_inplace
Thomasf0f64ca2022-03-30 10:38:01 -0700102func.func @simple_mul_inplace(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
Ben Vanik9de6fe02022-02-02 18:05:31 -0800103 %c4 = arith.constant 4 : index
Ben Vanike2a2b2b2024-08-22 11:56:59 -0700104 // CHECK: vm.call.variadic @hal.command_buffer.dispatch
MaheshRavishankarbeda4642022-06-30 18:04:13 -0700105 %ret0 = flow.dispatch @executable_1::@dispatch[%c4](%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> %arg0
Ben Vanik9de6fe02022-02-02 18:05:31 -0800106 return %ret0 : tensor<4xf32>
107}
108} // module
109
110// -----
111
112// The same element-wise multiply but now with dynamic shapes:
113// %ret0 = %arg0 * %arg1
114//
115// This shows how the shape dimensions are captured by the dispatch so that the
116// host knows the shapes of the tensors and how the dimensions are passed as
117// operands to the executable for association. Once we perform the host/device
118// split the association allows tensor.dim ops in the device code to query the
119// dynamic dimensions without needing to insert new host -> device transfers.
120// Note that because of this explicit association the order of the dispatch
121// operands doesn't matter as walking the SSA use-def chain up to the
122// stream.binding.subspan allows them to be resolved directly.
123
124// CHECK-LABEL: vm.module public @dynamic
125module @dynamic {
126// CHECK: vm.rodata private @executable_2_vmvx_bytecode_fb
127stream.executable private @executable_2 {
MaheshRavishankarbeda4642022-06-30 18:04:13 -0700128 stream.executable.export public @dispatch workgroups(%arg0: index) -> (index, index, index) {
MaheshRavishankar83b01032022-09-09 11:13:30 -0700129 %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
MaheshRavishankarbeda4642022-06-30 18:04:13 -0700130 stream.return %x, %y, %z : index, index, index
131 }
Ben Vanik9de6fe02022-02-02 18:05:31 -0800132 builtin.module {
Thomasf0f64ca2022-03-30 10:38:01 -0700133 func.func @dispatch(%arg0: !stream.binding, %arg0_dim0: index, %arg1: !stream.binding, %arg1_dim0: index, %ret0: !stream.binding) {
Ben Vanik9de6fe02022-02-02 18:05:31 -0800134 %c0 = arith.constant 0 : index
MaheshRavishankar4552fda2022-10-25 16:52:39 -0700135 %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?xf32>>{%arg0_dim0}
136 %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?xf32>>{%arg1_dim0}
137 %2 = stream.binding.subspan %ret0[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%arg0_dim0}
138 %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [%arg0_dim0], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%arg0_dim0} -> tensor<?xf32>
139 %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [%arg1_dim0], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%arg1_dim0} -> tensor<?xf32>
Thomas8f39d272022-10-12 14:12:03 -0700140 %5 = tensor.empty(%arg0_dim0) : tensor<?xf32>
MaheshRavishankarbeda4642022-06-30 18:04:13 -0700141 %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<?xf32>, tensor<?xf32>) outs(%5 : tensor<?xf32>) attrs = {name = "mul.1"} {
Ben Vanik9de6fe02022-02-02 18:05:31 -0800142 ^bb0(%arg6: f32, %arg7: f32, %arg8: f32):
143 %10 = arith.mulf %arg6, %arg7 : f32
144 linalg.yield %10 : f32
145 } -> tensor<?xf32>
MaheshRavishankar4552fda2022-10-25 16:52:39 -0700146 flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [%arg0_dim0], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%arg0_dim0}
Ben Vanik9de6fe02022-02-02 18:05:31 -0800147 return
148 }
149 }
150}
151// CHECK: vm.func private @simple_mul_dynamic
Thomasf0f64ca2022-03-30 10:38:01 -0700152func.func @simple_mul_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
Ben Vanik9de6fe02022-02-02 18:05:31 -0800153 %c0 = arith.constant 0 : index
Ben Vanik9de6fe02022-02-02 18:05:31 -0800154 // CHECK: vm.call @hal.buffer_view.dim
155 %arg0_dim0 = tensor.dim %arg0, %c0 : tensor<?xf32>
156 // CHECK: vm.call @hal.buffer_view.dim
157 %arg1_dim0 = tensor.dim %arg1, %c0 : tensor<?xf32>
Ben Vanike2a2b2b2024-08-22 11:56:59 -0700158 // CHECK: vm.call.variadic @hal.command_buffer.dispatch
MaheshRavishankarbeda4642022-06-30 18:04:13 -0700159 %ret0 = flow.dispatch @executable_2::@dispatch[%arg0_dim0](%arg0, %arg0_dim0, %arg1, %arg1_dim0) : (tensor<?xf32>{%arg0_dim0}, index, tensor<?xf32>{%arg1_dim0}, index) -> tensor<?xf32>{%arg0_dim0}
Ben Vanik9de6fe02022-02-02 18:05:31 -0800160 return %ret0 : tensor<?xf32>
161}
162} // module
163
164// -----
165
166// This shows the same element-wise multiply but without the first level of
167// tiling. This will execute in a single workgroup regardless of tensor size
168// (though here it's 4 so it wouldn't be distributed anyway).
169
170// CHECK-LABEL: vm.module public @untiled
171module @untiled {
172// CHECK: vm.rodata private @executable_3_vmvx_bytecode_fb
173stream.executable private @executable_3 {
MaheshRavishankarbeda4642022-06-30 18:04:13 -0700174 stream.executable.export public @dispatch workgroups(%arg0: index) -> (index, index, index) {
175 %c1 = arith.constant 1 : index
176 stream.return %c1, %c1, %c1 : index, index, index
177 }
Ben Vanik9de6fe02022-02-02 18:05:31 -0800178 builtin.module {
Thomasf0f64ca2022-03-30 10:38:01 -0700179 func.func @dispatch(%arg0: !stream.binding, %arg1: !stream.binding, %ret0: !stream.binding) {
Ben Vanik9de6fe02022-02-02 18:05:31 -0800180 %c0 = arith.constant 0 : index
MaheshRavishankar4552fda2022-10-25 16:52:39 -0700181 %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xf32>>
182 %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xf32>>
183 %2 = stream.binding.subspan %ret0[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<4xf32>>
184 %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<4xf32>
185 %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<4xf32>
Thomas8f39d272022-10-12 14:12:03 -0700186 %5 = tensor.empty() : tensor<4xf32>
Ben Vanik9de6fe02022-02-02 18:05:31 -0800187 %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) {
188 ^bb0(%lhs: f32, %rhs: f32, %out: f32):
189 %7 = arith.mulf %lhs, %rhs : f32
190 linalg.yield %7 : f32
191 } -> tensor<4xf32>
MaheshRavishankar4552fda2022-10-25 16:52:39 -0700192 flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:tensor<4xf32>>
Ben Vanik9de6fe02022-02-02 18:05:31 -0800193 return
194 }
195 }
196}
197// CHECK: vm.func private @simple_mul_untiled
Thomasf0f64ca2022-03-30 10:38:01 -0700198func.func @simple_mul_untiled(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
Ben Vanik9de6fe02022-02-02 18:05:31 -0800199 %c1 = arith.constant 1 : index
MaheshRavishankarbeda4642022-06-30 18:04:13 -0700200 %ret0 = flow.dispatch @executable_3::@dispatch[%c1](%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
Ben Vanik9de6fe02022-02-02 18:05:31 -0800201 return %ret0 : tensor<4xf32>
202}
203} // module