Stella Laurenzo | f44859d | 2022-05-31 20:00:19 -0700 | [diff] [blame] | 1 | // RUN: iree-compile --split-input-file --iree-hal-target-backends=vmvx \ |
| 2 | // RUN: --output-format=vm-bytecode \ |
| 3 | // RUN: --iree-vm-bytecode-module-output-format=flatbuffer-text %s \ |
| 4 | // RUN: --mlir-print-ir-after=iree-vm-ordinal-allocation 2>&1 | FileCheck %s |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 5 | |
| 6 | // This file has a few test programs that show how to mix `flow` dispatches into |
| 7 | // those created by the `linalg` dispatch region formation: the idea is to use |
| 8 | // any normal IREE input (mhlo/tosa/linalg/etc) on tensors and then also include |
| 9 | // `flow.dispatch` ops calling `stream.executable`s. `flow.executable`s could be |
| 10 | // used too but currently have some ergonomics issues that need to be resolved; |
| 11 | // the improved version of `flow.dispatch` (and `flow.dispatch.workgroups`) will |
| 12 | // be made part of the public `iree` dialect at which time this file will change |
| 13 | // to using that. The `flow`/`stream` dialects are generally not considered |
| 14 | // stable. |
| 15 | |
| 16 | // A simple element-wise multiply of two static tensors: |
| 17 | // %ret0 = %arg0 * %arg1 |
| 18 | // |
| 19 | // The host code performs the dispatch with a workload of 4x1x1 - how many |
| 20 | // workgroups that gets distributed across is left to the HAL backend to decide |
| 21 | // based on the target device and how the work is tiled. |
| 22 | // |
| 23 | // The device code in the stream.executable is tiled - but does not need to be: |
| 24 | // the only thing we care about at this level is the bindings and any operands |
| 25 | // that may need to be passed from host->device. |
| 26 | |
| 27 | // CHECK-LABEL: vm.module public @e2e |
| 28 | module @e2e { |
| 29 | // CHECK: vm.rodata private @executable_0_vmvx_bytecode_fb |
| 30 | stream.executable private @executable_0 { |
MaheshRavishankar | beda464 | 2022-06-30 18:04:13 -0700 | [diff] [blame] | 31 | stream.executable.export public @dispatch workgroups(%arg0: index) -> (index, index, index) { |
MaheshRavishankar | 83b0103 | 2022-09-09 11:13:30 -0700 | [diff] [blame] | 32 | %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0 |
MaheshRavishankar | beda464 | 2022-06-30 18:04:13 -0700 | [diff] [blame] | 33 | stream.return %x, %y, %z : index, index, index |
| 34 | } |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 35 | builtin.module { |
Thomas | f0f64ca | 2022-03-30 10:38:01 -0700 | [diff] [blame] | 36 | func.func @dispatch(%arg0: !stream.binding, %arg1: !stream.binding, %ret0: !stream.binding) { |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 37 | %c0 = arith.constant 0 : index |
MaheshRavishankar | 4552fda | 2022-10-25 16:52:39 -0700 | [diff] [blame] | 38 | %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xf32>> |
| 39 | %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xf32>> |
| 40 | %2 = stream.binding.subspan %ret0[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<4xf32>> |
| 41 | %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<4xf32> |
| 42 | %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<4xf32> |
Thomas | 8f39d27 | 2022-10-12 14:12:03 -0700 | [diff] [blame] | 43 | %5 = tensor.empty() : tensor<4xf32> |
MaheshRavishankar | beda464 | 2022-06-30 18:04:13 -0700 | [diff] [blame] | 44 | %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} { |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 45 | ^bb0(%arg4: f32, %arg5: f32, %arg6: f32): |
| 46 | %10 = arith.mulf %arg4, %arg5 : f32 |
| 47 | linalg.yield %10 : f32 |
MaheshRavishankar | beda464 | 2022-06-30 18:04:13 -0700 | [diff] [blame] | 48 | } -> tensor<4xf32> |
MaheshRavishankar | 4552fda | 2022-10-25 16:52:39 -0700 | [diff] [blame] | 49 | flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:tensor<4xf32>> |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 50 | return |
| 51 | } |
| 52 | } |
| 53 | } |
| 54 | // CHECK: vm.func private @simple_mul |
Thomas | f0f64ca | 2022-03-30 10:38:01 -0700 | [diff] [blame] | 55 | func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> { |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 56 | %c4 = arith.constant 4 : index |
Ben Vanik | e2a2b2b | 2024-08-22 11:56:59 -0700 | [diff] [blame] | 57 | // CHECK: vm.call.variadic @hal.command_buffer.dispatch |
MaheshRavishankar | beda464 | 2022-06-30 18:04:13 -0700 | [diff] [blame] | 58 | %ret0 = flow.dispatch @executable_0::@dispatch[%c4](%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 59 | return %ret0 : tensor<4xf32> |
| 60 | } |
| 61 | } // module |
| 62 | |
| 63 | // ----- |
| 64 | |
| 65 | // The same element-wise multiply but now in-place: |
| 66 | // %arg0 = %arg0 * %arg1 |
| 67 | // |
| 68 | // In-place operations can often introduce false dependencies between dispatches |
| 69 | // and should be avoided at this level in most cases - there's currently no cost |
| 70 | // model for making dispatches into in-place operations but it's something that |
| 71 | // would happen in the stream dialect after scheduling: two dispatches known to |
| 72 | // not be running concurrently and operating on the same resources could be made |
| 73 | // in-place. |
| 74 | |
| 75 | // CHECK-LABEL: vm.module public @inplace |
| 76 | module @inplace { |
| 77 | // CHECK: vm.rodata private @executable_1_vmvx_bytecode_fb |
| 78 | stream.executable private @executable_1 { |
MaheshRavishankar | beda464 | 2022-06-30 18:04:13 -0700 | [diff] [blame] | 79 | stream.executable.export public @dispatch workgroups(%arg0: index) -> (index, index, index) { |
MaheshRavishankar | 83b0103 | 2022-09-09 11:13:30 -0700 | [diff] [blame] | 80 | %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0 |
MaheshRavishankar | beda464 | 2022-06-30 18:04:13 -0700 | [diff] [blame] | 81 | stream.return %x, %y, %z : index, index, index |
| 82 | } |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 83 | builtin.module { |
Thomas | f0f64ca | 2022-03-30 10:38:01 -0700 | [diff] [blame] | 84 | func.func @dispatch(%arg0: !stream.binding, %arg1: !stream.binding) { |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 85 | %c0 = arith.constant 0 : index |
MaheshRavishankar | 4552fda | 2022-10-25 16:52:39 -0700 | [diff] [blame] | 86 | %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<4xf32>> |
| 87 | %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xf32>> |
| 88 | %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<4xf32>> -> tensor<4xf32> |
| 89 | %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<4xf32> |
Thomas | 8f39d27 | 2022-10-12 14:12:03 -0700 | [diff] [blame] | 90 | %5 = tensor.empty() : tensor<4xf32> |
MaheshRavishankar | beda464 | 2022-06-30 18:04:13 -0700 | [diff] [blame] | 91 | %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} { |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 92 | ^bb0(%arg4: f32, %arg5: f32, %arg6: f32): |
| 93 | %10 = arith.mulf %arg4, %arg5 : f32 |
| 94 | linalg.yield %10 : f32 |
MaheshRavishankar | beda464 | 2022-06-30 18:04:13 -0700 | [diff] [blame] | 95 | } -> tensor<4xf32> |
MaheshRavishankar | 4552fda | 2022-10-25 16:52:39 -0700 | [diff] [blame] | 96 | flow.dispatch.tensor.store %6, %0, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<readwrite:tensor<4xf32>> |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 97 | return |
| 98 | } |
| 99 | } |
| 100 | } |
| 101 | // CHECK: vm.func private @simple_mul_inplace |
Thomas | f0f64ca | 2022-03-30 10:38:01 -0700 | [diff] [blame] | 102 | func.func @simple_mul_inplace(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> { |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 103 | %c4 = arith.constant 4 : index |
Ben Vanik | e2a2b2b | 2024-08-22 11:56:59 -0700 | [diff] [blame] | 104 | // CHECK: vm.call.variadic @hal.command_buffer.dispatch |
MaheshRavishankar | beda464 | 2022-06-30 18:04:13 -0700 | [diff] [blame] | 105 | %ret0 = flow.dispatch @executable_1::@dispatch[%c4](%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> %arg0 |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 106 | return %ret0 : tensor<4xf32> |
| 107 | } |
| 108 | } // module |
| 109 | |
| 110 | // ----- |
| 111 | |
| 112 | // The same element-wise multiply but now with dynamic shapes: |
| 113 | // %ret0 = %arg0 * %arg1 |
| 114 | // |
| 115 | // This shows how the shape dimensions are captured by the dispatch so that the |
| 116 | // host knows the shapes of the tensors and how the dimensions are passed as |
| 117 | // operands to the executable for association. Once we perform the host/device |
| 118 | // split the association allows tensor.dim ops in the device code to query the |
| 119 | // dynamic dimensions without needing to insert new host -> device transfers. |
| 120 | // Note that because of this explicit association the order of the dispatch |
| 121 | // operands doesn't matter as walking the SSA use-def chain up to the |
| 122 | // stream.binding.subspan allows them to be resolved directly. |
| 123 | |
| 124 | // CHECK-LABEL: vm.module public @dynamic |
| 125 | module @dynamic { |
| 126 | // CHECK: vm.rodata private @executable_2_vmvx_bytecode_fb |
| 127 | stream.executable private @executable_2 { |
MaheshRavishankar | beda464 | 2022-06-30 18:04:13 -0700 | [diff] [blame] | 128 | stream.executable.export public @dispatch workgroups(%arg0: index) -> (index, index, index) { |
MaheshRavishankar | 83b0103 | 2022-09-09 11:13:30 -0700 | [diff] [blame] | 129 | %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0 |
MaheshRavishankar | beda464 | 2022-06-30 18:04:13 -0700 | [diff] [blame] | 130 | stream.return %x, %y, %z : index, index, index |
| 131 | } |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 132 | builtin.module { |
Thomas | f0f64ca | 2022-03-30 10:38:01 -0700 | [diff] [blame] | 133 | func.func @dispatch(%arg0: !stream.binding, %arg0_dim0: index, %arg1: !stream.binding, %arg1_dim0: index, %ret0: !stream.binding) { |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 134 | %c0 = arith.constant 0 : index |
MaheshRavishankar | 4552fda | 2022-10-25 16:52:39 -0700 | [diff] [blame] | 135 | %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?xf32>>{%arg0_dim0} |
| 136 | %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?xf32>>{%arg1_dim0} |
| 137 | %2 = stream.binding.subspan %ret0[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%arg0_dim0} |
| 138 | %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [%arg0_dim0], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%arg0_dim0} -> tensor<?xf32> |
| 139 | %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [%arg1_dim0], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%arg1_dim0} -> tensor<?xf32> |
Thomas | 8f39d27 | 2022-10-12 14:12:03 -0700 | [diff] [blame] | 140 | %5 = tensor.empty(%arg0_dim0) : tensor<?xf32> |
MaheshRavishankar | beda464 | 2022-06-30 18:04:13 -0700 | [diff] [blame] | 141 | %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<?xf32>, tensor<?xf32>) outs(%5 : tensor<?xf32>) attrs = {name = "mul.1"} { |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 142 | ^bb0(%arg6: f32, %arg7: f32, %arg8: f32): |
| 143 | %10 = arith.mulf %arg6, %arg7 : f32 |
| 144 | linalg.yield %10 : f32 |
| 145 | } -> tensor<?xf32> |
MaheshRavishankar | 4552fda | 2022-10-25 16:52:39 -0700 | [diff] [blame] | 146 | flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [%arg0_dim0], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%arg0_dim0} |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 147 | return |
| 148 | } |
| 149 | } |
| 150 | } |
| 151 | // CHECK: vm.func private @simple_mul_dynamic |
Thomas | f0f64ca | 2022-03-30 10:38:01 -0700 | [diff] [blame] | 152 | func.func @simple_mul_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> { |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 153 | %c0 = arith.constant 0 : index |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 154 | // CHECK: vm.call @hal.buffer_view.dim |
| 155 | %arg0_dim0 = tensor.dim %arg0, %c0 : tensor<?xf32> |
| 156 | // CHECK: vm.call @hal.buffer_view.dim |
| 157 | %arg1_dim0 = tensor.dim %arg1, %c0 : tensor<?xf32> |
Ben Vanik | e2a2b2b | 2024-08-22 11:56:59 -0700 | [diff] [blame] | 158 | // CHECK: vm.call.variadic @hal.command_buffer.dispatch |
MaheshRavishankar | beda464 | 2022-06-30 18:04:13 -0700 | [diff] [blame] | 159 | %ret0 = flow.dispatch @executable_2::@dispatch[%arg0_dim0](%arg0, %arg0_dim0, %arg1, %arg1_dim0) : (tensor<?xf32>{%arg0_dim0}, index, tensor<?xf32>{%arg1_dim0}, index) -> tensor<?xf32>{%arg0_dim0} |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 160 | return %ret0 : tensor<?xf32> |
| 161 | } |
| 162 | } // module |
| 163 | |
| 164 | // ----- |
| 165 | |
| 166 | // This shows the same element-wise multiply but without the first level of |
| 167 | // tiling. This will execute in a single workgroup regardless of tensor size |
| 168 | // (though here it's 4 so it wouldn't be distributed anyway). |
| 169 | |
| 170 | // CHECK-LABEL: vm.module public @untiled |
| 171 | module @untiled { |
| 172 | // CHECK: vm.rodata private @executable_3_vmvx_bytecode_fb |
| 173 | stream.executable private @executable_3 { |
MaheshRavishankar | beda464 | 2022-06-30 18:04:13 -0700 | [diff] [blame] | 174 | stream.executable.export public @dispatch workgroups(%arg0: index) -> (index, index, index) { |
| 175 | %c1 = arith.constant 1 : index |
| 176 | stream.return %c1, %c1, %c1 : index, index, index |
| 177 | } |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 178 | builtin.module { |
Thomas | f0f64ca | 2022-03-30 10:38:01 -0700 | [diff] [blame] | 179 | func.func @dispatch(%arg0: !stream.binding, %arg1: !stream.binding, %ret0: !stream.binding) { |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 180 | %c0 = arith.constant 0 : index |
MaheshRavishankar | 4552fda | 2022-10-25 16:52:39 -0700 | [diff] [blame] | 181 | %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xf32>> |
| 182 | %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xf32>> |
| 183 | %2 = stream.binding.subspan %ret0[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<4xf32>> |
| 184 | %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<4xf32> |
| 185 | %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<4xf32> |
Thomas | 8f39d27 | 2022-10-12 14:12:03 -0700 | [diff] [blame] | 186 | %5 = tensor.empty() : tensor<4xf32> |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 187 | %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) { |
| 188 | ^bb0(%lhs: f32, %rhs: f32, %out: f32): |
| 189 | %7 = arith.mulf %lhs, %rhs : f32 |
| 190 | linalg.yield %7 : f32 |
| 191 | } -> tensor<4xf32> |
MaheshRavishankar | 4552fda | 2022-10-25 16:52:39 -0700 | [diff] [blame] | 192 | flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:tensor<4xf32>> |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 193 | return |
| 194 | } |
| 195 | } |
| 196 | } |
| 197 | // CHECK: vm.func private @simple_mul_untiled |
Thomas | f0f64ca | 2022-03-30 10:38:01 -0700 | [diff] [blame] | 198 | func.func @simple_mul_untiled(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> { |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 199 | %c1 = arith.constant 1 : index |
MaheshRavishankar | beda464 | 2022-06-30 18:04:13 -0700 | [diff] [blame] | 200 | %ret0 = flow.dispatch @executable_3::@dispatch[%c1](%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> |
Ben Vanik | 9de6fe0 | 2022-02-02 18:05:31 -0800 | [diff] [blame] | 201 | return %ret0 : tensor<4xf32> |
| 202 | } |
| 203 | } // module |