| Nicolas Vasilache | 0573f4f | 2022-12-06 22:03:15 +0100 | [diff] [blame] | 1 | !in_tensor_t = tensor<8x64xf32> |
| 2 | !out_tensor_t = tensor<8xf32> |
| 3 | |
| 4 | func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) { |
| 5 | %cst = arith.constant -0.000000e+00 : f32 |
| 6 | |
| 7 | %0 = tensor.empty() : !out_tensor_t |
| 8 | %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t |
| 9 | %5 = linalg.generic { |
| 10 | indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, |
| 11 | affine_map<(d0, d1) -> (d0)>], |
| 12 | iterator_types = ["parallel", "reduction"]} |
| 13 | ins(%arg : !in_tensor_t) outs(%1 : !out_tensor_t) { |
| 14 | ^bb0(%arg3: f32, %arg4: f32): |
| 15 | %4 = arith.addf %arg3, %arg4 : f32 |
| 16 | linalg.yield %4 : f32 |
| 17 | } -> !out_tensor_t |
| 18 | |
| 19 | %6 = tensor.empty() : !out_tensor_t |
| 20 | %7 = linalg.generic { |
| 21 | indexing_maps = [affine_map<(d0) -> (d0)>, |
| 22 | affine_map<(d0) -> (d0)>], |
| 23 | iterator_types = ["parallel"]} |
| Ben Vanik | f65c5cb | 2023-02-01 11:02:10 -0800 | [diff] [blame] | 24 | ins(%5 : !out_tensor_t) outs(%6 : !out_tensor_t) { |
| Nicolas Vasilache | 0573f4f | 2022-12-06 22:03:15 +0100 | [diff] [blame] | 25 | ^bb0(%arg3: f32, %arg4: f32): |
| 26 | %4 = math.sqrt %arg3 : f32 |
| 27 | linalg.yield %4 : f32 |
| 28 | } -> !out_tensor_t |
| 29 | return %7 : !out_tensor_t |
| 30 | } |
| 31 | |
| 32 | // RUN: iree-opt %s --iree-hal-target-backends=cuda \ |
| 33 | // RUN: --iree-abi-transformation-pipeline \ |
| 34 | // RUN: --iree-flow-transformation-pipeline \ |
| 35 | // RUN: --iree-stream-transformation-pipeline \ |
| 36 | // RUN: --iree-hal-configuration-pipeline | \ |
| 37 | // RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-lower-executable-target)))' \ |
| Nicolas Vasilache | fafde87 | 2023-01-26 22:25:37 +0100 | [diff] [blame] | 38 | // RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ |
| Nicolas Vasilache | 0573f4f | 2022-12-06 22:03:15 +0100 | [diff] [blame] | 39 | // RUN: --iree-codegen-llvmgpu-use-transform-dialect=%p/reduction_eltwise_codegen_spec.mlir | \ |
| 40 | // RUN: FileCheck %s --check-prefix=CHECK |
| 41 | |
| Nicolas Vasilache | 0573f4f | 2022-12-06 22:03:15 +0100 | [diff] [blame] | 42 | // RUN: iree-compile %s --iree-hal-target-backends=cuda \ |
| Nicolas Vasilache | fafde87 | 2023-01-26 22:25:37 +0100 | [diff] [blame] | 43 | // RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ |
| Nicolas Vasilache | 0573f4f | 2022-12-06 22:03:15 +0100 | [diff] [blame] | 44 | // RUN: --iree-codegen-llvmgpu-use-transform-dialect=%p/reduction_eltwise_codegen_spec.mlir | \ |
| Ben Vanik | f65c5cb | 2023-02-01 11:02:10 -0800 | [diff] [blame] | 45 | // RUN: iree-run-module --function=reduce --device=cuda --input="8x64xf32=1" |\ |
| Nicolas Vasilache | 0573f4f | 2022-12-06 22:03:15 +0100 | [diff] [blame] | 46 | // RUN: FileCheck %s --check-prefix=EXEC |
| 47 | |
| 48 | /// Note: the current --iree-codegen-llvmgpu-enable-transform-dialect-jit only works for exactly this reduction atm. |
| Nicolas Vasilache | fafde87 | 2023-01-26 22:25:37 +0100 | [diff] [blame] | 49 | // RUN: iree-compile %s --iree-hal-target-backends=cuda | \ |
| Ben Vanik | f65c5cb | 2023-02-01 11:02:10 -0800 | [diff] [blame] | 50 | // RUN: iree-run-module --function=reduce --device=cuda --input="8x64xf32=1" |\ |
| Nicolas Vasilache | 0573f4f | 2022-12-06 22:03:15 +0100 | [diff] [blame] | 51 | // RUN: FileCheck %s --check-prefix=EXEC |
| 52 | |
| 53 | // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index |
| 54 | // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index |
| Nicolas Vasilache | 0573f4f | 2022-12-06 22:03:15 +0100 | [diff] [blame] | 55 | // CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index |
| MaheshRavishankar | 837151d | 2023-01-20 15:03:01 -0800 | [diff] [blame] | 56 | // CHECK-DAG: %[[SHMEM_ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x2xf32, #gpu.address_space<workgroup>> |
| Nicolas Vasilache | 0573f4f | 2022-12-06 22:03:15 +0100 | [diff] [blame] | 57 | // CHECK-DAG: %[[TIDX:.]] = gpu.thread_id x |
| 58 | // CHECK-DAG: %[[TIDY:.]] = gpu.thread_id y |
| Han-Chung Wang | badd598 | 2023-01-10 22:27:15 -0800 | [diff] [blame] | 59 | // CHECK-DAG: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index |
| Nicolas Vasilache | 0573f4f | 2022-12-06 22:03:15 +0100 | [diff] [blame] | 60 | |
| 61 | // Distributed reduction: everyone loads then 5 xor + addf expected |
| Thomas | 190885e | 2023-01-20 17:59:18 -0800 | [diff] [blame] | 62 | // CHECK: vector.transfer_read %{{.*}}[%[[workgroup_id_x]], %[[TIDY]], %[[TIDX]]] |
| Nicolas Vasilache | 0573f4f | 2022-12-06 22:03:15 +0100 | [diff] [blame] | 63 | // CHECK-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf |
| 64 | |
| 65 | // CHECK: %[[RES:.*]] = arith.addf %{{.*}} |
| 66 | |
| 67 | // CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[RES]] : f32 to vector<f32> |
| Nicolas Vasilache | 0573f4f | 2022-12-06 22:03:15 +0100 | [diff] [blame] | 68 | // CHECK: scf.if %[[CONDXIS0]] |
| Thomas | 190885e | 2023-01-20 17:59:18 -0800 | [diff] [blame] | 69 | // CHECK: vector.transfer_write %[[RES_VEC]], %[[SHMEM_ALLOC]][%[[C0]], %[[TIDY]]] |
| Nicolas Vasilache | 0573f4f | 2022-12-06 22:03:15 +0100 | [diff] [blame] | 70 | // CHECK: gpu.barrier |
| 71 | |
| 72 | // Last part is not distributed atm and is only ran by threadIdx.x == 0 and threadIdx.y == 0. |
| 73 | // It should contain the fused elementwise operation. |
| 74 | // CHECK: %[[CONDYIS0:.*]] = arith.cmpi ult, %[[TIDY]], %[[C1]] : index |
| 75 | // TODO: cond eq 0 and cond ult 1 do not CSE atm. |
| 76 | // CHECK: %[[CONXANDYARE0:.*]] = arith.andi %{{.*}}, %[[CONDYIS0]] : i1 |
| 77 | // CHECK: scf.if %[[CONXANDYARE0]] { |
| 78 | // CHECK: vector.transfer_read |
| 79 | // CHECK: vector.reduction <add> |
| 80 | // CHECK: math.sqrt |
| 81 | // CHECK: vector.transfer_write |
| 82 | // CHECK: gpu.barrier |
| MaheshRavishankar | 837151d | 2023-01-20 15:03:01 -0800 | [diff] [blame] | 83 | // CHECK: memref.dealloc %[[SHMEM_ALLOC]] : memref<1x2xf32, #gpu.address_space<workgroup>> |
| Nicolas Vasilache | 0573f4f | 2022-12-06 22:03:15 +0100 | [diff] [blame] | 84 | |
| 85 | |
| 86 | // EXEC: result[0]: hal.buffer_view |
| 87 | // EXEC-NEXT: 8xf32=8 8 8 8 8 8 8 8 |