blob: 94245b5d2a8720459622066e2dc1a5e56c2df8f3 [file] [log] [blame]
Nicolas Vasilache0573f4f2022-12-06 22:03:15 +01001!in_tensor_t = tensor<8x64xf32>
2!out_tensor_t = tensor<8xf32>
3
4func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
5 %cst = arith.constant -0.000000e+00 : f32
6
7 %0 = tensor.empty() : !out_tensor_t
8 %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t
9 %5 = linalg.generic {
10 indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
11 affine_map<(d0, d1) -> (d0)>],
12 iterator_types = ["parallel", "reduction"]}
13 ins(%arg : !in_tensor_t) outs(%1 : !out_tensor_t) {
14 ^bb0(%arg3: f32, %arg4: f32):
15 %4 = arith.addf %arg3, %arg4 : f32
16 linalg.yield %4 : f32
17 } -> !out_tensor_t
18
19 %6 = tensor.empty() : !out_tensor_t
20 %7 = linalg.generic {
21 indexing_maps = [affine_map<(d0) -> (d0)>,
22 affine_map<(d0) -> (d0)>],
23 iterator_types = ["parallel"]}
Ben Vanikf65c5cb2023-02-01 11:02:10 -080024 ins(%5 : !out_tensor_t) outs(%6 : !out_tensor_t) {
Nicolas Vasilache0573f4f2022-12-06 22:03:15 +010025 ^bb0(%arg3: f32, %arg4: f32):
26 %4 = math.sqrt %arg3 : f32
27 linalg.yield %4 : f32
28 } -> !out_tensor_t
29 return %7 : !out_tensor_t
30}
31
32// RUN: iree-opt %s --iree-hal-target-backends=cuda \
33// RUN: --iree-abi-transformation-pipeline \
34// RUN: --iree-flow-transformation-pipeline \
35// RUN: --iree-stream-transformation-pipeline \
36// RUN: --iree-hal-configuration-pipeline | \
37// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-lower-executable-target)))' \
Nicolas Vasilachefafde872023-01-26 22:25:37 +010038// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
Nicolas Vasilache0573f4f2022-12-06 22:03:15 +010039// RUN: --iree-codegen-llvmgpu-use-transform-dialect=%p/reduction_eltwise_codegen_spec.mlir | \
40// RUN: FileCheck %s --check-prefix=CHECK
41
Nicolas Vasilache0573f4f2022-12-06 22:03:15 +010042// RUN: iree-compile %s --iree-hal-target-backends=cuda \
Nicolas Vasilachefafde872023-01-26 22:25:37 +010043// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
Nicolas Vasilache0573f4f2022-12-06 22:03:15 +010044// RUN: --iree-codegen-llvmgpu-use-transform-dialect=%p/reduction_eltwise_codegen_spec.mlir | \
Ben Vanikf65c5cb2023-02-01 11:02:10 -080045// RUN: iree-run-module --function=reduce --device=cuda --input="8x64xf32=1" |\
Nicolas Vasilache0573f4f2022-12-06 22:03:15 +010046// RUN: FileCheck %s --check-prefix=EXEC
47
48/// Note: the current --iree-codegen-llvmgpu-enable-transform-dialect-jit only works for exactly this reduction atm.
Nicolas Vasilachefafde872023-01-26 22:25:37 +010049// RUN: iree-compile %s --iree-hal-target-backends=cuda | \
Ben Vanikf65c5cb2023-02-01 11:02:10 -080050// RUN: iree-run-module --function=reduce --device=cuda --input="8x64xf32=1" |\
Nicolas Vasilache0573f4f2022-12-06 22:03:15 +010051// RUN: FileCheck %s --check-prefix=EXEC
52
53 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
54 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
Nicolas Vasilache0573f4f2022-12-06 22:03:15 +010055 // CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
MaheshRavishankar837151d2023-01-20 15:03:01 -080056 // CHECK-DAG: %[[SHMEM_ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x2xf32, #gpu.address_space<workgroup>>
Nicolas Vasilache0573f4f2022-12-06 22:03:15 +010057 // CHECK-DAG: %[[TIDX:.]] = gpu.thread_id x
58 // CHECK-DAG: %[[TIDY:.]] = gpu.thread_id y
Han-Chung Wangbadd5982023-01-10 22:27:15 -080059 // CHECK-DAG: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
Nicolas Vasilache0573f4f2022-12-06 22:03:15 +010060
61 // Distributed reduction: everyone loads then 5 xor + addf expected
Thomas190885e2023-01-20 17:59:18 -080062 // CHECK: vector.transfer_read %{{.*}}[%[[workgroup_id_x]], %[[TIDY]], %[[TIDX]]]
Nicolas Vasilache0573f4f2022-12-06 22:03:15 +010063 // CHECK-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
64
65 // CHECK: %[[RES:.*]] = arith.addf %{{.*}}
66
67 // CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[RES]] : f32 to vector<f32>
Nicolas Vasilache0573f4f2022-12-06 22:03:15 +010068 // CHECK: scf.if %[[CONDXIS0]]
Thomas190885e2023-01-20 17:59:18 -080069 // CHECK: vector.transfer_write %[[RES_VEC]], %[[SHMEM_ALLOC]][%[[C0]], %[[TIDY]]]
Nicolas Vasilache0573f4f2022-12-06 22:03:15 +010070 // CHECK: gpu.barrier
71
72 // Last part is not distributed atm and is only ran by threadIdx.x == 0 and threadIdx.y == 0.
73 // It should contain the fused elementwise operation.
74 // CHECK: %[[CONDYIS0:.*]] = arith.cmpi ult, %[[TIDY]], %[[C1]] : index
75 // TODO: cond eq 0 and cond ult 1 do not CSE atm.
76 // CHECK: %[[CONXANDYARE0:.*]] = arith.andi %{{.*}}, %[[CONDYIS0]] : i1
77 // CHECK: scf.if %[[CONXANDYARE0]] {
78 // CHECK: vector.transfer_read
79 // CHECK: vector.reduction <add>
80 // CHECK: math.sqrt
81 // CHECK: vector.transfer_write
82 // CHECK: gpu.barrier
MaheshRavishankar837151d2023-01-20 15:03:01 -080083 // CHECK: memref.dealloc %[[SHMEM_ALLOC]] : memref<1x2xf32, #gpu.address_space<workgroup>>
Nicolas Vasilache0573f4f2022-12-06 22:03:15 +010084
85
86// EXEC: result[0]: hal.buffer_view
87// EXEC-NEXT: 8xf32=8 8 8 8 8 8 8 8