blob: eabf1a0cf7398b92f801e1727ac55616d0d7955e [file] [log] [blame]
!in_tensor_t = tensor<8x64xf32>
!out_tensor_t = tensor<8xf32>
func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
%cst = arith.constant -0.000000e+00 : f32
%0 = tensor.empty() : !out_tensor_t
%1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t
%2 = tensor.empty() : !in_tensor_t
%3 = linalg.generic {
indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d0, d1)>],
iterator_types = ["parallel", "parallel"]}
ins(%arg : !in_tensor_t) outs(%2 : !in_tensor_t) {
^bb0(%arg3: f32, %arg4: f32):
%4 = arith.addf %arg3, %arg3 : f32
%5 = arith.addf %4, %4 : f32
linalg.yield %5 : f32
} -> !in_tensor_t
%6 = linalg.generic {
indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d0)>],
iterator_types = ["parallel", "reduction"]}
ins(%3 : !in_tensor_t) outs(%1 : !out_tensor_t) {
^bb0(%arg3: f32, %arg4: f32):
%4 = arith.addf %arg3, %arg4 : f32
linalg.yield %4 : f32
} -> !out_tensor_t
return %6 : !out_tensor_t
}
// RUN: iree-compile %s --iree-hal-target-backends=cuda | \
// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
// RUN: FileCheck %s --check-prefix=EXEC
// EXEC: result[0]: hal.buffer_view
// EXEC-NEXT: 8xf32=256 256 256 256 256 256 256 256