blob: cdc24b6d3be1fac3504959d999f12cf73b795162 [file] [log] [blame]
func.func @reduction_aligned() {
%in = util.unfoldable_constant dense<1.0> : tensor<128x384xf32>
%cst = arith.constant 0.0 : f32
%init = tensor.empty() : tensor<128xf32>
%fill = linalg.fill ins(%cst : f32) outs(%init : tensor<128xf32>) -> tensor<128xf32>
%result = linalg.generic {indexing_maps = [
affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>],
iterator_types = ["parallel", "reduction"]}
ins(%in : tensor<128x384xf32>) outs(%fill : tensor<128xf32>) {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%2 = arith.addf %arg3, %arg4 : f32
linalg.yield %2 : f32
} -> tensor<128xf32>
check.expect_eq_const(%result, dense<384.0> : tensor<128xf32>) : tensor<128xf32>
return
}
func.func @reduction_unaligned() {
%in = util.unfoldable_constant dense<1.0> : tensor<129x384xf32>
%cst = arith.constant 0.0 : f32
%init = tensor.empty() : tensor<129xf32>
%fill = linalg.fill ins(%cst : f32) outs(%init : tensor<129xf32>) -> tensor<129xf32>
%result = linalg.generic {indexing_maps = [
affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>],
iterator_types = ["parallel", "reduction"]}
ins(%in : tensor<129x384xf32>) outs(%fill : tensor<129xf32>) {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%2 = arith.addf %arg3, %arg4 : f32
linalg.yield %2 : f32
} -> tensor<129xf32>
check.expect_eq_const(%result, dense<384.0> : tensor<129xf32>) : tensor<129xf32>
return
}
// Reduction dimension larger than the max number of threads per group on a gpu.
func.func @reduction_aligned_larger() {
%in = util.unfoldable_constant dense<0.001> : tensor<2x40960xf32>
%cst = arith.constant 0.0 : f32
%init = tensor.empty() : tensor<2xf32>
%fill = linalg.fill ins(%cst : f32) outs(%init : tensor<2xf32>) -> tensor<2xf32>
%result = linalg.generic {indexing_maps = [
affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>],
iterator_types = ["parallel", "reduction"]}
ins(%in : tensor<2x40960xf32>) outs(%fill : tensor<2xf32>) {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
%2 = arith.addf %arg3, %arg4 : f32
linalg.yield %2 : f32
} -> tensor<2xf32>
check.expect_almost_eq_const(%result, dense<40.96> : tensor<2xf32>) : tensor<2xf32>
return
}
func.func @half_reduction_aligned() {
%in = util.unfoldable_constant dense<0.001> : tensor<2x4096xf16>
%cst = arith.constant 0.0 : f16
%init = tensor.empty() : tensor<2xf16>
%fill = linalg.fill ins(%cst : f16) outs(%init : tensor<2xf16>) -> tensor<2xf16>
%result = linalg.generic {indexing_maps = [
affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>],
iterator_types = ["parallel", "reduction"]}
ins(%in : tensor<2x4096xf16>) outs(%fill : tensor<2xf16>) {
^bb0(%arg3: f16, %arg4: f16): // no predecessors
%2 = arith.addf %arg3, %arg4 : f16
linalg.yield %2 : f16
} -> tensor<2xf16>
check.expect_almost_eq_const(%result, dense<4.096> : tensor<2xf16>) : tensor<2xf16>
return
}
func.func @quarter_reduction_aligned_smaller() {
%in = util.unfoldable_constant dense<1> : tensor<128x128xi8>
%cst = arith.constant 0 : i8
%init = tensor.empty() : tensor<128xi8>
%fill = linalg.fill ins(%cst : i8) outs(%init : tensor<128xi8>) -> tensor<128xi8>
%result = linalg.generic {indexing_maps = [
affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>],
iterator_types = ["parallel", "reduction"]}
ins(%in : tensor<128x128xi8>) outs(%fill : tensor<128xi8>) {
^bb0(%arg3: i8, %arg4: i8): // no predecessors
%2 = arith.addi %arg3, %arg4 : i8
linalg.yield %2 : i8
} -> tensor<128xi8>
check.expect_eq_const(%result, dense<128> : tensor<128xi8>) : tensor<128xi8>
return
}