|  | func.func @reduction_aligned() { | 
|  | %in = util.unfoldable_constant dense<1.0> : tensor<128x384xf32> | 
|  | %cst = arith.constant 0.0 : f32 | 
|  | %init = tensor.empty() : tensor<128xf32> | 
|  | %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<128xf32>) -> tensor<128xf32> | 
|  | %result = linalg.generic {indexing_maps = [ | 
|  | affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>], | 
|  | iterator_types = ["parallel", "reduction"]} | 
|  | ins(%in : tensor<128x384xf32>) outs(%fill : tensor<128xf32>) { | 
|  | ^bb0(%arg3: f32, %arg4: f32):  // no predecessors | 
|  | %2 = arith.addf %arg3, %arg4 : f32 | 
|  | linalg.yield %2 : f32 | 
|  | } -> tensor<128xf32> | 
|  | check.expect_eq_const(%result, dense<384.0> : tensor<128xf32>) : tensor<128xf32> | 
|  | return | 
|  | } | 
|  |  | 
|  | func.func @reduction_unaligned() { | 
|  | %in = util.unfoldable_constant dense<1.0> : tensor<129x384xf32> | 
|  | %cst = arith.constant 0.0 : f32 | 
|  | %init = tensor.empty() : tensor<129xf32> | 
|  | %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<129xf32>) -> tensor<129xf32> | 
|  | %result = linalg.generic {indexing_maps = [ | 
|  | affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>], | 
|  | iterator_types = ["parallel", "reduction"]} | 
|  | ins(%in : tensor<129x384xf32>) outs(%fill : tensor<129xf32>) { | 
|  | ^bb0(%arg3: f32, %arg4: f32):  // no predecessors | 
|  | %2 = arith.addf %arg3, %arg4 : f32 | 
|  | linalg.yield %2 : f32 | 
|  | } -> tensor<129xf32> | 
|  | check.expect_eq_const(%result, dense<384.0> : tensor<129xf32>) : tensor<129xf32> | 
|  | return | 
|  | } | 
|  |  | 
|  | // Reduction dimension larger than the max number of threads per group on a gpu. | 
|  | func.func @reduction_aligned_larger() { | 
|  | %in = util.unfoldable_constant dense<0.001> : tensor<2x40960xf32> | 
|  | %cst = arith.constant 0.0 : f32 | 
|  | %init = tensor.empty() : tensor<2xf32> | 
|  | %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<2xf32>) -> tensor<2xf32> | 
|  | %result = linalg.generic {indexing_maps = [ | 
|  | affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>], | 
|  | iterator_types = ["parallel", "reduction"]} | 
|  | ins(%in : tensor<2x40960xf32>) outs(%fill : tensor<2xf32>) { | 
|  | ^bb0(%arg3: f32, %arg4: f32):  // no predecessors | 
|  | %2 = arith.addf %arg3, %arg4 : f32 | 
|  | linalg.yield %2 : f32 | 
|  | } -> tensor<2xf32> | 
|  | check.expect_almost_eq_const(%result, dense<40.96> : tensor<2xf32>) : tensor<2xf32> | 
|  | return | 
|  | } | 
|  |  | 
|  | func.func @half_reduction_aligned() { | 
|  | %in = util.unfoldable_constant dense<0.001> : tensor<2x4096xf16> | 
|  | %cst = arith.constant 0.0 : f16 | 
|  | %init = tensor.empty() : tensor<2xf16> | 
|  | %fill = linalg.fill ins(%cst : f16) outs(%init : tensor<2xf16>) -> tensor<2xf16> | 
|  | %result = linalg.generic {indexing_maps = [ | 
|  | affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>], | 
|  | iterator_types = ["parallel", "reduction"]} | 
|  | ins(%in : tensor<2x4096xf16>) outs(%fill : tensor<2xf16>) { | 
|  | ^bb0(%arg3: f16, %arg4: f16):  // no predecessors | 
|  | %2 = arith.addf %arg3, %arg4 : f16 | 
|  | linalg.yield %2 : f16 | 
|  | } -> tensor<2xf16> | 
|  | check.expect_almost_eq_const(%result, dense<4.096> : tensor<2xf16>) : tensor<2xf16> | 
|  | return | 
|  | } | 
|  |  | 
|  | func.func @quarter_reduction_aligned_smaller() { | 
|  | %in = util.unfoldable_constant dense<1> : tensor<128x128xi8> | 
|  | %cst = arith.constant 0 : i8 | 
|  | %init = tensor.empty() : tensor<128xi8> | 
|  | %fill = linalg.fill ins(%cst : i8) outs(%init : tensor<128xi8>) -> tensor<128xi8> | 
|  | %result = linalg.generic {indexing_maps = [ | 
|  | affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>], | 
|  | iterator_types = ["parallel", "reduction"]} | 
|  | ins(%in : tensor<128x128xi8>) outs(%fill : tensor<128xi8>) { | 
|  | ^bb0(%arg3: i8, %arg4: i8):  // no predecessors | 
|  | %2 = arith.addi %arg3, %arg4 : i8 | 
|  | linalg.yield %2 : i8 | 
|  | } -> tensor<128xi8> | 
|  | check.expect_eq_const(%result, dense<128> : tensor<128xi8>) : tensor<128xi8> | 
|  | return | 
|  | } |