| func.func @reduction_aligned() { | 
 |   %in = util.unfoldable_constant dense<1.0> : tensor<128x384xf32> | 
 |   %cst = arith.constant 0.0 : f32 | 
 |   %init = tensor.empty() : tensor<128xf32> | 
 |   %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<128xf32>) -> tensor<128xf32> | 
 |   %result = linalg.generic {indexing_maps = [ | 
 |     affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>], | 
 |     iterator_types = ["parallel", "reduction"]} | 
 |     ins(%in : tensor<128x384xf32>) outs(%fill : tensor<128xf32>) { | 
 |     ^bb0(%arg3: f32, %arg4: f32):  // no predecessors | 
 |       %2 = arith.addf %arg3, %arg4 : f32 | 
 |       linalg.yield %2 : f32 | 
 |     } -> tensor<128xf32> | 
 |   check.expect_eq_const(%result, dense<384.0> : tensor<128xf32>) : tensor<128xf32> | 
 |   return | 
 | } | 
 |  | 
 | func.func @reduction_unaligned() { | 
 |   %in = util.unfoldable_constant dense<1.0> : tensor<129x384xf32> | 
 |   %cst = arith.constant 0.0 : f32 | 
 |   %init = tensor.empty() : tensor<129xf32> | 
 |   %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<129xf32>) -> tensor<129xf32> | 
 |   %result = linalg.generic {indexing_maps = [ | 
 |     affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>], | 
 |     iterator_types = ["parallel", "reduction"]} | 
 |     ins(%in : tensor<129x384xf32>) outs(%fill : tensor<129xf32>) { | 
 |     ^bb0(%arg3: f32, %arg4: f32):  // no predecessors | 
 |       %2 = arith.addf %arg3, %arg4 : f32 | 
 |       linalg.yield %2 : f32 | 
 |     } -> tensor<129xf32> | 
 |   check.expect_eq_const(%result, dense<384.0> : tensor<129xf32>) : tensor<129xf32> | 
 |   return | 
 | } | 
 |  | 
 | // Reduction dimension larger than the max number of threads per group on a gpu. | 
 | func.func @reduction_aligned_larger() { | 
 |   %in = util.unfoldable_constant dense<0.001> : tensor<2x40960xf32> | 
 |   %cst = arith.constant 0.0 : f32 | 
 |   %init = tensor.empty() : tensor<2xf32> | 
 |   %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<2xf32>) -> tensor<2xf32> | 
 |   %result = linalg.generic {indexing_maps = [ | 
 |     affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>], | 
 |     iterator_types = ["parallel", "reduction"]} | 
 |     ins(%in : tensor<2x40960xf32>) outs(%fill : tensor<2xf32>) { | 
 |     ^bb0(%arg3: f32, %arg4: f32):  // no predecessors | 
 |       %2 = arith.addf %arg3, %arg4 : f32 | 
 |       linalg.yield %2 : f32 | 
 |     } -> tensor<2xf32> | 
 |   check.expect_almost_eq_const(%result, dense<40.96> : tensor<2xf32>) : tensor<2xf32> | 
 |   return | 
 | } | 
 |  | 
 | func.func @half_reduction_aligned() { | 
 |   %in = util.unfoldable_constant dense<0.001> : tensor<2x4096xf16> | 
 |   %cst = arith.constant 0.0 : f16 | 
 |   %init = tensor.empty() : tensor<2xf16> | 
 |   %fill = linalg.fill ins(%cst : f16) outs(%init : tensor<2xf16>) -> tensor<2xf16> | 
 |   %result = linalg.generic {indexing_maps = [ | 
 |     affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>], | 
 |     iterator_types = ["parallel", "reduction"]} | 
 |     ins(%in : tensor<2x4096xf16>) outs(%fill : tensor<2xf16>) { | 
 |     ^bb0(%arg3: f16, %arg4: f16):  // no predecessors | 
 |       %2 = arith.addf %arg3, %arg4 : f16 | 
 |       linalg.yield %2 : f16 | 
 |     } -> tensor<2xf16> | 
 |   check.expect_almost_eq_const(%result, dense<4.096> : tensor<2xf16>) : tensor<2xf16> | 
 |   return | 
 | } | 
 |  | 
 | func.func @quarter_reduction_aligned_smaller() { | 
 |   %in = util.unfoldable_constant dense<1> : tensor<128x128xi8> | 
 |   %cst = arith.constant 0 : i8 | 
 |   %init = tensor.empty() : tensor<128xi8> | 
 |   %fill = linalg.fill ins(%cst : i8) outs(%init : tensor<128xi8>) -> tensor<128xi8> | 
 |   %result = linalg.generic {indexing_maps = [ | 
 |     affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>], | 
 |     iterator_types = ["parallel", "reduction"]} | 
 |     ins(%in : tensor<128x128xi8>) outs(%fill : tensor<128xi8>) { | 
 |     ^bb0(%arg3: i8, %arg4: i8):  // no predecessors | 
 |       %2 = arith.addi %arg3, %arg4 : i8 | 
 |       linalg.yield %2 : i8 | 
 |     } -> tensor<128xi8> | 
 |   check.expect_eq_const(%result, dense<128> : tensor<128xi8>) : tensor<128xi8> | 
 |   return | 
 | } |