tests/e2e/regression/large_reduction.mlir - 3p/openxla/iree - Git at Google

 func.func @reduction_aligned() {
   %in = util.unfoldable_constant dense<1.0> : tensor<128x384xf32>
   %cst = arith.constant 0.0 : f32
   %init = tensor.empty() : tensor<128xf32>
   %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<128xf32>) -> tensor<128xf32>
   %result = linalg.generic {indexing_maps = [
     affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>],
     iterator_types = ["parallel", "reduction"]}
     ins(%in : tensor<128x384xf32>) outs(%fill : tensor<128xf32>) {
     ^bb0(%arg3: f32, %arg4: f32):  // no predecessors
       %2 = arith.addf %arg3, %arg4 : f32
       linalg.yield %2 : f32
     } -> tensor<128xf32>
   check.expect_eq_const(%result, dense<384.0> : tensor<128xf32>) : tensor<128xf32>
   return
 }

 func.func @reduction_unaligned() {
   %in = util.unfoldable_constant dense<1.0> : tensor<129x384xf32>
   %cst = arith.constant 0.0 : f32
   %init = tensor.empty() : tensor<129xf32>
   %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<129xf32>) -> tensor<129xf32>
   %result = linalg.generic {indexing_maps = [
     affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>],
     iterator_types = ["parallel", "reduction"]}
     ins(%in : tensor<129x384xf32>) outs(%fill : tensor<129xf32>) {
     ^bb0(%arg3: f32, %arg4: f32):  // no predecessors
       %2 = arith.addf %arg3, %arg4 : f32
       linalg.yield %2 : f32
     } -> tensor<129xf32>
   check.expect_eq_const(%result, dense<384.0> : tensor<129xf32>) : tensor<129xf32>
   return
 }

 // Reduction dimension larger than the max number of threads per group on a gpu.
 func.func @reduction_aligned_larger() {
   %in = util.unfoldable_constant dense<0.001> : tensor<2x40960xf32>
   %cst = arith.constant 0.0 : f32
   %init = tensor.empty() : tensor<2xf32>
   %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<2xf32>) -> tensor<2xf32>
   %result = linalg.generic {indexing_maps = [
     affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>],
     iterator_types = ["parallel", "reduction"]}
     ins(%in : tensor<2x40960xf32>) outs(%fill : tensor<2xf32>) {
     ^bb0(%arg3: f32, %arg4: f32):  // no predecessors
       %2 = arith.addf %arg3, %arg4 : f32
       linalg.yield %2 : f32
     } -> tensor<2xf32>
   check.expect_almost_eq_const(%result, dense<40.96> : tensor<2xf32>) : tensor<2xf32>
   return
 }

 func.func @half_reduction_aligned() {
   %in = util.unfoldable_constant dense<0.001> : tensor<2x4096xf16>
   %cst = arith.constant 0.0 : f16
   %init = tensor.empty() : tensor<2xf16>
   %fill = linalg.fill ins(%cst : f16) outs(%init : tensor<2xf16>) -> tensor<2xf16>
   %result = linalg.generic {indexing_maps = [
     affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>],
     iterator_types = ["parallel", "reduction"]}
     ins(%in : tensor<2x4096xf16>) outs(%fill : tensor<2xf16>) {
     ^bb0(%arg3: f16, %arg4: f16):  // no predecessors
       %2 = arith.addf %arg3, %arg4 : f16
       linalg.yield %2 : f16
     } -> tensor<2xf16>
   check.expect_almost_eq_const(%result, dense<4.096> : tensor<2xf16>) : tensor<2xf16>
   return
 }

 func.func @quarter_reduction_aligned_smaller() {
   %in = util.unfoldable_constant dense<1> : tensor<128x128xi8>
   %cst = arith.constant 0 : i8
   %init = tensor.empty() : tensor<128xi8>
   %fill = linalg.fill ins(%cst : i8) outs(%init : tensor<128xi8>) -> tensor<128xi8>
   %result = linalg.generic {indexing_maps = [
     affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>],
     iterator_types = ["parallel", "reduction"]}
     ins(%in : tensor<128x128xi8>) outs(%fill : tensor<128xi8>) {
     ^bb0(%arg3: i8, %arg4: i8):  // no predecessors
       %2 = arith.addi %arg3, %arg4 : i8
       linalg.yield %2 : i8
     } -> tensor<128xi8>
   check.expect_eq_const(%result, dense<128> : tensor<128xi8>) : tensor<128xi8>
   return
 }
	func.func @reduction_aligned() {
	%in = util.unfoldable_constant dense<1.0> : tensor<128x384xf32>
	%cst = arith.constant 0.0 : f32
	%init = tensor.empty() : tensor<128xf32>
	%fill = linalg.fill ins(%cst : f32) outs(%init : tensor<128xf32>) -> tensor<128xf32>
	%result = linalg.generic {indexing_maps = [
	affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>],
	iterator_types = ["parallel", "reduction"]}
	ins(%in : tensor<128x384xf32>) outs(%fill : tensor<128xf32>) {
	^bb0(%arg3: f32, %arg4: f32): // no predecessors
	%2 = arith.addf %arg3, %arg4 : f32
	linalg.yield %2 : f32
	} -> tensor<128xf32>
	check.expect_eq_const(%result, dense<384.0> : tensor<128xf32>) : tensor<128xf32>
	return
	}

	func.func @reduction_unaligned() {
	%in = util.unfoldable_constant dense<1.0> : tensor<129x384xf32>
	%cst = arith.constant 0.0 : f32
	%init = tensor.empty() : tensor<129xf32>
	%fill = linalg.fill ins(%cst : f32) outs(%init : tensor<129xf32>) -> tensor<129xf32>
	%result = linalg.generic {indexing_maps = [
	affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>],
	iterator_types = ["parallel", "reduction"]}
	ins(%in : tensor<129x384xf32>) outs(%fill : tensor<129xf32>) {
	^bb0(%arg3: f32, %arg4: f32): // no predecessors
	%2 = arith.addf %arg3, %arg4 : f32
	linalg.yield %2 : f32
	} -> tensor<129xf32>
	check.expect_eq_const(%result, dense<384.0> : tensor<129xf32>) : tensor<129xf32>
	return
	}

	// Reduction dimension larger than the max number of threads per group on a gpu.
	func.func @reduction_aligned_larger() {
	%in = util.unfoldable_constant dense<0.001> : tensor<2x40960xf32>
	%cst = arith.constant 0.0 : f32
	%init = tensor.empty() : tensor<2xf32>
	%fill = linalg.fill ins(%cst : f32) outs(%init : tensor<2xf32>) -> tensor<2xf32>
	%result = linalg.generic {indexing_maps = [
	affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>],
	iterator_types = ["parallel", "reduction"]}
	ins(%in : tensor<2x40960xf32>) outs(%fill : tensor<2xf32>) {
	^bb0(%arg3: f32, %arg4: f32): // no predecessors
	%2 = arith.addf %arg3, %arg4 : f32
	linalg.yield %2 : f32
	} -> tensor<2xf32>
	check.expect_almost_eq_const(%result, dense<40.96> : tensor<2xf32>) : tensor<2xf32>
	return
	}

	func.func @half_reduction_aligned() {
	%in = util.unfoldable_constant dense<0.001> : tensor<2x4096xf16>
	%cst = arith.constant 0.0 : f16
	%init = tensor.empty() : tensor<2xf16>
	%fill = linalg.fill ins(%cst : f16) outs(%init : tensor<2xf16>) -> tensor<2xf16>
	%result = linalg.generic {indexing_maps = [
	affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>],
	iterator_types = ["parallel", "reduction"]}
	ins(%in : tensor<2x4096xf16>) outs(%fill : tensor<2xf16>) {
	^bb0(%arg3: f16, %arg4: f16): // no predecessors
	%2 = arith.addf %arg3, %arg4 : f16
	linalg.yield %2 : f16
	} -> tensor<2xf16>
	check.expect_almost_eq_const(%result, dense<4.096> : tensor<2xf16>) : tensor<2xf16>
	return
	}

	func.func @quarter_reduction_aligned_smaller() {
	%in = util.unfoldable_constant dense<1> : tensor<128x128xi8>
	%cst = arith.constant 0 : i8
	%init = tensor.empty() : tensor<128xi8>
	%fill = linalg.fill ins(%cst : i8) outs(%init : tensor<128xi8>) -> tensor<128xi8>
	%result = linalg.generic {indexing_maps = [
	affine_map<(d0, d1) -> (d0, d1)>,affine_map<(d0, d1) -> (d0)>],
	iterator_types = ["parallel", "reduction"]}
	ins(%in : tensor<128x128xi8>) outs(%fill : tensor<128xi8>) {
	^bb0(%arg3: i8, %arg4: i8): // no predecessors
	%2 = arith.addi %arg3, %arg4 : i8
	linalg.yield %2 : i8
	} -> tensor<128xi8>
	check.expect_eq_const(%result, dense<128> : tensor<128xi8>) : tensor<128xi8>
	return
	}