llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/fuse-operands.mlir - 3p/openxla/iree - Git at Google

 // RUN: iree-dialects-opt %s  --transform-dialect-interpreter --split-input-file | FileCheck %s
 // TODO(#11765): Fix and re-enable this.
 // REQUIRES: dont-run

 #map0 = affine_map<()[s0] -> (64 ceildiv s0)>
 #map1 = affine_map<(d0)[s0] -> (d0 * s0)>
 #map2 = affine_map<(d0)[s0] -> (-(d0 * s0) + 64, s0)>

 module {
   // CHECK-LABEL: func.func @fuse_static
   //  CHECK-SAME:   %[[CHUNK_SIZE:[0-9a-z]+]]: index
   //  CHECK-SAME:   %[[IN:[0-9a-z]+]]: tensor<64xf32>
   //  CHECK-SAME:   %[[OUT:[0-9a-z]+]]: tensor<64xf32>
   func.func @fuse_static(%arg0: index, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<64xf32> {
     %cst = arith.constant 4.200000e+01 : f32
     %cst2 = arith.constant 4.300000e+01 : f32
     %0 = linalg.generic
         {indexing_maps = [affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]}
         outs(%arg1 : tensor<64xf32>) {
     ^bb0(%arg3: f32):
       linalg.yield %cst : f32
     } -> tensor<64xf32>
     %1 = linalg.generic
         {indexing_maps = [affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]}
         outs(%arg2 : tensor<64xf32>) {
     ^bb0(%arg3: f32):
       linalg.yield %cst : f32
     } -> tensor<64xf32>

     %2 = affine.apply #map0()[%arg0]
     // CHECK: scf.forall
     %3 = scf.forall (%arg3) in (%2) shared_outs(%O = %arg2) -> (tensor<64xf32>) {
       // CHECK:    %[[OFFSET:.*]] = affine.apply
       // CHECK:    %[[SIZE:.*]] = affine.min
       %4 = affine.apply #map1(%arg3)[%arg0]
       %5 = affine.min #map2(%arg3)[%arg0]
       %6 = tensor.extract_slice %0[%4] [%5] [1] : tensor<64xf32> to tensor<?xf32>

       // CHECK:    %[[T0:.*]] = tensor.extract_slice %[[IN]][%[[OFFSET]]] [%[[SIZE]]] [{{.*}}]
       // CHECK:    %[[T1:.*]] = linalg.generic {{.*}} outs(%[[T0]]
       // CHECK:    %[[T2:.*]] = tensor.extract_slice %[[OUT]][%[[OFFSET]]] [%[[SIZE]]] [{{.*}}]
       // CHECK:    %[[T3:.*]] = linalg.generic {{.*}} outs(%[[T2]]
       %7 = tensor.extract_slice %1[%4] [%5] [1] : tensor<64xf32> to tensor<?xf32>

       // CHECK:    %[[T4:.*]] = linalg.elemwise_unary ins(%[[T1]] {{.*}} outs(%[[T3]]
       %8 = linalg.elemwise_unary ins(%6 : tensor<?xf32>) outs(%7 : tensor<?xf32>) -> tensor<?xf32>
       scf.forall.in_parallel {
         tensor.parallel_insert_slice %8 into %O[%4] [%5] [1] : tensor<?xf32> into tensor<64xf32>
       }
     }
     func.return %3 : tensor<64xf32>
   }

   transform.with_pdl_patterns {
   ^bb0(%arg0: !pdl.operation):
     pdl.pattern @match_elemwise : benefit(1) {
       %0 = operands
       %1 = types
       %2 = operation "linalg.elemwise_unary"(%0 : !pdl.range<value>)  -> (%1 : !pdl.range<type>)
       rewrite %2 with "transform.dialect"
     }
     pdl.pattern @match_in_parallel : benefit(1) {
       %0 = operands
       %1 = types
       %2 = operation "scf.forall"(%0 : !pdl.range<value>)  -> (%1 : !pdl.range<type>)
       rewrite %2 with "transform.dialect"
     }
     transform.sequence %arg0 failures(propagate) {
     ^bb1(%arg1: !pdl.operation):
       %0 = pdl_match @match_elemwise in %arg1 : (!pdl.operation) -> !pdl.operation
       %1, %fusedOps:2 = fuse_producers %0 {operands_to_fuse=[0, 1]}
     }
   }
 }

 // -----

 #map0 = affine_map<()[s0, s1] -> (s0 ceildiv s1)>
 #map1 = affine_map<(d0)[s0] -> (d0 * s0)>
 #map2 = affine_map<(d0)[s0, s1] -> (-(d0 * s1) + s0, s1)>

 module {
   // CHECK-LABEL: func.func @fuse_dynamic
   //  CHECK-SAME:   %[[CHUNK_SIZE:[0-9a-z]+]]: index
   //  CHECK-SAME:   %[[IN:[0-9a-z]+]]: tensor<?xf32>
   //  CHECK-SAME:   %[[OUT:[0-9a-z]+]]: tensor<?xf32>
   func.func @fuse_dynamic(%arg0: index, %arg1: tensor<?xf32>, %arg2: tensor<?xf32>) -> tensor<?xf32> {
     %cst = arith.constant 4.200000e+01 : f32
     %c0 = arith.constant 0 : index
     %0 = linalg.generic
         {indexing_maps = [affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]}
         outs(%arg1 : tensor<?xf32>) {
     ^bb0(%arg3: f32):
       linalg.yield %cst : f32
     } -> tensor<?xf32>
     // TODO: Choosing %arg2 here complicates the size computation.
     %d0 = tensor.dim %arg1, %c0 : tensor<?xf32>
     %1 = affine.apply #map0()[%d0, %arg0]
     // CHECK: scf.forall
     %2 = scf.forall (%arg3) in (%1) shared_outs(%O = %arg2) -> (tensor<?xf32>) {
       // CHECK:    %[[OFFSET:.*]] = affine.apply
       // CHECK:    %[[SIZE:.*]] = affine.min
       %3 = affine.apply #map1(%arg3)[%arg0]
       %4 = affine.min #map2(%arg3)[%d0, %arg0]
       %5 = tensor.extract_slice %arg2[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>

       // CHECK:    %[[T0:.*]] = tensor.extract_slice %[[IN]][%[[OFFSET]]] [%[[SIZE]]] [{{.*}}]
       // CHECK:    %[[T1:.*]] = linalg.generic {{.*}} outs(%[[T0]]
       %6 = tensor.extract_slice %0[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>

       // CHECK:    %[[T2:.*]] = linalg.elemwise_unary ins(%[[T1]]
       %7 = linalg.elemwise_unary ins(%6 : tensor<?xf32>) outs(%5 : tensor<?xf32>) -> tensor<?xf32>
       scf.forall.in_parallel {
         tensor.parallel_insert_slice %7 into %O[%3] [%4] [1] : tensor<?xf32> into tensor<?xf32>
       }
     }
     func.return %2 : tensor<?xf32>
   }

   transform.with_pdl_patterns {
   ^bb0(%arg0: !pdl.operation):
     pdl.pattern @match_elemwise : benefit(1) {
       %0 = operands
       %1 = types
       %2 = operation "linalg.elemwise_unary"(%0 : !pdl.range<value>)  -> (%1 : !pdl.range<type>)
       rewrite %2 with "transform.dialect"
     }
     pdl.pattern @match_in_parallel : benefit(1) {
       %0 = operands
       %1 = types
       %2 = operation "scf.forall"(%0 : !pdl.range<value>)  -> (%1 : !pdl.range<type>)
       rewrite %2 with "transform.dialect"
     }
     transform.sequence %arg0 failures(propagate) {
     ^bb1(%arg1: !pdl.operation):
       %0 = pdl_match @match_elemwise in %arg1 : (!pdl.operation) -> !pdl.operation
       %1, %fusedOps = fuse_producers %0 {operands_to_fuse=[0]}
     }
   }
 }
	// RUN: iree-dialects-opt %s --transform-dialect-interpreter --split-input-file \| FileCheck %s
	// TODO(#11765): Fix and re-enable this.
	// REQUIRES: dont-run

	#map0 = affine_map<()[s0] -> (64 ceildiv s0)>
	#map1 = affine_map<(d0)[s0] -> (d0 * s0)>
	#map2 = affine_map<(d0)[s0] -> (-(d0 * s0) + 64, s0)>

	module {
	// CHECK-LABEL: func.func @fuse_static
	// CHECK-SAME: %[[CHUNK_SIZE:[0-9a-z]+]]: index
	// CHECK-SAME: %[[IN:[0-9a-z]+]]: tensor<64xf32>
	// CHECK-SAME: %[[OUT:[0-9a-z]+]]: tensor<64xf32>
	func.func @fuse_static(%arg0: index, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<64xf32> {
	%cst = arith.constant 4.200000e+01 : f32
	%cst2 = arith.constant 4.300000e+01 : f32
	%0 = linalg.generic
	{indexing_maps = [affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]}
	outs(%arg1 : tensor<64xf32>) {
	^bb0(%arg3: f32):
	linalg.yield %cst : f32
	} -> tensor<64xf32>
	%1 = linalg.generic
	{indexing_maps = [affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]}
	outs(%arg2 : tensor<64xf32>) {
	^bb0(%arg3: f32):
	linalg.yield %cst : f32
	} -> tensor<64xf32>

	%2 = affine.apply #map0()[%arg0]
	// CHECK: scf.forall
	%3 = scf.forall (%arg3) in (%2) shared_outs(%O = %arg2) -> (tensor<64xf32>) {
	// CHECK: %[[OFFSET:.*]] = affine.apply
	// CHECK: %[[SIZE:.*]] = affine.min
	%4 = affine.apply #map1(%arg3)[%arg0]
	%5 = affine.min #map2(%arg3)[%arg0]
	%6 = tensor.extract_slice %0[%4] [%5] [1] : tensor<64xf32> to tensor<?xf32>

	// CHECK: %[[T0:.]] = tensor.extract_slice %[[IN]][%[[OFFSET]]] [%[[SIZE]]] [{{.}}]
	// CHECK: %[[T1:.]] = linalg.generic {{.}} outs(%[[T0]]
	// CHECK: %[[T2:.]] = tensor.extract_slice %[[OUT]][%[[OFFSET]]] [%[[SIZE]]] [{{.}}]
	// CHECK: %[[T3:.]] = linalg.generic {{.}} outs(%[[T2]]
	%7 = tensor.extract_slice %1[%4] [%5] [1] : tensor<64xf32> to tensor<?xf32>

	// CHECK: %[[T4:.]] = linalg.elemwise_unary ins(%[[T1]] {{.}} outs(%[[T3]]
	%8 = linalg.elemwise_unary ins(%6 : tensor<?xf32>) outs(%7 : tensor<?xf32>) -> tensor<?xf32>
	scf.forall.in_parallel {
	tensor.parallel_insert_slice %8 into %O[%4] [%5] [1] : tensor<?xf32> into tensor<64xf32>
	}
	}
	func.return %3 : tensor<64xf32>
	}

	transform.with_pdl_patterns {
	^bb0(%arg0: !pdl.operation):
	pdl.pattern @match_elemwise : benefit(1) {
	%0 = operands
	%1 = types
	%2 = operation "linalg.elemwise_unary"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
	rewrite %2 with "transform.dialect"
	}
	pdl.pattern @match_in_parallel : benefit(1) {
	%0 = operands
	%1 = types
	%2 = operation "scf.forall"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
	rewrite %2 with "transform.dialect"
	}
	transform.sequence %arg0 failures(propagate) {
	^bb1(%arg1: !pdl.operation):
	%0 = pdl_match @match_elemwise in %arg1 : (!pdl.operation) -> !pdl.operation
	%1, %fusedOps:2 = fuse_producers %0 {operands_to_fuse=[0, 1]}
	}
	}
	}

	// -----

	#map0 = affine_map<()[s0, s1] -> (s0 ceildiv s1)>
	#map1 = affine_map<(d0)[s0] -> (d0 * s0)>
	#map2 = affine_map<(d0)[s0, s1] -> (-(d0 * s1) + s0, s1)>

	module {
	// CHECK-LABEL: func.func @fuse_dynamic
	// CHECK-SAME: %[[CHUNK_SIZE:[0-9a-z]+]]: index
	// CHECK-SAME: %[[IN:[0-9a-z]+]]: tensor<?xf32>
	// CHECK-SAME: %[[OUT:[0-9a-z]+]]: tensor<?xf32>
	func.func @fuse_dynamic(%arg0: index, %arg1: tensor<?xf32>, %arg2: tensor<?xf32>) -> tensor<?xf32> {
	%cst = arith.constant 4.200000e+01 : f32
	%c0 = arith.constant 0 : index
	%0 = linalg.generic
	{indexing_maps = [affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]}
	outs(%arg1 : tensor<?xf32>) {
	^bb0(%arg3: f32):
	linalg.yield %cst : f32
	} -> tensor<?xf32>
	// TODO: Choosing %arg2 here complicates the size computation.
	%d0 = tensor.dim %arg1, %c0 : tensor<?xf32>
	%1 = affine.apply #map0()[%d0, %arg0]
	// CHECK: scf.forall
	%2 = scf.forall (%arg3) in (%1) shared_outs(%O = %arg2) -> (tensor<?xf32>) {
	// CHECK: %[[OFFSET:.*]] = affine.apply
	// CHECK: %[[SIZE:.*]] = affine.min
	%3 = affine.apply #map1(%arg3)[%arg0]
	%4 = affine.min #map2(%arg3)[%d0, %arg0]
	%5 = tensor.extract_slice %arg2[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>

	// CHECK: %[[T0:.]] = tensor.extract_slice %[[IN]][%[[OFFSET]]] [%[[SIZE]]] [{{.}}]
	// CHECK: %[[T1:.]] = linalg.generic {{.}} outs(%[[T0]]
	%6 = tensor.extract_slice %0[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>

	// CHECK: %[[T2:.*]] = linalg.elemwise_unary ins(%[[T1]]
	%7 = linalg.elemwise_unary ins(%6 : tensor<?xf32>) outs(%5 : tensor<?xf32>) -> tensor<?xf32>
	scf.forall.in_parallel {
	tensor.parallel_insert_slice %7 into %O[%3] [%4] [1] : tensor<?xf32> into tensor<?xf32>
	}
	}
	func.return %2 : tensor<?xf32>
	}

	transform.with_pdl_patterns {
	^bb0(%arg0: !pdl.operation):
	pdl.pattern @match_elemwise : benefit(1) {
	%0 = operands
	%1 = types
	%2 = operation "linalg.elemwise_unary"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
	rewrite %2 with "transform.dialect"
	}
	pdl.pattern @match_in_parallel : benefit(1) {
	%0 = operands
	%1 = types
	%2 = operation "scf.forall"(%0 : !pdl.range<value>) -> (%1 : !pdl.range<type>)
	rewrite %2 with "transform.dialect"
	}
	transform.sequence %arg0 failures(propagate) {
	^bb1(%arg1: !pdl.operation):
	%0 = pdl_match @match_elemwise in %arg1 : (!pdl.operation) -> !pdl.operation
	%1, %fusedOps = fuse_producers %0 {operands_to_fuse=[0]}
	}
	}
	}