[LinalgExt][NFC] Split tiling tests into tiling and distribution tests. (#15903)

It is a step towards retiring LinalgExt tiling patterns. The upstream
methods models distribution through scf.forall ops; it models tiling
through scf.for ops. The break allows us to track them separately.
diff --git a/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/distribution.mlir b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/distribution.mlir
new file mode 100644
index 0000000..6203e18
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/distribution.mlir
@@ -0,0 +1,158 @@
+// RUN: iree-dialects-opt --iree-linalg-ext-tile --split-input-file -cse %s | FileCheck  %s
+
+func.func @scatter_tiling_distribution(
+    %original: tensor<?x?xf32>, %indices: tensor<?x1xi32>,
+    %update : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = iree_linalg_ext.scatter
+    {__internal_linalg_transform__ = "distribute_input"}
+    dimension_map = [0]
+    unique_indices(true)
+    ins(%update, %indices : tensor<?x?xf32>, tensor<?x1xi32>)
+    outs(%original : tensor<?x?xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+//   CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 * 10)>
+//   CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//       CHECK: func.func @scatter_tiling_distribution(
+//  CHECK-SAME:   %[[ORIGINAL:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
+//  CHECK-SAME:   %[[INDICES:[a-zA-Z0-9_]+]]: tensor<?x1xi32>
+//  CHECK-SAME:   %[[UPDATES:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[TILESIZE:.+]] = arith.constant 10 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[UPDATES]], %[[C0]]
+//   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[UPDATES]], %[[C1]]
+//   CHECK-DAG:   %[[ID:.+]] = iree_input.dispatch.workgroup.id[0]
+//   CHECK-DAG:   %[[COUNT:.+]] = iree_input.dispatch.workgroup.count[0]
+//   CHECK-DAG:   %[[OFFSET:.+]] = affine.apply #[[MAP0]]()[%[[ID]]]
+//   CHECK-DAG:   %[[STEP:.+]] = affine.apply #[[MAP0]]()[%[[COUNT]]]
+//       CHECK:   %[[RESULT:.+]] = scf.for %[[IV:.+]] = %[[OFFSET]] to %[[D0]] step %[[STEP]]
+//  CHECK-SAME:       iter_args(%[[INIT:.+]] = %[[ORIGINAL]])
+//       CHECK:     %[[USED_TILESIZE:.+]] = affine.min #[[MAP1]](%[[IV]])[%[[TILESIZE]], %[[D0]]]
+//       CHECK:     %[[UPDATE_SLICE:.+]] = tensor.extract_slice %[[UPDATES]][%[[IV]], 0]
+//  CHECK-SAME:         [%[[USED_TILESIZE]], %[[D1]]]
+//       CHECK:     %[[INDEX_SLICE:.+]] = tensor.extract_slice %[[INDICES]][%[[IV]], 0]
+//  CHECK-SAME:         [%[[USED_TILESIZE]], 1]
+//       CHECK:     %[[D2:.+]] = tensor.dim %[[ORIGINAL]], %[[C0]]
+//       CHECK:     %[[ORIGINAL_SLICE:.+]] = tensor.extract_slice %[[ORIGINAL]][0, 0]
+//  CHECK-SAME:         [%[[D2]], %[[D1]]]
+//       CHECK:     %[[SCATTER_TILE:.+]] = iree_linalg_ext.scatter
+//  CHECK-SAME:        __internal_linalg_transform__ = "distribute_output"
+//  CHECK-SAME:        unique_indices(true)
+//  CHECK-SAME:        ins(%[[UPDATE_SLICE]], %[[INDEX_SLICE]]
+//  CHECK-SAME:        outs(%[[ORIGINAL_SLICE]]
+//       CHECK:     %[[YIELD:.+]] = tensor.insert_slice %[[SCATTER_TILE]] into %[[INIT]][0, 0]
+//  CHECK-SAME:        [%[[D2]], %[[D1]]]
+//       CHECK:   return %[[RESULT]]
+
+// -----
+
+func.func @sort_3d_multi_result_distribute(
+  %arg0: tensor<?x?x?xi32>, %arg1 : tensor<?x?x?xf32>)
+  -> (tensor<?x?x?xi32>, tensor<?x?x?xf32>) {
+  %0, %1 = iree_linalg_ext.sort
+      {__internal_linalg_transform__ = "distribute_input"}
+      dimension(1)
+      outs(%arg0, %arg1 : tensor<?x?x?xi32>, tensor<?x?x?xf32>) {
+      ^bb0(%arg2: i32, %arg3: i32, %arg4 : f32, %arg5 : f32):  // no predecessors
+        %2 = arith.cmpf ogt, %arg4, %arg5 : f32
+        iree_linalg_ext.yield %2 : i1
+      } -> tensor<?x?x?xi32>, tensor<?x?x?xf32>
+  return %0, %1 : tensor<?x?x?xi32>, tensor<?x?x?xf32>
+}
+//   CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 * 10)>
+//   CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//   CHECK-DAG: #[[MAP2:.+]] = affine_map<()[s0] -> (s0 * 30)>
+//   CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0, s1] -> (30, -d0 + s1)>
+//       CHECK: func.func @sort_3d_multi_result_distribute(
+//  CHECK-SAME:   %[[OPERAND1:[a-zA-Z0-9_]+]]: tensor<?x?x?xi32>
+//  CHECK-SAME:   %[[OPERAND2:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
+//   CHECK-DAG:   %[[TILESIZE1:.+]] = arith.constant 10 : index
+//   CHECK-DAG:   %[[TILESIZE2:.+]] = arith.constant 30 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[OPERAND1]], %[[C0]]
+//   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[OPERAND1]], %[[C1]]
+//   CHECK-DAG:   %[[D2:.+]] = tensor.dim %[[OPERAND1]], %[[C2]]
+//   CHECK-DAG:   %[[IDX:.+]] = iree_input.dispatch.workgroup.id[0]
+//   CHECK-DAG:   %[[COUNTX:.+]] = iree_input.dispatch.workgroup.count[0]
+//   CHECK-DAG:   %[[IDY:.+]] = iree_input.dispatch.workgroup.id[1]
+//   CHECK-DAG:   %[[COUNTY:.+]] = iree_input.dispatch.workgroup.count[1]
+//   CHECK-DAG:   %[[OFFSETY:.+]] = affine.apply #[[MAP0]]()[%[[IDY]]]
+//   CHECK-DAG:   %[[STEPY:.+]] = affine.apply #[[MAP0]]()[%[[COUNTY]]]
+//       CHECK:   %[[RESULT:.+]]:2 = scf.for %[[IV0:.+]] = %[[OFFSETY]] to %[[D0]] step %[[STEPY]]
+//  CHECK-SAME:       iter_args(%[[INIT1:.+]] = %[[OPERAND1]], %[[INIT2:.+]] = %[[OPERAND2]])
+//   CHECK-DAG:     %[[USED_TILESIZE1:.+]] = affine.min #[[MAP1]](%[[IV0]])[%[[TILESIZE1]], %[[D0]]]
+//   CHECK-DAG:     %[[OFFSETX:.+]] = affine.apply #[[MAP2]]()[%[[IDX]]]
+//   CHECK-DAG:     %[[STEPX:.+]] = affine.apply #[[MAP2]]()[%[[COUNTX]]]
+//       CHECK:     %[[RESULT_INNER:.+]]:2 = scf.for %[[IV1:.+]] = %[[OFFSETX]] to %[[D2]] step %[[STEPX]]
+//  CHECK-SAME:         iter_args(%[[INIT3:.+]] = %[[INIT1]], %[[INIT4:.+]] = %[[INIT2]])
+//   CHECK-DAG:       %[[USED_TILESIZE2:.+]] = affine.min #[[MAP3]](%[[IV1]])[%[[TILESIZE2]], %[[D2]]]
+//       CHECK:       %[[OPERAND1_SLICE:.+]] = tensor.extract_slice %[[OPERAND1]][%[[IV0]], 0, %[[IV1]]]
+//  CHECK-SAME:           [%[[USED_TILESIZE1]], %[[D1]], %[[USED_TILESIZE2]]]
+//       CHECK:       %[[OPERAND2_SLICE:.+]] = tensor.extract_slice %[[OPERAND2]][%[[IV0]], 0, %[[IV1]]]
+//  CHECK-SAME:           [%[[USED_TILESIZE1]], %[[D1]], %[[USED_TILESIZE2]]]
+//       CHECK:       %[[SORT_SLICE:.+]]:2 = iree_linalg_ext.sort
+//  CHECK-SAME:           __internal_linalg_transform__ = "distribute_output"
+//  CHECK-SAME:           outs(%[[OPERAND1_SLICE]], %[[OPERAND2_SLICE]]
+//       CHECK:       %[[YIELD1:.+]] = tensor.insert_slice %[[SORT_SLICE]]#0
+//  CHECK-SAME:           into %[[INIT3]][%[[IV0]], 0, %[[IV1]]]
+//       CHECK:       %[[YIELD2:.+]] = tensor.insert_slice %[[SORT_SLICE]]#1
+//  CHECK-SAME:           into %[[INIT4]][%[[IV0]], 0, %[[IV1]]]
+//       CHECK:       scf.yield %[[YIELD1]], %[[YIELD2]]
+//       CHECK:     scf.yield %[[RESULT_INNER]]#0, %[[RESULT_INNER]]#1
+//       CHECK:   return %[[RESULT]]#0, %[[RESULT]]#1
+
+// -----
+
+func.func @sort_3d_multi_result_distribute_memref(
+  %arg0: memref<?x?x?xi32>, %arg1 : memref<?x?x?xf32>) {
+  iree_linalg_ext.sort
+      {__internal_linalg_transform__ = "distribute_input"}
+      dimension(1)
+      outs(%arg0, %arg1 : memref<?x?x?xi32>, memref<?x?x?xf32>) {
+      ^bb0(%arg2: i32, %arg3: i32, %arg4 : f32, %arg5 : f32):  // no predecessors
+        %0 = arith.cmpf ogt, %arg4, %arg5 : f32
+        iree_linalg_ext.yield %0 : i1
+      }
+  return
+}
+//   CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 * 10)>
+//   CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//   CHECK-DAG: #[[MAP2:.+]] = affine_map<()[s0] -> (s0 * 30)>
+//   CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0, s1] -> (30, -d0 + s1)>
+//       CHECK: func.func @sort_3d_multi_result_distribute_memref(
+//  CHECK-SAME:   %[[OPERAND1:[a-zA-Z0-9_]+]]: memref<?x?x?xi32>
+//  CHECK-SAME:   %[[OPERAND2:[a-zA-Z0-9_]+]]: memref<?x?x?xf32>
+//   CHECK-DAG:   %[[TILESIZE1:.+]] = arith.constant 10 : index
+//   CHECK-DAG:   %[[TILESIZE2:.+]] = arith.constant 30 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[D0:.+]] = memref.dim %[[OPERAND1]], %[[C0]]
+//   CHECK-DAG:   %[[D1:.+]] = memref.dim %[[OPERAND1]], %[[C1]]
+//   CHECK-DAG:   %[[D2:.+]] = memref.dim %[[OPERAND1]], %[[C2]]
+//   CHECK-DAG:   %[[IDX:.+]] = iree_input.dispatch.workgroup.id[0]
+//   CHECK-DAG:   %[[COUNTX:.+]] = iree_input.dispatch.workgroup.count[0]
+//   CHECK-DAG:   %[[IDY:.+]] = iree_input.dispatch.workgroup.id[1]
+//   CHECK-DAG:   %[[COUNTY:.+]] = iree_input.dispatch.workgroup.count[1]
+//   CHECK-DAG:   %[[OFFSETY:.+]] = affine.apply #[[MAP0]]()[%[[IDY]]]
+//   CHECK-DAG:   %[[STEPY:.+]] = affine.apply #[[MAP0]]()[%[[COUNTY]]]
+//       CHECK:   scf.for %[[IV0:.+]] = %[[OFFSETY]] to %[[D0]] step %[[STEPY]]
+//   CHECK-DAG:     %[[USED_TILESIZE1:.+]] = affine.min #[[MAP1]](%[[IV0]])[%[[TILESIZE1]], %[[D0]]]
+//   CHECK-DAG:     %[[OFFSETX:.+]] = affine.apply #[[MAP2]]()[%[[IDX]]]
+//   CHECK-DAG:     %[[STEPX:.+]] = affine.apply #[[MAP2]]()[%[[COUNTX]]]
+//       CHECK:     scf.for %[[IV1:.+]] = %[[OFFSETX]] to %[[D2]] step %[[STEPX]]
+//   CHECK-DAG:       %[[USED_TILESIZE2:.+]] = affine.min #[[MAP3]](%[[IV1]])[%[[TILESIZE2]], %[[D2]]]
+//       CHECK:       %[[OPERAND1_SLICE:.+]] = memref.subview %[[OPERAND1]][%[[IV0]], 0, %[[IV1]]]
+//  CHECK-SAME:           [%[[USED_TILESIZE1]], %[[D1]], %[[USED_TILESIZE2]]]
+//       CHECK:       %[[OPERAND2_SLICE:.+]] = memref.subview %[[OPERAND2]][%[[IV0]], 0, %[[IV1]]]
+//  CHECK-SAME:           [%[[USED_TILESIZE1]], %[[D1]], %[[USED_TILESIZE2]]]
+//       CHECK:       iree_linalg_ext.sort
+//  CHECK-SAME:           __internal_linalg_transform__ = "distribute_output"
+//  CHECK-SAME:           outs(%[[OPERAND1_SLICE]], %[[OPERAND2_SLICE]]
diff --git a/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/tiling.mlir b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/tiling.mlir
index f86e5a3..0803b0c 100644
--- a/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/tiling.mlir
+++ b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/tiling.mlir
@@ -99,57 +99,6 @@
 
 // -----
 
-func.func @scatter_tiling_distribution(
-    %original: tensor<?x?xf32>, %indices: tensor<?x1xi32>,
-    %update : tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %0 = iree_linalg_ext.scatter
-    {__internal_linalg_transform__ = "distribute_input"}
-    dimension_map = [0]
-    unique_indices(true)
-    ins(%update, %indices : tensor<?x?xf32>, tensor<?x1xi32>)
-    outs(%original : tensor<?x?xf32>) {
-    ^bb0(%arg1: f32, %arg2: f32):
-      %1 = arith.addf %arg1, %arg2 : f32
-      iree_linalg_ext.yield %1 : f32
-    } -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
-}
-//   CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 * 10)>
-//   CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
-//       CHECK: func.func @scatter_tiling_distribution(
-//  CHECK-SAME:   %[[ORIGINAL:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
-//  CHECK-SAME:   %[[INDICES:[a-zA-Z0-9_]+]]: tensor<?x1xi32>
-//  CHECK-SAME:   %[[UPDATES:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
-//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-//   CHECK-DAG:   %[[TILESIZE:.+]] = arith.constant 10 : index
-//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-//   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[UPDATES]], %[[C0]]
-//   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[UPDATES]], %[[C1]]
-//   CHECK-DAG:   %[[ID:.+]] = iree_input.dispatch.workgroup.id[0]
-//   CHECK-DAG:   %[[COUNT:.+]] = iree_input.dispatch.workgroup.count[0]
-//   CHECK-DAG:   %[[OFFSET:.+]] = affine.apply #[[MAP0]]()[%[[ID]]]
-//   CHECK-DAG:   %[[STEP:.+]] = affine.apply #[[MAP0]]()[%[[COUNT]]]
-//       CHECK:   %[[RESULT:.+]] = scf.for %[[IV:.+]] = %[[OFFSET]] to %[[D0]] step %[[STEP]]
-//  CHECK-SAME:       iter_args(%[[INIT:.+]] = %[[ORIGINAL]])
-//       CHECK:     %[[USED_TILESIZE:.+]] = affine.min #[[MAP1]](%[[IV]])[%[[TILESIZE]], %[[D0]]]
-//       CHECK:     %[[UPDATE_SLICE:.+]] = tensor.extract_slice %[[UPDATES]][%[[IV]], 0]
-//  CHECK-SAME:         [%[[USED_TILESIZE]], %[[D1]]]
-//       CHECK:     %[[INDEX_SLICE:.+]] = tensor.extract_slice %[[INDICES]][%[[IV]], 0]
-//  CHECK-SAME:         [%[[USED_TILESIZE]], 1]
-//       CHECK:     %[[D2:.+]] = tensor.dim %[[ORIGINAL]], %[[C0]]
-//       CHECK:     %[[ORIGINAL_SLICE:.+]] = tensor.extract_slice %[[ORIGINAL]][0, 0]
-//  CHECK-SAME:         [%[[D2]], %[[D1]]]
-//       CHECK:     %[[SCATTER_TILE:.+]] = iree_linalg_ext.scatter
-//  CHECK-SAME:        __internal_linalg_transform__ = "distribute_output"
-//  CHECK-SAME:        unique_indices(true)
-//  CHECK-SAME:        ins(%[[UPDATE_SLICE]], %[[INDEX_SLICE]]
-//  CHECK-SAME:        outs(%[[ORIGINAL_SLICE]]
-//       CHECK:     %[[YIELD:.+]] = tensor.insert_slice %[[SCATTER_TILE]] into %[[INIT]][0, 0]
-//  CHECK-SAME:        [%[[D2]], %[[D1]]]
-//       CHECK:   return %[[RESULT]]
-
-// -----
-
 func.func @scatter_no_tiling(
     %original: tensor<?x?xf32>, %indices: tensor<?x1xi32>,
     %update : tensor<?x?xf32>) -> tensor<?x?xf32> {
@@ -407,114 +356,6 @@
 
 // -----
 
-func.func @sort_3d_multi_result_distribute(
-  %arg0: tensor<?x?x?xi32>, %arg1 : tensor<?x?x?xf32>)
-  -> (tensor<?x?x?xi32>, tensor<?x?x?xf32>) {
-  %0, %1 = iree_linalg_ext.sort
-      {__internal_linalg_transform__ = "distribute_input"}
-      dimension(1)
-      outs(%arg0, %arg1 : tensor<?x?x?xi32>, tensor<?x?x?xf32>) {
-      ^bb0(%arg2: i32, %arg3: i32, %arg4 : f32, %arg5 : f32):  // no predecessors
-        %2 = arith.cmpf ogt, %arg4, %arg5 : f32
-        iree_linalg_ext.yield %2 : i1
-      } -> tensor<?x?x?xi32>, tensor<?x?x?xf32>
-  return %0, %1 : tensor<?x?x?xi32>, tensor<?x?x?xf32>
-}
-//   CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 * 10)>
-//   CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
-//   CHECK-DAG: #[[MAP2:.+]] = affine_map<()[s0] -> (s0 * 30)>
-//   CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0, s1] -> (30, -d0 + s1)>
-//       CHECK: func.func @sort_3d_multi_result_distribute(
-//  CHECK-SAME:   %[[OPERAND1:[a-zA-Z0-9_]+]]: tensor<?x?x?xi32>
-//  CHECK-SAME:   %[[OPERAND2:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
-//   CHECK-DAG:   %[[TILESIZE1:.+]] = arith.constant 10 : index
-//   CHECK-DAG:   %[[TILESIZE2:.+]] = arith.constant 30 : index
-//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-//   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
-//   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[OPERAND1]], %[[C0]]
-//   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[OPERAND1]], %[[C1]]
-//   CHECK-DAG:   %[[D2:.+]] = tensor.dim %[[OPERAND1]], %[[C2]]
-//   CHECK-DAG:   %[[IDX:.+]] = iree_input.dispatch.workgroup.id[0]
-//   CHECK-DAG:   %[[COUNTX:.+]] = iree_input.dispatch.workgroup.count[0]
-//   CHECK-DAG:   %[[IDY:.+]] = iree_input.dispatch.workgroup.id[1]
-//   CHECK-DAG:   %[[COUNTY:.+]] = iree_input.dispatch.workgroup.count[1]
-//   CHECK-DAG:   %[[OFFSETY:.+]] = affine.apply #[[MAP0]]()[%[[IDY]]]
-//   CHECK-DAG:   %[[STEPY:.+]] = affine.apply #[[MAP0]]()[%[[COUNTY]]]
-//       CHECK:   %[[RESULT:.+]]:2 = scf.for %[[IV0:.+]] = %[[OFFSETY]] to %[[D0]] step %[[STEPY]]
-//  CHECK-SAME:       iter_args(%[[INIT1:.+]] = %[[OPERAND1]], %[[INIT2:.+]] = %[[OPERAND2]])
-//   CHECK-DAG:     %[[USED_TILESIZE1:.+]] = affine.min #[[MAP1]](%[[IV0]])[%[[TILESIZE1]], %[[D0]]]
-//   CHECK-DAG:     %[[OFFSETX:.+]] = affine.apply #[[MAP2]]()[%[[IDX]]]
-//   CHECK-DAG:     %[[STEPX:.+]] = affine.apply #[[MAP2]]()[%[[COUNTX]]]
-//       CHECK:     %[[RESULT_INNER:.+]]:2 = scf.for %[[IV1:.+]] = %[[OFFSETX]] to %[[D2]] step %[[STEPX]]
-//  CHECK-SAME:         iter_args(%[[INIT3:.+]] = %[[INIT1]], %[[INIT4:.+]] = %[[INIT2]])
-//   CHECK-DAG:       %[[USED_TILESIZE2:.+]] = affine.min #[[MAP3]](%[[IV1]])[%[[TILESIZE2]], %[[D2]]]
-//       CHECK:       %[[OPERAND1_SLICE:.+]] = tensor.extract_slice %[[OPERAND1]][%[[IV0]], 0, %[[IV1]]]
-//  CHECK-SAME:           [%[[USED_TILESIZE1]], %[[D1]], %[[USED_TILESIZE2]]]
-//       CHECK:       %[[OPERAND2_SLICE:.+]] = tensor.extract_slice %[[OPERAND2]][%[[IV0]], 0, %[[IV1]]]
-//  CHECK-SAME:           [%[[USED_TILESIZE1]], %[[D1]], %[[USED_TILESIZE2]]]
-//       CHECK:       %[[SORT_SLICE:.+]]:2 = iree_linalg_ext.sort
-//  CHECK-SAME:           __internal_linalg_transform__ = "distribute_output"
-//  CHECK-SAME:           outs(%[[OPERAND1_SLICE]], %[[OPERAND2_SLICE]]
-//       CHECK:       %[[YIELD1:.+]] = tensor.insert_slice %[[SORT_SLICE]]#0
-//  CHECK-SAME:           into %[[INIT3]][%[[IV0]], 0, %[[IV1]]]
-//       CHECK:       %[[YIELD2:.+]] = tensor.insert_slice %[[SORT_SLICE]]#1
-//  CHECK-SAME:           into %[[INIT4]][%[[IV0]], 0, %[[IV1]]]
-//       CHECK:       scf.yield %[[YIELD1]], %[[YIELD2]]
-//       CHECK:     scf.yield %[[RESULT_INNER]]#0, %[[RESULT_INNER]]#1
-//       CHECK:   return %[[RESULT]]#0, %[[RESULT]]#1
-
-// -----
-
-func.func @sort_3d_multi_result_distribute_memref(
-  %arg0: memref<?x?x?xi32>, %arg1 : memref<?x?x?xf32>) {
-  iree_linalg_ext.sort
-      {__internal_linalg_transform__ = "distribute_input"}
-      dimension(1)
-      outs(%arg0, %arg1 : memref<?x?x?xi32>, memref<?x?x?xf32>) {
-      ^bb0(%arg2: i32, %arg3: i32, %arg4 : f32, %arg5 : f32):  // no predecessors
-        %0 = arith.cmpf ogt, %arg4, %arg5 : f32
-        iree_linalg_ext.yield %0 : i1
-      }
-  return
-}
-//   CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 * 10)>
-//   CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
-//   CHECK-DAG: #[[MAP2:.+]] = affine_map<()[s0] -> (s0 * 30)>
-//   CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0, s1] -> (30, -d0 + s1)>
-//       CHECK: func.func @sort_3d_multi_result_distribute_memref(
-//  CHECK-SAME:   %[[OPERAND1:[a-zA-Z0-9_]+]]: memref<?x?x?xi32>
-//  CHECK-SAME:   %[[OPERAND2:[a-zA-Z0-9_]+]]: memref<?x?x?xf32>
-//   CHECK-DAG:   %[[TILESIZE1:.+]] = arith.constant 10 : index
-//   CHECK-DAG:   %[[TILESIZE2:.+]] = arith.constant 30 : index
-//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-//   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
-//   CHECK-DAG:   %[[D0:.+]] = memref.dim %[[OPERAND1]], %[[C0]]
-//   CHECK-DAG:   %[[D1:.+]] = memref.dim %[[OPERAND1]], %[[C1]]
-//   CHECK-DAG:   %[[D2:.+]] = memref.dim %[[OPERAND1]], %[[C2]]
-//   CHECK-DAG:   %[[IDX:.+]] = iree_input.dispatch.workgroup.id[0]
-//   CHECK-DAG:   %[[COUNTX:.+]] = iree_input.dispatch.workgroup.count[0]
-//   CHECK-DAG:   %[[IDY:.+]] = iree_input.dispatch.workgroup.id[1]
-//   CHECK-DAG:   %[[COUNTY:.+]] = iree_input.dispatch.workgroup.count[1]
-//   CHECK-DAG:   %[[OFFSETY:.+]] = affine.apply #[[MAP0]]()[%[[IDY]]]
-//   CHECK-DAG:   %[[STEPY:.+]] = affine.apply #[[MAP0]]()[%[[COUNTY]]]
-//       CHECK:   scf.for %[[IV0:.+]] = %[[OFFSETY]] to %[[D0]] step %[[STEPY]]
-//   CHECK-DAG:     %[[USED_TILESIZE1:.+]] = affine.min #[[MAP1]](%[[IV0]])[%[[TILESIZE1]], %[[D0]]]
-//   CHECK-DAG:     %[[OFFSETX:.+]] = affine.apply #[[MAP2]]()[%[[IDX]]]
-//   CHECK-DAG:     %[[STEPX:.+]] = affine.apply #[[MAP2]]()[%[[COUNTX]]]
-//       CHECK:     scf.for %[[IV1:.+]] = %[[OFFSETX]] to %[[D2]] step %[[STEPX]]
-//   CHECK-DAG:       %[[USED_TILESIZE2:.+]] = affine.min #[[MAP3]](%[[IV1]])[%[[TILESIZE2]], %[[D2]]]
-//       CHECK:       %[[OPERAND1_SLICE:.+]] = memref.subview %[[OPERAND1]][%[[IV0]], 0, %[[IV1]]]
-//  CHECK-SAME:           [%[[USED_TILESIZE1]], %[[D1]], %[[USED_TILESIZE2]]]
-//       CHECK:       %[[OPERAND2_SLICE:.+]] = memref.subview %[[OPERAND2]][%[[IV0]], 0, %[[IV1]]]
-//  CHECK-SAME:           [%[[USED_TILESIZE1]], %[[D1]], %[[USED_TILESIZE2]]]
-//       CHECK:       iree_linalg_ext.sort
-//  CHECK-SAME:           __internal_linalg_transform__ = "distribute_output"
-//  CHECK-SAME:           outs(%[[OPERAND1_SLICE]], %[[OPERAND2_SLICE]]
-
-// -----
-
 func.func @fft_1d_stage_5(%arg0: tensor<1024xf32>, %arg1: tensor<1024xf32>,
     %arg2: tensor<16xf32>, %arg3: tensor<16xf32>) -> (tensor<1024xf32>, tensor<1024xf32>) {
   %cst1 = arith.constant 5 : index