Sandbox integrate (#8581) * Bring LinalgTransform dialect from the sandbox to iree-dialects. Temporarily name the dialect "iree_linalg_transform" instead of "linalg_transform" to avoid name conflicts during transition and thus ease it. * LinalgTransform python bindings Temporarily name the dialect "iree_linalg_transform" instead of "linalg_transform" to avoid name conflicts during transition and thus ease it. * [NFC] Add the MLIR clang-format and format iree-dialects * Update to sandbox 77ca66e88d130b195b2eac169f17b95305a98577. * Move Dialect tests to a location consistent with core MLIR * Update sandbox to 3738d5792a3da6f03628c4375183cb39e3a82d51 * Format * Drop spurious dependency * clang-format * Build fixes * Move include/Transforms -> include/iree-dialects/Transforms * Disable pytype on _iree_linalg_transforms_ops_ext.py * clang-format * More BUILD fixes * Fix unit test

commit: 11fb8d0c1c36dd4163a4271c066068f7a6fe603b [log] [tgz]
author: Nicolas Vasilache <nicolasvasilache@users.noreply.github.com> Mon Mar 21 20:33:30 2022 +0100
committer: GitHub <noreply@github.com> Mon Mar 21 20:33:30 2022 +0100
tree: 278970af08d1667efb57597d19ed79c425c9064e
parent: 86612ccee7cd2839c962d860d4c5c6b50df63582 [diff]
diff --git a/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/canonicalize.mlir b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/canonicalize.mlir
new file mode 100644
index 0000000..b8434d2
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/canonicalize.mlir

@@ -0,0 +1,42 @@
+// RUN: iree-dialects-opt -canonicalize -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: func @tensor.cast(
+func @tensor.cast(%arg0: tensor<3x5xi32>) -> tensor<3x5xi32> {
+  %init = linalg.init_tensor [3, 5] : tensor<3x5xi32>
+
+  %casted_arg0 = tensor.cast %arg0 : tensor<3x5xi32> to tensor<?x?xi32>
+  %casted_init = tensor.cast %init : tensor<3x5xi32> to tensor<?x?xi32>
+
+// CHECK:      iree_linalg_ext.reverse
+// CHECK-SAME:   ins(%{{[a-zA-Z0-9]*}} : tensor<3x5xi32>)
+// CHECK-SAME:  outs(%{{[a-zA-Z0-9]*}} : tensor<3x5xi32>)
+  %0 = iree_linalg_ext.reverse
+         dimensions(dense<0> : tensor<1xi64>)
+         ins(%casted_arg0 : tensor<?x?xi32>)
+         outs(%casted_init : tensor<?x?xi32>) : tensor<?x?xi32>
+
+  %1 = tensor.cast %0 : tensor<?x?xi32> to tensor<3x5xi32>
+
+  return %1: tensor<3x5xi32>
+}
+
+// CHECK-LABEL: func @canonicalize_insert_slice_indices(
+//  CHECK-SAME:     %[[arg0:.*]]: tensor<?x?xf32>, %[[arg1:.*]]: tensor<?x?xf32>,
+//  CHECK-SAME:     %[[idx:.*]]: index
+func @canonicalize_insert_slice_indices(
+    %arg0 : tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
+    %idx : index) -> tensor<?x?xf32>
+{
+  %cst = arith.constant 4.200000e+01 : f32
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  %2 = iree_linalg_ext.in_parallel %idx  -> (tensor<?x?xf32>) {
+    ^bb0(%arg3: index):  // no predecessors
+      iree_linalg_ext.perform_concurrently {
+        // CHECK: iree_linalg_ext.parallel_insert_slice %[[arg0]] into %arg1[%[[idx]], 0] [1, 5] [1, 1]
+        iree_linalg_ext.parallel_insert_slice %arg0 into %arg1[%idx, %c0] [%c1, 5] [%c1, %c1] : tensor<?x?xf32> into tensor<?x?xf32>
+      }
+  }
+  return %2 : tensor<?x?xf32>
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/convert_to_loops.mlir b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/convert_to_loops.mlir
new file mode 100644
index 0000000..f4871a8
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/convert_to_loops.mlir

@@ -0,0 +1,634 @@
+// RUN: iree-dialects-opt -split-input-file -iree-linalg-ext-to-loops %s | FileCheck %s
+
+func @sort_1d(%arg0: memref<128xi32>) {
+  iree_linalg_ext.sort dimension(0)
+    outs(%arg0 : memref<128xi32>) {
+  ^bb0(%arg2: i32, %arg3: i32):  // no predecessors
+    %0 = arith.cmpi sgt, %arg2, %arg3 : i32
+    iree_linalg_ext.yield %0 : i1
+  }
+  return
+}
+// CHECK-LABEL: func @sort_1d
+// CHECK-SAME:    %[[BUF:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C128:.+]] = arith.constant 128 : index
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C127:.+]] = arith.constant 127 : index
+// CHECK:         scf.for %[[ARG1:.+]] = %[[C0]] to %[[C128]] step %[[C1]]
+// CHECK:           scf.for %[[ARG2:.+]] = %[[C0]] to %[[C127]] step %[[C1]]
+// CHECK:             %[[T1:.+]] = arith.addi %[[ARG2]], %[[C1]] : index
+// CHECK:             %[[V1:.+]] = memref.load %[[BUF]][%[[ARG2]]]
+// CHECK:             %[[V2:.+]] = memref.load %[[BUF]][%[[T1]]]
+// CHECK:             %[[COND:.+]] = arith.cmpi sgt, %[[V1]], %[[V2]] : i32
+// CHECK:             scf.if %[[COND]] {
+// CHECK:             } else {
+// CHECK:               %[[T2:.+]] = arith.addi %[[ARG2]], %[[C1]] : index
+// CHECK:               memref.store %[[V2]], %[[BUF]][%[[ARG2]]]
+// CHECK:               memref.store %[[V1]], %[[BUF]][%[[T2]]]
+// CHECK:             }
+
+// -----
+
+func @sort_2d(%arg0: memref<16x32xi32>) {
+  iree_linalg_ext.sort dimension(0)
+    outs(%arg0 : memref<16x32xi32>) {
+  ^bb0(%arg2: i32, %arg3: i32):  // no predecessors
+    %0 = arith.cmpi sgt, %arg2, %arg3 : i32
+    iree_linalg_ext.yield %0 : i1
+  }
+  return
+}
+// CHECK-LABEL: func @sort_2d
+// CHECK-SAME:    %[[BUF:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:     %[[C32:.+]] = arith.constant 32 : index
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C15:.+]] = arith.constant 15 : index
+// CHECK:         scf.for %[[ARG1:.+]] = %[[C0]] to %[[C16]] step %[[C1]]
+// CHECK:           scf.for %[[ARG2:.+]] = %[[C0]] to %[[C32]] step %[[C1]]
+// CHECK:             scf.for %[[ARG3:.+]] = %[[C0]] to %[[C15]] step %[[C1]]
+// CHECK:               %[[T1:.+]] = arith.addi %[[ARG3]], %[[C1]] : index
+// CHECK:               %[[V1:.+]] = memref.load %[[BUF]][%[[ARG3]], %[[ARG2]]]
+// CHECK:               %[[V2:.+]] = memref.load %[[BUF]][%[[T1]], %[[ARG2]]]
+// CHECK:               %[[COND:.+]] = arith.cmpi sgt, %[[V1]], %[[V2]] : i32
+// CHECK:               scf.if %[[COND]] {
+// CHECK:               } else {
+// CHECK:                 %[[T2:.+]] = arith.addi %[[ARG3]], %[[C1]] : index
+// CHECK:                 memref.store %[[V2]], %[[BUF]][%[[ARG3]], %[[ARG2]]]
+// CHECK:                 memref.store %[[V1]], %[[BUF]][%[[T2]], %[[ARG2]]]
+// CHECK:               }
+
+// -----
+
+func @sort_multi(%arg0: memref<128xf32>, %arg1: memref<128xi32>) {
+  iree_linalg_ext.sort
+    dimension(0)
+    outs(%arg0, %arg1 : memref<128xf32>, memref<128xi32>) {
+  ^bb0(%arg2: f32, %arg3: f32, %arg4: i32, %arg5: i32):  // no predecessors
+    %0 = arith.cmpf ogt, %arg2, %arg3 : f32
+    iree_linalg_ext.yield %0 : i1
+  }
+  return
+}
+// CHECK-LABEL: func @sort_multi
+// CHECK-SAME:    %[[BUF1:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[BUF2:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C128:.+]] = arith.constant 128 : index
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C127:.+]] = arith.constant 127 : index
+// CHECK:         scf.for %[[ARG1:.+]] = %[[C0]] to %[[C128]] step %[[C1]]
+// CHECK:           scf.for %[[ARG2:.+]] = %[[C0]] to %[[C127]] step %[[C1]]
+// CHECK:             %[[T1:.+]] = arith.addi %[[ARG2]], %[[C1]] : index
+// CHECK:             %[[V1:.+]] = memref.load %[[BUF1]][%[[ARG2]]]
+// CHECK:             %[[V2:.+]] = memref.load %[[BUF1]][%[[T1]]]
+// CHECK:             %[[V3:.+]] = memref.load %[[BUF2]][%[[ARG2]]]
+// CHECK:             %[[V4:.+]] = memref.load %[[BUF2]][%[[T1]]]
+// CHECK:             %[[COND:.+]] = arith.cmpf ogt, %[[V1]], %[[V2]] : f32
+// CHECK:             scf.if %[[COND]] {
+// CHECK:             } else {
+// CHECK:               %[[T2:.+]] = arith.addi %[[ARG2]], %[[C1]] : index
+// CHECK:               memref.store %[[V2]], %[[BUF1]][%[[ARG2]]]
+// CHECK:               memref.store %[[V1]], %[[BUF1]][%[[T2]]]
+// CHECK:               memref.store %[[V4]], %[[BUF2]][%[[ARG2]]]
+// CHECK:               memref.store %[[V3]], %[[BUF2]][%[[T2]]]
+// CHECK:             }
+
+// -----
+
+func @scatter_update_scalar_1D(
+    %original: memref<8xi32>, %indices: memref<3x1xi32>,
+    %updates: memref<3xi32>) {
+  iree_linalg_ext.scatter unique_indices(true)
+    ins(%updates, %indices : memref<3xi32>, memref<3x1xi32>)
+    outs(%original : memref<8xi32>)  {
+  ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
+    iree_linalg_ext.yield %arg0 : i32
+  }
+  return
+}
+// CHECK-LABEL: func @scatter_update_scalar_1D
+// CHECK-SAME:    %[[ORIGINAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[INDICES:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[UPDATES:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C3:.+]] = arith.constant 3 : index
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {
+// CHECK:           %[[T1:.+]] = memref.load %[[UPDATES]][%[[I]]] : memref<3xi32>
+// CHECK:           %[[T2:.+]] =  memref.load %[[INDICES]][%[[I]], %[[C0]]] : memref<3x1xi32>
+// CHECK:           %[[IDX:.+]] = arith.index_cast %[[T2]] : i32 to index
+// CHECK:           memref.store %[[T1]], %[[ORIGINAL]][%[[IDX]]]
+
+// -----
+
+func @scatter_add_scalar_2D(
+    %original: memref<4x3xi32>, %indices: memref<3x2xi32>,
+    %updates: memref<3xi32>) {
+  iree_linalg_ext.scatter unique_indices(true)
+    ins(%updates, %indices : memref<3xi32>, memref<3x2xi32>)
+    outs(%original : memref<4x3xi32>)  {
+  ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
+    %0 = arith.addi %arg1, %arg0 : i32
+    iree_linalg_ext.yield %0 : i32
+  }
+  return
+}
+// CHECK-LABEL: func @scatter_add_scalar_2D
+// CHECK-SAME:    %[[ORIGINAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[INDICES:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[UPDATES:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C3:.+]] = arith.constant 3 : index
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {
+// CHECK:           %[[T1:.+]] = memref.load %[[UPDATES]][%[[I]]] : memref<3xi32>
+// CHECK:           %[[T2:.+]] = memref.load %[[INDICES]][%[[I]], %[[C0]]] : memref<3x2xi32>
+// CHECK:           %[[IDX1:.+]] = arith.index_cast %[[T2]] : i32 to index
+// CHECK:           %[[T3:.+]] = memref.load %[[INDICES]][%[[I]], %[[C1]]] : memref<3x2xi32>
+// CHECK:           %[[IDX2:.+]] = arith.index_cast %[[T3]] : i32 to index
+// CHECK:           %[[ORI:.+]] = memref.load %[[ORIGINAL]][%[[IDX1]], %[[IDX2]]] : memref<4x3xi32>
+// CHECK:           %[[ADD:.+]] = arith.addi %[[ORI]], %[[T1]] : i32
+// CHECK:           memref.store %[[ADD]], %[[ORIGINAL]][%[[IDX1]], %[[IDX2]]]
+
+// -----
+
+func @scatter_update_slice_2D(
+    %original: memref<4x3xi32>, %indices: memref<2x1xi32>,
+    %updates: memref<2x3xi32>) {
+  iree_linalg_ext.scatter unique_indices(true)
+    ins(%updates, %indices : memref<2x3xi32>, memref<2x1xi32>)
+    outs(%original : memref<4x3xi32>)  {
+  ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
+    iree_linalg_ext.yield %arg0 : i32
+  }
+  return
+}
+// CHECK:       func @scatter_update_slice_2D
+// CHECK-SAME:    %[[ORIGINAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[INDICES:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[UPDATES:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:     %[[C3:.+]] = arith.constant 3 : index
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[C2]] step %[[C1]] {
+// CHECK:           scf.for %[[J:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {
+// CHECK:             %[[UPDATE:.+]] = memref.load %[[UPDATES]][%[[I]], %[[J]]]
+// CHECK:             %[[INDEX:.+]] = memref.load %[[INDICES]][%[[I]], %[[C0]]]
+// CHECK:             %[[LOC:.+]] = arith.index_cast %[[INDEX]] : i32 to index
+// CHECK:             memref.store %[[UPDATE]], %[[ORIGINAL]][%[[LOC]], %[[J]]]
+// CHECK:           }
+// CHECK:         }
+
+// -----
+
+func @scatter_add_scalar_1D(
+    %original: memref<8xi32>, %indices: memref<3x1xi32>,
+    %updates: memref<3xi32>) {
+  iree_linalg_ext.scatter unique_indices(true)
+    ins(%updates, %indices : memref<3xi32>, memref<3x1xi32>)
+    outs(%original : memref<8xi32>)  {
+  ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
+    %0 = arith.addi %arg1, %arg0 : i32
+    iree_linalg_ext.yield %0 : i32
+  }
+  return
+}
+// CHECK-LABEL: func @scatter_add_scalar_1D
+// CHECK-SAME:    %[[ORIGINAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[INDICES:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[UPDATES:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C3:.+]] = arith.constant 3 : index
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {
+// CHECK:           %[[T1:.+]] = memref.load %[[UPDATES]][%[[I]]] : memref<3xi32>
+// CHECK:           %[[T2:.+]] =  memref.load %[[INDICES]][%[[I]], %[[C0]]] : memref<3x1xi32>
+// CHECK:           %[[IDX:.+]] = arith.index_cast %[[T2]] : i32 to index
+// CHECK:           %[[ORI:.+]] = memref.load %[[ORIGINAL]][%[[IDX]]] : memref<8xi32>
+// CHECK:           %[[ADD:.+]] = arith.addi %[[ORI]], %[[T1]] : i32
+// CHECK:           memref.store %[[ADD]], %[[ORIGINAL]][%[[IDX]]]
+
+// -----
+
+func @scatter_add_slice_2D(
+    %original: memref<4x3xi32>, %indices: memref<2x1xi32>,
+    %updates: memref<2x3xi32>) {
+  iree_linalg_ext.scatter unique_indices(true)
+    ins(%updates, %indices : memref<2x3xi32>, memref<2x1xi32>)
+    outs(%original : memref<4x3xi32>)  {
+  ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
+    %0 = arith.addi %arg1, %arg0 : i32
+    iree_linalg_ext.yield %0 : i32
+  }
+  return
+}
+// CHECK:       func @scatter_add_slice_2D
+// CHECK-SAME:    %[[ORIGINAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[INDICES:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[UPDATES:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C2:.+]] = arith.constant 2 : index
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[C2]] step %[[C1]] {
+// CHECK:           scf.for %[[J:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {
+// CHECK:             %[[UPDATEVAL:.+]] = memref.load %[[UPDATES]][%[[I]], %[[J]]]
+// CHECK:             %[[INDEXVAL:.+]] = memref.load %[[INDICES]][%[[I]], %[[C0]]]
+// CHECK:             %[[INDEX:.+]] = arith.index_cast %[[INDEXVAL]] : i32 to index
+// CHECK:             %[[ORIGINALVAL:.+]] = memref.load %[[ORIGINAL]][%[[INDEX]], %[[J]]]
+// CHECK:             %[[STOREVAL:.+]] = arith.addi %[[ORIGINALVAL]], %[[UPDATEVAL]]
+// CHECK:             memref.store %[[STOREVAL]], %[[ORIGINAL]][%[[INDEX]], %[[J]]]
+
+// -----
+
+func @scatter_update_scalar_dynamic_1D(
+    %original: memref<?xi32>, %indices: memref<?x1xi32>,
+    %updates: memref<?xi32>) {
+  iree_linalg_ext.scatter unique_indices(true)
+    ins(%updates, %indices : memref<?xi32>, memref<?x1xi32>)
+    outs(%original : memref<?xi32>)  {
+  ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
+    iree_linalg_ext.yield %arg0 : i32
+  }
+  return
+}
+// CHECK-LABEL: func @scatter_update_scalar_dynamic_1D
+// CHECK-SAME:    %[[ORIGINAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[INDICES:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[UPDATES:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[UB:.+]] = memref.dim %[[UPDATES]], %[[C0]] : memref<?xi32>
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[UB]] step %[[C1]] {
+// CHECK:           %[[T1:.+]] = memref.load %[[UPDATES]][%[[I]]] : memref<?xi32>
+// CHECK:           %[[T2:.+]] =  memref.load %[[INDICES]][%[[I]], %[[C0]]] : memref<?x1xi32>
+// CHECK:           %[[IDX:.+]] = arith.index_cast %[[T2]] : i32 to index
+// CHECK:           memref.store %[[T1]], %[[ORIGINAL]][%[[IDX]]]
+
+// -----
+
+func @scatter_add_scalar_dynamic_2D(
+    %original: memref<?x?xi32>, %indices: memref<?x2xi32>,
+    %updates: memref<?xi32>) {
+  iree_linalg_ext.scatter unique_indices(true)
+    ins(%updates, %indices : memref<?xi32>, memref<?x2xi32>)
+    outs(%original : memref<?x?xi32>)  {
+  ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
+    %0 = arith.addi %arg1, %arg0 : i32
+    iree_linalg_ext.yield %0 : i32
+  }
+  return
+}
+// CHECK-LABEL: func @scatter_add_scalar_dynamic_2D
+// CHECK-SAME:    %[[ORIGINAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[INDICES:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[UPDATES:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[UB:.+]] = memref.dim %[[UPDATES]], %[[C0]] : memref<?xi32>
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[UB]] step %[[C1]] {
+// CHECK:           %[[T1:.+]] = memref.load %[[UPDATES]][%[[I]]] : memref<?xi32>
+// CHECK:           %[[T2:.+]] = memref.load %[[INDICES]][%[[I]], %[[C0]]] : memref<?x2xi32>
+// CHECK:           %[[IDX1:.+]] = arith.index_cast %[[T2]] : i32 to index
+// CHECK:           %[[T3:.+]] = memref.load %[[INDICES]][%[[I]], %[[C1]]] : memref<?x2xi32>
+// CHECK:           %[[IDX2:.+]] = arith.index_cast %[[T3]] : i32 to index
+// CHECK:           %[[ORI:.+]] = memref.load %[[ORIGINAL]][%[[IDX1]], %[[IDX2]]] : memref<?x?xi32>
+// CHECK:           %[[ADD:.+]] = arith.addi %[[ORI]], %[[T1]] : i32
+// CHECK:           memref.store %[[ADD]], %[[ORIGINAL]][%[[IDX1]], %[[IDX2]]]
+
+// -----
+
+func @scatter_update_slice_dynamic_2D(
+    %original: memref<?x?xi32>, %indices: memref<?x1xi32>,
+    %updates: memref<?x?xi32>) {
+  iree_linalg_ext.scatter unique_indices(true)
+    ins(%updates, %indices : memref<?x?xi32>, memref<?x1xi32>)
+    outs(%original : memref<?x?xi32>)  {
+  ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
+    iree_linalg_ext.yield %arg0 : i32
+  }
+  return
+}
+// CHECK:       func @scatter_update_slice_dynamic_2D
+// CHECK-SAME:    %[[ORIGINAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[INDICES:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[UPDATES:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[UB1:.+]] = memref.dim %[[UPDATES]], %[[C0]] : memref<?x?xi32>
+// CHECK-DAG:     %[[UB2:.+]] = memref.dim %[[UPDATES]], %[[C1]] : memref<?x?xi32>
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[UB1]] step %[[C1]] {
+// CHECK:           scf.for %[[J:.+]] = %[[C0]] to %[[UB2]] step %[[C1]] {
+// CHECK:             %[[UPDATEVAL:.+]] = memref.load %[[UPDATES]][%[[I]], %[[J]]]
+// CHECK:             %[[INDEXVAL:.+]] = memref.load %[[INDICES]][%[[I]], %[[C0]]]
+// CHECK:             %[[INDEX:.+]] = arith.index_cast %[[INDEXVAL]] : i32 to index
+// CHECK:             memref.store %[[UPDATEVAL]], %[[ORIGINAL]][%[[INDEX]], %[[J]]]
+
+// -----
+
+func @scatter_partial_slices(%arg0: memref<2x64x12xf32>, %arg1: memref<2x3xi32>, %arg2: memref<2x1x12xf32>) {
+  iree_linalg_ext.scatter
+    unique_indices(true)
+    ins(%arg2, %arg1 : memref<2x1x12xf32>, memref<2x3xi32>)
+    outs(%arg0 : memref<2x64x12xf32>) {
+  ^bb0(%arg3: f32, %arg4: f32):
+    iree_linalg_ext.yield %arg4 : f32
+  }
+  return
+}
+
+// CHECK-LABEL: @scatter_partial_slices
+// CHECK-SAME:    %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[ARG1:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[ARG2:[a-zA-Z0-9]+]]
+// CHECK-DAG: %[[C0:.+]] = arith.constant
+// CHECK-DAG: %[[C1:.+]] = arith.constant
+// CHECK-DAG: %[[C2:.+]] = arith.constant
+// CHECK-DAG: %[[C12:.+]] = arith.constant
+// CHECK:     scf.for %[[ARG3:.+]] = %[[C0]] to %[[C2]] step %[[C1]] {
+// CHECK-NEXT:   scf.for %[[ARG4:.+]] = %[[C0]] to %[[C1]] step %[[C1]] {
+// CHECK-NEXT:     scf.for %[[ARG5:.+]] = %[[C0]] to %[[C12]] step %[[C1]] {
+// CHECK-NEXT:       %[[LOAD0:.+]] = memref.load %[[ARG1]][%[[ARG3]], %[[C0]]] : memref<2x3xi32>
+// CHECK-NEXT:       %[[CAST0:.+]] = arith.index_cast %[[LOAD0]] : i32 to index
+// CHECK-NEXT:       %[[LOAD1:.+]] = memref.load %[[ARG1]][%[[ARG3]], %[[C1]]] : memref<2x3xi32>
+// CHECK-NEXT:       %[[CAST1:.+]] = arith.index_cast %[[LOAD1]] : i32 to index
+// CHECK-NEXT:       %[[ADD1:.+]] = arith.addi %[[CAST1]], %[[ARG4]] : index
+// CHECK-NEXT:       %[[LOAD2:.+]] = memref.load %[[ARG1]][%[[ARG3]], %[[C2]]] : memref<2x3xi32>
+// CHECK-NEXT:       %[[CAST2:.+]] = arith.index_cast %[[LOAD2]] : i32 to index
+// CHECK-NEXT:       %[[ADD2:.+]] = arith.addi %[[CAST2]], %[[ARG5]] : index
+// CHECK-NEXT:       %[[LOAD3:.+]] = memref.load %[[ARG0]][%[[CAST0]], %[[ADD1]], %[[ADD2]]] : memref<2x64x12xf32>
+// CHECK-NEXT:       memref.store %[[LOAD3]], %[[ARG0]][%[[CAST0]], %[[ADD1]], %[[ADD2]]] : memref<2x64x12xf32>
+
+// -----
+
+func @fft_1D(%real: memref<16xf32>, %imag: memref<16xf32>) {
+  %stage = arith.constant 1 : index
+  iree_linalg_ext.fft
+    ins(%stage: index)
+    outs(%real, %imag: memref<16xf32>, memref<16xf32>)
+  return
+}
+// CHECK-DAG:   #[[MAP0:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
+// CHECK-DAG:   #[[MAP1:.+]] = affine_map<(d0) -> (d0)>
+// CHECK:       func @fft_1D
+// CHECK-SAME:    %[[REAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[IMAG:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:     %[[COEFF:.+]] = arith.constant -3.14159274 : f32
+// CHECK:         scf.for %[[K:.+]] = %[[C0]] to %[[C16]] step %[[C2]]
+// CHECK-DAG:       %[[HM:.+]] = arith.shrsi %[[C2]], %[[C1]] : index
+// CHECK:           %[[L_REAL_SLICE:.+]] = memref.subview %[[REAL]][%[[K]]] [%[[HM]]] [1]
+// CHECK:           %[[L_IMAG_SLICE:.+]] = memref.subview %[[IMAG]][%[[K]]] [%[[HM]]] [1]
+// CHECK:           %[[R_OFFSET:.+]] = arith.addi %[[K]], %[[HM]] : index
+// CHECK:           %[[R_REAL_SLICE:.+]] = memref.subview %[[REAL]][%[[R_OFFSET]]] [%[[HM]]] [1]
+// CHECK:           %[[R_IMAG_SLICE:.+]] = memref.subview %[[IMAG]][%[[R_OFFSET]]] [%[[HM]]] [1]
+// CHECK:           linalg.generic
+// CHECK-SAME:        indexing_maps = [#[[MAP1]], #[[MAP1]], #[[MAP1]], #[[MAP1]]]
+// CHECK-SAME:        iterator_types = ["parallel"]
+// CHECK-SAME:        outs(%[[L_REAL_SLICE]], %[[L_IMAG_SLICE]], %[[R_REAL_SLICE]], %[[R_IMAG_SLICE]]
+// CHECK:           ^bb0(%[[L_REAL:.+]]: f32, %[[L_IMAG:.+]]: f32, %[[R_REAL:.+]]: f32, %[[R_IMAG:.+]]: f32)
+//
+//                    Compute exp coeff.
+// CHECK:             %[[J_IDX:.+]] = linalg.index 0 : index
+// CHECK:             %[[J_I32:.+]] = arith.index_cast %[[J_IDX]] : index to i32
+// CHECK:             %[[J_F32:.+]] = arith.sitofp %[[J_I32]] : i32 to f32
+// CHECK:             %[[EXP_COEF:.+]] = arith.mulf %[[J_F32]], %[[COEFF]] : f32
+// CHECK:             %[[W_REAL:.+]] = math.cos %[[EXP_COEF]]
+// CHECK:             %[[W_IMAG:.+]] = math.sin %[[EXP_COEF]]
+//
+//                    Compute "t = w * a[k + j + mh]" by expanding
+//                      (x + yi)(u + vi) = (xu - yv) + (xv + yu)i
+// CHECK-DAG:         %[[XU:.+]] = arith.mulf %[[W_REAL]], %[[R_REAL]]
+// CHECK-DAG:         %[[YV:.+]] = arith.mulf %[[W_IMAG]], %[[R_IMAG]]
+// CHECK-DAG:         %[[XV:.+]] = arith.mulf %[[W_REAL]], %[[R_IMAG]]
+// CHECK-DAG:         %[[YU:.+]] = arith.mulf %[[W_IMAG]], %[[R_REAL]]
+// CHECK:             %[[T_REAL:.+]] = arith.subf %[[XU]], %[[YV]]
+// CHECK:             %[[T_IMAG:.+]] = arith.addf %[[XV]], %[[YU]]
+//
+//                    Compute the results.
+//                      u = a[k + j];
+//                      a[k + j] = u + t;
+//                      a[k + j + mh] = u - t;
+// CHECK:             %[[RES1:.+]] = arith.addf %[[L_REAL]], %[[T_REAL]]
+// CHECK:             %[[RES2:.+]] = arith.addf %[[L_IMAG]], %[[T_IMAG]]
+// CHECK:             %[[RES3:.+]] = arith.subf %[[L_REAL]], %[[T_REAL]]
+// CHECK:             %[[RES4:.+]] = arith.subf %[[L_IMAG]], %[[T_IMAG]]
+// CHECK:             linalg.yield %[[RES1]], %[[RES2]], %[[RES3]], %[[RES4]]
+
+// -----
+
+func @fft_2D(%real: memref<?x16xf32>, %imag: memref<?x16xf32>) {
+  %stage = arith.constant 2 : index
+  iree_linalg_ext.fft
+    ins(%stage: index)
+    outs(%real, %imag: memref<?x16xf32>, memref<?x16xf32>)
+  return
+}
+// CHECK-DAG:   #[[MAP0:.+]] = affine_map<(d0, d1)[s0] -> (d0 * 16 + s0 + d1)>
+// CHECK-DAG:   #[[MAP1:.+]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK:       func @fft_2D(
+// CHECK-SAME:    %[[REAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[IMAG:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C4:.+]] = arith.constant 4 : index
+// CHECK-DAG:     %[[D0:.+]] = memref.dim %[[REAL]], %[[C0]] : memref<?x16xf32>
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[D0]] step %[[C1]]
+// CHECK:           scf.for %[[K:.+]] = %[[C0]] to %[[C16]] step %[[C4]]
+// CHECK-DAG:         %[[HM:.+]] = arith.shrsi %[[C4]], %[[C1]] : index
+// CHECK:             %[[L_REAL_SLICE:.+]] = memref.subview %[[REAL]][%[[I]], %[[K]]] [1, %[[HM]]] [1, 1]
+// CHECK:             %[[L_IMAG_SLICE:.+]] = memref.subview %[[IMAG]][%[[I]], %[[K]]] [1, %[[HM]]] [1, 1]
+// CHECK:             %[[R_OFFSET:.+]] = arith.addi %[[K]], %[[HM]] : index
+// CHECK:             %[[R_REAL_SLICE:.+]] = memref.subview %[[REAL]][%[[I]], %[[R_OFFSET]]] [1, %[[HM]]] [1, 1]
+// CHECK:             %[[R_IMAG_SLICE:.+]] = memref.subview %[[IMAG]][%[[I]], %[[R_OFFSET]]] [1, %[[HM]]] [1, 1]
+// CHECK:             linalg.generic
+// CHECK-SAME:          indexing_maps = [#[[MAP1]], #[[MAP1]], #[[MAP1]], #[[MAP1]]]
+// CHECK-SAME:          iterator_types = ["parallel", "parallel"]
+// CHECK-SAME:          outs(%[[L_REAL_SLICE]], %[[L_IMAG_SLICE]], %[[R_REAL_SLICE]], %[[R_IMAG_SLICE]]
+//
+//                    The computation is bascially the same, and they are
+//                    checked above. Here only checks the different part.
+// CHECK:             %{{.+}} = linalg.index 1 : index
+
+// -----
+
+func @fft_2D_coef_buf(%real: memref<?x16xf32>, %imag: memref<?x16xf32>,
+                      %coef_real: memref<1xf32>, %coef_imag: memref<1xf32>) {
+  %stage = arith.constant 1 : index
+  iree_linalg_ext.fft
+    ins(%stage, %coef_real, %coef_imag: index, memref<1xf32>, memref<1xf32>)
+    outs(%real, %imag: memref<?x16xf32>, memref<?x16xf32>)
+  return
+}
+// CHECK-DAG:   #[[MAP0:.+]] = affine_map<(d0, d1)[s0] -> (d0 * 16 + s0 + d1)>
+// CHECK-DAG:   #[[MAP1:.+]] = affine_map<(d0, d1) -> (d1)>
+// CHECK-DAG:   #[[MAP2:.+]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK:       func @fft_2D_coef_buf
+// CHECK-SAME:    %[[REAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[IMAG:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[COEF_REAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[COEF_IMAG:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[D0:.+]] = memref.dim %[[REAL]], %[[C0]] : memref<?x16xf32>
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[D0]] step %[[C1]]
+// CHECK:           scf.for %[[K:.+]] = %[[C0]] to %[[C16]] step %[[C2]]
+// CHECK-DAG:         %[[HM:.+]] = arith.shrsi %[[C2]], %[[C1]] : index
+// CHECK:             %[[L_REAL_SLICE:.+]] = memref.subview %[[REAL]][%[[I]], %[[K]]] [1, %[[HM]]] [1, 1]
+// CHECK:             %[[L_IMAG_SLICE:.+]] = memref.subview %[[IMAG]][%[[I]], %[[K]]] [1, %[[HM]]] [1, 1]
+// CHECK:             %[[R_OFFSET:.+]] = arith.addi %[[K]], %[[HM]] : index
+// CHECK:             %[[R_REAL_SLICE:.+]] = memref.subview %[[REAL]][%[[I]], %[[R_OFFSET]]] [1, %[[HM]]] [1, 1]
+// CHECK:             %[[R_IMAG_SLICE:.+]] = memref.subview %[[IMAG]][%[[I]], %[[R_OFFSET]]] [1, %[[HM]]] [1, 1]
+// CHECK:             linalg.generic
+// CHECK-SAME:          indexing_maps = [#[[MAP1]], #[[MAP1]], #[[MAP2]], #[[MAP2]], #[[MAP2]], #[[MAP2]]]
+// CHECK-SAME:          iterator_types = ["parallel", "parallel"]
+// CHECK-SAME:          ins(%[[COEF_REAL]], %[[COEF_IMAG]]
+// CHECK-SAME:          outs(%[[L_REAL_SLICE]], %[[L_IMAG_SLICE]], %[[R_REAL_SLICE]], %[[R_IMAG_SLICE]]
+// CHECK:             ^bb0(%[[W_REAL:.+]]: f32, %[[W_IMAG:.+]]: f32, %[[L_REAL:.+]]: f32, %[[L_IMAG:.+]]: f32, %[[R_REAL:.+]]: f32, %[[R_IMAG:.+]]: f32)
+//                      Compute "t = w * a[k + j + mh]" by expanding
+//                        (x + yi)(u + vi) = (xu - yv) + (xv + yu)i
+// CHECK-DAG:           %[[XU:.+]] = arith.mulf %[[W_REAL]], %[[R_REAL]]
+// CHECK-DAG:           %[[YV:.+]] = arith.mulf %[[W_IMAG]], %[[R_IMAG]]
+// CHECK-DAG:           %[[XV:.+]] = arith.mulf %[[W_REAL]], %[[R_IMAG]]
+// CHECK-DAG:           %[[YU:.+]] = arith.mulf %[[W_IMAG]], %[[R_REAL]]
+// CHECK:               %[[T_REAL:.+]] = arith.subf %[[XU]], %[[YV]]
+// CHECK:               %[[T_IMAG:.+]] = arith.addf %[[XV]], %[[YU]]
+//
+//                      Compute the results.
+//                        u = a[k + j];
+//                        a[k + j] = u + t;
+//                        a[k + j + mh] = u - t;
+// CHECK:               %[[RES1:.+]] = arith.addf %[[L_REAL]], %[[T_REAL]]
+// CHECK:               %[[RES2:.+]] = arith.addf %[[L_IMAG]], %[[T_IMAG]]
+// CHECK:               %[[RES3:.+]] = arith.subf %[[L_REAL]], %[[T_REAL]]
+// CHECK:               %[[RES4:.+]] = arith.subf %[[L_IMAG]], %[[T_IMAG]]
+// CHECK:               linalg.yield %[[RES1]], %[[RES2]], %[[RES3]], %[[RES4]]
+
+// -----
+
+func @reverse_dim_0(%arg0: memref<?x?xi32>, %arg1: memref<?x?xi32>) {
+  iree_linalg_ext.reverse
+    dimensions(dense<0> : tensor<1xi64>)
+    ins(%arg0 : memref<?x?xi32>)
+    outs(%arg1 : memref<?x?xi32>)
+  return
+}
+// CHECK-LABEL: func @reverse_dim_0
+// CHECK-SAME:    %[[IN:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[OUT:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[D0:.+]] = memref.dim %arg0, %c0 : memref<?x?xi32>
+// CHECK-DAG:     %[[D1:.+]] = memref.dim %arg0, %c1 : memref<?x?xi32>
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[D0]] step %[[C1]]
+// CHECK:           scf.for %[[J:.+]] = %[[C0]] to %[[D1]] step %[[C1]]
+// CHECK:             %[[T0:.+]] = memref.dim %[[IN]], %[[C0]]
+// CHECK:             %[[T1:.+]] = arith.subi %[[T0]], %[[C1]] : index
+// CHECK:             %[[T2:.+]] = arith.subi %[[T1]], %[[I]] : index
+// CHECK:             %[[V0:.+]] = memref.load %[[IN]][%[[I]], %[[J]]]
+// CHECK:             memref.store %[[V0]], %[[OUT]][%[[T2]], %[[J]]] : memref<?x?xi32>
+
+func @scan_1d_inclusive(%0: memref<128xi32>, %1: memref<128xi32>) {
+  %c0 = memref.alloc() : memref<i32>
+  iree_linalg_ext.scan dimension(0) inclusive(true)
+    ins(%0 : memref<128xi32>) outs(%1, %c0 : memref<128xi32>, memref<i32>) {
+    ^bb0(%arg0 : i32, %arg1 : i32):
+      %sum = arith.addi %arg0, %arg1 : i32
+      iree_linalg_ext.yield %sum : i32
+  }
+  return
+}
+// CHECK-LABEL: func @scan_1d_inclusive
+// CHECK-SAME:    %[[BUFI:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[BUFO:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C128:.+]] = arith.constant 128 : index
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[ACC:.+]] = memref.alloc() : memref<i32>
+// CHECK:         scf.for %[[ARG1:.+]] = %[[C0]] to %[[C128]] step %[[C1]]
+// CHECK:           %[[COND:.+]] = arith.cmpi eq, %[[ARG1]], %[[C0]] : index
+// CHECK:           scf.if %[[COND]] {
+// CHECK:             %[[V1:.+]] = memref.load %[[BUFI]][%[[ARG1]]]
+// CHECK:             memref.store %[[V1]], %[[BUFO]][%[[ARG1]]]
+// CHECK:           } else {
+// CHECK:             %[[T1:.+]] = arith.subi %[[ARG1]], %[[C1]] : index
+// CHECK:             %[[V2:.+]] = memref.load %[[BUFO]][%[[T1]]]
+// CHECK:             %[[V3:.+]] = memref.load %[[BUFI]][%[[ARG1]]]
+// CHECK:             %[[V4:.+]] = arith.addi %[[V2]], %[[V3]] : i32
+// CHECK:             memref.store %[[V4]], %[[BUFO]][%[[ARG1]]]
+// CHECK:             memref.store %[[V4]], %[[ACC]][]
+// CHECK:           }
+
+// -----
+
+func @scan_1d_exclusive(%0: memref<128xi32>, %1: memref<128xi32>) {
+  %c0 = memref.alloc() : memref<i32>
+  iree_linalg_ext.scan dimension(0) inclusive(false)
+    ins(%0 : memref<128xi32>) outs(%1, %c0 : memref<128xi32>, memref<i32>) {
+    ^bb0(%arg0 : i32, %arg1 : i32):
+      %sum = arith.addi %arg0, %arg1 : i32
+      iree_linalg_ext.yield %sum : i32
+  }
+  return
+}
+// CHECK-LABEL: func @scan_1d_exclusive
+// CHECK-SAME:    %[[BUFI:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[BUFO:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C128:.+]] = arith.constant 128 : index
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[ACC:.+]] = memref.alloc() : memref<i32>
+// CHECK:         scf.for %[[ARG1:.+]] = %[[C0]] to %[[C128]] step %[[C1]]
+// CHECK:           %[[COND:.+]] = arith.cmpi eq, %[[ARG1]], %[[C0]] : index
+// CHECK:           scf.if %[[COND]] {
+// CHECK:             %[[V0:.+]] = memref.load %[[ACC]][] : memref<i32>
+// CHECK:             memref.store %[[V0]], %[[BUFO]][%[[ARG1]]]
+// CHECK:           } else {
+// CHECK:             %[[T1:.+]] = arith.subi %[[ARG1]], %[[C1]] : index
+// CHECK:             %[[V2:.+]] = memref.load %[[BUFO]][%[[T1]]]
+// CHECK:             %[[V3:.+]] = memref.load %[[BUFI]][%[[T1]]]
+// CHECK:             %[[V4:.+]] = arith.addi %[[V2]], %[[V3]] : i32
+// CHECK:             memref.store %[[V4]], %[[BUFO]][%[[ARG1]]]
+// CHECK:             memref.store %[[V4]], %[[ACC]][]
+// CHECK:           }
+
+// -----
+
+func @scan_2d(%0: memref<16x32xi32>, %1: memref<16x32xi32>) {
+  %t0 = memref.alloc() : memref<32xi32>
+  iree_linalg_ext.scan dimension(0) inclusive(true)
+    ins(%0 : memref<16x32xi32>) outs(%1, %t0 : memref<16x32xi32>, memref<32xi32>) {
+    ^bb0(%arg0 : i32, %arg1 : i32):
+      %sum = arith.addi %arg0, %arg1 : i32
+      iree_linalg_ext.yield %sum : i32
+  }
+  return
+}
+// CHECK-LABEL: func @scan_2d
+// CHECK-SAME:    %[[BUFI:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[BUFO:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:     %[[C32:.+]] = arith.constant 32 : index
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[ACC:.+]] = memref.alloc() : memref<32xi32>
+// CHECK:         scf.for %[[ARG1:.+]] = %[[C0]] to %[[C16]] step %[[C1]]
+// CHECK:           scf.for %[[ARG2:.+]] = %[[C0]] to %[[C32]] step %[[C1]]
+// CHECK:             %[[COND:.+]] = arith.cmpi eq, %[[ARG1]], %[[C0]] : index
+// CHECK:             scf.if %[[COND]] {
+// CHECK:               %[[V1:.+]] = memref.load %[[BUFI]][%[[ARG1]], %[[ARG2]]]
+// CHECK:               memref.store %[[V1]], %[[BUFO]][%[[ARG1]], %[[ARG2]]]
+// CHECK:             } else {
+// CHECK:               %[[T1:.+]] = arith.subi %[[ARG1]], %[[C1]] : index
+// CHECK:               %[[V2:.+]] = memref.load %[[BUFO]][%[[T1]], %[[ARG2]]]
+// CHECK:               %[[V3:.+]] = memref.load %[[BUFI]][%[[ARG1]], %[[ARG2]]]
+// CHECK:               %[[V4:.+]] = arith.addi %[[V2]], %[[V3]] : i32
+// CHECK:               memref.store %[[V4]], %[[BUFO]][%[[ARG1]], %[[ARG2]]]
+// CHECK:               memref.store %[[V4]], %[[ACC]][%[[ARG2]]]
+// CHECK:             }

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/invalid.mlir b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/invalid.mlir
new file mode 100644
index 0000000..517e9c2
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/invalid.mlir

@@ -0,0 +1,448 @@
+// RUN: iree-dialects-opt -split-input-file -verify-diagnostics %s
+
+func @sort_invalid_dimension(%arg0: tensor<128xi32>) -> tensor<128xi32> {
+  // expected-error @+1 {{dimension must be within (0, 1]}}
+  %0 = iree_linalg_ext.sort dimension(1)
+    outs(%arg0 : tensor<128xi32>) {
+  ^bb0(%arg1: i32, %arg2: i32):  // no predecessors
+    %1 = arith.cmpi sgt, %arg1, %arg2 : i32
+    iree_linalg_ext.yield %1 : i1
+  } -> tensor<128xi32>
+  return %0 : tensor<128xi32>
+}
+
+// -----
+
+func @sort_mismatch_rank(%arg0: tensor<?x?xi32>, %arg1: tensor<?xf32>)
+    -> (tensor<?x?xi32>, tensor<?xf32>) {
+  // expected-error @+1 {{expected operand 1 to be rank 2, same as other operands}}
+  %0:2 = iree_linalg_ext.sort dimension(0)
+      outs(%arg0, %arg1 : tensor<?x?xi32>, tensor<?xf32>) {
+      ^bb0(%arg2: i32, %arg3: i32, %arg4 : f32, %arg5 : f32):  // no predecessors
+        %1 = arith.cmpf ogt, %arg4, %arg5 : f32
+        iree_linalg_ext.yield %1 : i1
+      } -> tensor<?x?xi32>, tensor<?xf32>
+  return %0#0, %0#1 : tensor<?x?xi32>, tensor<?xf32>
+}
+
+// -----
+
+func @sort_mismatch_shape(%arg0: tensor<?xi32>, %arg1: tensor<42xf32>)
+    -> (tensor<?xi32>, tensor<42xf32>) {
+  // expected-error @+1 {{expected operand 1 to have same shape as other operands}}
+  %0:2 = iree_linalg_ext.sort dimension(0)
+      outs(%arg0, %arg1 : tensor<?xi32>, tensor<42xf32>) {
+      ^bb0(%arg2: i32, %arg3: i32, %arg4 : f32, %arg5 : f32):  // no predecessors
+        %1 = arith.cmpf ogt, %arg4, %arg5 : f32
+        iree_linalg_ext.yield %1 : i1
+      } -> tensor<?xi32>, tensor<42xf32>
+  return %0#0, %0#1 : tensor<?xi32>, tensor<42xf32>
+}
+
+// -----
+
+func @scatter_mixed_tensor_memref(
+    %update : memref<?x?xf32>, %indices : tensor<?x1xi32>,
+    %original : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  // expected-error @+1 {{expected inputs and outputs to be RankedTensorType or scalar}}
+  %0 = iree_linalg_ext.scatter unique_indices(true)
+      ins(%update, %indices : memref<?x?xf32>, tensor<?x1xi32>)
+      outs(%original : tensor<?x?xf32>) {
+      ^bb0(%arg1: f32, %arg2: f32):
+        %1 = arith.addf %arg1, %arg2 : f32
+        iree_linalg_ext.yield %1 : f32
+      } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// -----
+
+func @scatter_mixed_tensor_memref(
+    %update : tensor<?x?xf32>, %indices : memref<?x1xi32>,
+    %original : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  // expected-error @+1 {{expected inputs and outputs to be RankedTensorType or scalar}}
+  %0 = iree_linalg_ext.scatter unique_indices(true)
+      ins(%update, %indices : tensor<?x?xf32>, memref<?x1xi32>)
+      outs(%original : tensor<?x?xf32>) {
+      ^bb0(%arg1: f32, %arg2: f32):
+        %1 = arith.addf %arg1, %arg2 : f32
+        iree_linalg_ext.yield %1 : f32
+      } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// -----
+
+func @scatter_extra_outputs(
+    %update : tensor<?x?xf32>, %indices : tensor<?x1xi32>,
+    %original : tensor<?x?xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+  // expected-error @+1 {{expected number of outputs to be same as the number of results}}
+  %0, %1 = iree_linalg_ext.scatter unique_indices(true)
+      ins(%update, %indices : tensor<?x?xf32>, tensor<?x1xi32>)
+      outs(%original : tensor<?x?xf32>) {
+      ^bb0(%arg1: f32, %arg2: f32):
+        %1 = arith.addf %arg1, %arg2 : f32
+        iree_linalg_ext.yield %1 : f32
+      } -> tensor<?x?xf32>, tensor<?x?xf32>
+  return %0, %1 : tensor<?x?xf32>, tensor<?x?xf32>
+}
+
+// -----
+
+func @scatter_mixed_tensor_memref(
+    %update : tensor<?x?xf32>, %indices : tensor<?x1xi32>,
+    %original : memref<?x?xf32>) -> tensor<?x?xf32> {
+  // expected-error @+1 {{expected inputs and outputs to be RankedTensorType or scalar}}
+  %0 = iree_linalg_ext.scatter unique_indices(true)
+      ins(%update, %indices : tensor<?x?xf32>, tensor<?x1xi32>)
+      outs(%original : memref<?x?xf32>) {
+      ^bb0(%arg1: f32, %arg2: f32):
+        %1 = arith.addf %arg1, %arg2 : f32
+        iree_linalg_ext.yield %1 : f32
+      } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// -----
+
+func @scatter_output_type_mismatch(
+    %update : tensor<?x?xf32>, %indices : tensor<?x1xi32>,
+    %original : tensor<?x?xf32>) -> tensor<4x?xf32> {
+  // expected-error @+1 {{expected type of `outs` operand #0 'tensor<?x?xf32>' to be same as result type 'tensor<4x?xf32>'}}
+  %0 = iree_linalg_ext.scatter unique_indices(true)
+      ins(%update, %indices : tensor<?x?xf32>, tensor<?x1xi32>)
+      outs(%original : tensor<?x?xf32>) {
+      ^bb0(%arg1: f32, %arg2: f32):
+        %1 = arith.addf %arg1, %arg2 : f32
+        iree_linalg_ext.yield %1 : f32
+      } -> tensor<4x?xf32>
+  return %0 : tensor<4x?xf32>
+}
+
+// -----
+
+func @scatter_mixed_tensor_memref(
+    %update : memref<?x?xf32>, %indices : tensor<?x1xi32>,
+    %original : memref<?x?xf32>) {
+  // expected-error @+1 {{expected inputs and outputs to be MemRefType or scalar}}
+  iree_linalg_ext.scatter unique_indices(true)
+    ins(%update, %indices : memref<?x?xf32>, tensor<?x1xi32>)
+    outs(%original : memref<?x?xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    }
+  return
+}
+
+// -----
+
+func @scatter_mixed_tensor_memref(
+    %update : memref<?x?xf32>, %indices : memref<?x1xi32>,
+    %original : tensor<?x?xf32>) {
+  // expected-error @+1 {{expected inputs and outputs to be MemRefType or scalar}}
+  iree_linalg_ext.scatter unique_indices(true)
+    ins(%update, %indices : memref<?x?xf32>, memref<?x1xi32>)
+    outs(%original : tensor<?x?xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    }
+  return
+}
+
+// -----
+
+func @scatter_dim_mismatch(
+    %update : tensor<?x?xf32>, %indices : tensor<48x1xi32>,
+    %original : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  // expected-error @+1 {{mismatch in shape of indices and update value at dim#0}}
+  %0 = iree_linalg_ext.scatter unique_indices(true)
+    ins(%update, %indices : tensor<?x?xf32>, tensor<48x1xi32>)
+    outs(%original : tensor<?x?xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// -----
+
+func @scatter_dim_mismatch(
+    %update : tensor<64x?xf32>, %indices : tensor<48x1xi32>,
+    %original : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  // expected-error @+1 {{mismatch in shape of indices and update value at dim#0}}
+  %0 = iree_linalg_ext.scatter unique_indices(true)
+    ins(%update, %indices : tensor<64x?xf32>, tensor<48x1xi32>)
+    outs(%original : tensor<?x?xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// -----
+
+func @scatter_dim_mismatch(
+    %update : tensor<?x?x?x?xf32>, %indices : tensor<?x1xi32>,
+    %original : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  // expected-error @+1 {{op update value rank exceeds the rank of the original value}}
+  %0 = iree_linalg_ext.scatter unique_indices(true)
+    ins(%update, %indices : tensor<?x?x?x?xf32>, tensor<?x1xi32>)
+    outs(%original : tensor<?x?xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// -----
+
+func @scatter_dim_mismatch(
+    %update : tensor<?x4xf32>, %indices : tensor<?x1xi32>,
+    %original : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  // expected-error @+1 {{mismatch in shape of update value dim#1 and original value at dim#1}}
+  %0 = iree_linalg_ext.scatter unique_indices(true)
+    ins(%update, %indices : tensor<?x4xf32>, tensor<?x1xi32>)
+    outs(%original : tensor<?x?xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// -----
+
+func @scatter_region_type_mismatch(
+    %update : tensor<?x?xi32>, %indices : tensor<?x1xi32>,
+    %original : tensor<?x?xi32>) -> tensor<?x?xi32> {
+  // expected-error @+1 {{expected region to have scalar argument of integer or float types}}
+  %0 = iree_linalg_ext.scatter unique_indices(true)
+    ins(%update, %indices : tensor<?x?xi32>, tensor<?x1xi32>)
+    outs(%original : tensor<?x?xi32>) {
+    ^bb0(%arg1: index, %arg2: index):
+      %1 = arith.addi %arg1, %arg2 : index
+      %2 = arith.index_cast %1 : index to i32
+      iree_linalg_ext.yield %2 : i32
+    } -> tensor<?x?xi32>
+  return %0 : tensor<?x?xi32>
+}
+
+// -----
+
+func @scatter_region_type_mismatch(
+    %update : tensor<?x?xi32>, %indices : tensor<?x1xi32>,
+    %original : tensor<?x?xi32>) -> tensor<?x?xi32> {
+  // expected-error @+1 {{mismatch in argument 0 of region 'i64' and element type of update value 'i32'}}
+  %0 = iree_linalg_ext.scatter unique_indices(true)
+    ins(%update, %indices : tensor<?x?xi32>, tensor<?x1xi32>)
+    outs(%original : tensor<?x?xi32>) {
+    ^bb0(%arg1: i64, %arg2: i32):
+      %1 = arith.trunci %arg1 : i64 to i32
+      %2 = arith.addi %1, %arg2 : i32
+      iree_linalg_ext.yield %2 : i32
+    } -> tensor<?x?xi32>
+  return %0 : tensor<?x?xi32>
+}
+
+// -----
+
+func @scatter_region_type_mismatch(
+    %update : tensor<?x?xi32>, %indices : tensor<?x1xi32>,
+    %original : tensor<?x?xi32>) -> tensor<?x?xi32> {
+  // expected-error @+1 {{mismatch in argument 1 of region 'i64' and element type of original value 'i32'}}
+  %0 = iree_linalg_ext.scatter unique_indices(true)
+    ins(%update, %indices : tensor<?x?xi32>, tensor<?x1xi32>)
+    outs(%original : tensor<?x?xi32>) {
+    ^bb0(%arg1: i32, %arg2: i64):
+      %1 = arith.trunci %arg2 : i64 to i32
+      %2 = arith.addi %1, %arg1 : i32
+      iree_linalg_ext.yield %2 : i32
+    } -> tensor<?x?xi32>
+  return %0 : tensor<?x?xi32>
+}
+
+// -----
+
+func @scatter_region_type_mismatch(
+    %update : tensor<?x?xi32>, %indices : tensor<?x1xi32>,
+    %original : tensor<?x?xi64>) -> tensor<?x?xi64> {
+  // expected-error @+1 {{mismatch in region argument types 'i32' and 'i64'}}
+  %0 = iree_linalg_ext.scatter unique_indices(true)
+    ins(%update, %indices : tensor<?x?xi32>, tensor<?x1xi32>)
+    outs(%original : tensor<?x?xi64>) {
+    ^bb0(%arg1: i32, %arg2: i64):
+      %1 = arith.extsi %arg1 : i32 to i64
+      %2 = arith.addi %1, %arg2 : i64
+      iree_linalg_ext.yield %2 : i64
+    } -> tensor<?x?xi64>
+  return %0 : tensor<?x?xi64>
+}
+
+// -----
+
+func @scatter_region_type_mismatch(
+    %update : tensor<?x?xi64>, %indices : tensor<?x1xi32>,
+    %original : tensor<?x?xi64>) -> tensor<?x?xi64> {
+  // expected-error @+1 {{expected region to have two arguments}}
+  %0 = iree_linalg_ext.scatter unique_indices(true)
+    ins(%update, %indices : tensor<?x?xi64>, tensor<?x1xi32>)
+    outs(%original : tensor<?x?xi64>) {
+    ^bb0(%arg1: i64, %arg2: i64, %arg3 : i64):
+      %1 = arith.addi %arg1, %arg2 : i64
+      iree_linalg_ext.yield %1 : i64
+    } -> tensor<?x?xi64>
+  return %0 : tensor<?x?xi64>
+}
+
+
+// -----
+
+func @scatter_yield_mismatch(
+    %update : tensor<?x?xi64>, %indices : tensor<?x1xi32>,
+    %original : tensor<?x?xi64>) -> tensor<?x?xi64> {
+  %0 = iree_linalg_ext.scatter unique_indices(true)
+    ins(%update, %indices : tensor<?x?xi64>, tensor<?x1xi32>)
+    outs(%original : tensor<?x?xi64>) {
+    ^bb0(%arg1: i64, %arg2: i64):
+      %1 = arith.addi %arg1, %arg2 : i64
+      %2 = arith.trunci %1 : i64 to i32
+      // expected-error @+1 {{mismatch in type of yielded value 'i32' and argument of the region 'i64'}}
+      iree_linalg_ext.yield %2 : i32
+    } -> tensor<?x?xi64>
+  return %0 : tensor<?x?xi64>
+}
+
+// -----
+
+func @scatter_yield_mismatch(
+    %update : tensor<?x?xi64>, %indices : tensor<?x1xi32>,
+    %original : tensor<?x?xi64>) -> tensor<?x?xi64> {
+  %0 = iree_linalg_ext.scatter unique_indices(true)
+    ins(%update, %indices : tensor<?x?xi64>, tensor<?x1xi32>)
+    outs(%original : tensor<?x?xi64>) {
+    ^bb0(%arg1: i64, %arg2: i64):
+      %1 = arith.addi %arg1, %arg2 : i64
+      %2 = arith.trunci %1 : i64 to i32
+      // expected-error @+1 {{expected region to yield a single value}}
+      iree_linalg_ext.yield %1, %2 : i64, i32
+    } -> tensor<?x?xi64>
+  return %0 : tensor<?x?xi64>
+}
+
+// -----
+
+func @scatter_index_depth_dynamic(
+    %update : tensor<?x?xi64>, %indices : tensor<?x?xi32>,
+    %original : tensor<?x?xi64>) -> tensor<?x?xi64> {
+  // expected-error @+1 {{expected index depth is static}}
+  %0 = iree_linalg_ext.scatter unique_indices(true)
+    ins(%update, %indices : tensor<?x?xi64>, tensor<?x?xi32>)
+    outs(%original : tensor<?x?xi64>) {
+    ^bb0(%arg1: i64, %arg2: i64):
+      %1 = arith.addi %arg1, %arg2 : i64
+      %2 = arith.trunci %1 : i64 to i32
+      iree_linalg_ext.yield %1, %2 : i64, i32
+    } -> tensor<?x?xi64>
+  return %0 : tensor<?x?xi64>
+}
+
+// -----
+
+func @scatter_original_rank_mismatch(
+    %update : tensor<?xi64>, %indices : tensor<?x1xi32>,
+    %original : tensor<?x?xi64>) -> tensor<?x?xi64> {
+  // expected-error @+1 {{op index depth and update value does not cover rank of original value}}
+  %0 = iree_linalg_ext.scatter unique_indices(true)
+    ins(%update, %indices : tensor<?xi64>, tensor<?x1xi32>)
+    outs(%original : tensor<?x?xi64>) {
+    ^bb0(%arg1: i64, %arg2: i64):
+      %1 = arith.addi %arg1, %arg2 : i64
+      %2 = arith.trunci %1 : i64 to i32
+      iree_linalg_ext.yield %1, %2 : i64, i32
+    } -> tensor<?x?xi64>
+  return %0 : tensor<?x?xi64>
+}
+
+// -----
+
+func @reverse_diff_element_type(%arg0: tensor<3x5xi32>) -> tensor<3x5xf32> {
+  %init = linalg.init_tensor [3, 5] : tensor<3x5xf32>
+  // expected-error @+1 {{expected input/output element types to be identical}}
+  %0 = iree_linalg_ext.reverse
+         dimensions(dense<0> : tensor<1xi64>)
+         ins(%arg0 : tensor<3x5xi32>)
+         outs(%init : tensor<3x5xf32>) : tensor<3x5xf32>
+  return %0 : tensor<3x5xf32>
+}
+
+// -----
+
+func @reverse_diff_shape(%arg0: tensor<3x5xi32>) -> tensor<3x6xi32> {
+  %init = linalg.init_tensor [3, 6] : tensor<3x6xi32>
+  // expected-error @+1 {{incompatible input/output shapes}}
+  %0 = iree_linalg_ext.reverse
+         dimensions(dense<0> : tensor<1xi64>)
+         ins(%arg0 : tensor<3x5xi32>)
+         outs(%init : tensor<3x6xi32>) : tensor<3x6xi32>
+  return %0 : tensor<3x6xi32>
+}
+
+// -----
+
+func @reverse_dup_dims(%arg0: tensor<3x5xi32>) -> tensor<3x5xi32> {
+  %init = linalg.init_tensor [3, 5] : tensor<3x5xi32>
+  // expected-error @+1 {{expected dimensions numbers are all unique}}
+  %0 = iree_linalg_ext.reverse
+         dimensions(dense<[0, 0]> : tensor<2xi64>)
+         ins(%arg0 : tensor<3x5xi32>)
+         outs(%init : tensor<3x5xi32>) : tensor<3x5xi32>
+  return %0 : tensor<3x5xi32>
+}
+
+// -----
+
+func @not_enough_results() -> () {
+  %num_threads = arith.constant 100 : index
+  // expected-error@+1 {{'iree_linalg_ext.in_parallel' op produces 1 results, but its terminator yields 0 values}}
+  %result = iree_linalg_ext.in_parallel %num_threads -> tensor<100xf32> {
+    ^bb0(%thread_idx : index):
+      iree_linalg_ext.perform_concurrently {}
+  }
+}
+
+// -----
+
+func @too_many_results(%1 : tensor<1xf32>, %out : tensor<100xf32>) -> () {
+  %num_threads = arith.constant 100 : index
+  // expected-error@+1 {{'iree_linalg_ext.in_parallel' op produces 1 results, but its terminator yields 2 values}}
+  %result = iree_linalg_ext.in_parallel %num_threads -> tensor<100xf32> {
+    ^bb0(%thread_idx : index):
+      %0 = arith.constant 1 : index
+      iree_linalg_ext.perform_concurrently {
+        iree_linalg_ext.parallel_insert_slice %1 into %out[%thread_idx][%0][%0] :
+          tensor<1xf32> into tensor<100xf32>
+        iree_linalg_ext.parallel_insert_slice %1 into %out[%thread_idx][%0][%0] :
+          tensor<1xf32> into tensor<100xf32>
+      }
+  }
+}
+
+// -----
+
+func @type_mismatch(%1 : tensor<1xf32>, %out : tensor<200xf32>) -> () {
+  %num_threads = arith.constant 100 : index
+  // expected-error@+1 {{'iree_linalg_ext.in_parallel' op type mismatch between 0th result of in_parallel ('tensor<200xf32>') and 0th result yielded by its terminator ('tensor<100xf32>')}}
+  %result = iree_linalg_ext.in_parallel %num_threads -> tensor<100xf32> {
+    ^bb0(%thread_idx : index):
+      %0 = arith.constant 1 : index
+      iree_linalg_ext.perform_concurrently {
+        iree_linalg_ext.parallel_insert_slice %1 into %out[%thread_idx][%0][%0] :
+          tensor<1xf32> into tensor<200xf32>
+      }
+  }
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/pad_contraction_to_block_size.mlir b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/pad_contraction_to_block_size.mlir
new file mode 100644
index 0000000..385bff8
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/pad_contraction_to_block_size.mlir

@@ -0,0 +1,92 @@
+// RUN: iree-dialects-opt -pass-pipeline='iree-linalg-pad-contraction-to-block-size{rowAlignment=16 columnAlignment=32}' -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: @pad_matmul_static
+// Full verification is done on this case. Others use reduced checks.
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_4:.*]] = tensor.pad %arg0 low[0, 0] high[6, 12]  {
+// CHECK:           ^bb0(%[[VAL_5:.*]]: index, %[[VAL_6:.*]]: index):
+// CHECK:             tensor.yield %[[VAL_3]] : f32
+// CHECK:           } : tensor<250x500xf32> to tensor<256x512xf32>
+// CHECK:           %[[VAL_7:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_8:.*]] = tensor.pad %arg1 low[0, 0] high[12, 4]  {
+// CHECK:           ^bb0(%[[VAL_9:.*]]: index, %[[VAL_10:.*]]: index):
+// CHECK:             tensor.yield %[[VAL_7]] : f32
+// CHECK:           } : tensor<500x1020xf32> to tensor<512x1024xf32>
+// CHECK:           %[[VAL_11:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_12:.*]] = tensor.pad %arg2 low[0, 0] high[6, 4]  {
+// CHECK:           ^bb0(%[[VAL_13:.*]]: index, %[[VAL_14:.*]]: index):
+// CHECK:             tensor.yield %[[VAL_11]] : f32
+// CHECK:           } : tensor<250x1020xf32> to tensor<256x1024xf32>
+// CHECK:           %[[VAL_15:.*]] = linalg.matmul ins(%[[VAL_16:.*]], %[[VAL_17:.*]] : tensor<256x512xf32>, tensor<512x1024xf32>) outs(%[[VAL_18:.*]] : tensor<256x1024xf32>) -> tensor<256x1024xf32>
+// CHECK:           %[[VAL_19:.*]] = tensor.extract_slice %[[VAL_15]][0, 0] [250, 1020] [1, 1] : tensor<256x1024xf32> to tensor<250x1020xf32>
+// CHECK:           return %[[VAL_19]] : tensor<250x1020xf32>
+func @pad_matmul_static(%arg0 : tensor<250x500xf32>, %arg1 : tensor<500x1020xf32>,
+        %arg2 : tensor<250x1020xf32>) -> tensor<250x1020xf32> {
+  %matmul = linalg.matmul
+      ins(%arg0, %arg1 : tensor<250x500xf32>, tensor<500x1020xf32>)
+      outs(%arg2 : tensor<250x1020xf32>) -> tensor<250x1020xf32>
+  return %matmul : tensor<250x1020xf32>
+}
+
+// -----
+// CHECK-LABEL: @pad_matmul_noop
+// CHECK-NOT: pad_tensor
+// CHECK-NOT: extract_slice
+func @pad_matmul_noop(%arg0 : tensor<256x512xf32>, %arg1 : tensor<512x1024xf32>,
+        %arg2 : tensor<256x1024xf32>) -> tensor<256x1024xf32> {
+  %matmul = linalg.matmul
+      ins(%arg0, %arg1 : tensor<256x512xf32>, tensor<512x1024xf32>)
+      outs(%arg2 : tensor<256x1024xf32>) -> tensor<256x1024xf32>
+  return %matmul : tensor<256x1024xf32>
+}
+
+// -----
+// CHECK-LABEL: @pad_matmul_dynamic_row
+// Should trigger row alignment (16).
+// Pad LHS:
+// CHECK:           %[[LHS_DIM0:.*]] = arith.constant 0 : index
+// CHECK:           %[[LHS_DIM:.*]] = tensor.dim %arg0, %[[LHS_DIM0]] : tensor<?x512xf32>
+// CHECK:           %[[LHS_ALIGN:.*]] = arith.constant 16 : index
+// CHECK:           %[[LHS_DIM_ALIGNED:.*]] = iree_input.align %[[LHS_DIM]], %[[LHS_ALIGN]] : index
+// CHECK:           %[[LHS_ZERO:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[LHS_PADDED:.*]] = tensor.pad %arg0 low[0, 0] high{{\[}}%[[LHS_DIM_ALIGNED]], 0]   {
+// CHECK:           } : tensor<?x512xf32> to tensor<?x512xf32>
+// Pad Output:
+// CHECK:           %[[OUTPUT_PADDED:.*]] = tensor.pad %arg2 low[0, 0] high{{\[}}{{.*}}, 0]  {
+// CHECK:           } : tensor<?x1024xf32> to tensor<?x1024xf32>
+// Matmul:
+// CHECK:           %[[PADDED_RESULT:.*]] = linalg.matmul ins(%[[LHS_PADDED]], %arg1 : tensor<?x512xf32>, tensor<512x1024xf32>) outs(%[[OUTPUT_PADDED]] : tensor<?x1024xf32>) -> tensor<?x1024xf32>
+// CHECK:           %[[DIM0:.*]] = arith.constant 0 : index
+// CHECK:           %[[ORIG_DIM_VALUE:.*]] = tensor.dim %arg2, %[[DIM0]]
+// CHECK:           %[[RETURN:.*]] = tensor.extract_slice %[[PADDED_RESULT]][0, 0] {{\[}}%[[ORIG_DIM_VALUE]], 1024] [1, 1] : tensor<?x1024xf32> to tensor<?x1024xf32>
+// CHECK:           return %[[RETURN]] : tensor<?x1024xf32>
+func @pad_matmul_dynamic_row(%arg0 : tensor<?x512xf32>, %arg1 : tensor<512x1024xf32>,
+        %arg2 : tensor<?x1024xf32>) -> tensor<?x1024xf32> {
+  %matmul = linalg.matmul
+      ins(%arg0, %arg1 : tensor<?x512xf32>, tensor<512x1024xf32>)
+      outs(%arg2 : tensor<?x1024xf32>) -> tensor<?x1024xf32>
+  return %matmul : tensor<?x1024xf32>
+}
+
+// -----
+// CHECK-LABEL: @pad_matmul_dynamic_col
+// Should trigger column alignment (32).
+// Pad RHS:
+// CHECK:           %[[RHS_ALIGNMENT:.*]] = arith.constant 32 : index
+// CHECK:           %[[RHS_ALIGNED_DIM:.*]] = iree_input.align %{{.*}}, %[[RHS_ALIGNMENT]] : index
+// CHECK:           %[[RHS_PADDED:.*]] = tensor.pad %arg1 low[0, 0] high[0, %[[RHS_ALIGNED_DIM]]]  {
+// CHECK:           } : tensor<512x?xf32> to tensor<512x?xf32>
+// Pad Output:
+// CHECK:           %[[OUTPUT_ALIGNMENT:.*]] = arith.constant 32 : index
+// CHECK:           %[[OUTPUT_ALIGNED_DIM:.*]] = iree_input.align %{{.*}}, %[[OUTPUT_ALIGNMENT]] : index
+// CHECK:           %[[OUTPUT_PADDED:.*]] = tensor.pad %arg2 low[0, 0] high[0, %[[OUTPUT_ALIGNED_DIM]]]  {
+// CHECK:           } : tensor<256x?xf32> to tensor<256x?xf32>
+// Matmul:
+// CHECK:           %{{.*}} = linalg.matmul ins(%arg0, %[[RHS_PADDED]] : tensor<256x512xf32>, tensor<512x?xf32>) outs(%[[OUTPUT_PADDED]] : tensor<256x?xf32>) -> tensor<256x?xf32>
+func @pad_matmul_dynamic_col(%arg0 : tensor<256x512xf32>, %arg1 : tensor<512x?xf32>,
+        %arg2 : tensor<256x?xf32>) -> tensor<256x?xf32> {
+  %matmul = linalg.matmul
+      ins(%arg0, %arg1 : tensor<256x512xf32>, tensor<512x?xf32>)
+      outs(%arg2 : tensor<256x?xf32>) -> tensor<256x?xf32>
+  return %matmul : tensor<256x?xf32>
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/pad_tiling.mlir b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/pad_tiling.mlir
new file mode 100644
index 0000000..d4ad8f0
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/pad_tiling.mlir

@@ -0,0 +1,41 @@
+// RUN: iree-dialects-opt -iree-linalg-ext-tile -split-input-file %s | FileCheck  %s
+// XFAIL: *
+// TODO: Re-enable when upstream tensor.pad op properly implements the tiling
+// interface.
+
+func @pad_tensor(%arg0 : tensor<?x?xf32>, %arg1 : index, %arg2 : index,
+    %arg3 : index, %arg4 : index, %arg5 : f32) -> tensor<?x?xf32> {
+  %0 = tensor.pad %arg0 low[%arg1, %arg2] high[%arg3, %arg4] {
+    ^bb0(%arg6 : index, %arg7 : index):
+      tensor.yield %arg5 : f32
+  } {__internal_iree_linalg_transform__ = "tiling_input"}
+      :  tensor<?x?xf32> to tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0, s1, s2] -> (s2 + s0 + s1)>
+//      CHECK: func @pad_tensor
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:   %[[ARG3:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:   %[[ARG4:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:   %[[ARG5:[a-zA-Z0-9]+]]: f32
+//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
+//  CHECK-DAG:   %[[INIT:.+]] = linalg.init_tensor
+//      CHECK:   %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
+//      CHECK:   %[[UBY:.+]] = affine.apply #[[MAP0]]()[%[[ARG1]], %[[ARG3]], %[[D0]]]
+//      CHECK:   %[[D1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
+//      CHECK:   %[[UBX:.+]] = affine.apply #[[MAP0]]()[%[[ARG2]], %[[ARG4]], %[[D1]]]
+//      CHECK:   %[[RESULT:.+]] = scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[UBY]] step %[[C10]]
+// CHECK-SAME:       iter_args(%[[ARG7:.+]] = %[[INIT]])
+//      CHECK:     %[[YIELD:.+]] = scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[UBX]] step %[[C20]]
+// CHECK-SAME:         iter_args(%[[ARG9:.+]] = %[[ARG7]])
+//      CHECK:       %[[PAD_TILE:.+]] = scf.if
+//      CHECK:       %[[INSERT:.+]] = tensor.insert_slice %[[PAD_TILE]] into %[[ARG9]]
+// CHECK-SAME:           [%[[IV0]], %[[IV1]]]
+//      CHECK:       scf.yield %[[INSERT]]
+//      CHECK:     scf.yield %[[YIELD]]
+//      CHECK:   return %[[RESULT]]

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/roundtrip.mlir b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/roundtrip.mlir
new file mode 100644
index 0000000..98b2c71
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/roundtrip.mlir

@@ -0,0 +1,596 @@
+// RUN: iree-dialects-opt -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: func @sort_tensor
+// CHECK:         iree_linalg_ext.sort
+// CHECK-SAME:      dimension(0)
+// CHECK-SAME:      outs({{.*}})
+// CHECK:           iree_linalg_ext.yield
+func @sort_tensor(%arg0: tensor<128xi32>) -> tensor<128xi32> {
+  %0 = iree_linalg_ext.sort
+    dimension(0)
+    outs(%arg0 : tensor<128xi32>) {
+  ^bb0(%arg1: i32, %arg2: i32):  // no predecessors
+    %1 = arith.cmpi sgt, %arg1, %arg2 : i32
+    iree_linalg_ext.yield %1 : i1
+  } -> tensor<128xi32>
+  return %0 : tensor<128xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @sort_memref
+// CHECK:         iree_linalg_ext.sort
+// CHECK-SAME:      dimension(0)
+// CHECK-SAME:      outs({{.*}})
+// CHECK:           iree_linalg_ext.yield
+func @sort_memref(%arg0: memref<128xi32>) {
+  iree_linalg_ext.sort dimension(0)
+    outs(%arg0 : memref<128xi32>) {
+  ^bb0(%arg1: i32, %arg2: i32):  // no predecessors
+    %0 = arith.cmpi sgt, %arg1, %arg2 : i32
+    iree_linalg_ext.yield %0 : i1
+  }
+  return
+}
+
+// -----
+
+func @sort_multi_result_tensor(
+    %arg0: tensor<?x?xi32>, %arg1: tensor<?x?xf32>)
+    -> (tensor<?x?xi32>, tensor<?x?xf32>) {
+  %0:2 = iree_linalg_ext.sort dimension(0)
+      outs(%arg0, %arg1 : tensor<?x?xi32>, tensor<?x?xf32>) {
+      ^bb0(%arg2: i32, %arg3: i32, %arg4 : f32, %arg5 : f32):  // no predecessors
+        %1 = arith.cmpf ogt, %arg4, %arg5 : f32
+        iree_linalg_ext.yield %1 : i1
+      } -> tensor<?x?xi32>, tensor<?x?xf32>
+  return %0#0, %0#1 : tensor<?x?xi32>, tensor<?x?xf32>
+}
+// CHECK-LABEL: func @sort_multi_result_tensor
+//  CHECK-SAME:   %[[ARG0:.+]]: tensor<?x?xi32>
+//  CHECK-SAME:   %[[ARG1:.+]]: tensor<?x?xf32>
+//       CHECK:   %[[RESULT:.+]]:2 = iree_linalg_ext.sort dimension(0)
+//  CHECK-SAME:      outs(%[[ARG0]], %[[ARG1]]
+//       CHECK:   return %[[RESULT]]#0, %[[RESULT]]#1
+
+// -----
+
+func @sort_multi_result_memref(
+    %arg0: memref<?x?xi32>, %arg1: memref<?x?xf32>) {
+  iree_linalg_ext.sort dimension(0)
+     outs(%arg0, %arg1 : memref<?x?xi32>, memref<?x?xf32>) {
+     ^bb0(%arg2: i32, %arg3: i32, %arg4 : f32, %arg5 : f32):  // no predecessors
+       %1 = arith.cmpf ogt, %arg4, %arg5 : f32
+       iree_linalg_ext.yield %1 : i1
+     }
+  return
+}
+// CHECK-LABEL: func @sort_multi_result_memref
+//  CHECK-SAME:   %[[ARG0:.+]]: memref<?x?xi32>
+//  CHECK-SAME:   %[[ARG1:.+]]: memref<?x?xf32>
+//       CHECK:   iree_linalg_ext.sort dimension(0)
+//  CHECK-SAME:      outs(%[[ARG0]], %[[ARG1]]
+
+// -----
+
+func @scatter_tensor_dynamic(
+    %original: tensor<?x?xf32>, %indices: tensor<?x1xi32>,
+    %update: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = iree_linalg_ext.scatter
+    unique_indices(true)
+    ins(%update, %indices : tensor<?x?xf32>, tensor<?x1xi32>)
+    outs(%original: tensor<?x?xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+// CHECK-LABEL: func @scatter_tensor_dynamic(
+//  CHECK-SAME:   %[[ORIGINAL:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
+//  CHECK-SAME:   %[[INDICES:[a-zA-Z0-9_]+]]: tensor<?x1xi32>
+//  CHECK-SAME:   %[[UPDATE:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
+//       CHECK:   %[[RESULT:.+]] = iree_linalg_ext.scatter
+//  CHECK-SAME:     unique_indices(true)
+//  CHECK-SAME:     ins(%[[UPDATE]], %[[INDICES]]
+//  CHECK-SAME:     outs(%[[ORIGINAL]]
+//       CHECK:     iree_linalg_ext.yield %{{.+}} : f32
+//       CHECK:   return %[[RESULT]]
+
+// -----
+
+func @scatter_repeated_tensor_dynamic(
+    %original: tensor<?x?xf32>, %indices: tensor<?x1xi32>,
+    %update: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = iree_linalg_ext.scatter
+    unique_indices(false)
+    ins(%update, %indices : tensor<?x?xf32>, tensor<?x1xi32>)
+    outs(%original: tensor<?x?xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+// CHECK-LABEL: func @scatter_repeated_tensor_dynamic(
+//  CHECK-SAME:   %[[ORIGINAL:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
+//  CHECK-SAME:   %[[INDICES:[a-zA-Z0-9_]+]]: tensor<?x1xi32>
+//  CHECK-SAME:   %[[UPDATE:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
+//       CHECK:   %[[RESULT:.+]] = iree_linalg_ext.scatter
+//  CHECK-SAME:     unique_indices(false)
+//  CHECK-SAME:     ins(%[[UPDATE]], %[[INDICES]]
+//  CHECK-SAME:     outs(%[[ORIGINAL]]
+//       CHECK:     iree_linalg_ext.yield %{{.+}} : f32
+//       CHECK:   return %[[RESULT]]
+
+// -----
+
+func @scatter_tensor_static(
+    %original: tensor<128x3xf32>, %indices: tensor<48x1xi32>,
+    %update: tensor<48x3xf32>) -> tensor<128x3xf32> {
+  %0 = iree_linalg_ext.scatter
+    unique_indices(true)
+    ins(%update, %indices : tensor<48x3xf32>, tensor<48x1xi32>)
+    outs(%original: tensor<128x3xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    } -> tensor<128x3xf32>
+  return %0 : tensor<128x3xf32>
+}
+// CHECK-LABEL: func @scatter_tensor_static(
+//  CHECK-SAME:   %[[ORIGINAL:[a-zA-Z0-9_]+]]: tensor<128x3xf32>
+//  CHECK-SAME:   %[[INDICES:[a-zA-Z0-9_]+]]: tensor<48x1xi32>
+//  CHECK-SAME:   %[[UPDATE:[a-zA-Z0-9_]+]]: tensor<48x3xf32>
+//       CHECK:   %[[RESULT:.+]] = iree_linalg_ext.scatter
+//  CHECK-SAME:     unique_indices(true)
+//  CHECK-SAME:     ins(%[[UPDATE]], %[[INDICES]]
+//  CHECK-SAME:     outs(%[[ORIGINAL]]
+//       CHECK:     iree_linalg_ext.yield %{{.+}} : f32
+//       CHECK:   return %[[RESULT]]
+
+// -----
+
+func @scatter_tensor_multi_index_depth(
+    %original: tensor<1x128x3xf32>, %indices: tensor<48x2xi32>,
+    %update: tensor<48x3xf32>) -> tensor<1x128x3xf32> {
+  %0 = iree_linalg_ext.scatter
+    unique_indices(true)
+    ins(%update, %indices : tensor<48x3xf32>, tensor<48x2xi32>)
+    outs(%original: tensor<1x128x3xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    } -> tensor<1x128x3xf32>
+  return %0 : tensor<1x128x3xf32>
+}
+// CHECK-LABEL: func @scatter_tensor_multi_index_depth(
+//  CHECK-SAME:   %[[ORIGINAL:[a-zA-Z0-9_]+]]: tensor<1x128x3xf32>
+//  CHECK-SAME:   %[[INDICES:[a-zA-Z0-9_]+]]: tensor<48x2xi32>
+//  CHECK-SAME:   %[[UPDATE:[a-zA-Z0-9_]+]]: tensor<48x3xf32>
+//       CHECK:   %[[RESULT:.+]] = iree_linalg_ext.scatter
+//  CHECK-SAME:     unique_indices(true)
+//  CHECK-SAME:     ins(%[[UPDATE]], %[[INDICES]]
+//  CHECK-SAME:     outs(%[[ORIGINAL]]
+//       CHECK:     iree_linalg_ext.yield %{{.+}} : f32
+//       CHECK:   return %[[RESULT]]
+
+// -----
+
+func @scatter_memref_dynamic(
+    %original: memref<?x?xf32>, %indices: memref<?x1xi32>,
+    %update: memref<?x?xf32>) {
+  iree_linalg_ext.scatter
+    unique_indices(true)
+    ins(%update, %indices : memref<?x?xf32>, memref<?x1xi32>)
+    outs(%original: memref<?x?xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    }
+  return
+}
+// CHECK-LABEL: func @scatter_memref_dynamic(
+//  CHECK-SAME:   %[[ORIGINAL:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+//  CHECK-SAME:   %[[INDICES:[a-zA-Z0-9_]+]]: memref<?x1xi32>
+//  CHECK-SAME:   %[[UPDATE:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+//       CHECK:   iree_linalg_ext.scatter
+//  CHECK-SAME:     unique_indices(true)
+//  CHECK-SAME:     ins(%[[UPDATE]], %[[INDICES]]
+//  CHECK-SAME:     outs(%[[ORIGINAL]]
+//       CHECK:     iree_linalg_ext.yield %{{.+}} : f32
+//       CHECK:   return
+
+// -----
+
+func @scatter_memref_static(
+    %original: memref<128x3xf32>, %indices: memref<48x1xi32>,
+    %update: memref<48x3xf32>) {
+  iree_linalg_ext.scatter
+    unique_indices(true)
+    ins(%update, %indices : memref<48x3xf32>, memref<48x1xi32>)
+    outs(%original: memref<128x3xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    }
+  return
+}
+// CHECK-LABEL: func @scatter_memref_static(
+//  CHECK-SAME:   %[[ORIGINAL:[a-zA-Z0-9_]+]]: memref<128x3xf32>
+//  CHECK-SAME:   %[[INDICES:[a-zA-Z0-9_]+]]: memref<48x1xi32>
+//  CHECK-SAME:   %[[UPDATE:[a-zA-Z0-9_]+]]: memref<48x3xf32>
+//       CHECK:   iree_linalg_ext.scatter
+//  CHECK-SAME:     unique_indices(true)
+//  CHECK-SAME:     ins(%[[UPDATE]], %[[INDICES]]
+//  CHECK-SAME:     outs(%[[ORIGINAL]]
+//       CHECK:     iree_linalg_ext.yield %{{.+}} : f32
+//       CHECK:   return
+
+// -----
+
+func @scatter_memref_multi_index_depth(
+    %original: memref<1x128x3xf32>, %indices: memref<48x2xi32>,
+    %update: memref<48x3xf32>) {
+  iree_linalg_ext.scatter
+    unique_indices(true)
+    ins(%update, %indices : memref<48x3xf32>, memref<48x2xi32>)
+    outs(%original: memref<1x128x3xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    }
+  return
+}
+// CHECK-LABEL: func @scatter_memref_multi_index_depth(
+//  CHECK-SAME:   %[[ORIGINAL:[a-zA-Z0-9_]+]]: memref<1x128x3xf32>
+//  CHECK-SAME:   %[[INDICES:[a-zA-Z0-9_]+]]: memref<48x2xi32>
+//  CHECK-SAME:   %[[UPDATE:[a-zA-Z0-9_]+]]: memref<48x3xf32>
+//       CHECK:   iree_linalg_ext.scatter
+//  CHECK-SAME:     unique_indices(true)
+//  CHECK-SAME:     ins(%[[UPDATE]], %[[INDICES]]
+//  CHECK-SAME:     outs(%[[ORIGINAL]]
+//       CHECK:     iree_linalg_ext.yield %{{.+}} : f32
+//       CHECK:   return
+
+// -----
+
+func @scatter_update_scalar_1D(
+    %original: tensor<8xi32>, %indices: tensor<3x1xi32>,
+    %updates: tensor<3xi32>) -> tensor<8xi32> {
+  %0 = iree_linalg_ext.scatter
+    unique_indices(true)
+    ins(%updates, %indices : tensor<3xi32>, tensor<3x1xi32>)
+    outs(%original : tensor<8xi32>)  {
+    ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
+      iree_linalg_ext.yield %arg0 : i32
+    } -> tensor<8xi32>
+  return %0 : tensor<8xi32>
+}
+// CHECK-LABEL: func @scatter_update_scalar_1D(
+//  CHECK-SAME:   %[[ORIGINAL:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:   %[[INDICES:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:   %[[UPDATE:[a-zA-Z0-9_]+]]
+//       CHECK:   %[[RESULT:.+]] = iree_linalg_ext.scatter
+//  CHECK-SAME:     unique_indices(true)
+//  CHECK-SAME:     ins(%[[UPDATE]], %[[INDICES]]
+//  CHECK-SAME:     outs(%[[ORIGINAL]]
+//       CHECK:     iree_linalg_ext.yield %{{.+}} : i32
+//       CHECK:   return %[[RESULT]]
+
+// -----
+
+func @scatter_update_scalar_2D(
+    %original: tensor<4x3xi32>, %indices: tensor<3x2xi32>,
+    %updates: tensor<3xi32>) -> tensor<4x3xi32> {
+  %0 = iree_linalg_ext.scatter
+    unique_indices(true)
+    ins(%updates, %indices : tensor<3xi32>, tensor<3x2xi32>)
+    outs(%original : tensor<4x3xi32>)  {
+    ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
+      iree_linalg_ext.yield %arg0 : i32
+    } -> tensor<4x3xi32>
+  return %0 : tensor<4x3xi32>
+}
+// CHECK-LABEL: func @scatter_update_scalar_2D(
+//  CHECK-SAME:   %[[ORIGINAL:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:   %[[INDICES:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:   %[[UPDATE:[a-zA-Z0-9_]+]]
+//       CHECK:   %[[RESULT:.+]] = iree_linalg_ext.scatter
+//  CHECK-SAME:     unique_indices(true)
+//  CHECK-SAME:     ins(%[[UPDATE]], %[[INDICES]]
+//  CHECK-SAME:     outs(%[[ORIGINAL]]
+//       CHECK:     iree_linalg_ext.yield %{{.+}} : i32
+//       CHECK:   return %[[RESULT]]
+
+// -----
+
+func @scatter_update_slice_2D(
+    %original: tensor<4x3xi32>, %indices: tensor<1x1xi32>,
+    %updates: tensor<1x3xi32>) -> tensor<4x3xi32> {
+  %0 = iree_linalg_ext.scatter
+    unique_indices(true)
+    ins(%updates, %indices : tensor<1x3xi32>, tensor<1x1xi32>)
+    outs(%original : tensor<4x3xi32>)  {
+    ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
+      iree_linalg_ext.yield %arg0 : i32
+    } -> tensor<4x3xi32>
+  return %0 : tensor<4x3xi32>
+}
+// CHECK-LABEL: func @scatter_update_slice_2D(
+//  CHECK-SAME:   %[[ORIGINAL:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:   %[[INDICES:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:   %[[UPDATE:[a-zA-Z0-9_]+]]
+//       CHECK:   %[[RESULT:.+]] = iree_linalg_ext.scatter
+//  CHECK-SAME:     unique_indices(true)
+//  CHECK-SAME:     ins(%[[UPDATE]], %[[INDICES]]
+//  CHECK-SAME:     outs(%[[ORIGINAL]]
+//       CHECK:     iree_linalg_ext.yield %{{.+}} : i32
+//       CHECK:   return %[[RESULT]]
+
+// -----
+
+func @fft_tensor(%arg0: tensor<1024xf32>, %arg1: tensor<1024xf32>)
+    -> (tensor<1024xf32>, tensor<1024xf32>) {
+  %cst1 = arith.constant 1 : index
+  %0:2 = iree_linalg_ext.fft
+    ins(%cst1: index)
+    outs(%arg0, %arg1: tensor<1024xf32>, tensor<1024xf32>)
+  : tensor<1024xf32>, tensor<1024xf32>
+  return %0#0, %0#1 : tensor<1024xf32>, tensor<1024xf32>
+}
+// CHECK-LABEL: func @fft_tensor(
+//  CHECK-SAME:   %[[REAL:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:   %[[IMAG:[a-zA-Z0-9_]+]]
+//       CHECK:   %[[CST:.+]] = arith.constant 1 : index
+//       CHECK:   %[[RES:.+]]:2 = iree_linalg_ext.fft
+//  CHECK-SAME:     ins(%[[CST]] : index)
+//  CHECK-SAME:    outs(%[[REAL]], %[[IMAG]] : tensor<1024xf32>, tensor<1024xf32>)
+//  CHECK-SAME:   : tensor<1024xf32>, tensor<1024xf32>
+//       CHECK:   return %[[RES]]#0, %[[RES]]#1
+
+// -----
+
+func @fft_memref(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  %cst1 = arith.constant 1 : index
+  iree_linalg_ext.fft
+    ins(%cst1: index)
+    outs(%arg0, %arg1: memref<1024xf32>, memref<1024xf32>)
+  return
+}
+// CHECK-LABEL: func @fft_memref(
+//  CHECK-SAME:   %[[REAL:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:   %[[IMAG:[a-zA-Z0-9_]+]]
+//       CHECK:   %[[CST:.+]] = arith.constant 1 : index
+//       CHECK:   iree_linalg_ext.fft
+//  CHECK-SAME:     ins(%[[CST]] : index)
+//  CHECK-SAME:    outs(%[[REAL]], %[[IMAG]] : memref<1024xf32>, memref<1024xf32>)
+//       CHECK:   return
+
+// -----
+
+func @fft_tensor_coef(%arg0: tensor<1024xf32>, %arg1: tensor<1024xf32>,
+    %arg2: tensor<1xf32>, %arg3: tensor<1xf32>) -> (tensor<1024xf32>, tensor<1024xf32>) {
+  %cst1 = arith.constant 1 : index
+  %0:2 = iree_linalg_ext.fft
+    ins(%cst1, %arg2, %arg3: index, tensor<1xf32>, tensor<1xf32>)
+    outs(%arg0, %arg1: tensor<1024xf32>, tensor<1024xf32>)
+  : tensor<1024xf32>, tensor<1024xf32>
+  return %0#0, %0#1 : tensor<1024xf32>, tensor<1024xf32>
+}
+// CHECK-LABEL: func @fft_tensor_coef(
+//  CHECK-SAME:   %[[REAL:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:   %[[IMAG:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:   %[[COEF_REAL:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:   %[[COEF_IMAG:[a-zA-Z0-9_]+]]
+//       CHECK:   %[[CST:.+]] = arith.constant 1 : index
+//       CHECK:   %[[RES:.+]]:2 = iree_linalg_ext.fft
+//  CHECK-SAME:     ins(%[[CST]], %[[COEF_REAL]], %[[COEF_IMAG]] : index, tensor<1xf32>, tensor<1xf32>)
+//  CHECK-SAME:    outs(%[[REAL]], %[[IMAG]] : tensor<1024xf32>, tensor<1024xf32>)
+//  CHECK-SAME:   : tensor<1024xf32>, tensor<1024xf32>
+//       CHECK:   return %[[RES]]#0, %[[RES]]#1
+
+// -----
+
+func @fft_memref_coef(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>,
+                 %arg2: memref<1xf32>, %arg3: memref<1xf32>) {
+  %cst1 = arith.constant 1 : index
+  iree_linalg_ext.fft
+    ins(%cst1, %arg2, %arg3: index, memref<1xf32>, memref<1xf32>)
+    outs(%arg0, %arg1: memref<1024xf32>, memref<1024xf32>)
+  return
+}
+// CHECK-LABEL: func @fft_memref_coef(
+//  CHECK-SAME:   %[[REAL:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:   %[[IMAG:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:   %[[COEF_REAL:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:   %[[COEF_IMAG:[a-zA-Z0-9_]+]]
+//       CHECK:   %[[CST:.+]] = arith.constant 1 : index
+//       CHECK:   iree_linalg_ext.fft
+//  CHECK-SAME:     ins(%[[CST]], %[[COEF_REAL]], %[[COEF_IMAG]] : index, memref<1xf32>, memref<1xf32>)
+//  CHECK-SAME:    outs(%[[REAL]], %[[IMAG]] : memref<1024xf32>, memref<1024xf32>)
+//       CHECK:   return
+
+// -----
+
+// The size of coefficient tensor is 2^(stage-1).
+func @fft_tensor_coef_stage_5(%arg0: tensor<1024xf32>, %arg1: tensor<1024xf32>,
+    %arg2: tensor<16xf32>, %arg3: tensor<16xf32>) -> (tensor<1024xf32>, tensor<1024xf32>) {
+  %cst1 = arith.constant 5 : index
+  %0:2 = iree_linalg_ext.fft
+    ins(%cst1, %arg2, %arg3: index, tensor<16xf32>, tensor<16xf32>)
+    outs(%arg0, %arg1: tensor<1024xf32>, tensor<1024xf32>)
+  : tensor<1024xf32>, tensor<1024xf32>
+  return %0#0, %0#1 : tensor<1024xf32>, tensor<1024xf32>
+}
+// CHECK-LABEL: func @fft_tensor_coef_stage_5(
+//  CHECK-SAME:   %[[REAL:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:   %[[IMAG:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:   %[[COEF_REAL:[a-zA-Z0-9_]+]]
+//  CHECK-SAME:   %[[COEF_IMAG:[a-zA-Z0-9_]+]]
+//       CHECK:   %[[CST:.+]] = arith.constant 5 : index
+//       CHECK:   %[[RES:.+]]:2 = iree_linalg_ext.fft
+//  CHECK-SAME:     ins(%[[CST]], %[[COEF_REAL]], %[[COEF_IMAG]] : index, tensor<16xf32>, tensor<16xf32>)
+//  CHECK-SAME:    outs(%[[REAL]], %[[IMAG]] : tensor<1024xf32>, tensor<1024xf32>)
+//  CHECK-SAME:   : tensor<1024xf32>, tensor<1024xf32>
+//       CHECK:   return %[[RES]]#0, %[[RES]]#1
+
+// -----
+
+func @reverse_tensor(%arg0: tensor<3x5xi32>) -> tensor<3x5xi32> {
+  %init = linalg.init_tensor [3, 5] : tensor<3x5xi32>
+  %0 = iree_linalg_ext.reverse
+         dimensions(dense<0> : tensor<1xi64>)
+         ins(%arg0 : tensor<3x5xi32>)
+         outs(%init : tensor<3x5xi32>) : tensor<3x5xi32>
+  return %0 : tensor<3x5xi32>
+}
+// CHECK-LABEL: func @reverse_tensor
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: tensor<3x5xi32>
+//       CHECK:   %[[INIT:.+]] = linalg.init_tensor [3, 5]
+//       CHECK:   %[[RESULT:.+]] = iree_linalg_ext.reverse
+//  CHECK-SAME:      dimensions(dense<0> : tensor<1xi64>)
+//  CHECK-SAME:      ins(%[[ARG0]]
+//  CHECK-SAME:      outs(%[[INIT]]
+
+// -----
+
+func @reverse_memref(%arg0: memref<3x5xi32>, %arg1: memref<3x5xi32>) {
+  iree_linalg_ext.reverse
+    dimensions(dense<0> : tensor<1xi64>)
+    ins(%arg0 : memref<3x5xi32>)
+    outs(%arg1 : memref<3x5xi32>)
+  return
+}
+// CHECK-LABEL: func @reverse_memref
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: memref<3x5xi32>
+//  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9]+]]: memref<3x5xi32>
+//       CHECK:   iree_linalg_ext.reverse
+//  CHECK-SAME:      dimensions(dense<0> : tensor<1xi64>)
+//  CHECK-SAME:      ins(%[[ARG0]]
+//  CHECK-SAME:      outs(%[[ARG1]]
+
+// -----
+
+func @reverse_dynamic_tensor(%arg0: tensor<?x?xi32>) -> tensor<?x?xi32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %d0 = tensor.dim %arg0, %c0 : tensor<?x?xi32>
+  %d1 = tensor.dim %arg0, %c1 : tensor<?x?xi32>
+  %init = linalg.init_tensor [%d0, %d1] : tensor<?x?xi32>
+  %0 = iree_linalg_ext.reverse
+         dimensions(dense<1> : tensor<1xi64>)
+         ins(%arg0 : tensor<?x?xi32>)
+         outs(%init : tensor<?x?xi32>) : tensor<?x?xi32>
+  return %0 : tensor<?x?xi32>
+}
+// CHECK-LABEL: func @reverse_dynamic_tensor
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xi32>
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
+//   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
+//       CHECK:   %[[INIT:.+]] = linalg.init_tensor [%[[D0]], %[[D1]]]
+//       CHECK:   %[[RESULT:.+]] = iree_linalg_ext.reverse
+//  CHECK-SAME:      dimensions(dense<1> : tensor<1xi64>)
+//  CHECK-SAME:      ins(%[[ARG0]]
+//  CHECK-SAME:      outs(%[[INIT]]
+
+// -----
+
+func @reverse_static_dynamic_tensor(%arg0: tensor<3x5xi32>) -> tensor<?x?xi32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %d0 = tensor.dim %arg0, %c0 : tensor<3x5xi32>
+  %d1 = tensor.dim %arg0, %c1 : tensor<3x5xi32>
+  %init = linalg.init_tensor [%d0, %d1] : tensor<?x?xi32>
+  %0 = iree_linalg_ext.reverse
+         dimensions(dense<1> : tensor<1xi64>)
+         ins(%arg0 : tensor<3x5xi32>)
+         outs(%init : tensor<?x?xi32>) : tensor<?x?xi32>
+  return %0 : tensor<?x?xi32>
+}
+// CHECK-LABEL: func @reverse_static_dynamic_tensor
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: tensor<3x5xi32>
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
+//   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
+//       CHECK:   %[[INIT:.+]] = linalg.init_tensor [%[[D0]], %[[D1]]]
+//       CHECK:   %[[RESULT:.+]] = iree_linalg_ext.reverse
+//  CHECK-SAME:      dimensions(dense<1> : tensor<1xi64>)
+//  CHECK-SAME:      ins(%[[ARG0]]
+//  CHECK-SAME:      outs(%[[INIT]]
+
+// -----
+
+func @reverse_multi_dims(%arg0: tensor<3x5xi32>) -> tensor<3x5xi32> {
+  %init = linalg.init_tensor [3, 5] : tensor<3x5xi32>
+  %0 = iree_linalg_ext.reverse
+         dimensions(dense<[0, 1]> : tensor<2xi64>)
+         ins(%arg0 : tensor<3x5xi32>)
+         outs(%init : tensor<3x5xi32>) : tensor<3x5xi32>
+  return %0 : tensor<3x5xi32>
+}
+// CHECK-LABEL: func @reverse_multi_dims
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: tensor<3x5xi32>
+//       CHECK:   %[[INIT:.+]] = linalg.init_tensor [3, 5]
+//       CHECK:   %[[RESULT:.+]] = iree_linalg_ext.reverse
+//  CHECK-SAME:      dimensions(dense<[0, 1]> : tensor<2xi64>)
+//  CHECK-SAME:      ins(%[[ARG0]]
+//  CHECK-SAME:      outs(%[[INIT]]
+
+// -----
+
+// CHECK-LABEL: func @static_tile
+func @static_tile(%chunk_size: index, %in: tensor<?xf32>, %out: tensor<?xf32>, %out2: tensor<?xf32>) -> (tensor<?xf32>) {
+  %c0 = arith.constant 0: index
+  //%d0 = tensor.dim %out, %c0: tensor<?xf32>
+
+  // CHECK: iree_linalg_ext.tile %{{.*}} outs(%{{.*}}: tensor<?xf32>, %{{.*}}: tensor<?xf32>)
+  // CHECK: ^bb0(%{{.*}}: index, %{{.*}}: index, %{{.*}}: tensor<?xf32>, %{{.*}}: tensor<?xf32>):
+  %0:2 = iree_linalg_ext.tile %chunk_size outs(%out: tensor<?xf32>, %out2: tensor<?xf32>)
+      -> (tensor<?xf32>, tensor<?xf32>) {
+    // TODO: one offset and one size per tensor?
+    // If not necessary in the dense strided-array world, what about the rest?
+    ^bb0(%offset: index, %size: index, %st1: tensor<?xf32>, %st2: tensor<?xf32>):
+      // TODO: atm this is just 1-1: out-chunk-size -> in-size.
+      %1 = tensor.extract_slice %in[%offset][%size][1] : tensor<?xf32> to tensor<?xf32>
+      %3 = linalg.generic {
+           indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+           iterator_types = ["parallel"]}
+         ins(%1: tensor<?xf32>) outs(%st1: tensor<?xf32>) {
+         ^bb0(%a: f32, %b:f32):  // no predecessors
+           %f42 = arith.constant 42.0: f32
+           %tmp = arith.mulf %a, %f42: f32
+           linalg.yield %tmp: f32
+      } -> tensor<?xf32>
+      iree_linalg_ext.tile_yield %3, %st2: tensor<?xf32>, tensor<?xf32> // assumes dim is 0 and stacks
+  }
+  return %0#0: tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @simple_example
+func @simple_example(%in: tensor<100xf32>, %out: tensor<100xf32>) -> (tensor<100xf32>) {
+  %num_threads = arith.constant 100 : index
+  %result = iree_linalg_ext.in_parallel %num_threads -> tensor<100xf32> {
+    ^bb0(%thread_idx : index):
+      %0 = arith.constant 0 : index
+      %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
+      iree_linalg_ext.perform_concurrently {
+        iree_linalg_ext.parallel_insert_slice %1 into %out[%thread_idx][%0][%0] :
+          tensor<1xf32> into tensor<100xf32>
+      }
+  }
+  return %result : tensor<100xf32>
+}
+
+func @no_terminator() -> () {
+  %num_threads = arith.constant 100 : index
+  iree_linalg_ext.in_parallel %num_threads -> () {
+    ^bb0(%thread_idx : index):
+  }
+  return
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/tiling.mlir b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/tiling.mlir
new file mode 100644
index 0000000..ccdc7f8
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/tiling.mlir

@@ -0,0 +1,1352 @@
+// RUN: iree-dialects-opt -iree-linalg-ext-tile -split-input-file -verify-diagnostics %s | FileCheck  %s
+
+func @scatter_tiling(
+    %original: tensor<?x?xf32>, %indices: tensor<?x1xi32>,
+    %update : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = iree_linalg_ext.scatter
+    {__internal_linalg_transform__ = "tiling_input"}
+    unique_indices(true)
+    ins(%update, %indices : tensor<?x?xf32>, tensor<?x1xi32>)
+    outs(%original : tensor<?x?xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+//   CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//   CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
+//       CHECK: func @scatter_tiling(
+//  CHECK-SAME:   %[[ORIGINAL:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
+//  CHECK-SAME:   %[[INDICES:[a-zA-Z0-9_]+]]: tensor<?x1xi32>
+//  CHECK-SAME:   %[[UPDATES:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
+//   CHECK-DAG:   %[[TILESIZEY:.+]] = arith.constant 10 : index
+//   CHECK-DAG:   %[[TILESIZEX:.+]] = arith.constant 20 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[UPDATES]], %[[C0]]
+//   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[UPDATES]], %[[C1]]
+//       CHECK:   %[[RESULT:.+]] = scf.for %[[IV0:.+]] = %[[C0]] to %[[D0]] step %[[TILESIZEY]]
+//  CHECK-SAME:       iter_args(%[[INITY:.+]] = %[[ORIGINAL]])
+//   CHECK-DAG:     %[[USED_TILESIZEY:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[TILESIZEY]], %[[D0]]]
+//       CHECK:     %[[RESULT_INNER:.+]] = scf.for %[[IV1:.+]] = %[[C0]] to %[[D1]] step %[[TILESIZEX]]
+//  CHECK-SAME:         iter_args(%[[INITX:.+]] = %[[INITY]])
+//       CHECK:       %[[USED_TILESIZEX:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[TILESIZEX]], %[[D1]]]
+//       CHECK:       %[[UPDATE_SLICE:.+]] = tensor.extract_slice %[[UPDATES]][%[[IV0]], %[[IV1]]]
+//  CHECK-SAME:           [%[[USED_TILESIZEY]], %[[USED_TILESIZEX]]]
+//       CHECK:       %[[INDEX_SLICE:.+]] = tensor.extract_slice %[[INDICES]][%[[IV0]], 0]
+//  CHECK-SAME:           [%[[USED_TILESIZEY]], 1]
+//       CHECK:       %[[SCATTER_DIM:.+]] = tensor.dim %[[ORIGINAL]], %[[C0]]
+//       CHECK:       %[[ORIGINAL_SLICE:.+]] = tensor.extract_slice %[[INITX]][0, %[[IV1]]]
+//  CHECK-SAME:           [%[[SCATTER_DIM]], %[[USED_TILESIZEX]]]
+//       CHECK:       %[[SCATTER_TILE:.+]] = iree_linalg_ext.scatter
+//  CHECK-SAME:           __internal_linalg_transform__ = "tiling_output"
+//  CHECK-SAME:           unique_indices(true)
+//  CHECK-SAME:           ins(%[[UPDATE_SLICE]], %[[INDEX_SLICE]]
+//  CHECK-SAME:           outs(%[[ORIGINAL_SLICE]]
+//       CHECK:       %[[YIELD:.+]] = tensor.insert_slice %[[SCATTER_TILE]] into %[[INITX]][0, %[[IV1]]]
+//  CHECK-SAME:           [%[[SCATTER_DIM]], %[[USED_TILESIZEX]]]
+//       CHECK:       scf.yield %[[YIELD]]
+//       CHECK:     scf.yield %[[RESULT_INNER]]
+//       CHECK:   return %[[RESULT]]
+
+// -----
+
+func @scatter_tiling_memref(
+    %original: memref<?x?xf32>, %indices: memref<?x1xi32>,
+    %update : memref<?x?xf32>) {
+  iree_linalg_ext.scatter
+    {__internal_linalg_transform__ = "tiling_input"}
+    unique_indices(true)
+    ins(%update, %indices : memref<?x?xf32>, memref<?x1xi32>)
+    outs(%original : memref<?x?xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    }
+  return
+}
+//   CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//   CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
+//       CHECK: func @scatter_tiling_memref(
+//  CHECK-SAME:   %[[ORIGINAL:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+//  CHECK-SAME:   %[[INDICES:[a-zA-Z0-9_]+]]: memref<?x1xi32>
+//  CHECK-SAME:   %[[UPDATES:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+//   CHECK-DAG:   %[[TILESIZEY:.+]] = arith.constant 10 : index
+//   CHECK-DAG:   %[[TILESIZEX:.+]] = arith.constant 20 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[D0:.+]] = memref.dim %[[UPDATES]], %[[C0]]
+//   CHECK-DAG:   %[[D1:.+]] = memref.dim %[[UPDATES]], %[[C1]]
+//       CHECK:   scf.for %[[IV0:.+]] = %[[C0]] to %[[D0]] step %[[TILESIZEY]]
+//   CHECK-DAG:     %[[USED_TILESIZEY:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[TILESIZEY]], %[[D0]]]
+//       CHECK:     scf.for %[[IV1:.+]] = %[[C0]] to %[[D1]] step %[[TILESIZEX]]
+//   CHECK-DAG:       %[[USED_TILESIZEX:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[TILESIZEX]], %[[D1]]]
+//       CHECK:       %[[UPDATE_SLICE:.+]] = memref.subview %[[UPDATES]][%[[IV0]], %[[IV1]]]
+//  CHECK-SAME:           [%[[USED_TILESIZEY]], %[[USED_TILESIZEX]]]
+//       CHECK:       %[[INDEX_SLICE:.+]] = memref.subview %[[INDICES]][%[[IV0]], 0]
+//  CHECK-SAME:           [%[[USED_TILESIZEY]], 1]
+//       CHECK:       %[[SCATTER_DIM:.+]] = memref.dim %[[ORIGINAL]], %[[C0]]
+//       CHECK:       %[[ORIGINAL_SLICE:.+]] = memref.subview %[[ORIGINAL]][0, %[[IV1]]
+//  CHECK-SAME:           [%[[SCATTER_DIM]], %[[USED_TILESIZEX]]]
+//       CHECK:       iree_linalg_ext.scatter
+//  CHECK-SAME:           __internal_linalg_transform__ = "tiling_output"
+//  CHECK-SAME:           unique_indices(true)
+//  CHECK-SAME:           ins(%[[UPDATE_SLICE]], %[[INDEX_SLICE]]
+//  CHECK-SAME:           outs(%[[ORIGINAL_SLICE]]
+
+// -----
+
+func @scatter_tiling_distribution(
+    %original: tensor<?x?xf32>, %indices: tensor<?x1xi32>,
+    %update : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = iree_linalg_ext.scatter
+    {__internal_linalg_transform__ = "distribute_input"}
+    unique_indices(true)
+    ins(%update, %indices : tensor<?x?xf32>, tensor<?x1xi32>)
+    outs(%original : tensor<?x?xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+//   CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 * 10)>
+//   CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//       CHECK: func @scatter_tiling_distribution(
+//  CHECK-SAME:   %[[ORIGINAL:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
+//  CHECK-SAME:   %[[INDICES:[a-zA-Z0-9_]+]]: tensor<?x1xi32>
+//  CHECK-SAME:   %[[UPDATES:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[TILESIZE:.+]] = arith.constant 10 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[UPDATES]], %[[C0]]
+//   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[UPDATES]], %[[C1]]
+//   CHECK-DAG:   %[[ID:.+]] = iree_input.dispatch.workgroup.id[0]
+//   CHECK-DAG:   %[[COUNT:.+]] = iree_input.dispatch.workgroup.count[0]
+//   CHECK-DAG:   %[[OFFSET:.+]] = affine.apply #[[MAP0]]()[%[[ID]]]
+//   CHECK-DAG:   %[[STEP:.+]] = affine.apply #[[MAP0]]()[%[[COUNT]]]
+//       CHECK:   %[[RESULT:.+]] = scf.for %[[IV:.+]] = %[[OFFSET]] to %[[D0]] step %[[STEP]]
+//  CHECK-SAME:       iter_args(%[[INIT:.+]] = %[[ORIGINAL]])
+//       CHECK:     %[[USED_TILESIZE:.+]] = affine.min #[[MAP1]](%[[IV]])[%[[TILESIZE]], %[[D0]]]
+//       CHECK:     %[[UPDATE_SLICE:.+]] = tensor.extract_slice %[[UPDATES]][%[[IV]], 0]
+//  CHECK-SAME:         [%[[USED_TILESIZE]], %[[D1]]]
+//       CHECK:     %[[INDEX_SLICE:.+]] = tensor.extract_slice %[[INDICES]][%[[IV]], 0]
+//  CHECK-SAME:         [%[[USED_TILESIZE]], 1]
+//       CHECK:     %[[D2:.+]] = tensor.dim %[[ORIGINAL]], %[[C0]]
+//       CHECK:     %[[ORIGINAL_SLICE:.+]] = tensor.extract_slice %[[INIT]][0, 0]
+//  CHECK-SAME:         [%[[D2]], %[[D1]]]
+//       CHECK:     %[[SCATTER_TILE:.+]] = iree_linalg_ext.scatter
+//  CHECK-SAME:        __internal_linalg_transform__ = "distribute_output"
+//  CHECK-SAME:        unique_indices(true)
+//  CHECK-SAME:        ins(%[[UPDATE_SLICE]], %[[INDEX_SLICE]]
+//  CHECK-SAME:        outs(%[[ORIGINAL_SLICE]]
+//       CHECK:     %[[YIELD:.+]] = tensor.insert_slice %[[SCATTER_TILE]] into %[[INIT]][0, 0]
+//  CHECK-SAME:        [%[[D2]], %[[D1]]]
+//       CHECK:   return %[[RESULT]]
+
+// -----
+
+func @scatter_no_tiling(
+    %original: tensor<?x?xf32>, %indices: tensor<?x1xi32>,
+    %update : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = iree_linalg_ext.scatter
+    {__internal_linalg_transform__ = "no_tiling_input"}
+    unique_indices(true)
+    ins(%update, %indices : tensor<?x?xf32>, tensor<?x1xi32>)
+    outs(%original : tensor<?x?xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+//       CHECK: func @scatter_no_tiling
+//  CHECK-SAME:   %[[ORIGINAL:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
+//  CHECK-SAME:   %[[INDICES:[a-zA-Z0-9_]+]]: tensor<?x1xi32>
+//  CHECK-SAME:   %[[UPDATES:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
+//       CHECK:   %[[RESULT:.+]] = iree_linalg_ext.scatter
+//  CHECK-SAME:       __internal_linalg_transform__ = "no_tiling_output"
+//  CHECK-SAME:       unique_indices(true)
+//  CHECK-SAME:       ins(%[[UPDATES]], %[[INDICES]]
+//  CHECK-SAME:       outs(%[[ORIGINAL]]
+//       CHECK:   return %[[RESULT]]
+
+// -----
+
+func @scatter_repeated_indices_tiling(
+    %original: tensor<?x?xf32>, %indices: tensor<?x1xi32>,
+    %update : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = iree_linalg_ext.scatter
+    {__internal_linalg_transform__ = "tiling_repeated_indices_scatter_input"}
+    unique_indices(false)
+    ins(%update, %indices : tensor<?x?xf32>, tensor<?x1xi32>)
+    outs(%original : tensor<?x?xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+//   CHECK-DAG: #[[MAP:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
+//       CHECK: func @scatter_repeated_indices_tiling
+//  CHECK-SAME:   %[[ORIGINAL:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
+//  CHECK-SAME:   %[[INDICES:[a-zA-Z0-9_]+]]: tensor<?x1xi32>
+//  CHECK-SAME:   %[[UPDATES:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
+//   CHECK-DAG:   %[[TILESIZE:.+]] = arith.constant 20 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[UPDATES]], %[[C0]]
+//   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[UPDATES]], %[[C1]]
+//       CHECK:   %[[RESULT:.+]] = scf.for %[[I:.+]] = %[[C0]] to %[[D1]] step %[[TILESIZE]]
+//  CHECK-SAME:       iter_args(%[[ITER:.+]] = %[[ORIGINAL]])
+//       CHECK:     %[[SZ:.+]] = affine.min #[[MAP]](%[[I]])[%[[TILESIZE]], %[[D1]]]
+//       CHECK:       %[[UPDATES_TILE:.+]] = tensor.extract_slice
+//  CHECK-SAME:         %[[UPDATES]][0, %[[I]]] [%[[D0]], %[[SZ]]] [1, 1]
+//       CHECK:       %[[INDICES_TILE:.+]] = tensor.extract_slice
+//  CHECK-SAME:         %[[INDICES]][0, 0] [%[[D0]], 1] [1, 1]
+//       CHECK:       %[[ORIGINAL_D0:.+]] = tensor.dim %[[ORIGINAL]], %[[C0]]
+//       CHECK:       %[[ORIGINAL_TILE:.+]] = tensor.extract_slice
+//  CHECK-SAME:         %[[ITER]][0, %[[I]]] [%[[ORIGINAL_D0]], %[[SZ]]] [1, 1]
+//       CHECK:       %[[SCATTER:.+]] = iree_linalg_ext.scatter
+//  CHECK-SAME:         __internal_linalg_transform__ = "tiling_repeated_indices_scatter_output"
+//  CHECK-SAME:         unique_indices(false)
+//  CHECK-SAME:         ins(%[[UPDATES_TILE]], %[[INDICES_TILE]]
+//  CHECK-SAME:         outs(%[[ORIGINAL_TILE]]
+//       CHECK:       %[[RES:.+]] = tensor.insert_slice %[[SCATTER]] into
+//  CHECK-SAME:         %[[ITER]][0, %[[I]]] [%[[ORIGINAL_D0]], %[[SZ]]] [1, 1]
+//       CHECK:       scf.yield %[[RES]]
+//       CHECK:   return %[[RESULT]]
+
+// -----
+
+func @scatter_repeated_indices_no_tiling(
+    %original: tensor<?x?xf32>, %indices: tensor<?x1xi32>,
+    %update : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  // expected-error @+1 {{unimplemented tiling of non-parallel loop iterator type}}
+  %0 = iree_linalg_ext.scatter
+    {__internal_linalg_transform__ = "tiling_input"}
+    unique_indices(false)
+    ins(%update, %indices : tensor<?x?xf32>, tensor<?x1xi32>)
+    outs(%original : tensor<?x?xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.addf %arg1, %arg2 : f32
+      iree_linalg_ext.yield %1 : f32
+    } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// -----
+
+func @sort_1d(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = iree_linalg_ext.sort
+       {__internal_linalg_transform__ = "outer_reduce_input"}
+       dimension(0)
+       outs(%arg0 : tensor<?xi32>) {
+       ^bb0(%arg2: i32, %arg3: i32):  // no predecessors
+         %0 = arith.cmpi sgt, %arg2, %arg3 : i32
+         iree_linalg_ext.yield %0 : i1
+       } -> tensor<?xi32>
+  return %0 : tensor<?xi32>
+}
+//      CHECK: func @sort_1d(
+// CHECK-SAME:   %[[OPERAND:.+]]: tensor<?xi32>
+//      CHECK:   %[[RESULT:.+]] = iree_linalg_ext.sort
+// CHECK-SAME:       {__internal_linalg_transform__ = "outer_reduce_output"}
+// CHECK-SAME:       outs(%[[OPERAND]] :
+//      CHECK:   return %[[RESULT]]
+
+// -----
+
+func @sort_2d(%arg0: tensor<?x?xi32>) -> tensor<?x?xi32> {
+  %0 = iree_linalg_ext.sort
+       {__internal_linalg_transform__ = "inner_reduce_input"}
+       dimension(1)
+       outs(%arg0 : tensor<?x?xi32>) {
+       ^bb0(%arg2: i32, %arg3: i32):  // no predecessors
+         %0 = arith.cmpi sgt, %arg2, %arg3 : i32
+         iree_linalg_ext.yield %0 : i1
+       } -> tensor<?x?xi32>
+  return %0 : tensor<?x?xi32>
+}
+//       CHECK: #[[MAP:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//       CHECK: func @sort_2d(
+//  CHECK-SAME:   %[[OPERAND:.+]]: tensor<?x?xi32>
+//   CHECK-DAG:   %[[TILESIZE:.+]] = arith.constant 10 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[OPERAND]], %[[C0]]
+//   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[OPERAND]], %[[C1]]
+//       CHECK:   %[[RESULT:.+]] = scf.for %[[IV:.+]] = %[[C0]] to %[[D0]] step %[[TILESIZE]]
+//  CHECK-SAME:       iter_args(%[[INIT:.+]] = %[[OPERAND]])
+//   CHECK-DAG:     %[[USED_TILESIZE:.+]] = affine.min #[[MAP]](%[[IV]])[%[[TILESIZE]], %[[D0]]]
+//       CHECK:     %[[OPERAND_SLICE:.+]] = tensor.extract_slice %[[INIT]][%[[IV]], 0]
+//  CHECK-SAME:         [%[[USED_TILESIZE]], %[[D1]]]
+//       CHECK:     %[[SORT_TILE:.+]] = iree_linalg_ext.sort
+//  CHECK-SAME:         __internal_linalg_transform__ = "inner_reduce_output"
+//  CHECK-SAME:         outs(%[[OPERAND_SLICE]]
+//       CHECK:     %[[YIELD:.+]] = tensor.insert_slice %[[SORT_TILE]] into %[[INIT]][%[[IV]], 0]
+//  CHECK-SAME:         [%[[USED_TILESIZE]], %[[D1]]]
+//       CHECK:     scf.yield %[[YIELD]]
+//       CHECK:   return %[[RESULT]]
+
+// -----
+
+func @sort_2d_inner_parallel(%arg0: tensor<?x?xi32>) -> tensor<?x?xi32> {
+  %0 = iree_linalg_ext.sort
+       {__internal_linalg_transform__ = "outer_reduce_input"}
+       dimension(0)
+       outs(%arg0 : tensor<?x?xi32>) {
+       ^bb0(%arg2: i32, %arg3: i32):  // no predecessors
+         %0 = arith.cmpi sgt, %arg2, %arg3 : i32
+         iree_linalg_ext.yield %0 : i1
+       } -> tensor<?x?xi32>
+  return %0 : tensor<?x?xi32>
+}
+//       CHECK: #[[MAP:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
+//       CHECK: func @sort_2d_inner_parallel(
+//  CHECK-SAME:   %[[OPERAND:.+]]: tensor<?x?xi32>
+//   CHECK-DAG:   %[[TILESIZE:.+]] = arith.constant 20 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[OPERAND]], %[[C0]]
+//   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[OPERAND]], %[[C1]]
+//       CHECK:   %[[RESULT:.+]] = scf.for %[[IV:.+]] = %[[C0]] to %[[D1]] step %[[TILESIZE]]
+//  CHECK-SAME:       iter_args(%[[INIT:.+]] = %[[OPERAND]])
+//   CHECK-DAG:     %[[USED_TILESIZE:.+]] = affine.min #[[MAP]](%[[IV]])[%[[TILESIZE]], %[[D1]]]
+//       CHECK:     %[[OPERAND_SLICE:.+]] = tensor.extract_slice %[[INIT]][0, %[[IV]]]
+//  CHECK-SAME:         [%[[D0]], %[[USED_TILESIZE]]]
+//       CHECK:     %[[SORT_TILE:.+]] = iree_linalg_ext.sort
+//  CHECK-SAME:         __internal_linalg_transform__ = "outer_reduce_output"
+//  CHECK-SAME:         outs(%[[OPERAND_SLICE]]
+//       CHECK:     %[[YIELD:.+]] = tensor.insert_slice %[[SORT_TILE]] into %[[INIT]][0, %[[IV]]]
+//  CHECK-SAME:         [%[[D0]], %[[USED_TILESIZE]]]
+//       CHECK:     scf.yield %[[YIELD]]
+//       CHECK:   return %[[RESULT]]
+
+// -----
+
+func @sort_2d_multi_result(
+    %arg0: tensor<?x?xi32>, %arg1: tensor<?x?xf32>)
+    -> (tensor<?x?xi32>, tensor<?x?xf32>) {
+  %0:2 = iree_linalg_ext.sort
+       {__internal_linalg_transform__ = "inner_reduce_input"}
+       dimension(1)
+       outs(%arg0, %arg1 : tensor<?x?xi32>, tensor<?x?xf32>) {
+       ^bb0(%arg2: i32, %arg3: i32, %arg4 : f32, %arg5 : f32):  // no predecessors
+         %1 = arith.cmpf ogt, %arg4, %arg5 : f32
+         iree_linalg_ext.yield %1 : i1
+       } -> tensor<?x?xi32>, tensor<?x?xf32>
+  return %0#0, %0#1 : tensor<?x?xi32>, tensor<?x?xf32>
+}
+//       CHECK: #[[MAP:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//       CHECK: func @sort_2d_multi_result(
+//  CHECK-SAME:   %[[OPERAND1:.+]]: tensor<?x?xi32>
+//  CHECK-SAME:   %[[OPERAND2:.+]]: tensor<?x?xf32>
+//   CHECK-DAG:   %[[TILESIZE:.+]] = arith.constant 10 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[OPERAND1]], %[[C0]]
+//   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[OPERAND1]], %[[C1]]
+//       CHECK:   %[[RESULT:.+]]:2 = scf.for %[[IV:.+]] = %[[C0]] to %[[D0]] step %[[TILESIZE]]
+//  CHECK-SAME:       iter_args(%[[INIT1:.+]] = %[[OPERAND1]], %[[INIT2:.+]] = %[[OPERAND2]])
+//   CHECK-DAG:     %[[USED_TILESIZE:.+]] = affine.min #[[MAP]](%[[IV]])[%[[TILESIZE]], %[[D0]]]
+//       CHECK:     %[[OPERAND1_SLICE:.+]] = tensor.extract_slice %[[INIT1]][%[[IV]], 0]
+//  CHECK-SAME:         [%[[USED_TILESIZE]], %[[D1]]]
+//       CHECK:     %[[OPERAND2_SLICE:.+]] = tensor.extract_slice %[[INIT2]][%[[IV]], 0]
+//  CHECK-SAME:         [%[[USED_TILESIZE]], %[[D1]]]
+//       CHECK:     %[[SORT_TILE:.+]]:2 = iree_linalg_ext.sort
+//  CHECK-SAME:         __internal_linalg_transform__ = "inner_reduce_output"
+//  CHECK-SAME:         outs(%[[OPERAND1_SLICE]], %[[OPERAND2_SLICE]]
+//       CHECK:     %[[YIELD1:.+]] = tensor.insert_slice %[[SORT_TILE]]#0 into %[[INIT1]][%[[IV]], 0]
+//  CHECK-SAME:         [%[[USED_TILESIZE]], %[[D1]]]
+//       CHECK:     %[[YIELD2:.+]] = tensor.insert_slice %[[SORT_TILE]]#1 into %[[INIT2]][%[[IV]], 0]
+//  CHECK-SAME:         [%[[USED_TILESIZE]], %[[D1]]]
+//       CHECK:     scf.yield %[[YIELD1]], %[[YIELD2]]
+//       CHECK:   return %[[RESULT]]#0, %[[RESULT]]#1
+
+// -----
+
+func @sort_2d_multi_result_memref(
+    %arg0: memref<?x?xi32>, %arg1: memref<?x?xf32>) {
+  iree_linalg_ext.sort
+     {__internal_linalg_transform__ = "outer_reduce_input"}
+     dimension(0)
+     outs(%arg0, %arg1 : memref<?x?xi32>, memref<?x?xf32>) {
+     ^bb0(%arg2: i32, %arg3: i32, %arg4 : f32, %arg5 : f32):  // no predecessors
+       %0 = arith.cmpf ogt, %arg4, %arg5 : f32
+       iree_linalg_ext.yield %0 : i1
+     }
+  return
+}
+//       CHECK: #[[MAP:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
+//       CHECK: func @sort_2d_multi_result_memref(
+//  CHECK-SAME:   %[[OPERAND1:.+]]: memref<?x?xi32>
+//  CHECK-SAME:   %[[OPERAND2:.+]]: memref<?x?xf32>
+//   CHECK-DAG:   %[[TILESIZE:.+]] = arith.constant 20 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[D0:.+]] = memref.dim %[[OPERAND1]], %[[C0]]
+//   CHECK-DAG:   %[[D1:.+]] = memref.dim %[[OPERAND1]], %[[C1]]
+//       CHECK:   scf.for %[[IV:.+]] = %[[C0]] to %[[D1]] step %[[TILESIZE]]
+//   CHECK-DAG:     %[[USED_TILESIZE:.+]] = affine.min #[[MAP]](%[[IV]])[%[[TILESIZE]], %[[D1]]]
+//       CHECK:     %[[OPERAND1_SLICE:.+]] = memref.subview %[[OPERAND1]][0, %[[IV]]]
+//  CHECK-SAME:         [%[[D0]], %[[USED_TILESIZE]]]
+//       CHECK:     %[[OPERAND2_SLICE:.+]] = memref.subview %[[OPERAND2]][0, %[[IV]]]
+//  CHECK-SAME:         [%[[D0]], %[[USED_TILESIZE]]]
+//       CHECK:     iree_linalg_ext.sort
+//  CHECK-SAME:         __internal_linalg_transform__ = "outer_reduce_output"
+//  CHECK-SAME:         outs(%[[OPERAND1_SLICE]], %[[OPERAND2_SLICE]]
+
+// -----
+
+func @sort_3d_multi_result_distribute(
+  %arg0: tensor<?x?x?xi32>, %arg1 : tensor<?x?x?xf32>)
+  -> (tensor<?x?x?xi32>, tensor<?x?x?xf32>) {
+  %0, %1 = iree_linalg_ext.sort
+      {__internal_linalg_transform__ = "distribute_input"}
+      dimension(1)
+      outs(%arg0, %arg1 : tensor<?x?x?xi32>, tensor<?x?x?xf32>) {
+      ^bb0(%arg2: i32, %arg3: i32, %arg4 : f32, %arg5 : f32):  // no predecessors
+        %2 = arith.cmpf ogt, %arg4, %arg5 : f32
+        iree_linalg_ext.yield %2 : i1
+      } -> tensor<?x?x?xi32>, tensor<?x?x?xf32>
+  return %0, %1 : tensor<?x?x?xi32>, tensor<?x?x?xf32>
+}
+//   CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 * 10)>
+//   CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//   CHECK-DAG: #[[MAP2:.+]] = affine_map<()[s0] -> (s0 * 30)>
+//   CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0, s1] -> (30, -d0 + s1)>
+//       CHECK: func @sort_3d_multi_result_distribute(
+//  CHECK-SAME:   %[[OPERAND1:[a-zA-Z0-9_]+]]: tensor<?x?x?xi32>
+//  CHECK-SAME:   %[[OPERAND2:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
+//   CHECK-DAG:   %[[TILESIZE1:.+]] = arith.constant 10 : index
+//   CHECK-DAG:   %[[TILESIZE2:.+]] = arith.constant 30 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[OPERAND1]], %[[C0]]
+//   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[OPERAND1]], %[[C1]]
+//   CHECK-DAG:   %[[D2:.+]] = tensor.dim %[[OPERAND1]], %[[C2]]
+//   CHECK-DAG:   %[[IDX:.+]] = iree_input.dispatch.workgroup.id[0]
+//   CHECK-DAG:   %[[COUNTX:.+]] = iree_input.dispatch.workgroup.count[0]
+//   CHECK-DAG:   %[[IDY:.+]] = iree_input.dispatch.workgroup.id[1]
+//   CHECK-DAG:   %[[COUNTY:.+]] = iree_input.dispatch.workgroup.count[1]
+//   CHECK-DAG:   %[[OFFSETY:.+]] = affine.apply #[[MAP0]]()[%[[IDY]]]
+//   CHECK-DAG:   %[[STEPY:.+]] = affine.apply #[[MAP0]]()[%[[COUNTY]]]
+//       CHECK:   %[[RESULT:.+]]:2 = scf.for %[[IV0:.+]] = %[[OFFSETY]] to %[[D0]] step %[[STEPY]]
+//  CHECK-SAME:       iter_args(%[[INIT1:.+]] = %[[OPERAND1]], %[[INIT2:.+]] = %[[OPERAND2]])
+//   CHECK-DAG:     %[[USED_TILESIZE1:.+]] = affine.min #[[MAP1]](%[[IV0]])[%[[TILESIZE1]], %[[D0]]]
+//   CHECK-DAG:     %[[OFFSETX:.+]] = affine.apply #[[MAP2]]()[%[[IDX]]]
+//   CHECK-DAG:     %[[STEPX:.+]] = affine.apply #[[MAP2]]()[%[[COUNTX]]]
+//       CHECK:     %[[RESULT_INNER:.+]]:2 = scf.for %[[IV1:.+]] = %[[OFFSETX]] to %[[D2]] step %[[STEPX]]
+//  CHECK-SAME:         iter_args(%[[INIT3:.+]] = %[[INIT1]], %[[INIT4:.+]] = %[[INIT2]])
+//   CHECK-DAG:       %[[USED_TILESIZE2:.+]] = affine.min #[[MAP3]](%[[IV1]])[%[[TILESIZE2]], %[[D2]]]
+//       CHECK:       %[[OPERAND1_SLICE:.+]] = tensor.extract_slice %[[INIT3]][%[[IV0]], 0, %[[IV1]]]
+//  CHECK-SAME:           [%[[USED_TILESIZE1]], %[[D1]], %[[USED_TILESIZE2]]]
+//       CHECK:       %[[OPERAND2_SLICE:.+]] = tensor.extract_slice %[[INIT4]][%[[IV0]], 0, %[[IV1]]]
+//  CHECK-SAME:           [%[[USED_TILESIZE1]], %[[D1]], %[[USED_TILESIZE2]]]
+//       CHECK:       %[[SORT_SLICE:.+]]:2 = iree_linalg_ext.sort
+//  CHECK-SAME:           __internal_linalg_transform__ = "distribute_output"
+//  CHECK-SAME:           outs(%[[OPERAND1_SLICE]], %[[OPERAND2_SLICE]]
+//       CHECK:       %[[YIELD1:.+]] = tensor.insert_slice %[[SORT_SLICE]]#0
+//  CHECK-SAME:           into %[[INIT3]][%[[IV0]], 0, %[[IV1]]]
+//       CHECK:       %[[YIELD2:.+]] = tensor.insert_slice %[[SORT_SLICE]]#1
+//  CHECK-SAME:           into %[[INIT4]][%[[IV0]], 0, %[[IV1]]]
+//       CHECK:       scf.yield %[[YIELD1]], %[[YIELD2]]
+//       CHECK:     scf.yield %[[RESULT_INNER]]#0, %[[RESULT_INNER]]#1
+//       CHECK:   return %[[RESULT]]#0, %[[RESULT]]#1
+
+// -----
+
+func @sort_3d_multi_result_distribute_memref(
+  %arg0: memref<?x?x?xi32>, %arg1 : memref<?x?x?xf32>) {
+  iree_linalg_ext.sort
+      {__internal_linalg_transform__ = "distribute_input"}
+      dimension(1)
+      outs(%arg0, %arg1 : memref<?x?x?xi32>, memref<?x?x?xf32>) {
+      ^bb0(%arg2: i32, %arg3: i32, %arg4 : f32, %arg5 : f32):  // no predecessors
+        %0 = arith.cmpf ogt, %arg4, %arg5 : f32
+        iree_linalg_ext.yield %0 : i1
+      }
+  return
+}
+//   CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 * 10)>
+//   CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//   CHECK-DAG: #[[MAP2:.+]] = affine_map<()[s0] -> (s0 * 30)>
+//   CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0, s1] -> (30, -d0 + s1)>
+//       CHECK: func @sort_3d_multi_result_distribute_memref(
+//  CHECK-SAME:   %[[OPERAND1:[a-zA-Z0-9_]+]]: memref<?x?x?xi32>
+//  CHECK-SAME:   %[[OPERAND2:[a-zA-Z0-9_]+]]: memref<?x?x?xf32>
+//   CHECK-DAG:   %[[TILESIZE1:.+]] = arith.constant 10 : index
+//   CHECK-DAG:   %[[TILESIZE2:.+]] = arith.constant 30 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[D0:.+]] = memref.dim %[[OPERAND1]], %[[C0]]
+//   CHECK-DAG:   %[[D1:.+]] = memref.dim %[[OPERAND1]], %[[C1]]
+//   CHECK-DAG:   %[[D2:.+]] = memref.dim %[[OPERAND1]], %[[C2]]
+//   CHECK-DAG:   %[[IDX:.+]] = iree_input.dispatch.workgroup.id[0]
+//   CHECK-DAG:   %[[COUNTX:.+]] = iree_input.dispatch.workgroup.count[0]
+//   CHECK-DAG:   %[[IDY:.+]] = iree_input.dispatch.workgroup.id[1]
+//   CHECK-DAG:   %[[COUNTY:.+]] = iree_input.dispatch.workgroup.count[1]
+//   CHECK-DAG:   %[[OFFSETY:.+]] = affine.apply #[[MAP0]]()[%[[IDY]]]
+//   CHECK-DAG:   %[[STEPY:.+]] = affine.apply #[[MAP0]]()[%[[COUNTY]]]
+//       CHECK:   scf.for %[[IV0:.+]] = %[[OFFSETY]] to %[[D0]] step %[[STEPY]]
+//   CHECK-DAG:     %[[USED_TILESIZE1:.+]] = affine.min #[[MAP1]](%[[IV0]])[%[[TILESIZE1]], %[[D0]]]
+//   CHECK-DAG:     %[[OFFSETX:.+]] = affine.apply #[[MAP2]]()[%[[IDX]]]
+//   CHECK-DAG:     %[[STEPX:.+]] = affine.apply #[[MAP2]]()[%[[COUNTX]]]
+//       CHECK:     scf.for %[[IV1:.+]] = %[[OFFSETX]] to %[[D2]] step %[[STEPX]]
+//   CHECK-DAG:       %[[USED_TILESIZE2:.+]] = affine.min #[[MAP3]](%[[IV1]])[%[[TILESIZE2]], %[[D2]]]
+//       CHECK:       %[[OPERAND1_SLICE:.+]] = memref.subview %[[OPERAND1]][%[[IV0]], 0, %[[IV1]]]
+//  CHECK-SAME:           [%[[USED_TILESIZE1]], %[[D1]], %[[USED_TILESIZE2]]]
+//       CHECK:       %[[OPERAND2_SLICE:.+]] = memref.subview %[[OPERAND2]][%[[IV0]], 0, %[[IV1]]]
+//  CHECK-SAME:           [%[[USED_TILESIZE1]], %[[D1]], %[[USED_TILESIZE2]]]
+//       CHECK:       iree_linalg_ext.sort
+//  CHECK-SAME:           __internal_linalg_transform__ = "distribute_output"
+//  CHECK-SAME:           outs(%[[OPERAND1_SLICE]], %[[OPERAND2_SLICE]]
+
+// -----
+
+func @slice_insert(%source :tensor<?x?xf32>, %dest: tensor<?x?xf32>,
+                   %idx0 : index, %idx1 : index) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = tensor.dim %source, %c0 : tensor<?x?xf32>
+  %1 = tensor.dim %source, %c1 : tensor<?x?xf32>
+  %2 = tensor.insert_slice %source into %dest[%idx0, %idx1] [%0, %1] [1, 1]
+      {__internal_linalg_transform__ = "tiling_input"} : tensor<?x?xf32> into tensor<?x?xf32>
+  return %2 : tensor<?x?xf32>
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
+//      CHECK: func @slice_insert(
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
+// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:   %[[ARG3:[a-zA-Z0-9_]+]]: index
+//      CHECK:   %[[RESULT:.+]] = scf.for %[[IV0:[a-zA-Z0-9]+]] =
+//      CHECK:     %[[YIELD1:.+]] = scf.for %[[IV1:[a-zA-Z0-9]+]] =
+//      CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[IV0]], %[[IV1]]]
+//      CHECK:       %[[OFFSET0:.+]] = affine.apply #[[MAP2]](%[[IV0]])[%[[ARG2]]]
+//      CHECK:       %[[OFFSET1:.+]] = affine.apply #[[MAP2]](%[[IV1]])[%[[ARG3]]]
+//      CHECK:       %[[UPDATE:.+]] = tensor.insert_slice %[[SLICE]]
+// CHECK-SAME:         into %{{.+}}[%[[OFFSET0]], %[[OFFSET1]]]
+//      CHECK:       scf.yield %[[UPDATE]]
+//      CHECK:     scf.yield %[[YIELD1]]
+//      CHECK:   return %[[RESULT]]
+
+// -----
+
+func @slice_insert_rank_reduce(%source :tensor<?x?xf32>, %dest: tensor<?x?x?xf32>,
+                   %idx0 : index, %idx1 : index) -> tensor<?x?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = tensor.dim %source, %c0 : tensor<?x?xf32>
+  %1 = tensor.dim %source, %c1 : tensor<?x?xf32>
+  %2 = tensor.insert_slice %source into %dest[%idx0, 0, %idx1] [%0, 1, %1] [1, 1, 1]
+      {__internal_linalg_transform__ = "tiling_input"} : tensor<?x?xf32> into tensor<?x?x?xf32>
+  return %2 : tensor<?x?x?xf32>
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
+//      CHECK: func @slice_insert_rank_reduce(
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
+// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:   %[[ARG3:[a-zA-Z0-9_]+]]: index
+//      CHECK:   %[[RESULT:.+]] = scf.for %[[IV0:[a-zA-Z0-9]+]] =
+//      CHECK:     %[[YIELD1:.+]] = scf.for %[[IV1:[a-zA-Z0-9]+]] =
+//      CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[IV0]], %[[IV1]]]
+//      CHECK:       %[[OFFSET0:.+]] = affine.apply #[[MAP2]](%[[IV0]])[%[[ARG2]]]
+//      CHECK:       %[[OFFSET1:.+]] = affine.apply #[[MAP2]](%[[IV1]])[%[[ARG3]]]
+//      CHECK:       %[[UPDATE:.+]] = tensor.insert_slice %[[SLICE]]
+// CHECK-SAME:         into %{{.+}}[%[[OFFSET0]], 0, %[[OFFSET1]]]
+//      CHECK:       scf.yield %[[UPDATE]]
+//      CHECK:     scf.yield %[[YIELD1]]
+//      CHECK:   return %[[RESULT]]
+
+// -----
+
+func @fft_1d_stage_5(%arg0: tensor<1024xf32>, %arg1: tensor<1024xf32>,
+    %arg2: tensor<16xf32>, %arg3: tensor<16xf32>) -> (tensor<1024xf32>, tensor<1024xf32>) {
+  %cst1 = arith.constant 5 : index
+  %0:2 = iree_linalg_ext.fft
+  {__internal_linalg_transform__ = "tiling_1d_stage5_fft_input"}
+    ins(%cst1, %arg2, %arg3: index, tensor<16xf32>, tensor<16xf32>)
+    outs(%arg0, %arg1: tensor<1024xf32>, tensor<1024xf32>)
+  : tensor<1024xf32>, tensor<1024xf32>
+  return %0#0, %0#1 : tensor<1024xf32>, tensor<1024xf32>
+}
+// CHECK-DAG:  #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (32, -d0 + s1)>
+// CHECK:      func @fft_1d_stage_5(
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]
+// CHECK-SAME:   %[[COEF_REAL:[a-zA-Z0-9_]+]]
+// CHECK-SAME:   %[[COEF_IMAG:[a-zA-Z0-9_]+]]
+// CHECK-DAG:    %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:    %[[C5:.+]] = arith.constant 5 : index
+// CHECK-DAG:    %[[C32:.+]] = arith.constant 32 : index
+// CHECK-DAG:    %[[C1024:.+]] = arith.constant 1024 : index
+// CHECK:        %[[RES:.+]]:2 = scf.for %[[I:.+]] = %[[C0]] to %[[C1024]] step %[[C32]]
+// CHECK-SAME:       iter_args(%[[ARG5:.+]] = %[[ARG0]], %[[ARG6:.+]] = %[[ARG1]])
+// CHECK-SAME:       -> (tensor<1024xf32>, tensor<1024xf32>) {
+// CHECK:          %[[SIZE:.+]] = affine.min #[[MAP0]](%[[I]])[%[[C32]], %[[C1024]]]
+// CHECK:          %[[SLICE1:.+]] = tensor.extract_slice %[[ARG5]][%[[I]]] [%[[SIZE]]] [1] : tensor<1024xf32> to tensor<?xf32>
+// CHECK:          %[[SLICE2:.+]] = tensor.extract_slice %[[ARG6]][%[[I]]] [%[[SIZE]]] [1] : tensor<1024xf32> to tensor<?xf32>
+// CHECK:          %[[FFT:.+]]:2 = iree_linalg_ext.fft
+// CHECK-SAME:       {__internal_linalg_transform__ = "tiling_1d_stage5_fft_output"}
+// CHECK-SAME:       ins(%[[C5]], %[[COEF_REAL]], %[[COEF_IMAG]] : index, tensor<16xf32>, tensor<16xf32>)
+// CHECK-SAME:       outs(%[[SLICE1]], %[[SLICE2]] : tensor<?xf32>, tensor<?xf32>)
+// CHECK:          %[[INSERT1:.+]] = tensor.insert_slice %[[FFT]]#0 into %[[ARG5]][%[[I]]] [%[[SIZE]]] [1] : tensor<?xf32> into tensor<1024xf32>
+// CHECK:          %[[INSERT2:.+]] = tensor.insert_slice %[[FFT]]#1 into %[[ARG6]][%[[I]]] [%[[SIZE]]] [1] : tensor<?xf32> into tensor<1024xf32>
+// CHECK:          scf.yield %[[INSERT1]], %[[INSERT2]]
+// CHECK:        return %[[RES]]#0, %[[RES]]#1 : tensor<1024xf32>, tensor<1024xf32>
+
+// -----
+
+func @fft_2d_stage_5(%arg0: tensor<3x1024xf32>, %arg1: tensor<3x1024xf32>,
+    %arg2: tensor<16xf32>, %arg3: tensor<16xf32>) -> (tensor<3x1024xf32>, tensor<3x1024xf32>) {
+  %cst1 = arith.constant 5 : index
+  %0:2 = iree_linalg_ext.fft
+  {__internal_linalg_transform__ = "tiling_2d_stage5_fft_input"}
+    ins(%cst1, %arg2, %arg3: index, tensor<16xf32>, tensor<16xf32>)
+    outs(%arg0, %arg1: tensor<3x1024xf32>, tensor<3x1024xf32>)
+  : tensor<3x1024xf32>, tensor<3x1024xf32>
+  return %0#0, %0#1 : tensor<3x1024xf32>, tensor<3x1024xf32>
+}
+// CHECK-DAG:  #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+// CHECK-DAG:  #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (32, -d0 + s1)>
+// CHECK:      func @fft_2d_stage_5(
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]
+// CHECK-SAME:   %[[COEF_REAL:[a-zA-Z0-9_]+]]
+// CHECK-SAME:   %[[COEF_IMAG:[a-zA-Z0-9_]+]]
+// CHECK-DAG:    %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:    %[[C3:.+]] = arith.constant 3 : index
+// CHECK-DAG:    %[[C5:.+]] = arith.constant 5 : index
+// CHECK-DAG:    %[[C10:.+]] = arith.constant 10 : index
+// CHECK-DAG:    %[[C32:.+]] = arith.constant 32 : index
+// CHECK-DAG:    %[[C1024:.+]] = arith.constant 1024 : index
+// CHECK:        %[[RES:.+]]:2 = scf.for %[[I:.+]] = %[[C0]] to %[[C3]] step %[[C10]]
+// CHECK-SAME:       iter_args(%[[ARG5:.+]] = %[[ARG0]], %[[ARG6:.+]] = %[[ARG1]])
+// CHECK-SAME:       -> (tensor<3x1024xf32>, tensor<3x1024xf32>) {
+// CHECK:          %[[SZ1:.+]] = affine.min #[[MAP0]](%[[I]])[%[[C10]], %[[C3]]]
+// CHECK:          %{{.+}} = scf.for %[[J:.+]] = %[[C0]] to %[[C1024]] step %[[C32]]
+// CHECK-SAME:         iter_args(%[[ARG8:.+]] = %[[ARG5]], %[[ARG9:.+]] = %[[ARG6]]) -> (tensor<3x1024xf32>, tensor<3x1024xf32>) {
+// CHECK:            %[[SZ2:.+]] = affine.min #[[MAP1]](%[[J]])[%[[C32]], %[[C1024]]]
+// CHECK:            %[[SLICE1:.+]] = tensor.extract_slice %[[ARG8]][%[[I]], %[[J]]] [%[[SZ1]], %[[SZ2]]] [1, 1]
+// CHECK:            %[[SLICE2:.+]] = tensor.extract_slice %[[ARG9]][%[[I]], %[[J]]] [%[[SZ1]], %[[SZ2]]] [1, 1]
+// CHECK:          %[[FFT:.+]]:2 = iree_linalg_ext.fft
+// CHECK-SAME:       {__internal_linalg_transform__ = "tiling_2d_stage5_fft_output"}
+// CHECK-SAME:       ins(%[[C5]], %[[COEF_REAL]], %[[COEF_IMAG]] : index, tensor<16xf32>, tensor<16xf32>)
+// CHECK-SAME:       outs(%[[SLICE1]], %[[SLICE2]] : tensor<?x?xf32>, tensor<?x?xf32>)
+// CHECK:          %[[INSERT1:.+]] = tensor.insert_slice %[[FFT]]#0 into %[[ARG8]][%[[I]], %[[J]]] [%[[SZ1]], %[[SZ2]]] [1, 1]
+// CHECK:          %[[INSERT2:.+]] = tensor.insert_slice %[[FFT]]#1 into %[[ARG9]][%[[I]], %[[J]]] [%[[SZ1]], %[[SZ2]]] [1, 1]
+// CHECK:          scf.yield %[[INSERT1]], %[[INSERT2]] : tensor<3x1024xf32>, tensor<3x1024xf32>
+
+// -----
+
+func @fft_1d_stage_5_memref(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>,
+    %arg2: memref<16xf32>, %arg3: memref<16xf32>) {
+  %cst1 = arith.constant 5 : index
+  iree_linalg_ext.fft
+  {__internal_linalg_transform__ = "tiling_1d_stage5_fft_input"}
+    ins(%cst1, %arg2, %arg3: index, memref<16xf32>, memref<16xf32>)
+    outs(%arg0, %arg1: memref<1024xf32>, memref<1024xf32>)
+  return
+}
+// CHECK-DAG:  #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (32, -d0 + s1)>
+// CHECK-DAG:  #[[MAP1:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
+// CHECK:      func @fft_1d_stage_5_memref(
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]
+// CHECK-SAME:   %[[COEF_REAL:[a-zA-Z0-9_]+]]
+// CHECK-SAME:   %[[COEF_IMAG:[a-zA-Z0-9_]+]]
+// CHECK-DAG:    %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:    %[[C5:.+]] = arith.constant 5 : index
+// CHECK-DAG:    %[[C32:.+]] = arith.constant 32 : index
+// CHECK-DAG:    %[[C1024:.+]] = arith.constant 1024 : index
+// CHECK:        scf.for %[[I:.+]] = %[[C0]] to %[[C1024]] step %[[C32]] {
+// CHECK:          %[[SZ:.+]] = affine.min #[[MAP0]](%[[I]])[%[[C32]], %[[C1024]]]
+// CHECK:          %[[SUB1:.+]] = memref.subview %[[ARG0]][%[[I]]] [%[[SZ]]] [1] : memref<1024xf32> to memref<?xf32, #[[MAP1]]>
+// CHECK:          %[[SUB2:.+]] = memref.subview %[[ARG1]][%[[I]]] [%[[SZ]]] [1] : memref<1024xf32> to memref<?xf32, #[[MAP1]]>
+// CHECK:          iree_linalg_ext.fft
+// CHECK-SAME:       {__internal_linalg_transform__ = "tiling_1d_stage5_fft_output"}
+// CHECK-SAME:       ins(%[[C5]], %[[COEF_REAL]], %[[COEF_IMAG]] : index, memref<16xf32>, memref<16xf32>)
+// CHECK-SAME:       outs(%[[SUB1]], %[[SUB2]] : memref<?xf32, #[[MAP1]]>, memref<?xf32, #[[MAP1]]>)
+
+// -----
+
+func @reverse_memref(%arg0: memref<?xi32>, %arg1: memref<?xi32>) {
+  iree_linalg_ext.reverse
+    {__internal_linalg_transform__ = "tiling_input"}
+    dimensions(dense<0> : tensor<1xi64>)
+    ins(%arg0: memref<?xi32>)
+    outs(%arg1: memref<?xi32>)
+  return
+}
+// CHECK-DAG:  #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+// CHECK-DAG:  #[[MAP1:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
+// CHECK-DAG:  #[[MAP2:.+]] = affine_map<()[s0, s1, s2] -> (s0 - s1 - s2)>
+// CHECK:      func @reverse_memref(
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]
+// CHECK-DAG:    %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:    %[[C10:.+]] = arith.constant 10 : index
+// CHECK-DAG:    %[[D0:.+]] = memref.dim %[[ARG0]], %[[C0]] : memref<?xi32>
+// CHECK:        scf.for %[[I:.+]] = %[[C0]] to %[[D0]] step %[[C10]] {
+// CHECK:          %[[SIZE:.+]] = affine.min #[[MAP0]](%[[I]])[%[[C10]], %[[D0]]]
+// CHECK:          %[[SUB_IN:.+]] =  memref.subview %[[ARG0]][%[[I]]] [%[[SIZE]]] [1]
+// CHECK:          %[[T0:.+]] = memref.dim %[[ARG0]], %[[C0]] : memref<?xi32>
+// CHECK:          %[[IDX:.+]] = affine.apply #[[MAP2]]()[%[[T0]], %[[I]], %[[SIZE]]]
+// CHECK:          %[[SUB_OUT:.+]] = memref.subview %[[ARG1]][%[[IDX]]] [%[[SIZE]]] [1]
+// CHECK:          iree_linalg_ext.reverse
+// CHECK-SAME:       {__internal_linalg_transform__ = "tiling_output"}
+// CHECK-SAME:       dimensions(dense<0> : tensor<1xi64>)
+// CHECK-SAME:       ins(%[[SUB_IN]]
+// CHECK-SAME:       outs(%[[SUB_OUT]]
+
+// -----
+
+func @reverse_tensor_multi_dim(%arg0: tensor<?x?xi32>) -> tensor<?x?xi32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %d0 = tensor.dim %arg0, %c0 : tensor<?x?xi32>
+  %d1 = tensor.dim %arg0, %c1 : tensor<?x?xi32>
+  %init = linalg.init_tensor [%d0, %d1] : tensor<?x?xi32>
+  %0 = iree_linalg_ext.reverse
+         {__internal_linalg_transform__ = "tiling_input"}
+         dimensions(dense<[0, 1]> : tensor<2xi64>)
+         ins(%arg0: tensor<?x?xi32>)
+         outs(%init: tensor<?x?xi32>) : tensor<?x?xi32>
+  return %0 : tensor<?x?xi32>
+}
+// CHECK-DAG:  #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+// CHECK-DAG:  #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
+// CHECK-DAG:  #[[MAP2:.+]] = affine_map<()[s0, s1, s2] -> (s0 - s1 - s2)>
+// CHECK:      func @reverse_tensor_multi_dim(
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]
+// CHECK-DAG:    %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:    %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:    %[[C10:.+]] = arith.constant 10 : index
+// CHECK-DAG:    %[[C20:.+]] = arith.constant 20 : index
+// CHECK-DAG:    %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xi32>
+// CHECK-DAG:    %[[D1:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xi32>
+// CHECK:        %[[INIT:.+]] = linalg.init_tensor [%[[D0]], %[[D1]]] : tensor<?x?xi32>
+// CHECK-DAG:    %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xi32>
+// CHECK-DAG:    %[[D1:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xi32>
+// CHECK:        %[[RES:.+]] = scf.for %[[I:.+]] = %[[C0]] to %[[D0]] step %[[C10]]
+// CHECK-SAME:     iter_args(%[[INIT2:.+]] = %[[INIT]]) -> (tensor<?x?xi32>) {
+// CHECK:          %[[SIZE_I:.+]] = affine.min #[[MAP0]](%[[I]])[%[[C10]], %[[D0]]]
+// CHECK:          %[[RES2:.+]] = scf.for %[[J:.+]] = %[[C0]] to %[[D1]] step %[[C20]]
+// CHECK-SAME:       iter_args(%[[INIT3:.+]] = %[[INIT2]]) -> (tensor<?x?xi32>) {
+// CHECK:            %[[SIZE_J:.+]] = affine.min #[[MAP1]](%[[J]])[%[[C20]], %[[D1]]]
+// CHECK:            %[[SUB_IN:.+]] = tensor.extract_slice
+// CHECK-SAME:         %[[ARG0]][%[[I]], %[[J]]] [%[[SIZE_I]], %[[SIZE_J]]] [1, 1]
+// CHECK:            %[[T0:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xi32>
+// CHECK:            %[[IDX0:.+]] = affine.apply #[[MAP2]]()[%[[T0]], %[[I]], %[[SIZE_I]]]
+// CHECK:            %[[T1:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xi32>
+// CHECK:            %[[IDX1:.+]] = affine.apply #[[MAP2]]()[%[[T1]], %[[J]], %[[SIZE_J]]]
+// CHECK:            %[[SUB_INIT:.+]] = tensor.extract_slice
+// CHECK-SAME:         %[[INIT]][%[[IDX0]], %[[IDX1]]] [%[[SIZE_I]], %[[SIZE_J]]] [1, 1]
+// CHECK:            %[[REV:.+]] = iree_linalg_ext.reverse
+// CHECK-SAME:          {__internal_linalg_transform__ = "tiling_output"}
+// CHECK-SAME:          dimensions(dense<[0, 1]> : tensor<2xi64>)
+// CHECK-SAME:          ins(%[[SUB_IN]]
+// CHECK-SAME:          outs(%[[SUB_INIT]]
+// CHECK:            %[[RES3:.+]] = tensor.insert_slice %[[REV]] into
+// CHECK-SAME:         %[[INIT3]][%[[IDX0]], %[[IDX1]]] [%[[SIZE_I]], %[[SIZE_J]]] [1, 1]
+// CHECK:            scf.yield %[[RES3]]
+// CHECK:          scf.yield %[[RES2]]
+// CHECK:        return %[[RES]]
+
+// -----
+
+func @dynamic_insert_slice(%arg0 : tensor<?xf32>, %arg1 : tensor<?x?xf32>,
+    %arg2 : index, %arg3 : index) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %d0 = tensor.dim %arg0, %c0 : tensor<?xf32>
+  %0 = tensor.insert_slice %arg0 into %arg1[%arg2, %arg3] [1, %d0] [1, 1]
+      {__internal_linalg_transform__ = "tiling_input"} : tensor<?xf32> into tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
+//      CHECK: func @dynamic_insert_slice(
+// CHECK-SAME:     %[[ARG0:.+]]: tensor<?xf32>
+// CHECK-SAME:     %[[ARG1:.+]]: tensor<?x?xf32>
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[ARG3:[a-zA-Z0-9_]+]]: index
+//  CHECK-DAG:  %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-DAG:  %[[C10:.+]] = arith.constant 10 : index
+//      CHECK:  %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?xf32>
+//      CHECK:  %[[RESULT:.+]] = scf.for %[[ARG4:.+]] = %[[C0]] to %[[D0]]
+// CHECK-SAME:      step %[[C10]] iter_args(%[[ARG5:.+]] = %[[ARG1]])
+//      CHECK:    %[[TILESIZE:.+]] = affine.min #[[MAP0]](%[[ARG4]])[%[[C10]], %[[D0]]]
+//      CHECK:    %[[EXTRACT:.+]] = tensor.extract_slice %[[ARG0]][%[[ARG4]]] [%[[TILESIZE]]]
+//      CHECK:    %[[OFFSET:.+]] = affine.apply #[[MAP1]](%[[ARG4]])[%[[ARG3]]]
+//      CHECK:    %[[INSERT:.+]] = tensor.insert_slice %[[EXTRACT]] into %[[ARG5]]
+// CHECK-SAME:        [%[[ARG2]], %[[OFFSET]]] [1, %[[TILESIZE]]]
+//      CHECK:    scf.yield %[[INSERT]]
+//      CHECK:  return %[[RESULT]]
+
+
+// -----
+
+func @insert_slice_rank_reduced_inner(%arg0 : tensor<?xf32>,
+    %arg1 : tensor<?x?x?xf32>, %arg2: index, %arg3 : index, %arg4 : index) -> tensor<?x?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %d0 = tensor.dim %arg0, %c0 : tensor<?xf32>
+  %0 = tensor.insert_slice %arg0 into %arg1[%arg2, %arg3, %arg4] [1, %d0, 1] [1, 1, 1]
+      {__internal_linalg_transform__ = "tiling_input"} : tensor<?xf32> into tensor<?x?x?xf32>
+  return %0 : tensor<?x?x?xf32>
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
+//      CHECK: func @insert_slice_rank_reduced_inner(
+// CHECK-SAME:     %[[ARG0:.+]]: tensor<?xf32>
+// CHECK-SAME:     %[[ARG1:.+]]: tensor<?x?x?xf32>
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[ARG3:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:     %[[ARG4:[a-zA-Z0-9_]+]]: index
+//  CHECK-DAG:   %[[LB:.+]] = arith.constant 0 : index
+//  CHECK-DAG:   %[[STEP:.+]] = arith.constant 10 : index
+//      CHECK:   %[[UB:.+]] = tensor.dim %[[ARG0]]
+//      CHECK:   %[[RESULT:.+]] = scf.for %[[IV0:[a-zA-Z0-9_]+]] = %[[LB]]
+// CHECK-SAME:       to %[[D0]] step %[[STEP]] iter_args(%[[ARG6:.+]] = %[[ARG1]])
+//      CHECK:     %[[TILESIZE:.+]] = affine.min #[[MAP0]](%[[ARG5]])[%[[STEP]], %[[UB]]]
+//      CHECK:     %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[IV0]]] [%[[TILESIZE]]]
+//      CHECK:     %[[APPLY:.+]] = affine.apply #[[MAP1]](%[[IV0]])[%[[ARG3]]]
+//      CHECK:     %[[YIELD:.+]] = tensor.insert_slice %[[SLICE]] into %[[ARG6]]
+// CHECK-SAME:         [%[[ARG2]], %[[APPLY]], %[[ARG4]]] [1, %[[TILESIZE]], 1]
+//      CHECK:     scf.yield %[[YIELD]]
+//      CHECK:   return %[[RESULT]]
+
+// -----
+
+func @extract_slice(%arg0 : tensor<?x?xf32>, %arg1: index,
+    %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index,
+    %arg6 : index) -> tensor<?x?xf32> {
+  %0 = tensor.extract_slice %arg0[%arg1, %arg2] [%arg3, %arg4] [%arg5, %arg6]
+      {__internal_linalg_transform__ = "tiling_input"} : tensor<?x?xf32> to tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
+//      CHECK: func @extract_slice
+// CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?xf32>
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG3:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG4:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG5:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG6:[a-zA-Z0-9]+]]: index
+//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
+//  CHECK-DAG:   %[[INIT:.+]] = linalg.init_tensor [%[[ARG3]], %[[ARG4]]]
+//      CHECK:   %[[RESULT:.+]] = scf.for %[[IV0:.+]] = %[[C0]]
+// CHECK-SAME:       to %[[ARG3]] step %[[C10]]
+// CHECK-SAME:       iter_args(%[[ITER1:.+]] = %[[INIT]]) -> (tensor<?x?xf32>) {
+//      CHECK:     %[[TILE_Y:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[C10]], %[[ARG3]]]
+//      CHECK:     %[[YIELD:.+]] = scf.for %[[IV1:.+]] = %[[C0]]
+// CHECK-SAME:         to %[[ARG4]] step %[[C20]]
+// CHECK-SAME:         iter_args(%[[ITER2:.+]] = %[[ITER1]]) -> (tensor<?x?xf32>) {
+//      CHECK:       %[[TILE_X:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[C20]], %[[ARG4]]]
+//  CHECK-DAG:       %[[OFFSET_Y:.+]] = affine.apply #[[MAP2]](%[[IV0]])[%[[ARG5]], %[[ARG1]]]
+//  CHECK-DAG:       %[[OFFSET_X:.+]] = affine.apply #[[MAP2]](%[[IV1]])[%[[ARG6]], %[[ARG2]]]
+//      CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]]
+// CHECK-SAME:           [%[[OFFSET_Y]], %[[OFFSET_X]]] [%[[TILE_Y]], %[[TILE_X]]] [%[[ARG5]], %[[ARG6]]]
+//      CHECK:       %[[INSERT:.+]] = tensor.insert_slice %[[SLICE]] into %[[ITER2]]
+// CHECK-SAME:           [%[[IV0]], %[[IV1]]] [%[[TILE_Y]], %[[TILE_X]]] [1, 1]
+//      CHECK:       scf.yield %[[INSERT]]
+//      CHECK:     }
+//      CHECK:     scf.yield %[[YIELD]]
+//      CHECK:   }
+//      CHECK:   return %[[RESULT]]
+
+// -----
+
+func @extract_slice_static(%arg0 : tensor<50x60xf32>) -> tensor<20x30xf32> {
+  %0 = tensor.extract_slice %arg0[2, 3] [20, 30] [5, 6]
+      {__internal_linalg_transform__ = "tiling_input"} : tensor<50x60xf32> to tensor<20x30xf32>
+  return %0 : tensor<20x30xf32>
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
+//      CHECK: func @extract_slice_static
+// CHECK-SAME:     %[[ARG0:.+]]: tensor<50x60xf32>
+//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+//  CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+//  CHECK-DAG:   %[[C5:.+]] = arith.constant 5 : index
+//  CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
+//  CHECK-DAG:   %[[C30:.+]] = arith.constant 30 : index
+//  CHECK-DAG:   %[[INIT:.+]] = linalg.init_tensor [20, 30]
+//      CHECK:   %[[RESULT:.+]] = scf.for %[[IV0:.+]] = %[[C0]]
+// CHECK-SAME:       to %[[C20]] step %[[C10]]
+// CHECK-SAME:       iter_args(%[[ITER1:.+]] = %[[INIT]]) -> (tensor<20x30xf32>) {
+//      CHECK:     %[[TILE_Y:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[C10]], %[[C20]]]
+//      CHECK:     %[[YIELD:.+]] = scf.for %[[IV1:.+]] = %[[C0]]
+// CHECK-SAME:         to %[[C30]] step %[[C20]]
+// CHECK-SAME:         iter_args(%[[ITER2:.+]] = %[[ITER1]]) -> (tensor<20x30xf32>) {
+//      CHECK:       %[[TILE_X:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[C20]], %[[C30]]]
+//  CHECK-DAG:       %[[OFFSET_Y:.+]] = affine.apply #[[MAP2]](%[[IV0]])[%[[C5]], %[[C2]]]
+//  CHECK-DAG:       %[[OFFSET_X:.+]] = affine.apply #[[MAP2]](%[[IV1]])[%[[C6]], %[[C3]]]
+//      CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]]
+// CHECK-SAME:           [%[[OFFSET_Y]], %[[OFFSET_X]]] [%[[TILE_Y]], %[[TILE_X]]] [5, 6]
+//      CHECK:       %[[INSERT:.+]] = tensor.insert_slice %[[SLICE]] into %[[ITER2]]
+// CHECK-SAME:           [%[[IV0]], %[[IV1]]] [%[[TILE_Y]], %[[TILE_X]]] [1, 1]
+//      CHECK:       scf.yield %[[INSERT]]
+//      CHECK:     }
+//      CHECK:     scf.yield %[[YIELD]]
+//      CHECK:   }
+//      CHECK:   return %[[RESULT]]
+
+// -----
+
+func @extract_slice_reduced_rank_outer(%arg0 : tensor<?x?x?xf32>, %arg1 : index,
+    %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index,
+    %arg7 : index, %arg8 : index) -> tensor<?x?xf32> {
+  %0 = tensor.extract_slice %arg0[%arg1, %arg2, %arg3] [1, %arg4, %arg5] [%arg6, %arg7, %arg8]
+      {__internal_linalg_transform__ = "tiling_input"} : tensor<?x?x?xf32> to tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
+//      CHECK: func @extract_slice_reduced_rank_outer
+// CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?x?xf32>
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG3:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG4:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG5:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG6:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG7:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG8:[a-zA-Z0-9]+]]: index
+//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
+//  CHECK-DAG:   %[[INIT:.+]] = linalg.init_tensor [%[[ARG4]], %[[ARG5]]]
+//      CHECK:   %[[RESULT:.+]] = scf.for %[[IV0:.+]] = %[[C0]]
+// CHECK-SAME:       to %[[ARG4]] step %[[C10]]
+// CHECK-SAME:       iter_args(%[[ITER1:.+]] = %[[INIT]]) -> (tensor<?x?xf32>) {
+//      CHECK:     %[[TILE_Y:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[C10]], %[[ARG4]]]
+//      CHECK:     %[[YIELD:.+]] = scf.for %[[IV1:.+]] = %[[C0]]
+// CHECK-SAME:         to %[[ARG5]] step %[[C20]]
+// CHECK-SAME:         iter_args(%[[ITER2:.+]] = %[[ITER1]]) -> (tensor<?x?xf32>) {
+//      CHECK:       %[[TILE_X:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[C20]], %[[ARG5]]]
+//  CHECK-DAG:       %[[OFFSET_Y:.+]] = affine.apply #[[MAP2]](%[[IV0]])[%[[ARG7]], %[[ARG2]]]
+//  CHECK-DAG:       %[[OFFSET_X:.+]] = affine.apply #[[MAP2]](%[[IV1]])[%[[ARG8]], %[[ARG3]]]
+//      CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]]
+// CHECK-SAME:           [%[[ARG1]], %[[OFFSET_Y]], %[[OFFSET_X]]]
+// CHECK-SAME:           [1, %[[TILE_Y]], %[[TILE_X]]] [%[[ARG6]], %[[ARG7]], %[[ARG8]]]
+//      CHECK:       %[[INSERT:.+]] = tensor.insert_slice %[[SLICE]] into %[[ITER2]]
+// CHECK-SAME:           [%[[IV0]], %[[IV1]]] [%[[TILE_Y]], %[[TILE_X]]] [1, 1]
+//      CHECK:       scf.yield %[[INSERT]]
+//      CHECK:     }
+//      CHECK:     scf.yield %[[YIELD]]
+//      CHECK:   }
+//      CHECK:   return %[[RESULT]]
+
+// -----
+
+func @extract_slice_reduced_rank_middle(%arg0 : tensor<?x?x?xf32>, %arg1 : index,
+    %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index,
+    %arg7 : index, %arg8 : index) -> tensor<?x?xf32> {
+  %0 = tensor.extract_slice %arg0[%arg1, %arg2, %arg3] [%arg4, 1, %arg5] [%arg6, %arg7, %arg8]
+      {__internal_linalg_transform__ = "tiling_input"} : tensor<?x?x?xf32> to tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
+//      CHECK: func @extract_slice_reduced_rank_middle
+// CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?x?xf32>
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG3:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG4:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG5:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG6:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG7:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG8:[a-zA-Z0-9]+]]: index
+//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
+//  CHECK-DAG:   %[[INIT:.+]] = linalg.init_tensor [%[[ARG4]], %[[ARG5]]]
+//      CHECK:   %[[RESULT:.+]] = scf.for %[[IV0:.+]] = %[[C0]]
+// CHECK-SAME:       to %[[ARG4]] step %[[C10]]
+// CHECK-SAME:       iter_args(%[[ITER1:.+]] = %[[INIT]]) -> (tensor<?x?xf32>) {
+//      CHECK:     %[[TILE_Y:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[C10]], %[[ARG4]]]
+//      CHECK:     %[[YIELD:.+]] = scf.for %[[IV1:.+]] = %[[C0]]
+// CHECK-SAME:         to %[[ARG5]] step %[[C20]]
+// CHECK-SAME:         iter_args(%[[ITER2:.+]] = %[[ITER1]]) -> (tensor<?x?xf32>) {
+//      CHECK:       %[[TILE_X:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[C20]], %[[ARG5]]]
+//  CHECK-DAG:       %[[OFFSET_Y:.+]] = affine.apply #[[MAP2]](%[[IV0]])[%[[ARG6]], %[[ARG1]]]
+//  CHECK-DAG:       %[[OFFSET_X:.+]] = affine.apply #[[MAP2]](%[[IV1]])[%[[ARG8]], %[[ARG3]]]
+//      CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]]
+// CHECK-SAME:           [%[[OFFSET_Y]], %[[ARG2]], %[[OFFSET_X]]]
+// CHECK-SAME:           [%[[TILE_Y]], 1, %[[TILE_X]]] [%[[ARG6]], %[[ARG7]], %[[ARG8]]]
+//      CHECK:       %[[INSERT:.+]] = tensor.insert_slice %[[SLICE]] into %[[ITER2]]
+// CHECK-SAME:           [%[[IV0]], %[[IV1]]] [%[[TILE_Y]], %[[TILE_X]]] [1, 1]
+//      CHECK:       scf.yield %[[INSERT]]
+//      CHECK:     }
+//      CHECK:     scf.yield %[[YIELD]]
+//      CHECK:   }
+//      CHECK:   return %[[RESULT]]
+
+// -----
+
+func @extract_slice_reduced_rank_inner(%arg0 : tensor<?x?x?xf32>, %arg1 : index,
+    %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index,
+    %arg7 : index, %arg8 : index) -> tensor<?x?xf32> {
+  %0 = tensor.extract_slice %arg0[%arg1, %arg2, %arg3] [%arg4, %arg5, 1] [%arg6, %arg7, %arg8]
+      {__internal_linalg_transform__ = "tiling_input"} : tensor<?x?x?xf32> to tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
+//      CHECK: func @extract_slice_reduced_rank_inner
+// CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?x?xf32>
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG3:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG4:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG5:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG6:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG7:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG8:[a-zA-Z0-9]+]]: index
+//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
+//  CHECK-DAG:   %[[INIT:.+]] = linalg.init_tensor [%[[ARG4]], %[[ARG5]]]
+//      CHECK:   %[[RESULT:.+]] = scf.for %[[IV0:.+]] = %[[C0]]
+// CHECK-SAME:       to %[[ARG4]] step %[[C10]]
+// CHECK-SAME:       iter_args(%[[ITER1:.+]] = %[[INIT]]) -> (tensor<?x?xf32>) {
+//      CHECK:     %[[TILE_Y:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[C10]], %[[ARG4]]]
+//      CHECK:     %[[YIELD:.+]] = scf.for %[[IV1:.+]] = %[[C0]]
+// CHECK-SAME:         to %[[ARG5]] step %[[C20]]
+// CHECK-SAME:         iter_args(%[[ITER2:.+]] = %[[ITER1]]) -> (tensor<?x?xf32>) {
+//      CHECK:       %[[TILE_X:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[C20]], %[[ARG5]]]
+//  CHECK-DAG:       %[[OFFSET_Y:.+]] = affine.apply #[[MAP2]](%[[IV0]])[%[[ARG6]], %[[ARG1]]]
+//  CHECK-DAG:       %[[OFFSET_X:.+]] = affine.apply #[[MAP2]](%[[IV1]])[%[[ARG7]], %[[ARG2]]]
+//      CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]]
+// CHECK-SAME:           [%[[OFFSET_Y]], %[[OFFSET_X]], %[[ARG3]]]
+// CHECK-SAME:           [%[[TILE_Y]], %[[TILE_X]], 1] [%[[ARG6]], %[[ARG7]], %[[ARG8]]
+//      CHECK:       %[[INSERT:.+]] = tensor.insert_slice %[[SLICE]] into %[[ITER2]]
+// CHECK-SAME:           [%[[IV0]], %[[IV1]]] [%[[TILE_Y]], %[[TILE_X]]] [1, 1]
+//      CHECK:       scf.yield %[[INSERT]]
+//      CHECK:     }
+//      CHECK:     scf.yield %[[YIELD]]
+//      CHECK:   }
+//      CHECK:   return %[[RESULT]]
+
+// -----
+
+func @extract_slice_reduced_rank_two_dims_1(%arg0 : tensor<?x?x?x?xf32>, %arg1 : index,
+    %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index,
+    %arg7 : index, %arg8 : index, %arg9 : index, %arg10 : index) -> tensor<?x?xf32> {
+  %0 = tensor.extract_slice %arg0[%arg1, %arg2, %arg3, %arg4] [%arg5, 1, %arg6, 1] [%arg7, %arg8, %arg9, %arg10]
+      {__internal_linalg_transform__ = "tiling_input"} : tensor<?x?x?x?xf32> to tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
+//      CHECK: func @extract_slice_reduced_rank_two_dims_1
+// CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?x?x?xf32>
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG3:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG4:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG5:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG6:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG7:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG8:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG9:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG10:[a-zA-Z0-9]+]]: index
+//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
+//  CHECK-DAG:   %[[INIT:.+]] = linalg.init_tensor [%[[ARG5]], %[[ARG6]]]
+//      CHECK:   %[[RESULT:.+]] = scf.for %[[IV0:.+]] = %[[C0]]
+// CHECK-SAME:       to %[[ARG5]] step %[[C10]]
+// CHECK-SAME:       iter_args(%[[ITER1:.+]] = %[[INIT]]) -> (tensor<?x?xf32>) {
+//      CHECK:     %[[TILE_Y:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[C10]], %[[ARG5]]]
+//      CHECK:     %[[YIELD:.+]] = scf.for %[[IV1:.+]] = %[[C0]]
+// CHECK-SAME:         to %[[ARG6]] step %[[C20]]
+// CHECK-SAME:         iter_args(%[[ITER2:.+]] = %[[ITER1]]) -> (tensor<?x?xf32>) {
+//      CHECK:       %[[TILE_X:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[C20]], %[[ARG6]]]
+//  CHECK-DAG:       %[[OFFSET_Y:.+]] = affine.apply #[[MAP2]](%[[IV0]])[%[[ARG7]], %[[ARG1]]]
+//  CHECK-DAG:       %[[OFFSET_X:.+]] = affine.apply #[[MAP2]](%[[IV1]])[%[[ARG9]], %[[ARG3]]]
+//      CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]]
+// CHECK-SAME:           [%[[OFFSET_Y]], %[[ARG2]], %[[OFFSET_X]], %[[ARG4]]]
+// CHECK-SAME:           [%[[TILE_Y]], 1, %[[TILE_X]], 1]
+// CHECK-SAME:           [%[[ARG7]], %[[ARG8]], %[[ARG9]], %[[ARG10]]]
+//      CHECK:       %[[INSERT:.+]] = tensor.insert_slice %[[SLICE]] into %[[ITER2]]
+// CHECK-SAME:           [%[[IV0]], %[[IV1]]] [%[[TILE_Y]], %[[TILE_X]]] [1, 1]
+//      CHECK:       scf.yield %[[INSERT]]
+//      CHECK:     }
+//      CHECK:     scf.yield %[[YIELD]]
+//      CHECK:   }
+//      CHECK:   return %[[RESULT]]
+
+// -----
+
+func @extract_slice_reduced_rank_two_dims_2(%arg0 : tensor<?x?x?x?xf32>, %arg1 : index,
+    %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index,
+    %arg7 : index, %arg8 : index, %arg9 : index, %arg10 : index) -> tensor<?x?xf32> {
+  %0 = tensor.extract_slice %arg0[%arg1, %arg2, %arg3, %arg4] [%arg5, 1, 1, %arg6] [%arg7, %arg8, %arg9, %arg10]
+      {__internal_linalg_transform__ = "tiling_input"} : tensor<?x?x?x?xf32> to tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
+//      CHECK: func @extract_slice_reduced_rank_two_dims_2
+// CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?x?x?xf32>
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG3:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG4:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG5:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG6:[a-zA-Z0-9]+]]: index
+//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
+//  CHECK-DAG:   %[[INIT:.+]] = linalg.init_tensor [%[[ARG5]], %[[ARG6]]]
+//      CHECK:   %[[RESULT:.+]] = scf.for %[[IV0:.+]] = %[[C0]]
+// CHECK-SAME:       to %[[ARG5]] step %[[C10]]
+// CHECK-SAME:       iter_args(%[[ITER1:.+]] = %[[INIT]]) -> (tensor<?x?xf32>) {
+//      CHECK:     %[[TILE_Y:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[C10]], %[[ARG5]]]
+//      CHECK:     %[[YIELD:.+]] = scf.for %[[IV1:.+]] = %[[C0]]
+// CHECK-SAME:         to %[[ARG6]] step %[[C20]]
+// CHECK-SAME:         iter_args(%[[ITER2:.+]] = %[[ITER1]]) -> (tensor<?x?xf32>) {
+//      CHECK:       %[[TILE_X:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[C20]], %[[ARG6]]]
+//  CHECK-DAG:       %[[OFFSET_Y:.+]] = affine.apply #[[MAP2]](%[[IV0]])[%[[ARG7]], %[[ARG1]]]
+//  CHECK-DAG:       %[[OFFSET_X:.+]] = affine.apply #[[MAP2]](%[[IV1]])[%[[ARG10]], %[[ARG4]]]
+//      CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]]
+// CHECK-SAME:           [%[[OFFSET_Y]], %[[ARG2]], %[[ARG3]], %[[OFFSET_X]]]
+// CHECK-SAME:           [%[[TILE_Y]], 1, 1, %[[TILE_X]]]
+// CHECK-SAME:           [%[[ARG7]], %[[ARG8]], %[[ARG9]], %[[ARG10]]]
+//      CHECK:       %[[INSERT:.+]] = tensor.insert_slice %[[SLICE]] into %[[ITER2]]
+// CHECK-SAME:           [%[[IV0]], %[[IV1]]] [%[[TILE_Y]], %[[TILE_X]]] [1, 1]
+//      CHECK:       scf.yield %[[INSERT]]
+//      CHECK:     }
+//      CHECK:     scf.yield %[[YIELD]]
+//      CHECK:   }
+//      CHECK:   return %[[RESULT]]
+
+// -----
+
+func @extract_slice_reduced_rank_two_dims_3(%arg0 : tensor<?x?x?x?xf32>, %arg1 : index,
+    %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index,
+    %arg7 : index, %arg8 : index, %arg9 : index, %arg10 : index) -> tensor<?x?xf32> {
+  %0 = tensor.extract_slice %arg0[%arg1, %arg2, %arg3, %arg4] [1, %arg5, 1, %arg6] [%arg7, %arg8, %arg9, %arg10]
+      {__internal_linalg_transform__ = "tiling_input"} : tensor<?x?x?x?xf32> to tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
+//      CHECK: func @extract_slice_reduced_rank_two_dims_3
+// CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?x?x?xf32>
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG3:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG4:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG5:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG6:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG7:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG8:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG9:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG10:[a-zA-Z0-9]+]]: index
+//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
+//  CHECK-DAG:   %[[INIT:.+]] = linalg.init_tensor [%[[ARG5]], %[[ARG6]]]
+//      CHECK:   %[[RESULT:.+]] = scf.for %[[IV0:.+]] = %[[C0]]
+// CHECK-SAME:       to %[[ARG5]] step %[[C10]]
+// CHECK-SAME:       iter_args(%[[ITER1:.+]] = %[[INIT]]) -> (tensor<?x?xf32>) {
+//      CHECK:     %[[TILE_Y:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[C10]], %[[ARG5]]]
+//      CHECK:     %[[YIELD:.+]] = scf.for %[[IV1:.+]] = %[[C0]]
+// CHECK-SAME:         to %[[ARG6]] step %[[C20]]
+// CHECK-SAME:         iter_args(%[[ITER2:.+]] = %[[ITER1]]) -> (tensor<?x?xf32>) {
+//      CHECK:       %[[TILE_X:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[C20]], %[[ARG6]]]
+//  CHECK-DAG:       %[[OFFSET_Y:.+]] = affine.apply #[[MAP2]](%[[IV0]])[%[[ARG8]], %[[ARG2]]]
+//  CHECK-DAG:       %[[OFFSET_X:.+]] = affine.apply #[[MAP2]](%[[IV1]])[%[[ARG10]], %[[ARG4]]]
+//      CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]]
+// CHECK-SAME:           [%[[ARG1]], %[[OFFSET_Y]], %[[ARG3]], %[[OFFSET_X]]]
+// CHECK-SAME:           [1, %[[TILE_Y]], 1, %[[TILE_X]]]
+// CHECK-SAME:           [%[[ARG7]], %[[ARG8]], %[[ARG9]], %[[ARG10]]]
+//      CHECK:       %[[INSERT:.+]] = tensor.insert_slice %[[SLICE]] into %[[ITER2]]
+// CHECK-SAME:           [%[[IV0]], %[[IV1]]] [%[[TILE_Y]], %[[TILE_X]]] [1, 1]
+//      CHECK:       scf.yield %[[INSERT]]
+//      CHECK:     }
+//      CHECK:     scf.yield %[[YIELD]]
+//      CHECK:   }
+//      CHECK:   return %[[RESULT]]
+
+// -----
+
+func @extract_slice_reduced_rank_two_dims_4(%arg0 : tensor<?x?x?x?xf32>, %arg1 : index,
+    %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index,
+    %arg7 : index, %arg8 : index, %arg9 : index, %arg10 : index) -> tensor<?x?xf32> {
+  %0 = tensor.extract_slice %arg0[%arg1, %arg2, %arg3, %arg4] [%arg5, %arg6, 1, 1] [%arg7, %arg8, %arg9, %arg10]
+      {__internal_linalg_transform__ = "tiling_input"} : tensor<?x?x?x?xf32> to tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (10, -d0 + s1)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
+//      CHECK: func @extract_slice_reduced_rank_two_dims_4
+// CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?x?x?xf32>
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG3:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG4:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG5:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG6:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG7:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG8:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG9:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:     %[[ARG10:[a-zA-Z0-9]+]]: index
+//  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
+//  CHECK-DAG:   %[[INIT:.+]] = linalg.init_tensor [%[[ARG5]], %[[ARG6]]]
+//      CHECK:   %[[RESULT:.+]] = scf.for %[[IV0:.+]] = %[[C0]]
+// CHECK-SAME:       to %[[ARG5]] step %[[C10]]
+// CHECK-SAME:       iter_args(%[[ITER1:.+]] = %[[INIT]]) -> (tensor<?x?xf32>) {
+//      CHECK:     %[[TILE_Y:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[C10]], %[[ARG5]]]
+//      CHECK:     %[[YIELD:.+]] = scf.for %[[IV1:.+]] = %[[C0]]
+// CHECK-SAME:         to %[[ARG6]] step %[[C20]]
+// CHECK-SAME:         iter_args(%[[ITER2:.+]] = %[[ITER1]]) -> (tensor<?x?xf32>) {
+//      CHECK:       %[[TILE_X:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[C20]], %[[ARG6]]]
+//  CHECK-DAG:       %[[OFFSET_Y:.+]] = affine.apply #[[MAP2]](%[[IV0]])[%[[ARG7]], %[[ARG1]]]
+//  CHECK-DAG:       %[[OFFSET_X:.+]] = affine.apply #[[MAP2]](%[[IV1]])[%[[ARG8]], %[[ARG2]]]
+//      CHECK:       %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]]
+// CHECK-SAME:           [%[[OFFSET_Y]], %[[OFFSET_X]], %[[ARG3]], %[[ARG4]]]
+// CHECK-SAME:           [%[[TILE_Y]], %[[TILE_X]], 1, 1]
+// CHECK-SAME:           [%[[ARG7]], %[[ARG8]], %[[ARG9]], %[[ARG10]]]
+//      CHECK:       %[[INSERT:.+]] = tensor.insert_slice %[[SLICE]] into %[[ITER2]]
+// CHECK-SAME:           [%[[IV0]], %[[IV1]]] [%[[TILE_Y]], %[[TILE_X]]] [1, 1]
+//      CHECK:       scf.yield %[[INSERT]]
+//      CHECK:     }
+//      CHECK:     scf.yield %[[YIELD]]
+//      CHECK:   }
+//      CHECK:   return %[[RESULT]]
+
+// -----
+
+func @scan_1d(%0: tensor<128xi32>) -> tensor<128xi32> {
+  %c0 = linalg.init_tensor [] : tensor<i32>
+  %1 = linalg.init_tensor [128] : tensor<128xi32>
+  %2:2 = iree_linalg_ext.scan
+    {__internal_linalg_transform__ = "outer_reduce_input"}
+    dimension(0) inclusive(true)
+    ins(%0 : tensor<128xi32>) outs(%1, %c0 : tensor<128xi32>, tensor<i32>) {
+    ^bb0(%arg0 : i32, %arg1 : i32):
+      %sum = arith.addi %arg0, %arg1 : i32
+      iree_linalg_ext.yield %sum : i32
+  } -> tensor<128xi32>, tensor<i32>
+  return %2#0 : tensor<128xi32>
+}
+//      CHECK: func @scan_1d(
+// CHECK-SAME:   %[[OPERAND:.+]]: tensor<128xi32>
+//      CHECK:   %[[ACC:.+]] = linalg.init_tensor [] : tensor<i32>
+//      CHECK:   %[[OUTPUT:.+]] = linalg.init_tensor [128] : tensor<128xi32>
+//      CHECK:   %[[RESULT:.+]]:2 = iree_linalg_ext.scan
+// CHECK-SAME:           __internal_linalg_transform__ = "outer_reduce_output"
+// CHECK-SAME:       ins(%[[OPERAND]] :
+// CHECK-SAME:       outs(%[[OUTPUT]], %[[ACC]] :
+//      CHECK:   return %[[RESULT]]
+
+// -----
+
+func @scan_2d(%0: tensor<16x32xi32>) -> tensor<16x32xi32> {
+  %c0 = linalg.init_tensor [32] : tensor<32xi32>
+  %1 = linalg.init_tensor [16, 32] : tensor<16x32xi32>
+  %2:2 = iree_linalg_ext.scan
+    {__internal_linalg_transform__ = "outer_reduce_input"}
+    dimension(0) inclusive(true)
+    ins(%0 : tensor<16x32xi32>) outs(%1, %c0 : tensor<16x32xi32>, tensor<32xi32>) {
+    ^bb0(%arg0 : i32, %arg1 : i32):
+      %sum = arith.addi %arg0, %arg1 : i32
+      iree_linalg_ext.yield %sum : i32
+  } -> tensor<16x32xi32>, tensor<32xi32>
+  return %2#0 : tensor<16x32xi32>
+}
+//  CHECK-DAG:  #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
+//      CHECK:  func @scan_2d(
+// CHECK-SAME:    %[[ARG0:[a-zA-Z0-9_]+]]
+//      CHECK:    %[[C0:.+]] = arith.constant 0 : index
+//      CHECK:    %[[C16:.+]] = arith.constant 16 : index
+//      CHECK:    %[[C32:.+]] = arith.constant 32 : index
+//      CHECK:    %[[C20:.+]] = arith.constant 20 : index
+//      CHECK:    %[[ACC:.+]] = linalg.init_tensor [32] : tensor<32xi32>
+//      CHECK:    %[[OUTPUT:.+]] = linalg.init_tensor [16, 32] : tensor<16x32xi32>
+//      CHECK:    %[[RESULT:.+]]:2 = scf.for %[[I:.+]] = %[[C0]] to %[[C32]] step %[[C20]]
+// CHECK-SAME:      iter_args(%[[ARG2:.+]] = %[[OUTPUT]], %[[ARG3:.+]] = %[[ACC]])
+//      CHECK:      %[[SIZE:.+]] = affine.min #[[MAP0]](%[[I]])[%[[C20]], %[[C32]]]
+//      CHECK:      %[[UPDATE_SLICE_IN:.+]] = tensor.extract_slice %[[ARG0]][0, %[[I]]] [%[[C16]], %[[SIZE]]]
+//      CHECK:      %[[UPDATE_SLICE_OUT:.+]] = tensor.extract_slice %[[ARG2]][0, %[[I]]] [%[[C16]], %[[SIZE]]]
+//      CHECK:      %[[UPDATE_SLICE_ACC:.+]] = tensor.extract_slice %[[ARG3]][%[[I]]] [%[[SIZE]]]
+//      CHECK:      %[[SCAN_TILE:.+]]:2 = iree_linalg_ext.scan
+// CHECK-SAME:       {__internal_linalg_transform__ = "outer_reduce_output"}
+// CHECK-SAME:       dimension(0) inclusive(true)
+// CHECK-SAME:       ins(%[[UPDATE_SLICE_IN]]
+// CHECK-SAME:       outs(%[[UPDATE_SLICE_OUT]], %[[UPDATE_SLICE_ACC]]
+//      CHECK:       %[[YIELD:.+]] = tensor.insert_slice %[[SCAN_TILE]]#0 into %[[ARG2]][0, %[[I]]]
+// CHECK-SAME:           [%[[C16]], %[[SIZE]]]
+//      CHECK:       %[[ACC_YIELD:.+]] = tensor.insert_slice %[[SCAN_TILE]]#1 into %[[ARG3]][%[[I]]]
+// CHECK-SAME:           [%[[SIZE]]]
+//      CHECK:       scf.yield %[[YIELD]], %[[ACC_YIELD]] : tensor<16x32xi32>, tensor<32xi32>
+//      CHECK:   return %[[RESULT]]#0
+
+// -----
+
+func @scan_2d_memref(%0: memref<16x32xi32>, %1: memref<16x32xi32>) {
+  %c0 = memref.alloc() : memref<32xi32>
+  iree_linalg_ext.scan
+    {__internal_linalg_transform__ = "outer_reduce_input"}
+    dimension(0) inclusive(true)
+    ins(%0 : memref<16x32xi32>) outs(%1, %c0 : memref<16x32xi32>, memref<32xi32>) {
+    ^bb0(%arg0 : i32, %arg1 : i32):
+      %sum = arith.addi %arg0, %arg1 : i32
+      iree_linalg_ext.yield %sum : i32
+  }
+  return
+}
+//  CHECK-DAG:  #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (20, -d0 + s1)>
+//  CHECK-DAG:  #[[MAP1:.+]] = affine_map<(d0, d1)[s0] -> (d0 * 32 + s0 + d1)>
+//      CHECK:  func @scan_2d_memref(
+// CHECK-SAME:    %[[ARG0:[a-zA-Z0-9_]+]]
+// CHECK-SAME:    %[[ARG1:[a-zA-Z0-9_]+]]
+//      CHECK:    %[[C0:.+]] = arith.constant 0 : index
+//      CHECK:    %[[C16:.+]] = arith.constant 16 : index
+//      CHECK:    %[[C32:.+]] = arith.constant 32 : index
+//      CHECK:    %[[C20:.+]] = arith.constant 20 : index
+//      CHECK:    %[[ACC:.+]] = memref.alloc() : memref<32xi32>
+//      CHECK:    scf.for %[[I:.+]] = %[[C0]] to %[[C32]] step %[[C20]]
+//      CHECK:      %[[SIZE:.+]] = affine.min #[[MAP0]](%[[I]])[%[[C20]], %[[C32]]]
+//      CHECK:      %[[UPDATE_SLICE_IN:.+]] = memref.subview %[[ARG0]][0, %[[I]]] [%[[C16]], %[[SIZE]]]
+//      CHECK:      %[[UPDATE_SLICE_OUT:.+]] = memref.subview %[[ARG1]][0, %[[I]]] [%[[C16]], %[[SIZE]]]
+//      CHECK:      %[[UPDATE_SLICE_ACC:.+]] = memref.subview %[[ACC]][%[[I]]] [%[[SIZE]]]
+//      CHECK:      iree_linalg_ext.scan
+// CHECK-SAME:       {__internal_linalg_transform__ = "outer_reduce_output"}
+// CHECK-SAME:       dimension(0) inclusive(true)
+// CHECK-SAME:       ins(%[[UPDATE_SLICE_IN]]
+// CHECK-SAME:       outs(%[[UPDATE_SLICE_OUT]], %[[UPDATE_SLICE_ACC]]
+//      CHECK:   return

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/bufferize.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/bufferize.mlir
new file mode 100644
index 0000000..5ca985e
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/bufferize.mlir

@@ -0,0 +1,34 @@
+// RUN: iree-dialects-opt -linalg-interp-transforms %s | FileCheck %s
+
+// CHECK-LABEL: func @matmul_tensors(
+// CHECK-SAME:    %[[TA:[0-9a-z]+]]: memref<128x128xf32
+// CHECK-SAME:    %[[TB:[0-9a-z]+]]: memref<128x128xf32
+// CHECK-SAME:    %[[TC:[0-9a-z]+]]: memref<128x128xf32
+// CHECK-NOT:   -> tensor
+func @matmul_tensors(
+  %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32> { linalg.inplaceable = true})
+    -> tensor<128x128xf32> {
+  // CHECK: linalg.matmul ins(%[[TA]], %[[TB]] : memref{{.*}}, memref{{.*}} outs(%[[TC]] : memref{{.*}})
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg2: tensor<128x128xf32>)
+    -> tensor<128x128xf32>
+
+  // CHECK: return
+  // CHECK-NOT: %{{.*}}
+  return %0 : tensor<128x128xf32>
+// CHECK: }
+}
+
+
+pdl.pattern @pdl_target : benefit(1) {
+  %args = operands
+  %results = types
+  %0 = operation "linalg.matmul"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
+  apply_native_constraint "nestedInFunc"[@matmul_tensors](%0 : !pdl.operation)
+  // TODO: we don't want this, but it is the required terminator for pdl.pattern
+  rewrite %0 with "iree_linalg_transform.apply"
+}
+
+iree_linalg_transform.sequence {
+  bufferize
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/double-tiling.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/double-tiling.mlir
new file mode 100644
index 0000000..74d6cff
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/double-tiling.mlir

@@ -0,0 +1,43 @@
+// RUN: iree-dialects-opt -linalg-interp-transforms %s | FileCheck %s
+
+// This test is verifying that a non-trivial 2*tiling+padding+vectorization transformation completes successfully
+
+// CHECK-LABEL: func @matmul_tensors(
+func @matmul_tensors(
+  %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32> { linalg.inplaceable = true})
+    -> tensor<128x128xf32> {
+  // Pack transposed padding of 1st operand.
+  //      CHECK:    tensor.pad
+  //      CHECK:    linalg.generic
+
+  // Pack padding of 2nd operand.
+  //      CHECK:    tensor.pad
+
+  //      CHECK:      scf.for
+  //      CHECK:        scf.for
+  //      CHECK:          scf.for
+  //      CHECK:            scf.for
+  //      CHECK:              scf.for
+  //      CHECK:                linalg.generic
+  //      CHECK:                vector.contract
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg2: tensor<128x128xf32>)
+    -> tensor<128x128xf32>
+
+  return %0 : tensor<128x128xf32>
+}
+
+pdl.pattern @pdl_target: benefit(1) {
+  %args = operands
+  %results= types
+  %0 = operation "linalg.matmul"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
+  apply_native_constraint "nestedInFunc"[@matmul_tensors](%0 : !pdl.operation)
+  rewrite %0 with "iree_linalg_transform.apply"
+}
+iree_linalg_transform.sequence {
+  %0 = match @pdl_target
+  %1 = tile %0 {interchange = [0, 2, 1], peel = [], scalarize_dyn_dims = false, sizes = [32, 32, 32]}
+  %2 = tile %1 {interchange = [0, 1, 2], peel = [], scalarize_dyn_dims = false, sizes = [4, 4, 1]}
+  %3 = pad %2 {pack_paddings = [1, 1, 1], hoist_paddings = [6, 6, 0], transpose_paddings = [[1, 0], [0, 1]]}
+  %4 = vectorize %3  {vectorize_padding = true}
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/drop-schedule.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/drop-schedule.mlir
new file mode 100644
index 0000000..c82252b
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/drop-schedule.mlir

@@ -0,0 +1,26 @@
+// RUN: iree-dialects-opt -linalg-drop-schedule %s | FileCheck %s
+
+func @matmul_tensors(
+  %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32> { linalg.inplaceable = true}) 
+    -> tensor<128x128xf32> {
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg2: tensor<128x128xf32>)
+    -> tensor<128x128xf32>
+  return %0 : tensor<128x128xf32>
+}
+
+// CHECK-NOT: pdl.pattern
+pdl.pattern @pdl_target : benefit(1) {
+  %args = operands
+  %results = types
+  %0 = operation "linalg.matmul"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
+  apply_native_constraint "nestedInFunc"[@matmul_tensors](%0 : !pdl.operation)
+  // TODO: we don't want this, but it is the required terminator for pdl.pattern
+  rewrite %0 with "iree_linalg_transform.apply"
+}
+
+// CHECK-NOT: iree_linalg_transform.sequence
+iree_linalg_transform.sequence {
+  %0 = match @pdl_target
+  tile %0 {sizes = [4, 4, 4], pad = false}
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/expert.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/expert.mlir
new file mode 100644
index 0000000..b5825ee
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/expert.mlir

@@ -0,0 +1,164 @@
+// RUN: iree-dialects-opt -linalg-transform-expert-expansion -split-input-file %s | FileCheck %s --check-prefix=EXPAND
+// RUN: iree-dialects-opt -linalg-transform-expert-expansion -linalg-interp-transforms -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: func @matmul_tensors
+// CHECK-NOT: linalg
+// CHECK: llvm
+func @matmul_tensors(
+  %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32> { linalg.inplaceable = true})
+    -> tensor<128x128xf32> {
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg2: tensor<128x128xf32>)
+    -> tensor<128x128xf32>
+
+  return %0 : tensor<128x128xf32>
+}
+
+pdl.pattern @pdl_target : benefit(1) {
+  %args = operands
+  %results = types
+  %0 = operation "linalg.matmul"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
+  apply_native_constraint "nestedInFunc"[@matmul_tensors](%0 : !pdl.operation)
+  // TODO: we don't want this, but it is the required terminator for pdl.pattern
+  rewrite %0 with "iree_linalg_transform.apply"
+}
+
+iree_linalg_transform.sequence {
+  // This should match the strategy below.
+  // EXPAND-NOT: expert apply
+  // EXPAND: %[[OP:.*]] = match @pdl_target
+  // EXPAND: %[[HANDLE:.*]] = tile %[[OP]] {sizes = [4, 4, 4]}
+  // EXPAND: %[[HANDLE2:.*]] = vectorize %[[HANDLE]] {vectorize_padding = true}
+  // EXPAND: bufferize
+  // EXPAND: lower_vectors {multireduction_lowering = "innerreduce"}
+  // EXPAND: lower_to_llvm
+  %0 = match @pdl_target
+  expert apply "single_tiling" to %0
+  {
+    tile_sizes = [4, 4, 4],
+    vectorize_padding = true,
+    multireduction_lowering = "innerreduce"
+  }
+}
+
+// CHECK-NOT: @strategies
+// EXPAND-NOT: @strategies
+module @strategies {
+  pdl.pattern @single_tiling_matcher : benefit(1) {
+    %tile_sizes = attribute
+    %vectorize_padding = attribute
+    %multireduction_lowering = attribute
+    %name = attribute : "single_tiling"
+    %type = type : !pdl.operation
+    %target = operand : %type
+    %transformed = type
+    %root = operation "iree_linalg_transform.expert"(%target : !pdl.value) {
+      "expertName" = %name,
+      "tile_sizes" = %tile_sizes,
+      "vectorize_padding" = %vectorize_padding,
+      "multireduction_lowering" = %multireduction_lowering
+    } -> (%transformed : !pdl.type)
+
+    rewrite %root {
+      %tile = operation "iree_linalg_transform.tile"(%target : !pdl.value) {
+        "sizes" = %tile_sizes
+      } -> (%transformed : !pdl.type)
+      %handle = result 0 of %tile
+
+      %vectorize = operation "iree_linalg_transform.vectorize"(%handle : !pdl.value) {
+        "vectorize_padding" = %vectorize_padding
+      } -> (%transformed : !pdl.type)
+      %handle2 = result 0 of %vectorize
+
+      %bufferize = operation "iree_linalg_transform.bufferize"
+      %lower_vectors = operation "iree_linalg_transform.lower_vectors" {
+        "multireduction_lowering" = %multireduction_lowering
+      }
+      %lower_to_llvm = operation "iree_linalg_transform.lower_to_llvm"
+
+      replace %root with (%handle2 : !pdl.value)
+    }
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func @matmul_tensors2
+// CHECK-NOT: linalg
+// CHECK: llvm
+func @matmul_tensors2(
+  %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32> { linalg.inplaceable = true})
+    -> tensor<128x128xf32> {
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg2: tensor<128x128xf32>)
+    -> tensor<128x128xf32>
+
+  return %0 : tensor<128x128xf32>
+}
+
+pdl.pattern @pdl_target2 : benefit(1) {
+  %args = pdl.operands
+  %results = pdl.types
+  %0 = pdl.operation "linalg.matmul"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
+  pdl.apply_native_constraint "nestedInFunc"[@matmul_tensors2](%0 : !pdl.operation)
+  // TODO: we don't want this, but it is the required terminator for pdl.pattern
+  pdl.rewrite %0 with "iree_linalg_transform.apply"
+}
+
+iree_linalg_transform.sequence {
+  // This should match the strategy below.
+  // EXPAND-NOT: expert apply
+  // EXPAND: %[[OP:.*]] = match @pdl_target2
+  // EXPAND: %[[HANDLE:.*]] = tile %[[OP]] {sizes = [32, 8, 8]}
+  // EXPAND: %[[HANDLE2:.*]] = tile %[[HANDLE]] {sizes = [4, 4, 4]}
+  // EXPAND: %[[HANDLE3:.*]] = vectorize %[[HANDLE2]] {vectorize_padding = false}
+  // EXPAND: bufferize
+  // EXPAND: lower_vectors {multireduction_lowering = "innerparallel"}
+  // EXPAND: lower_to_llvm
+  %0 = match @pdl_target2
+  %1 = tile %0 {sizes = [32, 8, 8]}
+  expert apply "single_tiling" to %1
+  {
+    tile_sizes = [4, 4, 4],
+    vectorize_padding = false,
+    multireduction_lowering = "innerparallel"
+  }
+}
+
+module @strategies {
+  pdl.pattern @single_tiling_operand : benefit(1) {
+    %tile_sizes = attribute
+    %vectorize_padding = attribute
+    %multireduction_lowering = attribute
+    %name = attribute : "single_tiling"
+    %type = type : !pdl.operation
+    %target = operand : %type
+    %transformed = type
+    %root = operation "iree_linalg_transform.expert"(%target : !pdl.value) {
+      "expertName" = %name,
+      "tile_sizes" = %tile_sizes,
+      "vectorize_padding" = %vectorize_padding,
+      "multireduction_lowering" = %multireduction_lowering
+    } -> (%transformed : !pdl.type)
+
+    rewrite %root {
+      %tile = operation "iree_linalg_transform.tile"(%target : !pdl.value)  {
+        "sizes" = %tile_sizes
+      } -> (%transformed : !pdl.type)
+      %handle = result 0 of %tile
+
+      %vectorize = operation "iree_linalg_transform.vectorize"(%handle : !pdl.value) {
+        "vectorize_padding" = %vectorize_padding
+      } -> (%transformed : !pdl.type)
+      %handle2 = result 0 of %vectorize
+
+      %bufferize = operation "iree_linalg_transform.bufferize"
+      %lower_vectors = operation "iree_linalg_transform.lower_vectors" {
+        "multireduction_lowering" = %multireduction_lowering
+      }
+      %lower_to_llvm = operation "iree_linalg_transform.lower_to_llvm"
+
+      replace %root with (%handle2 : !pdl.value)
+    }
+  }
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/failure.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/failure.mlir
new file mode 100644
index 0000000..f0ecf7c
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/failure.mlir

@@ -0,0 +1,176 @@
+// RUN: iree-dialects-opt -linalg-interp-transforms -split-input-file -verify-diagnostics -allow-unregistered-dialect %s
+
+// This cannot be vectorized because of dynamic tensor shapes. We expect the
+// pass fail and report an error at the vectorization operation below.
+func public @non_vectorizable(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
+  %0 = linalg.generic {
+      indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+      iterator_types = ["parallel"]}
+    ins(%arg0: tensor<?xf32>) outs(%arg1: tensor<?xf32>) {
+  ^bb0(%arg2: f32, %arg3: f32):
+    %1 = arith.mulf %arg2, %arg2 : f32
+    linalg.yield %1 : f32
+  } -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+pdl.pattern @target_pattern : benefit(1) {
+  %0 = operands
+  %1 = types
+  %2 = operation "linalg.generic"(%0 : !pdl.range<value>)  -> (%1 : !pdl.range<type>)
+  rewrite %2 with "iree_linalg_transform.apply"
+}
+
+iree_linalg_transform.sequence {
+  %0 = match @target_pattern
+  // expected-error@below {{failed to apply}}
+  vectorize %0
+}
+
+// -----
+
+func public @no_loop(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
+  %0 = linalg.generic {
+      indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+      iterator_types = ["parallel"]}
+    ins(%arg0: tensor<?xf32>) outs(%arg1: tensor<?xf32>) {
+  ^bb0(%arg2: f32, %arg3: f32):
+    %1 = arith.mulf %arg2, %arg2 : f32
+    linalg.yield %1 : f32
+  } -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+pdl.pattern @target_pattern : benefit(1) {
+  %0 = operands
+  %1 = types
+  %2 = operation "linalg.generic"(%0 : !pdl.range<value>)  -> (%1 : !pdl.range<type>)
+  rewrite %2 with "iree_linalg_transform.apply"
+}
+
+iree_linalg_transform.sequence {
+  %0 = match @target_pattern
+  // expected-error@below {{the transformed op is enclosed by 0 loops, but 1 expected}}
+  // expected-error@below {{failed to apply}}
+  get_parent_loop %0
+}
+
+// -----
+
+func private @prevent_dce()
+
+pdl.pattern @something : benefit(1) {
+  %0 = operands
+  %2 = operation "scf.for"(%0 : !pdl.range<value>)
+  rewrite %2 with "iree_linalg_transform.apply"
+}
+
+func public @loop(%lb: index, %ub: index, %step: index) {
+  scf.for %i = %lb to %ub step %step {
+    call @prevent_dce() : () -> ()
+  }
+  return
+}
+
+iree_linalg_transform.sequence {
+  %0 = match @something
+  // expected-error@below {{NYI: cannot target the result of pipelining}}
+  // expected-error@below {{failed to apply}}
+  %1 = pipeline_loop %0
+  // expected-note@below {{use here}}
+  get_parent_loop %1
+}
+
+// -----
+
+func public @no_outlining() {
+  "some.operation"() ({}, {}) : () -> ()
+  return
+}
+
+pdl.pattern @some_operation : benefit(1) {
+  %0 = operation "some.operation"
+  rewrite %0 with "iree_linalg_transform.apply"
+}
+
+iree_linalg_transform.sequence {
+  %0 = match @some_operation
+  // Make sure we don't crash on wrong operation type.
+  // expected-error@below {{failed to apply}}
+  outline_loop %0 {func_name = "outlined"}
+}
+
+// -----
+
+func @no_replacement(
+  %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>,
+  %arg2: tensor<128x128xf32> {linalg.inplaceable = true})
+    -> tensor<128x128xf32> {
+  // expected-error @below {{could not find replacement for tracked op}}
+  %0 = linalg.matmul {test.attrA}
+                     ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg2: tensor<128x128xf32>)
+    -> tensor<128x128xf32>
+  return %0 : tensor<128x128xf32>
+}
+
+pdl.pattern @pdl_target : benefit(1) {
+  %args = operands
+  %results = types
+  %0 = operation "linalg.matmul"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
+  apply_native_constraint "nestedInFunc"[@no_replacement](%0 : !pdl.operation)
+  // TODO: we don't want this, but it is the required terminator for pdl.pattern
+  rewrite %0 with "iree_linalg_transform.apply"
+}
+
+iree_linalg_transform.sequence {
+  %0 = match @pdl_target
+  // expected-error @below {{failed to apply}}
+  vectorize
+  tile %0
+}
+
+// -----
+
+func @repeated_match(
+  %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>,
+  %arg2: tensor<128x128xf32> {linalg.inplaceable = true})
+    -> tensor<128x128xf32> {
+  // expected-error @below {{operation tracked by two handles}}
+  %0 = linalg.matmul {test.attrA}
+                     ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg2: tensor<128x128xf32>)
+    -> tensor<128x128xf32>
+  return %0 : tensor<128x128xf32>
+}
+
+pdl.pattern @pdl_target1 : benefit(1) {
+  %args = operands
+  %results = types
+  %0 = operation "linalg.matmul"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
+  apply_native_constraint "nestedInFunc"[@repeated_match](%0 : !pdl.operation)
+  // TODO: we don't want this, but it is the required terminator for pdl.pattern
+  rewrite %0 with "iree_linalg_transform.apply"
+}
+
+// An exact copy of the above, but with a different name.
+pdl.pattern @pdl_target2 : benefit(1) {
+  %args = operands
+  %results = types
+  %0 = operation "linalg.matmul"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
+  apply_native_constraint "nestedInFunc"[@repeated_match](%0 : !pdl.operation)
+  // TODO: we don't want this, but it is the required terminator for pdl.pattern
+  rewrite %0 with "iree_linalg_transform.apply"
+}
+
+iree_linalg_transform.sequence {
+  // expected-note @below {{handle}}
+  %0 = match @pdl_target1
+  // expected-error @below {{failed to apply}}
+  // expected-note @below {{handle}}
+  %1 = match @pdl_target2
+  
+  // Add references to handles produced by match so that they are not DCE'd.
+  tile %0
+  tile %1
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/fuse.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/fuse.mlir
new file mode 100644
index 0000000..6a78eb3
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/fuse.mlir

@@ -0,0 +1,31 @@
+// RUN: iree-dialects-opt -linalg-interp-transforms %s | FileCheck %s
+
+
+// CHECK-LABEL: func @fuse_unary
+func @fuse_unary(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
+
+  //     CHECK:   scf.for
+  //     CHECK:     scf.for
+  //     CHECK:       linalg.elemwise_unary
+  //     CHECK:       linalg.elemwise_binary
+  %0 = linalg.elemwise_unary ins(%arg0 : tensor<?x?xf32>)
+                             outs(%arg1: tensor<?x?xf32>) -> tensor<?x?xf32>
+  %1 = linalg.elemwise_binary ins(%0, %arg0 : tensor<?x?xf32>, tensor<?x?xf32>)
+                             outs(%arg1: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %1 : tensor<?x?xf32>
+}
+
+
+pdl.pattern @pdl_target : benefit(1) {
+  %args = operands
+  %results = types
+  %0 = pdl.operation "linalg.elemwise_binary"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
+  apply_native_constraint "nestedInFunc"[@fuse_unary](%0 : !pdl.operation)
+  // TODO: we don't want this, but it is the required terminator for pdl.pattern
+  rewrite %0 with "iree_linalg_transform.apply"
+}
+
+iree_linalg_transform.sequence {
+  %0 = match @pdl_target
+  %1 = fuse %0 {tile_sizes = [32, 32], tile_interchange = [0, 1]}
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/generalize.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/generalize.mlir
new file mode 100644
index 0000000..ea12b9a
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/generalize.mlir

@@ -0,0 +1,27 @@
+// RUN: iree-dialects-opt -linalg-interp-transforms %s | FileCheck %s
+
+
+// CHECK-LABEL: func @generalize_unary
+func @generalize_unary(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
+
+  // CHECK-NOT:   linalg.elemwise_unary
+  //     CHECK:   linalg.generic
+  %0 = linalg.elemwise_unary ins(%arg0 : tensor<?x?xf32>)
+                             outs(%arg1: tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+
+pdl.pattern @pdl_target : benefit(1) {
+  %args = operands
+  %results = types
+  %0 = pdl.operation "linalg.elemwise_unary"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
+  apply_native_constraint "nestedInFunc"[@generalize_unary](%0 : !pdl.operation)
+  // TODO: we don't want this, but it is the required terminator for pdl.pattern
+  rewrite %0 with "iree_linalg_transform.apply"
+}
+
+iree_linalg_transform.sequence {
+  %0 = match @pdl_target
+  generalize %0
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/interchange.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/interchange.mlir
new file mode 100644
index 0000000..e988133
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/interchange.mlir

@@ -0,0 +1,34 @@
+// RUN: iree-dialects-opt -linalg-interp-transforms %s | FileCheck %s
+
+//       CHECK: #[[$MAP:.*]] = affine_map<(d0, d1) -> (d1, d0)>
+
+// CHECK-LABEL: func @interchange_generic
+func @interchange_generic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
+
+  //      CHECK:   linalg.generic
+  // CHECK-SAME:   indexing_maps = [#[[$MAP]], #[[$MAP]]
+  %0 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } ins(%arg0 : tensor<?x?xf32>) outs(%arg1 : tensor<?x?xf32>) {
+  ^bb0(%arg2: f32, %arg3: f32):
+    %1 = math.exp %arg2 : f32
+    linalg.yield %1 : f32
+  } -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+
+pdl.pattern @pdl_target : benefit(1) {
+  %args = operands
+  %results = types
+  %0 = pdl.operation "linalg.generic"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
+  apply_native_constraint "nestedInFunc"[@interchange_generic](%0 : !pdl.operation)
+  // TODO: we don't want this, but it is the required terminator for pdl.pattern
+  rewrite %0 with "iree_linalg_transform.apply"
+}
+
+iree_linalg_transform.sequence {
+  %0 = match @pdl_target
+  interchange %0 {iterator_interchange = [1, 0]}
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/invalid.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/invalid.mlir
new file mode 100644
index 0000000..d9c7e28
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/invalid.mlir

@@ -0,0 +1,59 @@
+// RUN: iree-dialects-opt %s -split-input-file -verify-diagnostics
+
+iree_linalg_transform.sequence {
+  %0 = match @match
+  // expected-error@below {{result #0 has more than one use}}
+  %1 = tile %0
+  // expected-note@below {{used here as operand #0}}
+  tile %1
+  // expected-note@below {{used here as operand #0}}
+  vectorize %1
+}
+
+// -----
+
+iree_linalg_transform.sequence {
+  %0 = match @match
+  // expected-error@below {{"sizes" and "scalarize_dyn_dims" attributes are mutually exclusive}}
+  tile %0 {sizes = [1,2,3], scalarize_dyn_dims = true}
+}
+
+// -----
+
+iree_linalg_transform.sequence {
+  %0 = match @match
+  // expected-error@below {{expects iterator_interchange to be a permutation, found [1, 1]}}
+  interchange %0 {iterator_interchange = [1, 1]}
+}
+
+// -----
+
+iree_linalg_transform.sequence {
+  %0 = match @match
+  // expected-error@below {{expects interchange to be a permutation, found [1, 1]}}
+  fuse %0 {tile_sizes=[0, 1], tile_interchange = [1, 1]}
+}
+
+// -----
+
+iree_linalg_transform.sequence {
+  %0 = match @match
+  // expected-error@below {{expects pack_paddings to contain booleans (0/1), found [1, 7]}}
+  pad %0 {pack_paddings=[1, 7]}
+}
+
+// -----
+
+iree_linalg_transform.sequence {
+  %0 = match @match
+  // expected-error@below {{expects hoist_paddings to contain positive integers, found [1, -7]}}
+  pad %0 {hoist_paddings=[1, -7]}
+}
+
+// -----
+
+iree_linalg_transform.sequence {
+  %0 = match @match
+  // expected-error@below {{expects transpose_paddings to be a permutation, found [1, 1]}}
+  pad %0 {transpose_paddings=[[1, 1]]}
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/pad.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/pad.mlir
new file mode 100644
index 0000000..d6d627b
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/pad.mlir

@@ -0,0 +1,42 @@
+// RUN: iree-dialects-opt -linalg-interp-transforms %s | FileCheck %s
+
+
+// CHECK-LABEL: func @pad_unary
+func @pad_unary(%arg0: tensor<24x12xf32>,
+                %arg1: tensor<24x12xf32>) -> tensor<24x12xf32> {
+  %c0 = arith.constant 0 : index
+  %c12 = arith.constant 12 : index
+  %c4 = arith.constant 4 : index
+
+  //     CHECK:   scf.for
+  //     CHECK:     tensor.pad
+  //     CHECK:     linalg.generic
+  //     CHECK:   scf.for
+  %0 = scf.for %arg3 = %c0 to %c12 step %c4 iter_args(%arg2 = %arg1) -> (tensor<24x12xf32>) {
+    %1 = tensor.extract_slice %arg0[0, %arg3] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32>
+    %2 = tensor.extract_slice %arg2[0, %arg3] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32>
+
+    //     CHECK:     linalg.generic
+    //     CHECK:     tensor.pad
+    //     CHECK:     linalg.elemwise_unary
+    %3 = linalg.elemwise_unary ins(%1 : tensor<24x4xf32>)
+                              outs(%2: tensor<24x4xf32>) -> tensor<24x4xf32>
+    %4 = tensor.insert_slice %3 into %arg2[0, %arg3] [24, 4] [1, 1] : tensor<24x4xf32> into tensor<24x12xf32>
+    scf.yield %4 : tensor<24x12xf32>
+  }
+  return %0 : tensor<24x12xf32>
+}
+
+pdl.pattern @pdl_target : benefit(1) {
+  %args = operands
+  %results = types
+  %0 = pdl.operation "linalg.elemwise_unary"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
+  apply_native_constraint "nestedInFunc"[@pad_unary](%0 : !pdl.operation)
+  // TODO: we don't want this, but it is the required terminator for pdl.pattern
+  rewrite %0 with "iree_linalg_transform.apply"
+}
+
+iree_linalg_transform.sequence {
+  %0 = match @pdl_target
+  %1 = pad %0 {pack_paddings=[1, 1], hoist_paddings=[1, 0], transpose_paddings=[[1, 0], [0, 1]]}
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/roundtrip.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/roundtrip.mlir
new file mode 100644
index 0000000..7ff0112
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/roundtrip.mlir

@@ -0,0 +1,33 @@
+// RUN: iree-dialects-opt %s | FileCheck %s
+
+// CHECK: iree_linalg_transform.sequence
+iree_linalg_transform.sequence {
+  // CHECK: %[[OPS:.*]] = match @{{.*}}
+  %0 = match @match1
+  // CHECK: %[[TILED:.*]] = tile %[[OPS]] {
+  // CHECK-DAG: sizes = [4, 4, 4]
+  // CHECK: }
+  %1 = tile %0 {sizes = [4, 4, 4]}
+  // CHECK: %[[TILED2:.*]] = tile %[[TILED]]
+  %2 = tile %1 {sizes = [2, 2, 2]}
+  // CHECK: %[[PADDED:.*]] = pad %[[TILED2]] {pack_paddings = [1, 1, 0]}
+  %3 = pad %2 {pack_paddings = [1, 1, 0]}
+  // CHECK: decompose
+  decompose
+  // CHECK: %{{.*}} = vectorize %[[PADDED]] {vectorize_padding = true}
+  %4 = vectorize %3 {vectorize_padding = true}
+  // CHECK: %[[OPS2:.*]] = match @{{.*}}
+  %5 = match @match2
+  // CHECK: %{{.*}} = vectorize %[[OPS2]]
+  vectorize %5
+  // CHECK-NOT: %
+  // CHECK: vectorize
+  // CHECK-NOT: %
+  vectorize
+  // CHECK: bufferize
+  bufferize
+  // CHECK: lower_vectors {multireduction_lowering = "innerreduce"}
+  lower_vectors { multireduction_lowering = "innerreduce"}
+  // CHECK: lower_to_llvm
+  lower_to_llvm
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/scoped.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/scoped.mlir
new file mode 100644
index 0000000..6964ef1
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/scoped.mlir

@@ -0,0 +1,30 @@
+// RUN: iree-dialects-opt -test-wrap-scope='opname=arith.addi' %s | FileCheck %s --check-prefix WRAP
+// RUN: iree-dialects-opt -test-unwrap-scope %s | FileCheck %s --check-prefix UNWRAP
+
+// WRAP-LABEL: @test_wrap
+// WRAP-SAME: (%[[ARG0:.*]]: i32) -> i32
+func @test_wrap(%arg0: i32) -> i32 {
+  // WRAP: %[[V:.*]] = iree_linalg_transform.util.scope(%[[ARG0]], %[[ARG0]]) {
+  // WRAP-NEXT: ^[[B:.*]](%[[ARG1:.*]]: i32, %[[ARG2:.*]]: i32):
+  // WRAP-NEXT: %[[ADD:.*]] = arith.addi %[[ARG2]], %[[ARG2]]
+  // WRAP-NEXT: iree_linalg_transform.util.forward %[[ADD]]
+  // WRAP-NEXT: } : (i32, i32) -> i32
+  %0 = arith.addi %arg0, %arg0 : i32
+  // WRAP: return %[[V]]
+  return %0 : i32
+}
+
+// UNWRAP-LABEL: @test_unwrap
+// UNWRAP-SAME: (%[[ARG0:.*]]: i32) -> (i32, i32)
+func @test_unwrap(%arg0: i32) -> (i32, i32) {
+  // UNWRAP: %[[V0:.*]] = arith.addi %[[ARG0]], %[[ARG0]]
+  // UNWRAP-NEXT: %[[V1:.*]] = arith.addi %[[V0]], %[[ARG0]]
+  %0:2 = iree_linalg_transform.util.scope(%arg0) {
+  ^bb0(%arg1: i32):
+    %1 = arith.addi %arg1, %arg1 : i32
+    %2 = arith.addi %1, %arg1 : i32
+    iree_linalg_transform.util.forward %1, %2 : i32, i32
+  } : (i32) -> (i32, i32)
+  // UNWRAP-NEXT: return %[[V0]], %[[V1]]
+  return %0#0, %0#1 : i32, i32
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/selective-targeting.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/selective-targeting.mlir
new file mode 100644
index 0000000..fdcd2f9
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/selective-targeting.mlir

@@ -0,0 +1,134 @@
+// RUN: iree-dialects-opt %s -linalg-interp-transforms -split-input-file | FileCheck %s
+
+// CHECK-LABEL: func @matmul_tensors(
+func @matmul_tensors(
+  %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>,
+  %arg3: tensor<128x128xf32>, %arg4: tensor<128x128xf32>, %arg5: tensor<128x128xf32>,
+  %arg6: tensor<128x128xf32> {linalg.inplaceable = true})
+    -> tensor<128x128xf32> {
+  // This operation is marked for tiling only.
+  // CHECK-COUNT-3: scf.for
+  // CHECK-COUNT-3: tensor.extract_slice
+  // CHECK: linalg.matmul
+  // CHECK-SAME: -> tensor<4x4xf32>
+  %0 = linalg.matmul { test.attrA}
+                      ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg2: tensor<128x128xf32>)
+    -> tensor<128x128xf32>
+
+  // This operation is marked for tiling and vectorization.
+  // Note that the loop-invariant read is hoisted out of the innermost loop.
+  // CHECK: scf.for
+  // CHECK:   scf.for
+  // CHECK:     vector.transfer_read
+  // CHECK:     scf.for
+  // CHECK:       vector.transfer_read
+  // CHECK:       vector.transfer_read
+  // CHECK:       vector.contract
+  // CHECK-NOT:   linalg.matmul
+  // CHECK:       vector.transfer_write
+  %1 = linalg.matmul { test.attrA, test.attrC}
+                      ins(%arg3, %arg4: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg5: tensor<128x128xf32>)
+    -> tensor<128x128xf32>
+
+  // This operation is marked for vectorization only.
+  // CHECK-NOT: scf.for
+  // CHECK-COUNT-3: vector.transfer_read
+  // CHECK: vector.contract
+  // CHECK-SAME: into vector<128x128xf32>
+  // CHECK: vector.transfer_write
+  %2 = linalg.matmul { test.attrC}
+                      ins(%0, %1: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg6: tensor<128x128xf32>)
+    -> tensor<128x128xf32>
+
+  return %2 : tensor<128x128xf32>
+}
+
+// Match matmul operations inside @matmul_tensors with test.attrA set.
+pdl.pattern @pdl_target_attrA : benefit(1) {
+  %args = operands
+  %results = types
+  %attr = attribute
+  %0 = operation "linalg.matmul"(%args : !pdl.range<value>) {"test.attrA" = %attr}-> (%results : !pdl.range<type>)
+  apply_native_constraint "nestedInFunc"[@matmul_tensors](%0 : !pdl.operation)
+  // TODO: we don't want this, but it is the required terminator for pdl.pattern
+  rewrite %0 with "iree_linalg_transform.apply"
+}
+
+// Match matmul operations inside @matmul_tensors with test.attrC set.
+pdl.pattern @pdl_target_attrC : benefit(1) {
+  %args = operands
+  %results = types
+  %attr = attribute
+  %0 = operation "linalg.matmul"(%args : !pdl.range<value>) {"test.attrC" = %attr}-> (%results : !pdl.range<type>)
+  apply_native_constraint "nestedInFunc"[@matmul_tensors](%0 : !pdl.operation)
+  // TODO: we don't want this, but it is the required terminator for pdl.pattern
+  rewrite %0 with "iree_linalg_transform.apply"
+}
+
+iree_linalg_transform.sequence {
+  %0 = match @pdl_target_attrA
+  tile %0 {sizes = [4, 4, 4]}
+  %1 = match @pdl_target_attrC
+  vectorize %1
+}
+
+// -----
+
+// CHECK-LABEL: @vectorize_one
+func @vectorize_one(
+  %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>,
+  %arg3: tensor<128x128xf32> {linalg.inplaceable = true})
+    -> tensor<128x128xf32> {
+  // CHECK: vector.contract
+  %0 = linalg.matmul {test.attrA}
+                     ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg2: tensor<128x128xf32>)
+    -> tensor<128x128xf32>
+  // CHECK: linalg.matmul
+  %1 = linalg.matmul ins(%arg0, %0: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg3: tensor<128x128xf32>)
+    -> tensor<128x128xf32>
+  return %1 : tensor<128x128xf32>
+}
+
+pdl.pattern @pdl_target : benefit(1) {
+  %args = operands
+  %results = types
+  %attr = attribute
+  %0 = operation "linalg.matmul"(%args : !pdl.range<value>) {"test.attrA" = %attr}-> (%results : !pdl.range<type>)
+  apply_native_constraint "nestedInFunc"[@vectorize_one](%0 : !pdl.operation)
+  // TODO: we don't want this, but it is the required terminator for pdl.pattern
+  rewrite %0 with "iree_linalg_transform.apply"
+}
+
+iree_linalg_transform.sequence {
+  %0 = match @pdl_target
+  vectorize %0
+}
+
+
+// -----
+
+// CHECK-LABEL: @vectorize_all
+func @vectorize_all(
+  %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>,
+  %arg3: tensor<128x128xf32> {linalg.inplaceable = true})
+    -> tensor<128x128xf32> {
+  // CHECK: vector.contract
+  %0 = linalg.matmul {test.attrA}
+                     ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg2: tensor<128x128xf32>)
+    -> tensor<128x128xf32>
+  // CHECK: vector.contract
+  %1 = linalg.matmul ins(%arg0, %0: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg3: tensor<128x128xf32>)
+    -> tensor<128x128xf32>
+  return %1 : tensor<128x128xf32>
+}
+
+iree_linalg_transform.sequence {
+  vectorize
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/single-tiling-full-script.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/single-tiling-full-script.mlir
new file mode 100644
index 0000000..adffa86
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/single-tiling-full-script.mlir

@@ -0,0 +1,33 @@
+// RUN: iree-dialects-opt -linalg-interp-transforms %s | FileCheck %s
+
+// CHECK-LABEL: func @matmul_tensors
+// CHECK-NOT: linalg
+// CHECK: llvm
+func @matmul_tensors(
+  %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32> { linalg.inplaceable = true})
+    -> tensor<128x128xf32> {
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg2: tensor<128x128xf32>)
+    -> tensor<128x128xf32>
+
+  return %0 : tensor<128x128xf32>
+}
+
+
+pdl.pattern @pdl_target : benefit(1) {
+  %args = operands
+  %results = types
+  %0 = pdl.operation "linalg.matmul"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
+  apply_native_constraint "nestedInFunc"[@matmul_tensors](%0 : !pdl.operation)
+  // TODO: we don't want this, but it is the required terminator for pdl.pattern
+  rewrite %0 with "iree_linalg_transform.apply"
+}
+
+iree_linalg_transform.sequence {
+  %0 = match @pdl_target
+  %1 = tile %0 {sizes = [4, 4, 4]}
+  %2 = vectorize %1 {vectorize_padding = true}
+  bufferize
+  lower_vectors { multireduction_lowering = "innerreduce"}
+  lower_to_llvm
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/tile-interchange.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/tile-interchange.mlir
new file mode 100644
index 0000000..88286aa
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/tile-interchange.mlir

@@ -0,0 +1,72 @@
+// RUN: iree-dialects-opt -linalg-interp-transforms -split-input-file %s | FileCheck %s
+
+#map0 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// Check that vectorization applies after interchange+tiling.
+
+// CHECK-LABEL: @matmul_021
+// CHECK-NOT: linalg.generic
+// CHECK: vector.contract
+func public @matmul_021(%arg0: tensor<39x154xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<154x5xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg2: tensor<39x5xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) -> tensor<39x5xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
+  %0 = linalg.generic {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<39x154xf32>, tensor<154x5xf32>) outs(%arg2 : tensor<39x5xf32>) {
+  ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
+    %1 = arith.mulf %arg3, %arg4 : f32
+    %2 = arith.addf %arg5, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<39x5xf32>
+  return %0 : tensor<39x5xf32>
+}
+
+pdl.pattern @target_pattern : benefit(1) {
+  %0 = operands
+  %1 = types
+  %2 = operation "linalg.generic"(%0 : !pdl.range<value>)  -> (%1 : !pdl.range<type>)
+  apply_native_constraint "nestedInFunc" [@matmul_021](%2 : !pdl.operation)
+  rewrite %2 with "iree_linalg_transform.apply"
+}
+
+iree_linalg_transform.sequence {
+  %0 = match @target_pattern
+  %1 = tile %0 {interchange = [0, 2, 1], sizes = [3, 5, 14]}
+  %2 = tile %1 {sizes = [3, 5, 2]}
+  %3 = vectorize %2 {vectorize_padding = true}
+}
+
+
+// -----
+
+#map0 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// Check that vectorization applies after interchange+tiling.
+
+// CHECK-LABEL: @matmul_210
+// CHECK-NOT: linalg.generic
+// CHECK: vector.contract
+func public @matmul_210(%arg0: tensor<39x154xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<154x5xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg2: tensor<39x5xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) -> tensor<39x5xf32> attributes {passthrough = ["noinline", ["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} {
+  %0 = linalg.generic {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<39x154xf32>, tensor<154x5xf32>) outs(%arg2 : tensor<39x5xf32>) {
+  ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
+    %1 = arith.mulf %arg3, %arg4 : f32
+    %2 = arith.addf %arg5, %1 : f32
+    linalg.yield %2 : f32
+  } -> tensor<39x5xf32>
+  return %0 : tensor<39x5xf32>
+}
+
+pdl.pattern @target_pattern : benefit(1) {
+  %0 = operands
+  %1 = types
+  %2 = operation "linalg.generic"(%0 : !pdl.range<value>)  -> (%1 : !pdl.range<type>)
+  apply_native_constraint "nestedInFunc" [@matmul_210](%2 : !pdl.operation)
+  rewrite %2 with "iree_linalg_transform.apply"
+}
+
+iree_linalg_transform.sequence {
+  %0 = match @target_pattern
+  %1 = tile %0 {interchange = [2, 1, 0], sizes = [3, 5, 14]}
+  %2 = tile %1 {sizes = [3, 5, 2]}
+  %3 = vectorize %2 {vectorize_padding = true}
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/tile.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/tile.mlir
new file mode 100644
index 0000000..ba94d44
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/tile.mlir

@@ -0,0 +1,44 @@
+// RUN: iree-dialects-opt -linalg-interp-transforms %s | FileCheck %s
+
+// CHECK-LABEL: func @matmul_tensors(
+// CHECK-SAME:    %[[TA:[0-9a-z]+]]: tensor<128x128xf32>
+// CHECK-SAME:    %[[TB:[0-9a-z]+]]: tensor<128x128xf32>
+// CHECK-SAME:    %[[TC:[0-9a-z]+]]: tensor<128x128xf32>
+// CHECK-SAME:  -> tensor<128x128xf32> {
+func @matmul_tensors(
+  %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32> { linalg.inplaceable = true})
+    -> tensor<128x128xf32> {
+//      CHECK: %[[TD0:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC0:.*]] = %[[TC]]) -> (tensor<128x128xf32>) {
+//      CHECK:   %[[TD1:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC1:.*]] = %[[TC0]]) -> (tensor<128x128xf32>) {
+//      CHECK:     %[[TD2:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC2:.*]] = %[[TC1]]) -> (tensor<128x128xf32>) {
+//      CHECK:       %[[sTA:.*]] = tensor.extract_slice %[[TA]][{{.*}}] : tensor<128x128xf32> to tensor<4x4xf32>
+//      CHECK:       %[[sTB:.*]] = tensor.extract_slice %[[TB]][{{.*}}] : tensor<128x128xf32> to tensor<4x4xf32>
+//      CHECK:       %[[sTC:.*]] = tensor.extract_slice %[[TC2]][{{.*}}] : tensor<128x128xf32> to tensor<4x4xf32>
+//      CHECK:       %[[sTD:.*]] = linalg.matmul {{.*}} ins(%[[sTA]], %[[sTB]] : tensor<4x4xf32>, tensor<4x4xf32>)
+// CHECK-SAME:                                  outs(%[[sTC]] : tensor<4x4xf32>)  -> tensor<4x4xf32>
+//      CHECK:       %[[TD:.*]] = tensor.insert_slice %[[sTD]] into %[[TC2]][{{.*}}]  : tensor<4x4xf32> into tensor<128x128xf32>
+//      CHECK:       scf.yield %[[TD]] : tensor<128x128xf32>
+//      CHECK:     scf.yield %[[TD2]] : tensor<128x128xf32>
+//      CHECK:   scf.yield %[[TD1]] : tensor<128x128xf32>
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg2: tensor<128x128xf32>)
+    -> tensor<128x128xf32>
+
+//      CHECK: return %[[TD0]] : tensor<128x128xf32>
+  return %0 : tensor<128x128xf32>
+}
+
+
+pdl.pattern @pdl_target : benefit(1) {
+  %args = operands
+  %results = types
+  %0 = operation "linalg.matmul"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
+  apply_native_constraint "nestedInFunc"[@matmul_tensors](%0 : !pdl.operation)
+  // TODO: we don't want this, but it is the required terminator for pdl.pattern
+  rewrite %0 with "iree_linalg_transform.apply"
+}
+
+iree_linalg_transform.sequence {
+  %0 = match @pdl_target
+  tile %0 {sizes = [4, 4, 4]}
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/vectorize-transforms.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/vectorize-transforms.mlir
new file mode 100644
index 0000000..60864ee
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/vectorize-transforms.mlir

@@ -0,0 +1,16 @@
+// This test only checks the content of the file parses.
+// RUN: iree-dialects-opt %s
+
+pdl.pattern @pdl_target : benefit(1) {
+  %args = operands
+  %results = types
+  %0 = operation "linalg.matmul"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
+  apply_native_constraint "nestedInFunc"[@matmul_tensors](%0 : !pdl.operation)
+  // TODO: we don't want this, but it is the required terminator for pdl.pattern
+  rewrite %0 with "iree_linalg_transform.apply"
+}
+
+iree_linalg_transform.sequence {
+  %0 = match @pdl_target
+  vectorize %0 {vectorize_padding = true}
+}

diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/vectorize.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/vectorize.mlir
new file mode 100644
index 0000000..303ff83
--- /dev/null
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/vectorize.mlir

@@ -0,0 +1,21 @@
+// RUN: iree-dialects-opt -linalg-interp-transforms -linalg-transform-file-name=%p/vectorize-transforms.mlir %s | FileCheck %s
+
+// CHECK-LABEL: func @matmul_tensors(
+// CHECK-SAME:    %[[TA:[0-9a-z]+]]: tensor<128x128xf32>
+// CHECK-SAME:    %[[TB:[0-9a-z]+]]: tensor<128x128xf32>
+// CHECK-SAME:    %[[TC:[0-9a-z]+]]: tensor<128x128xf32>
+// CHECK-SAME:  -> tensor<128x128xf32> {
+func @matmul_tensors(
+  %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32> { linalg.inplaceable = true})
+    -> tensor<128x128xf32> {
+  // CHECK: %[[VA:.*]] = vector.transfer_read %[[TA]]
+  // CHECK: %[[VB:.*]] = vector.transfer_read %[[TB]]
+  // CHECK: %[[VC:.*]] = vector.transfer_read %[[TC]]
+  // CHECK: %[[VCU:.*]] = vector.contract {{.*}} %[[VA]], %[[VB]], %[[VC]]
+  // CHECK: vector.transfer_write %[[VCU]], %[[TC]]
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg2: tensor<128x128xf32>)
+    -> tensor<128x128xf32>
+
+  return %0 : tensor<128x128xf32>
+}
commit	11fb8d0c1c36dd4163a4271c066068f7a6fe603b	[log] [tgz]
author	Nicolas Vasilache <nicolasvasilache@users.noreply.github.com>	Mon Mar 21 20:33:30 2022 +0100
committer	GitHub <noreply@github.com>	Mon Mar 21 20:33:30 2022 +0100
tree	278970af08d1667efb57597d19ed79c425c9064e
parent	86612ccee7cd2839c962d860d4c5c6b50df63582 [diff]