[NFC] Remove trailing whitespaces from *.mlir files. (#15757)

diff --git a/compiler/plugins/input/StableHLO/stablehlo-iree/Conversion/Preprocessing/test/canonicalization.mlir b/compiler/plugins/input/StableHLO/stablehlo-iree/Conversion/Preprocessing/test/canonicalization.mlir
index af6df8e..df6839f 100644
--- a/compiler/plugins/input/StableHLO/stablehlo-iree/Conversion/Preprocessing/test/canonicalization.mlir
+++ b/compiler/plugins/input/StableHLO/stablehlo-iree/Conversion/Preprocessing/test/canonicalization.mlir
@@ -655,7 +655,7 @@
   func.return %0 : tensor<5x0xi32>
 }
 // CHECK:   %[[EMPTY:.+]] = tensor.empty() : tensor<5x0xi32>
-// CHECK:   return %[[EMPTY]] 
+// CHECK:   return %[[EMPTY]]
 
 // -----
 
@@ -689,7 +689,7 @@
 
 // CHECK:   %[[EMPTY:.+]] = tensor.empty() : tensor<1x0xi32>
 // CHECK:   %[[SCATTER:.+]] = "stablehlo.scatter"(%arg0, %0, %arg2)
-// CHECK:   return %[[SCATTER]] 
+// CHECK:   return %[[SCATTER]]
 
 // -----
 
diff --git a/compiler/plugins/input/StableHLO/stablehlo-iree/Conversion/Preprocessing/test/stablehlo_to_stablehlo.mlir b/compiler/plugins/input/StableHLO/stablehlo-iree/Conversion/Preprocessing/test/stablehlo_to_stablehlo.mlir
index ce1feda..8e36b84 100644
--- a/compiler/plugins/input/StableHLO/stablehlo-iree/Conversion/Preprocessing/test/stablehlo_to_stablehlo.mlir
+++ b/compiler/plugins/input/StableHLO/stablehlo-iree/Conversion/Preprocessing/test/stablehlo_to_stablehlo.mlir
@@ -211,7 +211,7 @@
 
 // -----
 
-// CHECK-LABEL: @rng_bitcast_f32 
+// CHECK-LABEL: @rng_bitcast_f32
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<4xi32>)
 func.func @rng_bitcast_f32(%arg0: tensor<4xi32>) -> (tensor<4xi32>, tensor<8xf32>) {
   // CHECK: %[[OUT_STATE:.*]], %[[OUT_INT:.*]] = stablehlo.rng_bit_generator %[[ARG0]]
@@ -413,7 +413,7 @@
     %7 = "stablehlo.compare"(%arg0, %arg1) {comparison_direction = #stablehlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     "stablehlo.return"(%7) : (tensor<i1>) -> ()
   }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> (tensor<16x16xf32>, tensor<16x16xi32>)
-  %1 = "stablehlo.slice"(%0#0) { start_indices = dense<[0, 0]> : tensor<2xi64>, limit_indices = dense<[16, 8]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64> } : (tensor<16x16xf32>) -> tensor<16x8xf32> 
+  %1 = "stablehlo.slice"(%0#0) { start_indices = dense<[0, 0]> : tensor<2xi64>, limit_indices = dense<[16, 8]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64> } : (tensor<16x16xf32>) -> tensor<16x8xf32>
   %2 = "stablehlo.slice"(%0#1) { start_indices = dense<[0, 0]> : tensor<2xi64>, limit_indices = dense<[16, 8]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64> } : (tensor<16x16xi32>) -> tensor<16x8xi32>
   return %1, %2 : tensor<16x8xf32>, tensor<16x8xi32>
 }
@@ -434,7 +434,7 @@
     %7 = "stablehlo.compare"(%arg0, %arg1) {comparison_direction = #stablehlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     "stablehlo.return"(%7) : (tensor<i1>) -> ()
   }) {dimension = 2 : i64, is_stable = true} : (tensor<16x16x16xf32>, tensor<16x16x16xi32>) -> (tensor<16x16x16xf32>, tensor<16x16x16xi32>)
-  %1 = "stablehlo.slice"(%0#0) { start_indices = dense<[0, 0, 0]> : tensor<3xi64>, limit_indices = dense<[16, 16, 8]> : tensor<3xi64>, strides = dense<[1, 1, 1]> : tensor<3xi64> } : (tensor<16x16x16xf32>) -> tensor<16x16x8xf32> 
+  %1 = "stablehlo.slice"(%0#0) { start_indices = dense<[0, 0, 0]> : tensor<3xi64>, limit_indices = dense<[16, 16, 8]> : tensor<3xi64>, strides = dense<[1, 1, 1]> : tensor<3xi64> } : (tensor<16x16x16xf32>) -> tensor<16x16x8xf32>
   %2 = "stablehlo.slice"(%0#1) { start_indices = dense<[0, 0, 0]> : tensor<3xi64>, limit_indices = dense<[16, 16, 8]> : tensor<3xi64>, strides = dense<[1, 1, 1]> : tensor<3xi64> } : (tensor<16x16x16xi32>) -> tensor<16x16x8xi32>
   return %1, %2 : tensor<16x16x8xf32>, tensor<16x16x8xi32>
 }
@@ -455,7 +455,7 @@
     %7 = "stablehlo.compare"(%arg0, %arg1) {comparison_direction = #stablehlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     "stablehlo.return"(%7) : (tensor<i1>) -> ()
   }) {dimension = 2 : i64, is_stable = true} : (tensor<16x16x16xf32>, tensor<16x16x16xi32>) -> (tensor<16x16x16xf32>, tensor<16x16x16xi32>)
-  %1 = "stablehlo.slice"(%0#0) { start_indices = dense<[0, 0, 0]> : tensor<3xi64>, limit_indices = dense<[16, 16, 8]> : tensor<3xi64>, strides = dense<[1, 1, 1]> : tensor<3xi64> } : (tensor<16x16x16xf32>) -> tensor<16x16x8xf32> 
+  %1 = "stablehlo.slice"(%0#0) { start_indices = dense<[0, 0, 0]> : tensor<3xi64>, limit_indices = dense<[16, 16, 8]> : tensor<3xi64>, strides = dense<[1, 1, 1]> : tensor<3xi64> } : (tensor<16x16x16xf32>) -> tensor<16x16x8xf32>
   %2 = "stablehlo.slice"(%0#1) { start_indices = dense<[0, 0, 0]> : tensor<3xi64>, limit_indices = dense<[16, 16, 8]> : tensor<3xi64>, strides = dense<[1, 1, 1]> : tensor<3xi64> } : (tensor<16x16x16xi32>) -> tensor<16x16x8xi32>
   return %1, %2 : tensor<16x16x8xf32>, tensor<16x16x8xi32>
 }
@@ -478,8 +478,8 @@
 
 func.func private @top_k_gt_f32_comparator(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
   %0 = stablehlo.compare  GT, %arg0, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
-  stablehlo.return %0 : tensor<i1> 
-} 
+  stablehlo.return %0 : tensor<i1>
+}
 
 // CHECK-LABEL: @custom_call_topk
 // CHECK-SAME: %[[ARG0:.+]]: tensor<1x160xf32>
@@ -496,8 +496,8 @@
 
 func.func private @bottom_k_gt_f32_comparator(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
   %0 = stablehlo.compare  LT, %arg0, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
-  stablehlo.return %0 : tensor<i1> 
-} 
+  stablehlo.return %0 : tensor<i1>
+}
 
 // CHECK-LABEL: @custom_call_bottomk
 // CHECK-SAME: %[[ARG0:.+]]: tensor<1x160xf32>
diff --git a/compiler/plugins/input/Torch/torch-iree/InputConversion/test/torch_to_iree.mlir b/compiler/plugins/input/Torch/torch-iree/InputConversion/test/torch_to_iree.mlir
index e89142b..493965a 100644
--- a/compiler/plugins/input/Torch/torch-iree/InputConversion/test/torch_to_iree.mlir
+++ b/compiler/plugins/input/Torch/torch-iree/InputConversion/test/torch_to_iree.mlir
@@ -31,7 +31,7 @@
 // -----
 
 // Verify we can decompose complex ops
-// CHECK: func @main(%arg0: tensor<2x3x4xf32>) -> (tensor<2x3x4xf32>, tensor<2x3x4xf32>) 
+// CHECK: func @main(%arg0: tensor<2x3x4xf32>) -> (tensor<2x3x4xf32>, tensor<2x3x4xf32>)
 // CHECK: tensor.empty
 module {
   func.func @main(%arg0: !torch.vtensor<[2,3,4],f32>) -> (!torch.vtensor<[2,3,4],f32>, !torch.vtensor<[2,3,4],f32>) {
diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/test/lower_to_ukernel_ops.mlir b/compiler/src/iree/compiler/Codegen/Common/CPU/test/lower_to_ukernel_ops.mlir
index c6aee63..5e769aa 100644
--- a/compiler/src/iree/compiler/Codegen/Common/CPU/test/lower_to_ukernel_ops.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/CPU/test/lower_to_ukernel_ops.mlir
@@ -621,8 +621,8 @@
   %dim_2 = tensor.dim %arg0, %c3 : tensor<?x?x?x?xi8>
   %0 = tensor.empty(%dim, %dim_0, %dim_1, %dim_2) : tensor<?x?x?x?xi32>
   %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
-                                        affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], 
-                        iterator_types = ["parallel", "parallel", "parallel", "parallel"]} 
+                                        affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
+                        iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
                         ins(%arg0 : tensor<?x?x?x?xi8>) outs(%0 : tensor<?x?x?x?xi32>) {
   ^bb0(%in: i8, %out: i32):
     %5 = arith.extsi %in : i8 to i32
@@ -634,8 +634,8 @@
   %dim_6 = tensor.dim %arg1, %c3 : tensor<?x?x?x?xi8>
   %2 = tensor.empty(%dim_3, %dim_4, %dim_5, %dim_6) : tensor<?x?x?x?xi32>
   %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
-                                        affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], 
-                        iterator_types = ["parallel", "parallel", "parallel", "parallel"]} 
+                                        affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
+                        iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
                         ins(%arg1 : tensor<?x?x?x?xi8>) outs(%2 : tensor<?x?x?x?xi32>) {
   ^bb0(%in: i8, %out: i32):
     %5 = arith.extsi %in : i8 to i32
@@ -683,8 +683,8 @@
   %dim_2 = tensor.dim %arg0, %c3 : tensor<?x?x?x?xi16>
   %0 = tensor.empty(%dim, %dim_0, %dim_1, %dim_2) : tensor<?x?x?x?xi32>
   %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
-                                        affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], 
-                        iterator_types = ["parallel", "parallel", "parallel", "parallel"]} 
+                                        affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
+                        iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
                         ins(%arg0 : tensor<?x?x?x?xi16>) outs(%0 : tensor<?x?x?x?xi32>) {
   ^bb0(%in: i16, %out: i32):
     %5 = arith.extsi %in : i16 to i32
@@ -696,8 +696,8 @@
   %dim_6 = tensor.dim %arg1, %c3 : tensor<?x?x?x?xi4>
   %2 = tensor.empty(%dim_3, %dim_4, %dim_5, %dim_6) : tensor<?x?x?x?xi32>
   %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
-                                        affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], 
-                        iterator_types = ["parallel", "parallel", "parallel", "parallel"]} 
+                                        affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
+                        iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
                         ins(%arg1 : tensor<?x?x?x?xi4>) outs(%2 : tensor<?x?x?x?xi32>) {
   ^bb0(%in: i4, %out: i32):
     %5 = arith.extui %in : i4 to i32
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tile_reduction.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tile_reduction.mlir
index 60cc518..1f274c4 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tile_reduction.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tile_reduction.mlir
@@ -192,7 +192,7 @@
 //       CHECK:      scf.yield %{{.+}} : tensor<1x2x64xf32>
 //       CHECK:    scf.yield %{{.+}} : tensor<1x2x64xf32>
 
-//       CHECK:  linalg.generic 
+//       CHECK:  linalg.generic
 //  CHECK-SAME:    iterator_types = ["parallel", "reduction", "reduction"]
 //  CHECK-SAME:    ins(%[[LN]] : tensor<1x2x64xf32>)
 //  CHECK-SAME:    outs(%{{.+}} : tensor<1xf32>)
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_workgroup_swizzle.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_workgroup_swizzle.mlir
index 1982183..af39659 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_workgroup_swizzle.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_workgroup_swizzle.mlir
@@ -33,7 +33,7 @@
       %variant_op: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     transform.iree.workgroup_swizzle %0 { log_tile = 3 } : (!transform.any_op) -> ()
-    transform.yield 
+    transform.yield
   }
 } // module
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/decompose_batch_mmt4d_ops.mlir b/compiler/src/iree/compiler/Codegen/Common/test/decompose_batch_mmt4d_ops.mlir
index d6cab0d..8c2a5e4 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/decompose_batch_mmt4d_ops.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/decompose_batch_mmt4d_ops.mlir
@@ -43,9 +43,9 @@
 func.func @batch_mmt4d_with_extened_inputs(%arg0: tensor<1x10x32x8x1xi8>, %arg1: tensor<1x80x32x4x1xi8>, %arg2: tensor<1x10x80x8x4xi32>) -> tensor<1x10x80x8x4xi32> {
   %c0_i32 = arith.constant 0 : i32
   %0 = tensor.empty() : tensor<1x10x32x8x1xi32>
-  %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>, 
-                                        affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>], 
-                       iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} 
+  %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>,
+                                        affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>],
+                       iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]}
                        ins(%arg0 : tensor<1x10x32x8x1xi8>) outs(%0 : tensor<1x10x32x8x1xi32>) {
   ^bb0(%in: i8, %out: i32):
     %6 = arith.extsi %in : i8 to i32
@@ -53,7 +53,7 @@
   } -> tensor<1x10x32x8x1xi32>
   %2 = tensor.empty() : tensor<1x80x32x4x1xi32>
   %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>,
-                                        affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>], 
+                                        affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>],
                        iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]}
                        ins(%arg1 : tensor<1x80x32x4x1xi8>) outs(%2 : tensor<1x80x32x4x1xi32>) {
   ^bb0(%in: i8, %out: i32):
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/iree_expand_strided_metadata.mlir b/compiler/src/iree/compiler/Codegen/Common/test/iree_expand_strided_metadata.mlir
index a506df4..ce8fa5a 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/iree_expand_strided_metadata.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/iree_expand_strided_metadata.mlir
@@ -12,7 +12,7 @@
 // CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
 // CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
 // CHECK-DAG:   %[[BASE_BUFFER:.+]], %[[BASE_OFFSET:.+]], %[[BASE_SIZES:.+]]:2, %[[BASE_STRIDES:.+]]:2 = memref.extract_strided_metadata %arg0
-//     CHECK:   %[[SUB_OFFSET:.+]] = affine.apply #[[MAP]]()[%arg1, %arg2] 
+//     CHECK:   %[[SUB_OFFSET:.+]] = affine.apply #[[MAP]]()[%arg1, %arg2]
 //     CHECK:   return %[[BASE_BUFFER]], %[[SUB_OFFSET]], %[[C64]], %[[C64]], %[[C128]], %[[C1]]
 
 // -----
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/reductions.mlir b/compiler/src/iree/compiler/Codegen/Common/test/reductions.mlir
index 3459164..aeefd04 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/reductions.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/reductions.mlir
@@ -114,7 +114,7 @@
     indexing_maps = [affine_map<(d0) -> (d0)>,
                      affine_map<(d0) -> (d0)>],
     iterator_types = ["parallel"]}
-    ins(%5 : !out_tensor_t) outs(%6 : !out_tensor_t) {  
+    ins(%5 : !out_tensor_t) outs(%6 : !out_tensor_t) {
     ^bb0(%arg3: f32, %arg4: f32):
       %4 = math.sqrt %arg3 : f32
       linalg.yield %4 : f32
@@ -174,7 +174,7 @@
     indexing_maps = [affine_map<(d0) -> (d0)>,
                      affine_map<(d0) -> (d0)>],
     iterator_types = ["parallel"]}
-    ins(%6 : !out_tensor_t) outs(%7 : !out_tensor_t) {  
+    ins(%6 : !out_tensor_t) outs(%7 : !out_tensor_t) {
     ^bb0(%arg3: f32, %arg4: f32):
       %4 = math.sqrt %arg3 : f32
       linalg.yield %4 : f32
@@ -236,7 +236,7 @@
     indexing_maps = [affine_map<(d0) -> (d0)>,
                      affine_map<(d0) -> (d0)>],
     iterator_types = ["parallel"]}
-    ins(%6 : !out_tensor_t) outs(%7 : !out_tensor_t) {  
+    ins(%6 : !out_tensor_t) outs(%7 : !out_tensor_t) {
     ^bb0(%arg3: f32, %arg4: f32):
       %4 = math.sqrt %arg3 : f32
       linalg.yield %4 : f32
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir b/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir
index d2a67aa..91df2fd 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/reductions_codegen_spec.mlir
@@ -7,7 +7,7 @@
     %maybe_leading, %original_fill, %reduction, %maybe_trailing_0 =
       transform.iree.match_callback failures(propagate) "reduction"(%root)
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-    
+
     %_, %more_parallel_fill, %parallel_reduction, %combiner_op =
       transform.structured.split_reduction %reduction { split_factor = 2, insert_split_dimension = 1 }
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
@@ -18,7 +18,7 @@
     %outer_tiled, %grid_loop = transform.structured.tile_using_forall %fusion_root_1 tile_sizes [1]
       ( mapping = [#gpu.block<x>] )
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    
+
     %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
     transform.apply_patterns to %func {
       transform.apply_patterns.iree.bubble_expand
@@ -50,7 +50,7 @@
       tile_sizes [1] ( mapping = [#gpu.thread<z>] )
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
     transform.structured.fuse_into_containing_op %fusion_group_22_full into %block_loop_22 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    
+
 
     %fusion_group_21 = transform.merge_handles %maybe_leading_2, %more_parallel_fill_2
       : !transform.any_op
@@ -59,7 +59,7 @@
       tile_sizes [1, 1] ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
     transform.structured.fuse_into_containing_op %fusion_group_21 into %block_loop_21 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    
+
     // Step 3. Rank-reduce.
     // ===========================================================================
     transform.apply_patterns to %func {
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/repeated_matcher_use.mlir b/compiler/src/iree/compiler/Codegen/Common/test/repeated_matcher_use.mlir
index 3193304..f376ebc 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/repeated_matcher_use.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/repeated_matcher_use.mlir
@@ -26,7 +26,7 @@
     %dummy3 = tensor.empty() : tensor<10xf32>
     %c0 = arith.constant 0.0 : f32
     %operand = linalg.fill ins(%c0 : f32) outs(%dummy1 : tensor<10xf32>) -> tensor<10xf32>
-    
+
     // expected-remark @below {{first}}
     %first = linalg.generic {
       indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
@@ -82,7 +82,7 @@
     %c5 = arith.constant 5.0 : f32
     %operand5 = linalg.fill ins(%c5 : f32) outs(%dummy5 : tensor<10xf32>) -> tensor<10xf32>
     %operand = linalg.fill ins(%c0 : f32) outs(%dummy1 : tensor<10xf32>) -> tensor<10xf32>
-    
+
     %first = linalg.generic {
       indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
       iterator_types = ["parallel"]
@@ -134,7 +134,7 @@
     %dummy5 = tensor.empty() : tensor<10xf32>
     %c0 = arith.constant 0.0 : f32
     %operand = linalg.fill ins(%c0 : f32) outs(%dummy1 : tensor<10xf32>) -> tensor<10xf32>
-    
+
     %first = linalg.generic {
       indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
       iterator_types = ["parallel"]
@@ -231,7 +231,7 @@
     %0 = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
       iterator_types = ["parallel", "parallel"]
-    } ins(%arg0: tensor<42x10xf32>) 
+    } ins(%arg0: tensor<42x10xf32>)
       outs(%init: tensor<10x42xf32>) {
     ^bb0(%arg1: f32, %arg2: f32):
       linalg.yield %arg1 : f32
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/transform_match_partial_reduction.mlir b/compiler/src/iree/compiler/Codegen/Common/test/transform_match_partial_reduction.mlir
index 3e4e546..a7bf5cd 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/transform_match_partial_reduction.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/transform_match_partial_reduction.mlir
@@ -11,7 +11,7 @@
   %result = linalg.generic {
     indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                      affine_map<(d0, d1) -> (d0)>],
-    iterator_types = ["parallel", "reduction"]} 
+    iterator_types = ["parallel", "reduction"]}
     ins(%arg0 : tensor<8x479xf32>)
     outs(%fill : tensor<8xf32>) {
   ^bb0(%in: f32, %out: f32):
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/transform_ops_invalid.mlir b/compiler/src/iree/compiler/Codegen/Common/test/transform_ops_invalid.mlir
index e8be084..5453916 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/transform_ops_invalid.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/transform_ops_invalid.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt %s --split-input-file --iree-transform-dialect-interpreter --verify-diagnostics 
+// RUN: iree-opt %s --split-input-file --iree-transform-dialect-interpreter --verify-diagnostics
 
 module attributes { transform.with_named_sequence } {
   transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/type_propagation.mlir b/compiler/src/iree/compiler/Codegen/Common/test/type_propagation.mlir
index 194644c..7a6176b 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/type_propagation.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/type_propagation.mlir
@@ -460,7 +460,7 @@
 //   CHECK-DAG:   %[[A_TENSOR:.+]] = flow.dispatch.tensor.load %[[A]]
 //   CHECK-DAG:   %[[B_TENSOR:.+]] = flow.dispatch.tensor.load %[[B]]
 //       CHECK:   %[[SORT:.+]]:2 = iree_linalg_ext.sort dimension(0)
-//  CHECK-SAME:       outs(%[[A_TENSOR]], %[[B_TENSOR]] : tensor<1xi32>, tensor<1xi8>) 
+//  CHECK-SAME:       outs(%[[A_TENSOR]], %[[B_TENSOR]] : tensor<1xi32>, tensor<1xi8>)
 //  CHECK-NEXT:     ^bb0(%[[ARG0:[a-zA-Z0-9]+]]: i32, %[[ARG1:[a-zA-Z0-9]+]]: i32, %[[ARG2:[a-zA-Z0-9]+]]: i8, %[[ARG3:[a-zA-Z0-9]+]]: i8)
 //   CHECK-DAG:       %[[CMPI:.+]] = arith.cmpi ult, %[[ARG0]], %[[ARG1]] : i32
 //       CHECK:       iree_linalg_ext.yield %[[CMPI]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/hal_interface_bindings.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/hal_interface_bindings.mlir
index 97c795a..a48ac2a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/hal_interface_bindings.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/hal_interface_bindings.mlir
@@ -17,7 +17,7 @@
   // CHECK: %[[OFFSET_PTR0:.+]] = llvm.getelementptr %[[BASE_PTR]][18]
   // CHECK: %[[OFFSET_D0:.+]] = llvm.mul %[[C5]], %[[C2]]
   // CHECK: %[[INDEX1:.+]] = llvm.add %[[OFFSET_D0]], %[[C1]]
-  // CHECK: %[[OFFSET_PTR1:.+]] = llvm.getelementptr %[[OFFSET_PTR0]][%[[INDEX1]]] 
+  // CHECK: %[[OFFSET_PTR1:.+]] = llvm.getelementptr %[[OFFSET_PTR0]][%[[INDEX1]]]
   // CHECK: %[[VALUE:.+]] = llvm.load %[[OFFSET_PTR1]]
   %c1 = arith.constant 1 : index
   %c5 = arith.constant 5 : index
@@ -105,7 +105,7 @@
   // CHECK: %[[OFFSET_ZEXT:.+]] = llvm.zext %[[OFFSET]]
   %offset = hal.interface.constant.load[0] : index
   %dim0 = hal.interface.constant.load[1]: index
-  
+
   // CHECK: %[[STATE3:.+]] = llvm.load %arg1
   // CHECK: %[[BINDING_PTRS:.+]] = llvm.extractvalue %[[STATE3]][10]
   // CHECK: %[[ARRAY_PTR:.+]] = llvm.getelementptr %[[BINDING_PTRS]][1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir
index 273543b..466c259 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir
@@ -40,6 +40,6 @@
     transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
     %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> (!transform.any_op)
     %func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.yield 
+    transform.yield
   }
 } // module
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir
index e33e07e..17b2157 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir
@@ -145,6 +145,6 @@
     } : !transform.any_op
     transform.iree.apply_cse %func_8 : !transform.any_op
     transform.memref.erase_dead_alloc_and_stores %func_8 : (!transform.any_op) -> ()
-    transform.yield 
+    transform.yield
   }
 } ////  module
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_address_space_function.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_address_space_function.mlir
index 59eb11d..1f1274f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_address_space_function.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/cast_address_space_function.mlir
@@ -1,9 +1,9 @@
 // RUN: iree-opt --iree-codegen-lower-ukernel-ops-to-calls --iree-llvmgpu-cast-address-space-function %s --split-input-file | FileCheck %s
 
 module {
-  func.func private @foo(memref<f32>, memref<f32, #gpu.address_space<workgroup>>, memref<f32, #gpu.address_space<workgroup>>) 
+  func.func private @foo(memref<f32>, memref<f32, #gpu.address_space<workgroup>>, memref<f32, #gpu.address_space<workgroup>>)
   func.func @bar() {
-    %alloc_1 = memref.alloc() : memref<110xf32, #gpu.address_space<workgroup>>    
+    %alloc_1 = memref.alloc() : memref<110xf32, #gpu.address_space<workgroup>>
     %alloc_2 = memref.alloc() : memref<128xf32>
     %alloc_3 = memref.alloc() : memref<128xf32, #gpu.address_space<workgroup>>
     %a1:4 = memref.extract_strided_metadata %alloc_1 : memref<110xf32, #gpu.address_space<workgroup>> -> memref<f32, #gpu.address_space<workgroup>>, index, index, index
@@ -14,12 +14,12 @@
   }
 }
 
-// CHECK:    func.func private @foo(memref<f32>, memref<f32>, memref<f32>) 
+// CHECK:    func.func private @foo(memref<f32>, memref<f32>, memref<f32>)
 
 // CHECK-LABEL: func.func @bar
-// CHECK:     %[[a1:.+]] = memref.alloc() : memref<110xf32, #gpu.address_space<workgroup>>    
-// CHECK:     %[[a2:.+]] = memref.alloc() : memref<128xf32, #gpu.address_space<workgroup>>    
-// CHECK:     %[[b1:.+]], %{{.*}}, %{{.*}}, %{{.*}} = memref.extract_strided_metadata %[[a1]] : memref<110xf32, #gpu.address_space<workgroup>> -> memref<f32, #gpu.address_space<workgroup>>, index, index, index 
+// CHECK:     %[[a1:.+]] = memref.alloc() : memref<110xf32, #gpu.address_space<workgroup>>
+// CHECK:     %[[a2:.+]] = memref.alloc() : memref<128xf32, #gpu.address_space<workgroup>>
+// CHECK:     %[[b1:.+]], %{{.*}}, %{{.*}}, %{{.*}} = memref.extract_strided_metadata %[[a1]] : memref<110xf32, #gpu.address_space<workgroup>> -> memref<f32, #gpu.address_space<workgroup>>, index, index, index
 // CHECK:     %[[b2:.+]], %{{.*}}, %{{.*}}, %{{.*}} = memref.extract_strided_metadata %[[a2]] : memref<128xf32, #gpu.address_space<workgroup>> -> memref<f32, #gpu.address_space<workgroup>>, index, index, index
 // CHECK:     %[[C1:.+]] = memref.memory_space_cast %[[b1]] : memref<f32, #gpu.address_space<workgroup>> to memref<f32>
 // CHECK:     %[[C2:.+]] = memref.memory_space_cast %[[b2]] : memref<f32, #gpu.address_space<workgroup>> to memref<f32>
@@ -29,11 +29,11 @@
 
 module {
   func.func @bar() {
-    %alloc_1 = memref.alloc() : memref<110xf32, #gpu.address_space<workgroup>>    
+    %alloc_1 = memref.alloc() : memref<110xf32, #gpu.address_space<workgroup>>
     %alloc_2 = memref.alloc() : memref<128xf32>
     %alloc_3 = memref.alloc() : memref<128xf32, #gpu.address_space<workgroup>>
-    iree_codegen.ukernel.generic "fastfunction" ins(%alloc_1, %alloc_2 : memref<110xf32, #gpu.address_space<workgroup>>, memref<128xf32>) 
-    outs(%alloc_3 : memref<128xf32, #gpu.address_space<workgroup>>) 
+    iree_codegen.ukernel.generic "fastfunction" ins(%alloc_1, %alloc_2 : memref<110xf32, #gpu.address_space<workgroup>>, memref<128xf32>)
+    outs(%alloc_3 : memref<128xf32, #gpu.address_space<workgroup>>)
     return
   }
 }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/create_async_groups.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/create_async_groups.mlir
index 1fc2285..099764b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/create_async_groups.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/create_async_groups.mlir
@@ -14,7 +14,7 @@
     vector.transfer_write %1, %0[%c0, %c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
     // CHECK-NOT: nvgpu.device_async_create_group
 
-    // CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1  
+    // CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1
     %2 = vector.transfer_read %a[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<1xf32>
     vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
     // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
@@ -27,7 +27,7 @@
   transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     transform.iree.create_async_groups %top_level_func {use_mma_sync} : (!transform.any_op) -> ()
-    transform.yield 
+    transform.yield
   }
 } // module
 
@@ -61,7 +61,7 @@
   transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     transform.iree.create_async_groups %top_level_func : (!transform.any_op) -> ()
-    transform.yield 
+    transform.yield
   }
 } // module
 
@@ -92,7 +92,7 @@
     %vector_transfer = transform.structured.match ops{["memref.alloc"]} in %top_level_func : (!transform.any_op) -> !transform.any_op
     // expected-error@below {{transform applied to the wrong op kind}}
     transform.iree.create_async_groups %vector_transfer : (!transform.any_op) -> ()
-    transform.yield 
+    transform.yield
   }
 } // module
 
@@ -124,7 +124,7 @@
   transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     transform.iree.create_async_groups %top_level_func : (!transform.any_op) -> ()
-    transform.yield 
+    transform.yield
   }
 } // module
 
@@ -149,7 +149,7 @@
     // CHECK-NOT: nvgpu.device_async_create_group
 
     // CHECK-NOT: nvgpu.device_async_copy
-    //     CHECK: vector.load    
+    //     CHECK: vector.load
     //     CHECK: vector.store
     %2 = vector.load %b[%c0, %c4] : memref<1024x1024xf16>, vector<1xf16>
     vector.store %2, %alloc_1[%c0, %c4, %c0] : memref<4x32x16xf16, #gpu.address_space<workgroup>>, vector<1xf16>
@@ -162,6 +162,6 @@
   transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     transform.iree.create_async_groups %top_level_func : (!transform.any_op) -> ()
-    transform.yield 
+    transform.yield
   }
 } // module
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir
index bc34439..aff4f69 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir
@@ -21,7 +21,7 @@
   transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     %transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
-    transform.yield 
+    transform.yield
   }
 } // module
 
@@ -154,7 +154,7 @@
   transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     %transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
-    transform.yield 
+    transform.yield
   }
 } // module
 
@@ -354,7 +354,7 @@
   transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     %transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
-    transform.yield 
+    transform.yield
   }
 } // module
 
@@ -542,7 +542,7 @@
   transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     %transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
-    transform.yield 
+    transform.yield
   }
 } // module
 
@@ -710,7 +710,7 @@
   transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     %transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
-    transform.yield 
+    transform.yield
   }
 } // module
 
@@ -860,7 +860,7 @@
   transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     %transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
-    transform.yield 
+    transform.yield
   }
 } // module
 
@@ -983,7 +983,7 @@
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     %reordered_func = transform.iree.reorder_transpose %top_level_func : (!transform.any_op) -> !transform.any_op
     transform.iree.apply_cse %reordered_func : !transform.any_op
-    transform.yield 
+    transform.yield
   }
 } // module
 
@@ -1108,7 +1108,7 @@
   transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     %transformed_func = transform.iree.layout_analysis_and_distribution %top_level_func : (!transform.any_op) -> (!transform.any_op)
-    transform.yield 
+    transform.yield
   }
 } // module
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/pack_shared_memory_alloc.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/pack_shared_memory_alloc.mlir
index 2c120d8..3c36ee0 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/pack_shared_memory_alloc.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/pack_shared_memory_alloc.mlir
@@ -21,7 +21,7 @@
 //       CHECK:   memref.view %[[PACKED]][%[[C0]]][] : memref<1024xi8, #gpu.address_space<workgroup>> to memref<128xf32, #gpu.address_space<workgroup>>
 //       CHECK:   %[[C512:.+]] = arith.constant 512 : index
 //       CHECK:   memref.view %[[PACKED]][%[[C512]]][] : memref<1024xi8, #gpu.address_space<workgroup>> to memref<128xf32, #gpu.address_space<workgroup>>
-//       CHECK:   nvgpu.device_async_create_group 
+//       CHECK:   nvgpu.device_async_create_group
 //       CHECK:   nvgpu.device_async_wait %0 {numGroups = 0 : i32}
 //       CHECK:   gpu.barrier
 //       CHECK:   memref.view %[[PACKED]][%[[C0]]][] : memref<1024xi8, #gpu.address_space<workgroup>> to memref<32xf32, #gpu.address_space<workgroup>>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir
index 839d389..46517b7 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir
@@ -113,7 +113,7 @@
 // CHECK:   %[[RHS:.+]] = transform.get_producer_of_operand %[[PADDED]][1]
 // CHECK:   %[[RHS_DPS:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RHS]]
 
-// CHECK:   transform.structured.tile_using_forall %[[LHS]] 
+// CHECK:   transform.structured.tile_using_forall %[[LHS]]
 // DEFAULT:  num_threads [1, 32, 4](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
 // OPTIONS:  num_threads [1, 64, 2](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
 // CHECK:   apply_patterns
@@ -122,10 +122,10 @@
 // CHECK:   transform.structured.match ops{["scf.if"]}
 // CHECK:   transform.scf.take_assumed_branch %{{.*}} take_else_branch
 
-// CHECK:   transform.structured.tile_using_forall %[[RHS_DPS]]  
+// CHECK:   transform.structured.tile_using_forall %[[RHS_DPS]]
 // DEFAULT:  num_threads [8, 16, 1](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
 // OPTIONS:  num_threads [2, 8, 8](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
-// CHECK:   apply_patterns 
+// CHECK:   apply_patterns
 // CHECK:   transform.iree.apply_licm
 // CHECK:   transform.iree.apply_cse
 
@@ -139,7 +139,7 @@
 // CHECK:   transform.structured.tile_using_forall
 // DEFAULT:  num_threads [1, 2, 64](mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>])
 // OPTIONS:  num_threads [1, 4, 32](mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>])
-// CHECK:   apply_patterns 
+// CHECK:   apply_patterns
 // CHECK:   transform.iree.apply_licm
 // CHECK:   transform.iree.apply_cse
 
@@ -211,7 +211,7 @@
 // OPTIONS:   factor = 3
 // CHECK:   apply_patterns
 // CHECK:     transform.apply_patterns.vector.transfer_to_scf   full_unroll = true
-// CHECK:   apply_patterns 
+// CHECK:   apply_patterns
 // CHECK:   transform.iree.apply_licm
 // CHECK:   transform.iree.apply_cse
 // CHECK:   transform.iree.create_async_groups
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir
index 1769c4b..4468fd0 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir
@@ -24,7 +24,7 @@
       %6 = linalg.matmul ins(%p, %4 : tensor<250x500xf32>, tensor<500x1020xf32>) outs(%5 : tensor<250x1020xf32>) -> tensor<250x1020xf32>
 
       flow.dispatch.tensor.store %6, %2, offsets=[0, 0], sizes=[250, 1020], strides=[1, 1] : tensor<250x1020xf32> -> !flow.dispatch.tensor<readwrite:tensor<250x1020xf32>>
-      return 
+      return
     }
   }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
index ebf730c..4a2276d 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
@@ -2,7 +2,7 @@
   transform.named_sequence @__transform_main(
       %variant_op: !transform.any_op {transform.consumed}) {
     %0 = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %forall, %tiled_fill = transform.structured.tile_using_forall %0 num_threads [5, 1] 
+    %forall, %tiled_fill = transform.structured.tile_using_forall %0 num_threads [5, 1]
     ( mapping = [#gpu.thread<y>, #gpu.thread<x>] )
     : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
@@ -28,7 +28,7 @@
     transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
     %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> (!transform.any_op)
     %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.iree.map_nested_forall_to_gpu_threads %memref_func 
+    transform.iree.map_nested_forall_to_gpu_threads %memref_func
       workgroup_dims = [10, 11, 1] : (!transform.any_op) -> ()
 
     // Late canonicalizations to cleanup and pass the checks
@@ -40,7 +40,7 @@
     } : !transform.any_op
     transform.iree.apply_licm %memref_func : !transform.any_op
     transform.iree.apply_cse %memref_func : !transform.any_op
-    transform.yield 
+    transform.yield
   }
 } // module
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_distribution_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_distribution_spec.mlir
index 04474cd..74d7e52 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_distribution_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_distribution_spec.mlir
@@ -1,11 +1,11 @@
 module attributes { transform.with_named_sequence } {
   transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
 
-    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op 
+    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op
       : (!transform.any_op) -> !transform.any_op
     %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
       : (!transform.any_op) -> !transform.any_op
-    %isolated = transform.get_parent_op %warp {isolated_from_above} 
+    %isolated = transform.get_parent_op %warp {isolated_from_above}
       : (!transform.any_op) -> !transform.any_op
     transform.iree.vector.warp_distribute %isolated
       : (!transform.any_op) -> ()
@@ -22,6 +22,6 @@
     transform.iree.apply_licm %func_op : !transform.any_op
     transform.iree.apply_cse %func_op : !transform.any_op
 
-    transform.yield 
+    transform.yield
   }
 } // module
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir
index 33fa29e..29540c2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir
@@ -1,7 +1,7 @@
 module attributes { transform.with_named_sequence } {
   transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
 
-    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op 
+    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op
       : (!transform.any_op) -> !transform.any_op
     transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
       : (!transform.any_op) -> !transform.any_op
@@ -14,6 +14,6 @@
     } : !transform.any_op
     transform.iree.apply_licm %func_op : !transform.any_op
     transform.iree.apply_cse %func_op : !transform.any_op
-    transform.yield 
+    transform.yield
   }
 } // module
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir
index 504950e..2654797 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir
@@ -7,7 +7,7 @@
 //       CHECK:   memref.view %[[PACKED]][%[[C0]]][] : memref<1024xi8, #gpu.address_space<workgroup>> to memref<128xf32, #gpu.address_space<workgroup>>
 //       CHECK:   %[[C512:.+]] = arith.constant 512 : index
 //       CHECK:   memref.view %[[PACKED]][%[[C512]]][] : memref<1024xi8, #gpu.address_space<workgroup>> to memref<128xf32, #gpu.address_space<workgroup>>
-//       CHECK:   nvgpu.device_async_create_group 
+//       CHECK:   nvgpu.device_async_create_group
 //       CHECK:   nvgpu.device_async_wait %0 {numGroups = 0 : i32}
 //       CHECK:   gpu.barrier
 //       CHECK:   memref.view %[[PACKED]][%[[C0]]][] : memref<1024xi8, #gpu.address_space<workgroup>> to memref<32xf32, #gpu.address_space<workgroup>>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_promote_operands.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_promote_operands.mlir
index 757baf4..3d1ce7f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_promote_operands.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_promote_operands.mlir
@@ -33,7 +33,7 @@
       %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op
         : (!transform.any_op) -> !transform.any_op
       %promoted_matmul, %alloc_0, %alloc_1 =
-        transform.iree.promote_operands %matmul [0, 1] 
+        transform.iree.promote_operands %matmul [0, 1]
           : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
       // Late canonicalizations to cleanup and pass the checks.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir
index 6bfe345..82334e9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir
@@ -57,7 +57,7 @@
     } : !transform.any_op
     transform.iree.vector.vector_to_mma_conversion %func { use_wmma } : (!transform.any_op) -> ()
 
-    // Apply canonicalization post-hoc to trigger DCE and pass the test 
+    // Apply canonicalization post-hoc to trigger DCE and pass the test
     // (i.e. all vector.contract are dead).
     // TODO: consider having the vector_to_mma_conversion do the DCE automatically.
     transform.apply_patterns to %func {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_to_gpu.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_to_gpu.mlir
index 1558636..140bf1b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_to_gpu.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_to_gpu.mlir
@@ -170,12 +170,12 @@
   %mask1 = vector.create_mask %i : vector<1xi1>
   // CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1, %[[I]]
   %2 = vector.transfer_read %a[%c0, %c4], %cst_0, %mask1 {in_bounds = [true]} : memref<1024x1024xf32>, vector<1xf32>
-  vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>  
+  vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
   // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
   // CHECK: nvgpu.device_async_wait %[[G]]
 
   %3 = vector.transfer_read %a[%c0, %c4], %cst_0, %mask1 {in_bounds = [true]} : memref<1024x1024xf32>, vector<1xf32>
-  vector.transfer_write %3, %0[%c0, %c4, %c0], %mask1 {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>  
+  vector.transfer_write %3, %0[%c0, %c4, %c0], %mask1 {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
   // We cannot generate async copy if the write is masked.
   // CHECK-NOT: nvgpu.device_async_copy
   return
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_reduction.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_reduction.mlir
index bd435a5..8aec774 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_reduction.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_reduction.mlir
@@ -136,13 +136,13 @@
       func.func @subgroup_reduce_dynamic() {
         %c32_i64 = arith.constant 32 : i64
         %cst = arith.constant 0.000000e+00 : f32
-        %cst_0 = arith.constant 2.000000e+00 : f32 
+        %cst_0 = arith.constant 2.000000e+00 : f32
         %c0 = arith.constant 0 : index
         %0 = hal.interface.constant.load[0] : i32
-        %1 = hal.interface.constant.load[1] : i32 
+        %1 = hal.interface.constant.load[1] : i32
         %2 = arith.extui %0 : i32 to i64
         %3 = arith.extui %1 : i32 to i64
-        %4 = arith.shli %3, %c32_i64 : i64 
+        %4 = arith.shli %3, %c32_i64 : i64
         %5 = arith.ori %2, %4 : i64
         %6 = arith.index_castui %5 : i64 to index
         %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8xf32>>
@@ -155,7 +155,7 @@
         ^bb0(%in: f32, %out: f32):
           %14 = math.powf %in, %cst_0 : f32
           %15 = arith.addf %14, %out : f32
-          linalg.yield %15 : f32 
+          linalg.yield %15 : f32
         } -> tensor<8xf32>
         flow.dispatch.tensor.store %13, %7, offsets = [0], sizes = [8], strides = [1] : tensor<8xf32> -> !flow.dispatch.tensor<writeonly:tensor<8xf32>>
         return
diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups_folding.mlir b/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups_folding.mlir
index 74fc9b0..3ec11fb 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups_folding.mlir
@@ -173,7 +173,7 @@
     flow.dispatch.tensor.store %empty, %binding, offsets = [0, 0], sizes = [%wl0, %wl1], strides = [1, 1]
         : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%wl0, %wl1}
     flow.return
-  }  
+  }
   return %result : tensor<?x?xf32>
 }
 
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_linalg_generic_on_tensors.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_linalg_generic_on_tensors.mlir
index 5e4ec9b..6ebe9e4 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_linalg_generic_on_tensors.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_linalg_generic_on_tensors.mlir
@@ -8,10 +8,10 @@
   %input_ptr = util.global.address @"__transpose_10_input" : !util.ptr<!type>
   %input = util.global.load.indirect %input_ptr : !util.ptr<!type> -> !type
   %output = tensor.empty() : !type
-  
+
   // Can collapse All (d0, d1, d2, d3, d4, d5)
-  %6 = linalg.generic { indexing_maps = [ 
-            affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, 
+  %6 = linalg.generic { indexing_maps = [
+            affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>,
             affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>],
             iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]
          }
@@ -43,10 +43,10 @@
   %input_ptr = util.global.address @"__transpose_10_input" : !util.ptr<!type>
   %input = util.global.load.indirect %input_ptr : !util.ptr<!type> -> !type
   %output = tensor.empty() : !type
-  
+
   // Can collapse (d0, d1) and (d5, d6)
-  %6 = linalg.generic { indexing_maps = [ 
-            affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d3, d5, d6)>, 
+  %6 = linalg.generic { indexing_maps = [
+            affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d4, d3, d5, d6)>,
             affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4, d5, d6)>],
             iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "parallel", "parallel"]
          }
@@ -78,10 +78,10 @@
   %input_ptr = util.global.address @"__transpose_10_input" : !util.ptr<!type>
   %input = util.global.load.indirect %input_ptr : !util.ptr<!type> -> !type
   %output = tensor.empty() : !type
-  
+
   // Can collapse (d0, d1) and (d3, d4, d5, d6, d7)
-  %result = linalg.generic { indexing_maps = [ 
-          affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, 
+  %result = linalg.generic { indexing_maps = [
+          affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>,
           affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>],
           iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "parallel", "parallel", "parallel"]
         }
@@ -112,10 +112,10 @@
   %input_ptr = util.global.address @"__transpose_10_input" : !util.ptr<!type>
   %input = util.global.load.indirect %input_ptr : !util.ptr<!type> -> !type
   %output = tensor.empty() : !type
-  
+
   // Can collapse (d0, d1) and (d6, d7)
-  %result = linalg.generic { indexing_maps = [ 
-          affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, 
+  %result = linalg.generic { indexing_maps = [
+          affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>,
           affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d4, d6, d7)>],
           iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "parallel", "parallel", "parallel"]
         }
@@ -149,16 +149,16 @@
   %input2 = util.global.load.indirect %input_ptr : !util.ptr<!type> -> !type
   %input3 = util.global.load.indirect %input_ptr : !util.ptr<!type> -> !type
   %output = tensor.empty() : !type
-  
+
   // Can collapse (d0, d1) and (d6, d7)
-  %result = linalg.generic { indexing_maps = [ 
-          affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, 
-          affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d4, d3, d5, d6, d7)>, 
-          affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d4, d3, d2, d5, d6, d7)>, 
+  %result = linalg.generic { indexing_maps = [
+          affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>,
+          affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d4, d3, d5, d6, d7)>,
+          affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d4, d3, d2, d5, d6, d7)>,
           affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>],
           iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "parallel", "parallel"]
         }
-  ins(%input, %input2, %input3  : !type, !type, !type) 
+  ins(%input, %input2, %input3  : !type, !type, !type)
   outs(%output : !type) {
   ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32):
     linalg.yield %arg1 : f32
@@ -190,10 +190,10 @@
   %input_ptr = util.global.address @"__transpose_10_input" : !util.ptr<!type>
   %input = util.global.load.indirect %input_ptr : !util.ptr<!type> -> !type
   %output = tensor.empty() : !type
-  
+
   // Can collapse (d2, d3) and (d6, d7)
-  %result = linalg.generic { indexing_maps = [ 
-          affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, 
+  %result = linalg.generic { indexing_maps = [
+          affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>,
           affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d5, d4, d6, d7)>],
           iterator_types = ["parallel", "reduction", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]
         }
@@ -226,9 +226,9 @@
   %input_ptr = util.global.address @"__transpose_10_input" : !util.ptr<!type_in>
   %input = util.global.load.indirect %input_ptr : !util.ptr<!type_in> -> !type_in
   %output = tensor.empty() : !type_out
-  
-  %result = linalg.generic { indexing_maps = [ 
-          affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)>, 
+
+  %result = linalg.generic { indexing_maps = [
+          affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)>,
           affine_map<(d0, d1, d2, d3) -> (d1, d2, d3, d0)>],
           iterator_types = ["parallel", "parallel", "parallel", "parallel"]
         }
@@ -258,8 +258,8 @@
   %c0 = arith.constant 0 : index
   %output = tensor.empty() : !type_out
   // Can collapse (d3, d0, d1)
-  %6 = linalg.generic { indexing_maps = [ 
-            affine_map<(d0, d1, d2, d3, d4, d5) -> (d3, d0, d1, d5)>, 
+  %6 = linalg.generic { indexing_maps = [
+            affine_map<(d0, d1, d2, d3, d4, d5) -> (d3, d0, d1, d5)>,
             affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d3, d0, d1, d4, d5)>],
             iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]
          }
@@ -291,8 +291,8 @@
   %c0 = arith.constant 0 : index
   %input = tensor.empty() : !type_in
   %output = tensor.empty() : !type_out
-  %6 = linalg.generic { indexing_maps = [ 
-            affine_map<(d0, d1, d2) -> (d0, d2)>, 
+  %6 = linalg.generic { indexing_maps = [
+            affine_map<(d0, d1, d2) -> (d0, d2)>,
             affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
             iterator_types = ["parallel", "parallel", "parallel"]
          }
@@ -318,10 +318,10 @@
   %input_ptr = util.global.address @"__transpose_10_input" : !util.ptr<!type_in>
   %input = util.global.load.indirect %input_ptr : !util.ptr<!type_in> -> !type_in
   %output = tensor.empty() : !type_out
-  
+
   // Can collapse (d0, d1) and (d6, d7)
-  %result = linalg.generic { indexing_maps = [ 
-          affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>, 
+  %result = linalg.generic { indexing_maps = [
+          affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>,
           affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d3, d5, d4, d6, d7)>],
           iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "parallel", "parallel", "parallel"]
         }
@@ -352,7 +352,7 @@
 
   // Can collapse (d1, d3, d0)
   %result = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d0)>, 
+    indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d0)>,
                      affine_map<(d0, d1, d2, d3, d4) -> (d2, d1, d3, d0, d4)>],
     iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]}
   ins(%input : !type_in) outs(%output : !type_out) {
@@ -378,9 +378,9 @@
   %output = tensor.empty() : !type_out
 
   // Can collapse (d1, d0)
-  %result = linalg.generic { 
+  %result = linalg.generic {
     indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1, d0)>],
-    iterator_types = ["parallel", "parallel"] }     
+    iterator_types = ["parallel", "parallel"] }
   ins(%input : !type_in) outs(%output : !type_out) {
   ^bb0(%arg1: f32, %arg2: f32):
     linalg.yield %arg1 : f32
@@ -432,12 +432,12 @@
   %output1 = tensor.empty() : !type
   %output2 = tensor.empty() : !type
   %output3 = tensor.empty() : !type
-  
-  %6, %7, %8, %9 = linalg.generic { indexing_maps = [ 
-            affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4, d3, d5)>, 
-            affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4, d3, d5)>, 
-            affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4, d3, d5)>, 
-            affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4, d3, d5)>, 
+
+  %6, %7, %8, %9 = linalg.generic { indexing_maps = [
+            affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4, d3, d5)>,
+            affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4, d3, d5)>,
+            affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4, d3, d5)>,
+            affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4, d3, d5)>,
             affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4, d3, d5)>],
             iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]
          }
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_regions.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_regions.mlir
index d5c0ea0..966d54a 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_regions.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_regions.mlir
@@ -425,7 +425,7 @@
   return %0#1 : tensor<?xf32>
 }
 // CHECK: func @no_yield_dead_results
-// CHECK:   %[[RESULT:.+]] = flow.dispatch.region 
+// CHECK:   %[[RESULT:.+]] = flow.dispatch.region
 // CHECK:     %[[GENERIC:.+]]:2 = linalg.generic
 // CHECK:     flow.return %[[GENERIC]]#1
 // CHECK:   return %[[RESULT]]
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_workgroups.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_workgroups.mlir
index 85f263b..adabc66 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_workgroups.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_workgroups.mlir
@@ -22,10 +22,10 @@
   %0 = flow.dispatch.region -> (tensor<10x20xf32>) {
     %cst_0 = arith.constant dense<1.000000e+00> : tensor<10x20xf32>
     cf.cond_br %arg0, ^bb1, ^bb2
-  ^bb1:  // pred: ^bb0                                                                                                                                                                                                                                                                                             
+  ^bb1:  // pred: ^bb0
     %2 = tensor.empty() : tensor<10x20xf32>
     flow.return %2 : tensor<10x20xf32>
-  ^bb2:  // pred: ^bb0                                                                                                                                                                                                                                                                                             
+  ^bb2:  // pred: ^bb0
     flow.return %cst_0 : tensor<10x20xf32>
   }
   return %0 : tensor<10x20xf32>
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/raise_special_ops.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/raise_special_ops.mlir
index f4a3b70..802c855 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/raise_special_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/raise_special_ops.mlir
@@ -87,7 +87,7 @@
   } -> tensor<10x4096x4096xf16>
   %229 = tensor.empty() : tensor<10x4096xf16>
   %230 = linalg.fill ins(%cst_121 : f16) outs(%229 : tensor<10x4096xf16>) -> tensor<10x4096xf16>
-  %231 = linalg.generic 
+  %231 = linalg.generic
   {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
   iterator_types = ["parallel", "parallel", "reduction"]}
   ins(%228 : tensor<10x4096x4096xf16>) outs(%230 : tensor<10x4096xf16>) {
@@ -95,7 +95,7 @@
     %5290 = arith.addf %in, %out : f16
     linalg.yield %5290 : f16
   } -> tensor<10x4096xf16>
-  %232 = linalg.generic 
+  %232 = linalg.generic
   {indexing_maps = [
     affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
     affine_map<(d0, d1, d2) -> (d0, d1)>,
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors.mlir
index cab7e8b..b34db57 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors.mlir
@@ -114,7 +114,7 @@
   %cst = complex.constant [3.000000e+00 : f32, 1.000000e+01 : f32] : complex<f32>
   %0 = stream.tensor.sizeof tensor<6xcomplex<f32>> : index
   // CHECK: %[[I64NUMBER:.+]] = complex.constant [3.000000e+00 : f32, 1.000000e+01 : f32] : complex<f32>
-  // CHECK: %[[BITCAST:.+]] = complex.bitcast %[[I64NUMBER]] : complex<f32> to i64 
+  // CHECK: %[[BITCAST:.+]] = complex.bitcast %[[I64NUMBER]] : complex<f32> to i64
   // CHECK: %[[SPLAT_RES:.+]] = stream.async.splat %[[BITCAST]]
   %1 = stream.tensor.splat %cst : complex<f32> -> tensor<6xcomplex<f32>> in !stream.resource<*>{%0}
   // CHECK: return %[[SPLAT_RES]]
@@ -126,7 +126,7 @@
 // CHECK-LABEL: @denseTensorSplatDynamicComplexF32
 func.func @denseTensorSplatDynamicComplexF32(%arg0: !stream.resource<*>, %arg1: complex<f32>) -> (!stream.resource<*>) {
   %0 = stream.tensor.sizeof tensor<6xcomplex<f32>> : index
-  // CHECK: %[[BITCAST:.+]] = complex.bitcast %arg1 : complex<f32> to i64 
+  // CHECK: %[[BITCAST:.+]] = complex.bitcast %arg1 : complex<f32> to i64
   // CHECK: %[[SPLAT_RES:.+]] = stream.async.splat %[[BITCAST]]
   %1 = stream.tensor.splat %arg1 : complex<f32> -> tensor<6xcomplex<f32>> in !stream.resource<*>{%0}
   // CHECK: return %[[SPLAT_RES]]
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/propagate_subviews.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/propagate_subviews.mlir
index 188c211..23934c4 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/propagate_subviews.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/propagate_subviews.mlir
@@ -170,7 +170,7 @@
   cf.switch %flag : i32, [
     default: ^bb1(%resource0, %resource1 : !stream.resource<external>, !stream.resource<transient>),
     0: ^bb1(%resource0, %resource1 : !stream.resource<external>, !stream.resource<transient>)
-  ]  
+  ]
 
 // CHECK: ^bb1(%[[BB1_RESOURCE0:.+]]: !stream.resource<external>, %[[BB1_STORAGE_SIZE0:.+]]: index, %[[BB1_OFFSET0:.+]]: index, %[[BB1_LENGTH0:.+]]: index, %[[BB1_RESOURCE1:.+]]: !stream.resource<transient>, %[[BB1_STORAGE_SIZE1:.+]]: index, %[[BB1_OFFSET1:.+]]: index, %[[BB1_LENGTH1:.+]]: index):
 ^bb1(%bb1_resource0: !stream.resource<external>, %bb1_resource1: !stream.resource<transient>):
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/hoist_into_globals.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/hoist_into_globals.mlir
index af4585f..191992d 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/hoist_into_globals.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/hoist_into_globals.mlir
@@ -258,7 +258,7 @@
 //       CHECK:   %[[CST:.+]] = arith.constant
 //       CHECK:   %[[EXPANDED:.+]] = tensor.expand_shape %[[CST]]
 //       CHECK:   %[[RESULT:.+]] = flow.dispatch.region
-//       CHECK:     %[[ADD:.+]] = linalg.generic 
+//       CHECK:     %[[ADD:.+]] = linalg.generic
 //  CHECK-SAME:     %[[EXPANDED]]
 //       CHECK:     flow.return %[[ADD]]
 //       CHECK:   return %[[RESULT]]
@@ -277,7 +277,7 @@
   func.func @main() -> (tensor<128xi8>) {
     %0 = arith.constant dense<0> : tensor<32xi8>
     %1 = arith.constant dense<0> : tensor<32xi8>
-    %2 = "iree_unregistered.const_expr"(%0, %1) 
+    %2 = "iree_unregistered.const_expr"(%0, %1)
     : (tensor<32xi8>, tensor<32xi8>) -> tensor<128xi8>
     return %2 : tensor<128xi8>
   }
@@ -294,7 +294,7 @@
   func.func @main() -> (tensor<129xi8>) {
     %0 = arith.constant dense<0> : tensor<32xi8>
     %1 = arith.constant dense<0> : tensor<32xi8>
-    %2 = "iree_unregistered.const_expr"(%0, %1) 
+    %2 = "iree_unregistered.const_expr"(%0, %1)
     : (tensor<32xi8>, tensor<32xi8>) -> tensor<129xi8>
     return %2 : tensor<129xi8>
   }
diff --git a/compiler/src/iree/compiler/Dialect/VMVX/Transforms/test/resolve_buffer_descriptors.mlir b/compiler/src/iree/compiler/Dialect/VMVX/Transforms/test/resolve_buffer_descriptors.mlir
index a33639a..225a908 100644
--- a/compiler/src/iree/compiler/Dialect/VMVX/Transforms/test/resolve_buffer_descriptors.mlir
+++ b/compiler/src/iree/compiler/Dialect/VMVX/Transforms/test/resolve_buffer_descriptors.mlir
@@ -8,7 +8,7 @@
     %base_buffer, %offset, %sizes:2, %strides:2 = vmvx.get_buffer_descriptor %0 : memref<64x64xf32, #map0> -> !util.buffer, index, index, index, index, index
     return %base_buffer, %offset, %sizes#0, %sizes#1, %strides#0, %strides#1 : !util.buffer, index, index, index, index, index
   }
-//     CHECK: #[[MAP:.+]] = affine_map<()[s0, s1, s2, s3, s4] -> (s0 + s1 * s2 + s3 * s4)>       
+//     CHECK: #[[MAP:.+]] = affine_map<()[s0, s1, s2, s3, s4] -> (s0 + s1 * s2 + s3 * s4)>
 //     CHECK: func @resolve_subview(
 // CHECK-DAG:   %[[BASE_BUFFER:.+]], %[[BASE_OFFSET:.+]], %[[BASE_SIZES:.+]]:2, %[[BASE_STRIDES:.+]]:2 = vmvx.get_buffer_descriptor %arg0
 // CHECK-DAG:   %[[C64:.+]] = arith.constant 64 : index
diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/expand_vectors.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/expand_vectors.mlir
index bad1503..c6dfcd8 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/expand_vectors.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/expand_vectors.mlir
@@ -133,10 +133,10 @@
   %c0 = arith.constant 0 : index
   %dim = tensor.dim %arg0, %c0 : tensor<?xbf16>
   %0 = tensor.empty(%dim) : tensor<?xf32>
-  %casted0 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, 
-                                              affine_map<(d0) -> (d0)>], 
-                             iterator_types = ["parallel"]} 
-                             ins(%arg0 : tensor<?xbf16>) 
+  %casted0 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
+                                              affine_map<(d0) -> (d0)>],
+                             iterator_types = ["parallel"]}
+                             ins(%arg0 : tensor<?xbf16>)
                              outs(%0 : tensor<?xf32>) {
   ^bb0(%in: bf16, %out: f32):
     %2 = arith.extf %in : bf16 to f32
@@ -169,10 +169,10 @@
   %c0 = arith.constant 0 : index
   %dim = tensor.dim %arg1, %c0 : tensor<?xi8>
   %0 = tensor.empty(%dim) : tensor<?xi32>
-  %casted1 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, 
-                                              affine_map<(d0) -> (d0)>], 
-                             iterator_types = ["parallel"]} 
-                             ins(%arg1 : tensor<?xi8>) 
+  %casted1 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
+                                              affine_map<(d0) -> (d0)>],
+                             iterator_types = ["parallel"]}
+                             ins(%arg1 : tensor<?xi8>)
                              outs(%0 : tensor<?xi32>) {
   ^bb0(%in: i8, %out: i32):
     %2 = arith.extsi %in : i8 to i32
@@ -206,10 +206,10 @@
   %c1 = arith.constant 1 : index
   %dim = tensor.dim %arg0, %c1 : tensor<3x?xf16>
   %0 = tensor.empty(%dim) : tensor<3x?xf32>
-  %casted0 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, 
-                                              affine_map<(d0, d1) -> (d0, d1)>], 
-                             iterator_types = ["parallel", "parallel"]} 
-                             ins(%arg0 : tensor<3x?xf16>) 
+  %casted0 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                              affine_map<(d0, d1) -> (d0, d1)>],
+                             iterator_types = ["parallel", "parallel"]}
+                             ins(%arg0 : tensor<3x?xf16>)
                              outs(%0 : tensor<3x?xf32>) {
   ^bb0(%in: f16, %out: f32):
     %2 = arith.extf %in : f16 to f32
diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/fuse_dequantization_matmul.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/fuse_dequantization_matmul.mlir
index 3d0d010..fd8d196 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/fuse_dequantization_matmul.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/fuse_dequantization_matmul.mlir
@@ -7,11 +7,11 @@
     %1 = tensor.empty() : tensor<11008x32x128xf32>
     %2 = linalg.fill ins(%cst : f32) outs(%0 : tensor<11008xf32>) -> tensor<11008xf32>
     %3 = linalg.generic {
-        indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, 
-                         affine_map<(d0, d1, d2) -> (d0, d1)>, 
-                         affine_map<(d0, d1, d2) -> (d0, d1)>, 
-                         affine_map<(d0, d1, d2) -> (d0, d1, d2)>], 
-        iterator_types = ["parallel", "parallel", "parallel"]} 
+        indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
+                         affine_map<(d0, d1, d2) -> (d0, d1)>,
+                         affine_map<(d0, d1, d2) -> (d0, d1)>,
+                         affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
+        iterator_types = ["parallel", "parallel", "parallel"]}
         ins(%arg0, %arg2, %arg3 : tensor<11008x32x128xi4>, tensor<11008x32xf32>, tensor<11008x32xf32>) outs(%1 : tensor<11008x32x128xf32>) {
     ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
       %5 = arith.extui %in : i4 to i32
@@ -21,10 +21,10 @@
       linalg.yield %8 : f32
     } -> tensor<11008x32x128xf32>
     %4 = linalg.generic {
-        indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, 
-                         affine_map<(d0, d1, d2) -> (d0, d1, d2)>, 
-                         affine_map<(d0, d1, d2) -> (d0)>], 
-        iterator_types = ["parallel", "reduction", "reduction"]} 
+        indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>,
+                         affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
+                         affine_map<(d0, d1, d2) -> (d0)>],
+        iterator_types = ["parallel", "reduction", "reduction"]}
         ins(%arg1, %3 : tensor<32x128xf32>, tensor<11008x32x128xf32>) outs(%2 : tensor<11008xf32>) {
     ^bb0(%in: f32, %in_0: f32, %out: f32):
       %5 = arith.mulf %in, %in_0 : f32
diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/lift_generic_to_transpose_batch_matmul.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/lift_generic_to_transpose_batch_matmul.mlir
index e8eb5ee..4854bdd 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/lift_generic_to_transpose_batch_matmul.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/lift_generic_to_transpose_batch_matmul.mlir
@@ -5,11 +5,11 @@
     %c0_i32 = arith.constant 0 : i32
     %0 = tensor.empty() : tensor<11008x32xi32>
     %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<11008x32xi32>) -> tensor<11008x32xi32>
-    %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, 
-                                          affine_map<(d0, d1, d2) -> (d0, d1, d2)>, 
-                                          affine_map<(d0, d1, d2) -> (d0, d1)>], 
-                         iterator_types = ["parallel", "parallel", "reduction"]} 
-                         ins(%arg0, %arg1 : tensor<32x128xi16>, tensor<11008x32x128xi4>) 
+    %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>,
+                                          affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
+                                          affine_map<(d0, d1, d2) -> (d0, d1)>],
+                         iterator_types = ["parallel", "parallel", "reduction"]}
+                         ins(%arg0, %arg1 : tensor<32x128xi16>, tensor<11008x32x128xi4>)
                          outs(%1 : tensor<11008x32xi32>) {
     ^bb0(%in: i16, %in_0: i4, %out: i32):
       %3 = arith.extsi %in : i16 to i32
@@ -52,11 +52,11 @@
     %c0_i32 = arith.constant 0 : i32
     %0 = tensor.empty() : tensor<11008x32xi32>
     %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<11008x32xi32>) -> tensor<11008x32xi32>
-    %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, 
-                                          affine_map<(d0, d1, d2) -> (d2, d1)>, 
-                                          affine_map<(d0, d1, d2) -> (d0, d1)>], 
-                         iterator_types = ["parallel", "parallel", "reduction"]} 
-                         ins(%arg0, %arg1 : tensor<11008x32x128xi4>, tensor<128x32xi16>) 
+    %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
+                                          affine_map<(d0, d1, d2) -> (d2, d1)>,
+                                          affine_map<(d0, d1, d2) -> (d0, d1)>],
+                         iterator_types = ["parallel", "parallel", "reduction"]}
+                         ins(%arg0, %arg1 : tensor<11008x32x128xi4>, tensor<128x32xi16>)
                          outs(%1 : tensor<11008x32xi32>) {
     ^bb0(%in: i4, %in_0: i16, %out: i32):
       %3 = arith.extui %in : i4 to i32
@@ -101,11 +101,11 @@
     %c0_i32 = arith.constant 0 : i32
     %0 = tensor.empty() : tensor<11008x32x8xi32>
     %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<11008x32x8xi32>) -> tensor<11008x32x8xi32>
-    %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d1, d2)>, 
-                                          affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, 
-                                          affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>], 
-                         iterator_types = ["parallel", "parallel", "reduction", "parallel"]} 
-                         ins(%arg0, %arg1 : tensor<8x32x128xi16>, tensor<11008x32x128xi4>) 
+    %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d1, d2)>,
+                                          affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>,
+                                          affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>],
+                         iterator_types = ["parallel", "parallel", "reduction", "parallel"]}
+                         ins(%arg0, %arg1 : tensor<8x32x128xi16>, tensor<11008x32x128xi4>)
                          outs(%1 : tensor<11008x32x8xi32>) {
     ^bb0(%in: i16, %in_0: i4, %out: i32):
       %3 = arith.extsi %in : i16 to i32
@@ -151,11 +151,11 @@
     %dim = tensor.dim %arg0, %c1 : tensor<8x?x128xi16>
     %0 = tensor.empty(%dim) : tensor<11008x?x8xi32>
     %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<11008x?x8xi32>) -> tensor<11008x?x8xi32>
-    %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d1, d2)>, 
-                                          affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, 
-                                          affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>], 
-                         iterator_types = ["parallel", "parallel", "reduction", "parallel"]} 
-                         ins(%arg0, %arg1 : tensor<8x?x128xi16>, tensor<11008x?x128xi4>) 
+    %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d1, d2)>,
+                                          affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>,
+                                          affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>],
+                         iterator_types = ["parallel", "parallel", "reduction", "parallel"]}
+                         ins(%arg0, %arg1 : tensor<8x?x128xi16>, tensor<11008x?x128xi4>)
                          outs(%1 : tensor<11008x?x8xi32>) {
     ^bb0(%in: i16, %in_0: i4, %out: i32):
       %3 = arith.extsi %in : i16 to i32
diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/remove_zero_extent_tensors.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/remove_zero_extent_tensors.mlir
index dffb238..289f372 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/remove_zero_extent_tensors.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/remove_zero_extent_tensors.mlir
@@ -24,7 +24,7 @@
   return %1 : tensor<?x?xf32>
 }
 // CHECK: func @zero_sized_tensor_insert(%[[ARG0:.+]]: tensor<?x?xf32>
-// CHECK:   return %[[ARG0]] 
+// CHECK:   return %[[ARG0]]
 
 // -----
 
diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/set_encoding.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/set_encoding.mlir
index 2deef49..aac9f7f 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/set_encoding.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/set_encoding.mlir
@@ -676,10 +676,10 @@
       %arg2 : tensor<64x100x500xi32>) -> tensor<64x100x500xi32> {
   %0 = tensor.empty() : tensor<64x250x500xi32>
   %casted0 = arith.extui %arg0 : tensor<64x100x250xi8> to tensor<64x100x250xi32>
-  %casted1 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, 
-                                              affine_map<(d0, d1, d2) -> (d0, d1, d2)>], 
-                              iterator_types = ["parallel", "parallel", "parallel"]} 
-                              ins(%arg1 : tensor<64x250x500xi8>) 
+  %casted1 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
+                                              affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
+                              iterator_types = ["parallel", "parallel", "parallel"]}
+                              ins(%arg1 : tensor<64x250x500xi8>)
                               outs(%0 : tensor<64x250x500xi32>) {
   ^bb0(%in: i8, %out: i32):
       %2 = arith.extsi %in : i8 to i32
@@ -845,7 +845,7 @@
 }
 
 //      CHECK: func @batch_matmul_f32f32f32_narrow_MN(
-//      CHECK:   iree_linalg_ext.upper_bound_tile_size tensor<64x4x250xf32, #iree_linalg_ext.encoding<user =  BATCH_MATMUL, role =  LHS, element_types = [f32, f32, f32], matmul_narrow_M = 4 : index, matmul_narrow_N = 2 : index>> 
+//      CHECK:   iree_linalg_ext.upper_bound_tile_size tensor<64x4x250xf32, #iree_linalg_ext.encoding<user =  BATCH_MATMUL, role =  LHS, element_types = [f32, f32, f32], matmul_narrow_M = 4 : index, matmul_narrow_N = 2 : index>>
 //      CHECK:   iree_linalg_ext.upper_bound_tile_size tensor<64x250x2xf32, #iree_linalg_ext.encoding<user =  BATCH_MATMUL, role =  RHS, element_types = [f32, f32, f32], matmul_narrow_M = 4 : index, matmul_narrow_N = 2 : index>>
-//      CHECK:   iree_linalg_ext.upper_bound_tile_size tensor<64x4x2xf32, #iree_linalg_ext.encoding<user =  BATCH_MATMUL, role =  RESULT, element_types = [f32, f32, f32], matmul_narrow_M = 4 : index, matmul_narrow_N = 2 : index>> 
+//      CHECK:   iree_linalg_ext.upper_bound_tile_size tensor<64x4x2xf32, #iree_linalg_ext.encoding<user =  BATCH_MATMUL, role =  RESULT, element_types = [f32, f32, f32], matmul_narrow_M = 4 : index, matmul_narrow_N = 2 : index>>
 //      CHECK:   linalg.batch_matmul
diff --git a/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/convert_to_loops.mlir b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/convert_to_loops.mlir
index 5bc4e3a..43d9ad6 100644
--- a/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/convert_to_loops.mlir
+++ b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/convert_to_loops.mlir
@@ -1417,7 +1417,7 @@
 
 func.func @unpack(%arg0: memref<1x4x6x6x2xf32>, %arg1: memref<1x6x6x8xf32>) {
   iree_linalg_ext.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [2] into %arg1 : (memref<1x4x6x6x2xf32> memref<1x6x6x8xf32>)
-  return 
+  return
 }
 
 // CHECK-DAG: #[[MAP:.+]] = affine_map<(d0) -> (d0 floordiv 2)>
diff --git a/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/tile_and_decompose_attention.mlir b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/tile_and_decompose_attention.mlir
index 322f8ab..4691721 100644
--- a/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/tile_and_decompose_attention.mlir
+++ b/llvm-external-projects/iree-dialects/test/Dialect/iree_linalg_ext/tile_and_decompose_attention.mlir
@@ -170,8 +170,8 @@
 // TILING-SAME:       [1, 1, 1] : tensor<?x?x?xf32> to tensor<?x?xf32>
 // TILING:          %[[EXTRACTED_SLICE_4:.+]] = tensor.extract_slice %[[QUERY]][0, 0, 0] [1, %[[DIM]], %[[DIM_0]]] [1, 1,
 // TILING-SAME:       1] : tensor<?x?x?xf32> to tensor<?x?xf32>
-// TILING:          %[[TILED_ATTENTION]]:3 = iree_linalg_ext.attention ins(%[[EXTRACTED_SLICE_4]], %[[EXTRACTED_SLICE]], %[[EXTRACTED_SLICE_3]] : 
-// TILING-SAME:                      outs(%[[ARG7]], %[[ARG8]], %[[ARG9]] : 
+// TILING:          %[[TILED_ATTENTION]]:3 = iree_linalg_ext.attention ins(%[[EXTRACTED_SLICE_4]], %[[EXTRACTED_SLICE]], %[[EXTRACTED_SLICE_3]] :
+// TILING-SAME:                      outs(%[[ARG7]], %[[ARG8]], %[[ARG9]] :
 // TILING-SAME:                      -> tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>
 // TILING:          scf.yield %[[TILED_ATTENTION]]#0, %[[TILED_ATTENTION]]#1, %[[TILED_ATTENTION]]#2 : tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>
 // TILING:        }
diff --git a/tests/e2e/stablehlo_ops/philox.mlir b/tests/e2e/stablehlo_ops/philox.mlir
index 119d8b7..24e5d4c 100644
--- a/tests/e2e/stablehlo_ops/philox.mlir
+++ b/tests/e2e/stablehlo_ops/philox.mlir
@@ -1,5 +1,5 @@
 func.func @philox_i32() {
-  %inp = util.unfoldable_constant dense<[1, 2, 3, 4]> : tensor<4xi32> 
+  %inp = util.unfoldable_constant dense<[1, 2, 3, 4]> : tensor<4xi32>
   %0:2 = "stablehlo.rng_bit_generator"(%inp) {rng_algorithm = #stablehlo<rng_algorithm PHILOX>} : (tensor<4xi32>) -> (tensor<4xi32>, tensor<8xi32>)
   check.expect_eq_const(%0#1, dense<[-1788415499, 854201270, -855525523, 2043148971, 110723240, 146396481, -1258660138, -1968502964]> : tensor<8xi32>) : tensor<8xi32>
   return
diff --git a/tests/e2e/stablehlo_ops/three_fry.mlir b/tests/e2e/stablehlo_ops/three_fry.mlir
index b06aef7..789476f 100644
--- a/tests/e2e/stablehlo_ops/three_fry.mlir
+++ b/tests/e2e/stablehlo_ops/three_fry.mlir
@@ -1,5 +1,5 @@
 func.func @three_fry_i32() {
-  %inp = util.unfoldable_constant dense<[1, 2, 3, 4]> : tensor<4xi32> 
+  %inp = util.unfoldable_constant dense<[1, 2, 3, 4]> : tensor<4xi32>
   %0:2 = "stablehlo.rng_bit_generator"(%inp) {rng_algorithm = #stablehlo<rng_algorithm THREE_FRY>} : (tensor<4xi32>) -> (tensor<4xi32>, tensor<8xi32>)
   check.expect_eq_const(%0#1, dense<[-1997982863, -261361928, -1008514867, 1226850200, 1419974734, -277475325, 1033030661, -1926332264]> : tensor<8xi32>) : tensor<8xi32>
   return
diff --git a/tests/transform_dialect/cpu/attention_codegen_spec.mlir b/tests/transform_dialect/cpu/attention_codegen_spec.mlir
index 73ffab9..c011025 100644
--- a/tests/transform_dialect/cpu/attention_codegen_spec.mlir
+++ b/tests/transform_dialect/cpu/attention_codegen_spec.mlir
@@ -65,9 +65,9 @@
     transform.memref.erase_dead_alloc_and_stores %func_8 : (!transform.any_op) -> ()
     transform.yield
   } // codegen
-  
+
   // Find `hal.executable.variant`.
-  transform.named_sequence @match_variant_for_codegen(%root: !transform.any_op {transform.readonly}) 
+  transform.named_sequence @match_variant_for_codegen(%root: !transform.any_op {transform.readonly})
     -> !transform.any_op {
     transform.match.operation_name %root ["hal.executable.variant"] : !transform.any_op
     transform.yield %root : !transform.any_op
@@ -78,7 +78,7 @@
     transform.foreach_match in %root
         @match_variant_for_codegen -> @codegen
       : (!transform.any_op) -> (!transform.any_op)
-    transform.yield 
+    transform.yield
   }
 } // module
 
diff --git a/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir b/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir
index f8ab046..622db6a 100644
--- a/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir
+++ b/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir
@@ -59,6 +59,6 @@
         matmul_packed_sizes = [8, 16, 32]
         matmul_inner_dims_order = [0, 1, 2]
       : (!transform.any_op) -> !transform.op<"linalg.generic">
-    transform.yield 
+    transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cpu/contraction-packing.mlir b/tests/transform_dialect/cpu/contraction-packing.mlir
index d103303..d8105de 100644
--- a/tests/transform_dialect/cpu/contraction-packing.mlir
+++ b/tests/transform_dialect/cpu/contraction-packing.mlir
@@ -26,9 +26,9 @@
   //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [16, 32]
   //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16]
   //      CHECK: linalg.generic
-  // CHECK-SAME:   indexing_maps = [#[[$map_lhs]], #[[$map_rhs]], #[[$map_res]]] 
+  // CHECK-SAME:   indexing_maps = [#[[$map_lhs]], #[[$map_rhs]], #[[$map_res]]]
   // CHECK-SAME:   iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]}
-  // CHECK-SAME:   ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<18x56x16x32xf32>) 
+  // CHECK-SAME:   ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<18x56x16x32xf32>)
   // CHECK-SAME:  outs(%{{.*}} : tensor<155x56x8x16xf32>)
   //      CHECK: tensor.unpack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16]
   %0 = linalg.matmul
@@ -54,9 +54,9 @@
   //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [16, 32]
   //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16]
   //      CHECK: linalg.generic
-  // CHECK-SAME:   indexing_maps = [#[[$map_tlhs]], #[[$map_rhs]], #[[$map_res]]] 
+  // CHECK-SAME:   indexing_maps = [#[[$map_tlhs]], #[[$map_rhs]], #[[$map_res]]]
   // CHECK-SAME:   iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]}
-  // CHECK-SAME:   ins(%{{.*}} : tensor<18x155x8x32xf32>, tensor<18x56x16x32xf32>) 
+  // CHECK-SAME:   ins(%{{.*}} : tensor<18x155x8x32xf32>, tensor<18x56x16x32xf32>)
   // CHECK-SAME:  outs(%{{.*}} : tensor<155x56x8x16xf32>)
   //      CHECK: tensor.unpack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16]
   %0 = linalg.generic #matmul_tnn_trait
@@ -87,9 +87,9 @@
   //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [16, 32]
   //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16]
   //      CHECK: linalg.generic
-  // CHECK-SAME:   indexing_maps = [#[[$map_lhs]], #[[$map_trhs]], #[[$map_res]]] 
+  // CHECK-SAME:   indexing_maps = [#[[$map_lhs]], #[[$map_trhs]], #[[$map_res]]]
   // CHECK-SAME:   iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]}
-  // CHECK-SAME:   ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<56x18x16x32xf32>) 
+  // CHECK-SAME:   ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<56x18x16x32xf32>)
   // CHECK-SAME:  outs(%{{.*}} : tensor<155x56x8x16xf32>)
   //      CHECK: tensor.unpack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16]
   %0 = linalg.generic #matmul_ntn_trait
@@ -120,9 +120,9 @@
   //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [16, 32]
   //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [8, 16]
   //      CHECK: linalg.generic
-  // CHECK-SAME:   indexing_maps = [#[[$map_lhs]], #[[$map_rhs]], #[[$map_tres]]] 
+  // CHECK-SAME:   indexing_maps = [#[[$map_lhs]], #[[$map_rhs]], #[[$map_tres]]]
   // CHECK-SAME:   iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]}
-  // CHECK-SAME:   ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<18x56x16x32xf32>) 
+  // CHECK-SAME:   ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<18x56x16x32xf32>)
   // CHECK-SAME:  outs(%{{.*}} : tensor<56x155x8x16xf32>)
   //      CHECK: tensor.unpack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [8, 16]
   %0 = linalg.generic #matmul_nnt_trait
@@ -140,15 +140,15 @@
   transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
     %matmul = transform.structured.match interface{LinalgOp} in %module_op
       : (!transform.any_op) -> (!transform.any_op)
-    
-    // Generalized packing rewrite extracts a gemm from any linalg op that contains 
+
+    // Generalized packing rewrite extracts a gemm from any linalg op that contains
     // one. This acts as a powerful normalization step: after this point, we have a
     // gemm (i.e. 3-D contraction with (m,n,k)=(8,16,32) ) on the 3 most minor
     // dimensions.
     transform.structured.pack_greedily %matmul
         matmul_packed_sizes = [8, 16, 32] matmul_inner_dims_order = [0, 1, 2]
       : (!transform.any_op) -> !transform.op<"linalg.generic">
-    transform.yield 
+    transform.yield
   }
 } // module
 
diff --git a/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir b/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir
index 18c675a..2fb784a 100644
--- a/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir
+++ b/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir
@@ -105,6 +105,6 @@
     transform.apply_patterns to %func_op {
       transform.apply_patterns.iree.fold_tensor_slice_into_transfer
     } : !transform.op<"func.func">
-    transform.yield 
+    transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir b/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
index df5231d..34eaa2f 100644
--- a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
+++ b/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
@@ -22,6 +22,6 @@
     // =========================================================
     %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
     transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
-    transform.yield 
+    transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cpu/transform_library.mlir b/tests/transform_dialect/cpu/transform_library.mlir
index b390561..3754c47 100644
--- a/tests/transform_dialect/cpu/transform_library.mlir
+++ b/tests/transform_dialect/cpu/transform_library.mlir
@@ -1,18 +1,18 @@
 module attributes { transform.with_named_sequence } {
-  transform.named_sequence @custom_matmul(%variant_op: !transform.any_op {transform.consumed}) {  
+  transform.named_sequence @custom_matmul(%variant_op: !transform.any_op {transform.consumed}) {
     %0 = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  
+
     %tiled_generic, %forall =
-      transform.structured.tile_using_forall %0 num_threads [2] 
+      transform.structured.tile_using_forall %0 num_threads [2]
       // TODO: IREE needs own workgroup mapping attribute.
       ( mapping = [#gpu.block<x>] )
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
       transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall
         : (!transform.any_op) -> ()
-  
+
     // Canonicalization/CSE is needed before bufferization otherwise unnecessary
     // allocs will be created.
-    %func_op = transform.structured.match ops{["func.func"]} in %variant_op 
+    %func_op = transform.structured.match ops{["func.func"]} in %variant_op
         : (!transform.any_op) -> !transform.any_op
     transform.apply_patterns to %func_op {
       transform.apply_patterns.iree.fold_fill_into_pad
@@ -22,10 +22,10 @@
     } : !transform.any_op
     transform.iree.apply_cse %func_op : !transform.any_op
     %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> (!transform.any_op)
-    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
       : (!transform.any_op) -> !transform.any_op
     transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
-  
+
     // CSE is needed on the workgroup_count region to pass this particular test.
     transform.iree.apply_cse %variant_op_3 : !transform.any_op
     %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
index 9982106..a14398d 100644
--- a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
@@ -68,6 +68,6 @@
     %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
     %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
 
-    transform.yield 
+    transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/mma.mlir b/tests/transform_dialect/cuda/mma.mlir
index 2093e1c..42aed57 100644
--- a/tests/transform_dialect/cuda/mma.mlir
+++ b/tests/transform_dialect/cuda/mma.mlir
@@ -37,7 +37,7 @@
     } : !transform.any_op
     transform.iree.vector.vector_to_mma_conversion %func { use_wmma } : (!transform.any_op) -> ()
 
-    // Apply canonicalization post-hoc to trigger DCE and pass the test 
+    // Apply canonicalization post-hoc to trigger DCE and pass the test
     // (i.e. all vector.contract are dead).
     // TODO: consider having the vector_to_mma_conversion do the DCE automatically.
     transform.apply_patterns to %func {
@@ -85,7 +85,7 @@
     } : !transform.any_op
     transform.iree.vector.vector_to_mma_conversion %func { use_mma_sync } : (!transform.any_op) -> ()
 
-    // Apply canonicalization post-hoc to trigger DCE and pass the test 
+    // Apply canonicalization post-hoc to trigger DCE and pass the test
     // (i.e. all vector.contract are dead).
     // TODO: consider having the vector_to_mma_conversion do the DCE automatically.
     transform.apply_patterns to %func {
diff --git a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
index cea3833..9c1c8bf 100644
--- a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
@@ -19,7 +19,7 @@
     // Promote operands in order to test loading from shared memory.
     %matmul_2 = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     %promoted_matmul, %alloc_0, %alloc_1 =
-      transform.iree.promote_operands %matmul_2 [0, 1] 
+      transform.iree.promote_operands %matmul_2 [0, 1]
         : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
 
@@ -70,6 +70,6 @@
     // ===========================================================================
     %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
     %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
-    transform.yield 
+    transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/reduction.mlir b/tests/transform_dialect/cuda/reduction.mlir
index 500eba6..2642b03 100644
--- a/tests/transform_dialect/cuda/reduction.mlir
+++ b/tests/transform_dialect/cuda/reduction.mlir
@@ -72,6 +72,6 @@
   //         CHECK:   vector.reduction <add>
   //         CHECK:   vector.transfer_write
   //         CHECK: gpu.barrier
-  
+
 //      EXEC: result[0]: hal.buffer_view
 // EXEC-NEXT: 8xf32=64 64 64 64 64 64 64 64
diff --git a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
index 0ab6d6a..13dc3f2 100644
--- a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
@@ -26,16 +26,16 @@
 
     // Step 3. Second level of tiling + fusion parallelizes to threads.
     // ===========================================================================
-    %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op 
+    %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op
       : (!transform.any_op) -> !transform.any_op
     %block_combiner_op, %forall_block_combiner_op =
-      transform.structured.tile_using_forall %grid_combiner_op tile_sizes [1] 
+      transform.structured.tile_using_forall %grid_combiner_op tile_sizes [1]
       ( mapping = [#gpu.thread<z>] )
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
     transform.structured.fuse_into_containing_op %fill_1d into %forall_block_combiner_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     // Canonicalizations.
-    %func_op = transform.structured.match ops{["func.func"]} in %variant_op 
+    %func_op = transform.structured.match ops{["func.func"]} in %variant_op
         : (!transform.any_op) -> !transform.any_op
     transform.apply_patterns to %func_op {
       transform.apply_patterns.iree.fold_fill_into_pad
@@ -45,20 +45,20 @@
     transform.iree.apply_licm %func_op : !transform.any_op
     transform.iree.apply_cse %func_op : !transform.any_op
 
-    %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op 
+    %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op
       : (!transform.any_op) -> !transform.any_op
     %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]}
-      attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op 
+      attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
         : (!transform.any_op) -> !transform.any_op
     %block_more_parallel_op, %forall_block_more_parallel_op =
-      transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1] 
+      transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1]
       ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
     transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
     // Step 4. Rank-reduce and vectorize.
     // ===========================================================================
-    %func = transform.structured.match ops{["func.func"]} in %variant_op 
+    %func = transform.structured.match ops{["func.func"]} in %variant_op
       : (!transform.any_op) -> !transform.any_op
     transform.apply_patterns to %func {
       transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
@@ -74,12 +74,12 @@
     } : !transform.any_op
     transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
     %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
-    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
       : (!transform.any_op) -> !transform.any_op
 
     // Step 6. Post-bufferization mapping to blocks and threads.
     // ===========================================================================
-    %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3 
+    %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3
       : (!transform.any_op) -> !transform.any_op
     transform.iree.forall_to_workgroup %func_5 : (!transform.any_op) -> ()
     transform.iree.map_nested_forall_to_gpu_threads %func_5
@@ -93,7 +93,7 @@
       transform.apply_patterns.memref.fold_memref_alias_ops
       transform.apply_patterns.vector.cast_away_vector_leading_one_dim
     } : !transform.any_op
-    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 
+    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
       : (!transform.any_op) -> !transform.any_op
     // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0)
     // at this point.
@@ -115,6 +115,6 @@
     transform.iree.apply_licm %func_op_3 : !transform.any_op
     transform.iree.apply_cse %func_op_3 : !transform.any_op
 
-    transform.yield 
+    transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
index fb4bcd6..b036039 100644
--- a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
@@ -154,7 +154,7 @@
     transform.iree.apply_licm %func_op_3 : !transform.any_op
     transform.iree.apply_cse %func_op_3 : !transform.any_op
 
-    transform.yield 
+    transform.yield
   }
 } // module
 
diff --git a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
index b1479a0..956ff0e 100644
--- a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
@@ -18,7 +18,7 @@
 
     // Step 2. Split the reduction to get meatier parallelism.
     // ===========================================================================
-    %block_more_parallel_fill_op_2, %block_more_parallel_op_2, %block_combiner_op_2, %forall = 
+    %block_more_parallel_fill_op_2, %block_more_parallel_op_2, %block_combiner_op_2, %forall =
       transform.structured.tile_reduction_using_for %grid_reduction by tile_sizes = [0, 128]
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     %_1:2 =
@@ -36,7 +36,7 @@
     // 2nd op is [parallel, reduction] of 1x128, map the 1-dim to threadIdx.y to
     // trigger mapping of the reduction to threadIdx.x via predication via `if (x==0)`.
     %_3:2 =
-      transform.structured.tile_using_forall %block_combiner_op_2 tile_sizes [1] 
+      transform.structured.tile_using_forall %block_combiner_op_2 tile_sizes [1]
       ( mapping = [#gpu.thread<y>] )
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
@@ -87,7 +87,7 @@
       transform.apply_patterns.memref.fold_memref_alias_ops
       transform.apply_patterns.vector.cast_away_vector_leading_one_dim
     } : !transform.any_op
-    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 
+    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
       : (!transform.any_op) -> !transform.any_op
     %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op
     transform.iree.vector.warp_distribute %func_7
@@ -102,7 +102,7 @@
     } : !transform.any_op
     transform.iree.apply_licm %func_7 : !transform.any_op
     transform.iree.apply_cse %func_7 : !transform.any_op
-  
-    transform.yield 
+
+    transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
index 345be1f..a255f23 100644
--- a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
@@ -109,6 +109,6 @@
     %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op
     transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> ()
 
-    transform.yield 
+    transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
index 7c1564f..d62558f 100644
--- a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
@@ -92,6 +92,6 @@
       : (!transform.any_op) -> !transform.any_op
     transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> ()
 
-    transform.yield 
+    transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir
index 67e3cb3..aefcd43 100644
--- a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir
@@ -137,7 +137,7 @@
     } : !transform.any_op
     transform.iree.apply_licm %func_op_3 : !transform.any_op
     transform.iree.apply_cse %func_op_3 : !transform.any_op
-    
-    transform.yield 
+
+    transform.yield
   }
 } // module