Use better Linalg transform op builders (#11551)

This PR cherry-picks the following LLVM commit:
  f27514800cc50677d640deae555bf999653a4c6f

@hanhan for integrate.
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformDialectStrategiesGPU.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformDialectStrategiesGPU.cpp
index 50c1e32..ff5bb28 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformDialectStrategiesGPU.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformDialectStrategiesGPU.cpp
@@ -165,8 +165,7 @@
   // Split the reduction into a parallel and combiner part, then tile the
   // parallel part and map it to a full warp so it works on vectors.
   auto tileReduction = b.create<transform::TileReductionUsingScfOp>(
-      pdlOperation, pdlOperation, pdlOperation, pdlOperation, gridReductionH,
-      b.getI64ArrayAttr({0, firstReductionSize}));
+      gridReductionH, ArrayRef<int64_t>({0, firstReductionSize}));
   Value blockParallelFillH = tileReduction.getFillOp();
   Value blockParallelOpH = tileReduction.getSplitLinalgOp();
   Value blockCombinerOpH = tileReduction.getCombiningLinalgOp();
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy.mlir
index e25a9d5..0f3ab31 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy.mlir
@@ -35,7 +35,7 @@
 //         CHECK:   transform.iree.tile_to_foreach_thread_and_workgroup_count_region {{.*}} tile_sizes [1](mapping = [#gpu.block<x>])
 // CHECK-COUNT-3:   transform.structured.fuse_into_containing_op
 //         CHECK:   transform.iree.take_first
-//         CHECK:   transform.structured.tile_reduction_using_scf %{{.*}} {tile_sizes = [0, 64]}
+//         CHECK:   transform.structured.tile_reduction_using_scf %{{.*}} by tile_sizes = [0, 64]
 //         CHECK:   transform.structured.tile_to_foreach_thread_op %{{.*}} num_threads [0, 32]
 //    CHECK-SAME:      (mapping = [#gpu.thread<x>])
 //         CHECK:   transform.structured.tile_to_foreach_thread_op %{{.*}} tile_sizes [0, 2](mapping = [#gpu.thread<x>])
@@ -95,7 +95,7 @@
 
 //   CHECK-LABEL: func.func @group_reduction_128
 //         CHECK:   transform.structured.canonicalized_sequence failures(propagate)
-//         CHECK:   transform.structured.tile_reduction_using_scf %{{.*}} {tile_sizes = [0, 128]}
+//         CHECK:   transform.structured.tile_reduction_using_scf %{{.*}} by tile_sizes = [0, 128]
 //         CHECK:   transform.structured.tile_to_foreach_thread_op %{{.*}} num_threads [0, 32]
 //    CHECK-SAME:      (mapping = [#gpu.thread<x>])
 //         CHECK:   transform.structured.tile_to_foreach_thread_op %{{.*}} tile_sizes [0, 4](mapping = [#gpu.thread<x>])
@@ -136,7 +136,7 @@
 
 //   CHECK-LABEL: func.func @group_reduction_32
 //         CHECK:   transform.structured.canonicalized_sequence failures(propagate)
-//         CHECK:   transform.structured.tile_reduction_using_scf %{{.*}} {tile_sizes = [0, 32]}
+//         CHECK:   transform.structured.tile_reduction_using_scf %{{.*}} by tile_sizes = [0, 32]
 //         CHECK:   transform.structured.tile_to_foreach_thread_op %{{.*}} num_threads [0, 32]
 //    CHECK-SAME:      (mapping = [#gpu.thread<x>])
 //         CHECK:   transform.structured.tile_to_foreach_thread_op %{{.*}} tile_sizes [0, 1](mapping = [#gpu.thread<x>])
diff --git a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
index eb0a481..5d171d0 100644
--- a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
@@ -15,7 +15,7 @@
   // Step 2. Split the reduction to get meatier parallelism.
   // ===========================================================================
   %foreach_thread, %block_more_parallel_fill_op_2, %block_more_parallel_op_2, %block_combiner_op_2 = 
-    transform.structured.tile_reduction_using_scf %grid_reduction { tile_sizes = [0, 128] }
+    transform.structured.tile_reduction_using_scf %grid_reduction by tile_sizes = [0, 128]
   %_1:2 =
     transform.structured.tile_to_foreach_thread_op %block_more_parallel_op_2 num_threads [0, 32] 
     ( mapping = [#gpu.thread<x>] )
diff --git a/tests/transform_dialect/cuda/reduction_v3_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_v3_codegen_spec.mlir
index 93107bb..31845ec 100644
--- a/tests/transform_dialect/cuda/reduction_v3_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_v3_codegen_spec.mlir
@@ -17,10 +17,12 @@
   // ===========================================================================
   %foreach_thread, %block_more_parallel_fill_op_2, %block_more_parallel_op_2, %block_combiner_op_2 = 
      transform.structured.tile_reduction_using_foreach_thread %grid_reduction 
-       { num_threads = [0, 1024], tile_sizes = [0, 1], mapping = [#gpu.thread<x>] }
+        by num_threads = [0, 1024], tile_sizes = [0, 1], mapping = [#gpu.thread<x>]
+
   // Fuse the fill and pointwise to privatize them. 
   transform.structured.fuse_into_containing_op %block_more_parallel_fill_op_2
     into %foreach_thread
+
   // block_combiner_op_2 op is [parallel, reduction] of 1x384 that cannot fuse.
   // map the 1-dim to threadIdx.y to trigger mapping of the reduction to 
   // threadIdx.x via predication via `if (x==0)`.
diff --git a/third_party/llvm-project b/third_party/llvm-project
index 1f2fc6b..f1757ad 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit 1f2fc6b557655caf802dac906768a5c8880a731c
+Subproject commit f1757ad433874b7c9f0cd755e90e221ddb5bde48