Revert enabling Vulkan ASan tests, with cleanup. (#8550) Attempt 2 at https://github.com/google/iree/pull/8545, with less forward-looking changes :/

commit: b10a62b57171711354746701ec4cbfdff1d84389 [log] [tgz]
author: Scott Todd <scotttodd@google.com> Tue Mar 15 15:26:39 2022 -0700
committer: GitHub <noreply@github.com> Tue Mar 15 15:26:39 2022 -0700
tree: 9d68d83e99a7f5c246d9c8ab2850d60d058a2fa0
parent: ad57a7ddd4ec58f6a59e5b3060e6505600e42034 [diff]
parent: cb5917a9877f9f4bef51d90bbc521aecb5e560c1 [diff]
diff --git a/build_tools/bazel_to_cmake/bazel_to_cmake_targets.py b/build_tools/bazel_to_cmake/bazel_to_cmake_targets.py
index 63c9133..aad06d4 100644
--- a/build_tools/bazel_to_cmake/bazel_to_cmake_targets.py
+++ b/build_tools/bazel_to_cmake/bazel_to_cmake_targets.py

@@ -97,6 +97,22 @@
         "MhloDialect",
         "MLIRMhloUtils",
     ],
+    "@mlir-hlo//:hlo_legalize_shape_ops_to_standard": [
+        "tensorflow::external_mhlo_includes",
+        "MhloShapeOpsToStandard",
+    ],
+    "@mlir-hlo//:hlo_legalize_to_arithmetic": [
+        "tensorflow::external_mhlo_includes",
+        "MhloToArithmeticConversion",
+    ],
+    "@mlir-hlo//:hlo_legalize_to_lhlo": [
+        "tensorflow::external_mhlo_includes",
+        "MhloToLhloConversion",
+    ],
+    "@mlir-hlo//:hlo_legalize_to_memref": [
+        "tensorflow::external_mhlo_includes",
+        "MhloToMemrefConversion",
+    ],
     "@mlir-hlo//:legalize_control_flow": [
         "tensorflow::external_mhlo_includes",
         "MhloToStandard",

diff --git a/integrations/tensorflow/WORKSPACE b/integrations/tensorflow/WORKSPACE
index 3cd4a16..410fd2f 100644
--- a/integrations/tensorflow/WORKSPACE
+++ b/integrations/tensorflow/WORKSPACE

@@ -7,7 +7,7 @@
 
 load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
 
-TENSORFLOW_COMMIT = "58b667263ba5734ba175a7640f1a1d572e52bd3d"
+TENSORFLOW_COMMIT = "fe3fd49d08db3174730123cbab2fed8bbec9cf1b"
 
 git_repository(
     name = "org_tensorflow",

diff --git a/integrations/tensorflow/build_tools/bazel/iree-tf.bazelrc b/integrations/tensorflow/build_tools/bazel/iree-tf.bazelrc
index 2c26542..e20c24d 100644
--- a/integrations/tensorflow/build_tools/bazel/iree-tf.bazelrc
+++ b/integrations/tensorflow/build_tools/bazel/iree-tf.bazelrc

@@ -13,3 +13,4 @@
 
 # Flags specific for working around tensorflow warnings.
 build:generic_clang --copt=-Wno-inconsistent-missing-override --host_copt=-Wno-inconsistent-missing-override
+build:generic_clang --copt=-Wno-c++11-narrowing --host_copt=-Wno-c++11-narrowing

diff --git a/iree/compiler/Codegen/Common/TypePropagationPass.cpp b/iree/compiler/Codegen/Common/TypePropagationPass.cpp
index 851e64c..f120f46 100644
--- a/iree/compiler/Codegen/Common/TypePropagationPass.cpp
+++ b/iree/compiler/Codegen/Common/TypePropagationPass.cpp

@@ -238,7 +238,7 @@
     if (outputType == legalizedOutputType) {
       return rewriter.notifyMatchFailure(fillOp, "op already legal");
     }
-    Value value = adaptor.value();
+    Value value = adaptor.inputs().front();
     Optional<Type> legalizedElementType =
         getLegalizedElementType(value.getType());
     if (!legalizedElementType) {
@@ -246,8 +246,8 @@
     }
     Value legalizedValue = convertElementType(
         rewriter, fillOp->getLoc(), legalizedElementType.getValue(), value);
-    rewriter.replaceOpWithNewOp<linalg::FillOp>(fillOp, legalizedValue,
-                                                adaptor.output());
+    rewriter.replaceOpWithNewOp<linalg::FillOp>(
+        fillOp, ValueRange{legalizedValue}, ValueRange{adaptor.outputs()});
     return success();
   }
 };

diff --git a/iree/compiler/Codegen/Common/test/convert_to_destination_passing_style.mlir b/iree/compiler/Codegen/Common/test/convert_to_destination_passing_style.mlir
index 6f2493f..0907d4c 100644
--- a/iree/compiler/Codegen/Common/test/convert_to_destination_passing_style.mlir
+++ b/iree/compiler/Codegen/Common/test/convert_to_destination_passing_style.mlir

@@ -74,7 +74,7 @@
       %lhs_tile = flow.dispatch.tensor.load %lhs, offsets = [%iv0, 0], sizes = [%tilesize_y, %k], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32>{%m, %k} -> tensor<?x?xf32>
       %rhs_tile = flow.dispatch.tensor.load %rhs, offsets = [0, %iv1], sizes = [%k, %tilesize_x], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32>{%k, %n} -> tensor<?x?xf32>
       %init_tile = linalg.init_tensor [%tilesize_y, %tilesize_x] : tensor<?x?xf32>
-      %fill_tile = linalg.fill(%cst, %init_tile) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+      %fill_tile = linalg.fill ins(%cst : f32) outs(%init_tile : tensor<?x?xf32>) -> tensor<?x?xf32>
       %matmul_tile = linalg.matmul ins(%lhs_tile, %rhs_tile : tensor<?x?xf32>, tensor<?x?xf32>) outs(%fill_tile : tensor<?x?xf32>) -> tensor<?x?xf32>
       flow.dispatch.tensor.store %matmul_tile, %result, offsets = [%iv0, %iv1], sizes = [%tilesize_y, %tilesize_x], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>{%m, %n}
     }
@@ -90,7 +90,8 @@
 //  CHECK-DAG:       %[[LHS_TILE:.+]] = flow.dispatch.tensor.load %[[LHS]]
 //  CHECK-DAG:       %[[RHS_TILE:.+]] = flow.dispatch.tensor.load %[[RHS]]
 //  CHECK-DAG:       %[[RESULT_TILE:.+]] = flow.dispatch.tensor.load %[[RESULT]]
-//      CHECK:       %[[FILL_TILE:.+]] = linalg.fill(%{{.+}}, %[[RESULT_TILE]])
+//      CHECK:       %[[FILL_TILE:.+]] = linalg.fill
+// CHECK-SAME:           outs(%[[RESULT_TILE]] :
 //      CHECK:       %[[MATMUL_TILE:.+]] = linalg.matmul
 // CHECK-SAME:           ins(%[[LHS_TILE]], %[[RHS_TILE]] : tensor<?x?xf32>, tensor<?x?xf32>)
 // CHECK-SAME:           outs(%[[FILL_TILE]] : tensor<?x?xf32>)
@@ -303,7 +304,7 @@
         %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%c1, %c32, 1024], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x32x1024xf32> -> tensor<?x?x1024xf32>
         %8 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [%c1, 1024, %c32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x1024x64xf32> -> tensor<?x1024x?xf32>
         %9 = linalg.init_tensor [1, 32, 32] : tensor<1x32x32xf32>
-        %10 = linalg.fill(%cst, %9) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<1x32x32xf32> -> tensor<1x32x32xf32>
+        %10 = linalg.fill  {__internal_linalg_transform__ = "workgroup"} ins(%cst : f32) outs(%9 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32>
         %11 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup", is_root_op} ins(%7, %8 : tensor<?x?x1024xf32>, tensor<?x1024x?xf32>) outs(%10 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32>
         %12 = tensor.cast %11 : tensor<1x32x32xf32> to tensor<?x?x?xf32>
         flow.dispatch.tensor.store %12, %2, offsets = [%arg0, %arg1, %arg2], sizes = [%c1, %c32, %c32], strides = [1, 1, 1] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<writeonly:4x32x64xf32>
@@ -321,7 +322,8 @@
 //  CHECK-DAG:       %[[LHS_TILE:.+]] = flow.dispatch.tensor.load %[[LHS]]
 //  CHECK-DAG:       %[[RHS_TILE:.+]] = flow.dispatch.tensor.load %[[RHS]]
 //  CHECK-DAG:       %[[RESULT_TILE:.+]] = flow.dispatch.tensor.load %[[RESULT]]
-//      CHECK:       %[[FILL_TILE:.+]] = linalg.fill(%{{.+}}, %[[RESULT_TILE]])
+//      CHECK:       %[[FILL_TILE:.+]] = linalg.fill
+//  CHECK-SAME:          outs(%[[RESULT_TILE]] :
 //      CHECK:       %[[MATMUL_TILE:.+]] = linalg.batch_matmul
 // CHECK-SAME:           ins(%[[LHS_TILE]], %[[RHS_TILE]]
 // CHECK-SAME:           outs(%[[FILL_TILE]]
@@ -497,7 +499,7 @@
       %8 = tensor.extract_slice %cst_0[%arg1] [64] [1] : tensor<64xf32> to tensor<64xf32>
       %9 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:6400x64xf32> -> tensor<64x64xf32>
       %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:64x64xf32> -> tensor<64x64xf32>
-      %11 = linalg.fill(%cst_1, %7) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
+      %11 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32>
       %12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 0], [8, 32, 0], [0, 0, 16]]>} ins(%9, %10 : tensor<64x64xf32>, tensor<64x64xf32>) outs(%11 : tensor<64x64xf32>) -> tensor<64x64xf32>
       %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8, %12 : tensor<64xf32>, tensor<64x64xf32>) outs(%7 : tensor<64x64xf32>) {
       ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
@@ -522,7 +524,8 @@
 //   CHECK-NOT:   linalg.init_tensor
 //       CHECK:   %[[LOAD:.+]] = flow.dispatch.tensor.load %[[OUTPUT]]
 //   CHECK-NOT:   linalg.init_tensor
-//       CHECK:   linalg.fill(%{{.+}}, %[[LOAD]])
+//       CHECK:   linalg.fill
+//  CHECK-SAME:       outs(%[[LOAD]] :
 //       CHECK:   %[[MATMUL:.+]] = linalg.matmul
 //       CHECK:   %[[GENERIC:.+]] = linalg.generic
 //  CHECK-SAME:       outs(%[[MATMUL]] :
@@ -557,7 +560,7 @@
       %12 = affine.min affine_map<(d0) -> (-d0 + 49, 16)>(%arg1)
       %13 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [16, %12], strides = [1, 1] : !flow.dispatch.tensor<readonly:16x49xf32> -> tensor<16x?xf32>
       %14 = linalg.init_tensor [%10, %12] : tensor<?x?xf32>
-      %15 = linalg.fill(%cst, %14) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+      %15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<?x?xf32>) -> tensor<?x?xf32>
       %16 = linalg.matmul ins(%11, %13 : tensor<?x16xf32>, tensor<16x?xf32>) outs(%15 : tensor<?x?xf32>) -> tensor<?x?xf32>
       %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) {
       ^bb0(%arg2: f32, %arg3: f32):

diff --git a/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir b/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir
index 5fb20f8..b9d84ee 100644
--- a/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir
+++ b/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir

@@ -99,7 +99,7 @@
       %lhs_tile = flow.dispatch.tensor.load %lhs, offsets = [%iv0, 0], sizes = [%tilesize_y, %k], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32>{%m, %k} -> tensor<?x?xf32>
       %rhs_tile = flow.dispatch.tensor.load %rhs, offsets = [0, %iv1], sizes = [%k, %tilesize_x], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32>{%k, %n} -> tensor<?x?xf32>
       %init_tile = linalg.init_tensor [%tilesize_y, %tilesize_x] : tensor<?x?xf32>
-      %fill_tile = linalg.fill(%cst, %init_tile) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+      %fill_tile = linalg.fill ins(%cst : f32) outs(%init_tile : tensor<?x?xf32>) -> tensor<?x?xf32>
       %matmul_tile = linalg.matmul ins(%lhs_tile, %rhs_tile : tensor<?x?xf32>, tensor<?x?xf32>) outs(%fill_tile : tensor<?x?xf32>) -> tensor<?x?xf32>
       flow.dispatch.tensor.store %matmul_tile, %result, offsets = [%iv0, %iv1], sizes = [%tilesize_y, %tilesize_x], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<readwrite:?x?xf32>{%m, %n}
     }
@@ -136,7 +136,9 @@
 //  CHECK-DAG:       %[[LHS_TILE:.+]] = memref.subview %[[LHS]][%[[IV0]], 0] [%[[TILESIZE_Y]], %[[K]]]
 //  CHECK-DAG:       %[[RHS_TILE:.+]] = memref.subview %[[RHS]][0, %[[IV1]]] [%[[K]], %[[TILESIZE_X]]]
 //  CHECK-DAG:       %[[RESULT_TILE:.+]] = memref.subview %[[RESULT]][%[[IV0]], %[[IV1]]] [%[[TILESIZE_Y]], %[[TILESIZE_X]]]
-//      CHECK:       linalg.fill(%[[CST]], %[[RESULT_TILE]])
+//      CHECK:       linalg.fill
+// CHECK-SAME:           ins(%[[CST]] :
+// CHECK-SAME:           outs(%[[RESULT_TILE]] :
 //      CHECK:       linalg.matmul
 // CHECK-SAME:           ins(%[[LHS_TILE]], %[[RHS_TILE]]
 // CHECK-SAME:           outs(%[[RESULT_TILE]]

diff --git a/iree/compiler/Codegen/Common/test/linalg_bufferize.mlir b/iree/compiler/Codegen/Common/test/linalg_bufferize.mlir
index a9eba4e..5e40783 100644
--- a/iree/compiler/Codegen/Common/test/linalg_bufferize.mlir
+++ b/iree/compiler/Codegen/Common/test/linalg_bufferize.mlir

@@ -736,7 +736,7 @@
       %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 3)>(%arg1)[%workgroup_size_x]
       %12 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c2, %11], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:2x3xf32> -> tensor<2x?xf32>
       %13 = linalg.init_tensor [%9, %11] : tensor<?x?xf32>
-      %14 = linalg.fill(%cst, %13) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+      %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<?x?xf32>) -> tensor<?x?xf32>
       %15 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%10, %12 : tensor<?x2xf32>, tensor<2x?xf32>) outs(%14 : tensor<?x?xf32>) -> tensor<?x?xf32>
       flow.dispatch.tensor.store %15, %2, offsets = [%arg0, %arg1], sizes = [%9, %11], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:1x3xf32>
     }
@@ -754,7 +754,8 @@
 //   CHECK-DAG:       %[[LHS_TILE:.+]] = memref.subview %[[RESHAPE_LHS]][%[[IV0]], 0]
 //   CHECK-DAG:       %[[RESULT_TILE:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]]
 //   CHECK-DAG:       %[[RHS_TILE:.+]] = memref.subview %[[RHS]][0, %[[IV1]]]
-//       CHECK:       linalg.fill(%{{.+}}, %[[RESULT_TILE]])
+//       CHECK:       linalg.fill
+//  CHECK-SAME:           outs(%[[RESULT_TILE]] :
 //       CHECK:       linalg.matmul
 //  CHECK-SAME:         ins(%[[LHS_TILE]], %[[RHS_TILE]]
 //  CHECK-SAME:         outs(%[[RESULT_TILE]]
@@ -920,7 +921,7 @@
   %2 = linalg.init_tensor [3, 9] : tensor<3x9xi32>
   %3 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = []  : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
   %4 = tensor.extract %3[] : tensor<i32>
-  %5 = linalg.fill(%4, %2) : i32, tensor<3x9xi32> -> tensor<3x9xi32>
+  %5 = linalg.fill ins(%4 : i32) outs(%2 : tensor<3x9xi32>) -> tensor<3x9xi32>
   flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [3, 9], strides = [1, 1] : tensor<3x9xi32> -> !flow.dispatch.tensor<writeonly:3x9xi32>
   return
 }
@@ -929,7 +930,9 @@
 //   CHECK-DAG:   %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
 //   CHECK-DAG:   %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
 //       CHECK:   %[[LOAD:.+]] = memref.load %[[ARG0]]
-//       CHECK:   linalg.fill(%[[LOAD]], %[[RET0]])
+//       CHECK:   linalg.fill
+//  CHECK-SAME:       ins(%[[LOAD]] :
+//  CHECK-SAME:       outs(%[[RET0]] :
 
 // -----
 
@@ -992,7 +995,7 @@
       %10 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 5)>(%arg1)[%workgroup_size_x]
       %11 = tensor.extract_slice %cst[0, %arg1] [3, %10] [1, 1] : tensor<3x5xf32> to tensor<3x?xf32>
       %12 = linalg.init_tensor [%8, %10] : tensor<?x?xf32>
-      %13 = linalg.fill(%cst_0, %12) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+      %13 = linalg.fill ins(%cst_0 : f32) outs(%12 : tensor<?x?xf32>) -> tensor<?x?xf32>
       %14 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%9, %11 : tensor<?x3xf32>, tensor<3x?xf32>) outs(%13 : tensor<?x?xf32>) -> tensor<?x?xf32>
       flow.dispatch.tensor.store %14, %1, offsets = [%arg0, %arg1], sizes = [%8, %10], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:5x5xf32>
     }
@@ -1011,7 +1014,8 @@
 //   CHECK-DAG:       %[[LHS_SUBVIEW:.+]] = memref.subview %[[LHS]][%[[IV0]], 0]
 //   CHECK-DAG:       %[[RHS_SUBVIEW:.+]] = memref.subview %[[RHS]][0, %[[IV1]]]
 //   CHECK-DAG:       %[[RESULT_SUBVIEW:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]]
-//       CHECK:       linalg.fill(%{{.+}}, %[[RESULT_SUBVIEW]])
+//       CHECK:       linalg.fill
+//  CHECK-SAME:           outs(%[[RESULT_SUBVIEW]] :
 //       CHECK:       linalg.matmul
 //  CHECK-SAME:         ins(%[[LHS_SUBVIEW]], %[[RHS_SUBVIEW]]
 //  CHECK-SAME:         outs(%[[RESULT_SUBVIEW]]
@@ -1067,7 +1071,7 @@
   %5 = tensor.extract %4[] : tensor<f32>
   %6 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [1, 4, 6, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x4x6x1xf32> -> tensor<1x4x6x1xf32>
   %7 = linalg.init_tensor [1, 2, 2, 1] : tensor<1x2x2x1xf32>
-  %8 = linalg.fill(%5, %7) : f32, tensor<1x2x2x1xf32> -> tensor<1x2x2x1xf32>
+  %8 = linalg.fill ins(%5 : f32) outs(%7 : tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32>
   %9 = linalg.pooling_nhwc_sum {
     dilations = dense<1> : vector<2xi64>,
     strides = dense<[2, 3]> : vector<2xi64>
@@ -1083,7 +1087,9 @@
 //   CHECK-DAG:   %[[INPUT:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<1x4x6x1xf32>
 //   CHECK-DAG:   %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1x2x2x1xf32>
 //       CHECK:   %[[INIT_VAL:.+]] = memref.load %[[INIT]][] : memref<f32>
-//       CHECK:   linalg.fill(%[[INIT_VAL]], %[[RET0]]) : f32, memref<1x2x2x1xf32>
+//       CHECK:   linalg.fill
+//  CHECK-SAME:       ins(%[[INIT_VAL]] :
+//  CHECK-SAME:       outs(%[[RET0]] :
 //       CHECK:   linalg.pooling_nhwc_sum
 //  CHECK-SAME:     dilations = dense<1> : vector<2xi64>
 //  CHECK-SAME:     strides = dense<[2, 3]> : vector<2xi64>
@@ -1212,7 +1218,7 @@
 
   %cst = arith.constant 0.0 : f32
   %0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
-  %1 = linalg.fill(%cst, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
   %2 = linalg.conv_2d_nhwc_hwcf
          {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
          ins(%input, %filter : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>)
@@ -1238,7 +1244,8 @@
 
 //  CHECK-NOT: memref.alloc
 //      CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer)
-//      CHECK: linalg.fill(%{{.+}}, %[[OUTPUT]])
+//      CHECK: linalg.fill
+// CHECK-SAME:     outs(%[[OUTPUT]] :
 // CHECK-NEXT: linalg.conv_2d_nhwc_hwcf
 // CHECK-SAME:   outs(%[[OUTPUT]] : memref<1x112x112x32xf32>)
 // CHECK-NEXT: linalg.generic
@@ -1262,13 +1269,13 @@
   %cst0 = arith.constant 0.0 : f32
   %cst1 = arith.constant 1.0 : f32
   %0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
-  %1 = linalg.fill(%cst0, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+  %1 = linalg.fill ins(%cst0 : f32) outs(%0 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
   %2 = linalg.conv_2d_nhwc_hwcf
          {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
          ins(%input, %filter : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>)
          outs(%1 : tensor<1x112x112x32xf32>)
          -> tensor<1x112x112x32xf32>
-  %3 = linalg.fill(%cst1, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+  %3 = linalg.fill ins(%cst1 : f32) outs(%0 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
   %4 = linalg.generic {
          indexing_maps = [
            affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
@@ -1289,10 +1296,12 @@
 // CHECK-LABEL: func @dont_use_buffer_for_operand_when_output_tensor_used()
 //      CHECK: %[[ALLOC:.+]] = memref.alloc
 //      CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer)
-//      CHECK: linalg.fill(%{{.+}}, %[[ALLOC]])
+//      CHECK: linalg.fill
+// CHECK-SAME:     outs(%[[ALLOC]] :
 // CHECK-NEXT: linalg.conv_2d_nhwc_hwcf
 // CHECK-SAME:   outs(%[[ALLOC]] : memref<1x112x112x32xf32>)
-// CHECK-NEXT: linalg.fill(%{{.+}}, %[[OUTPUT]])
+// CHECK-NEXT: linalg.fill
+// CHECK-SAME:     outs(%[[OUTPUT]] :
 // CHECK-NEXT: linalg.generic
 // CHECK-SAME:   ins(%[[ALLOC]], %{{.+}} : memref<1x112x112x32xf32>, memref<32xf32>)
 // CHECK-SAME:   outs(%[[OUTPUT]] : memref<1x112x112x32xf32>)
@@ -1365,7 +1374,7 @@
         %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%c1, %c32, 1024], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x32x1024xf32> -> tensor<?x?x1024xf32>
         %8 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [%c1, 1024, %c32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x1024x64xf32> -> tensor<?x1024x?xf32>
         %9 = linalg.init_tensor [1, 32, 32] : tensor<1x32x32xf32>
-        %10 = linalg.fill(%cst, %9) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<1x32x32xf32> -> tensor<1x32x32xf32>
+        %10 = linalg.fill {__internal_linalg_transform__ = "workgroup"} ins(%cst : f32) outs(%9 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32>
         %11 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup", is_root_op} ins(%7, %8 : tensor<?x?x1024xf32>, tensor<?x1024x?xf32>) outs(%10 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32>
         %12 = tensor.cast %11 : tensor<1x32x32xf32> to tensor<?x?x?xf32>
         flow.dispatch.tensor.store %12, %2, offsets = [%arg0, %arg1, %arg2], sizes = [%c1, %c32, %c32], strides = [1, 1, 1] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<writeonly:4x32x64xf32>
@@ -1383,7 +1392,9 @@
 //       CHECK: %[[LHSV:.+]] = memref.subview %[[LHS]]
 //       CHECK: %[[RHSV:.+]] = memref.subview %[[RHS]]
 //       CHECK: %[[RESULTV:.+]] = memref.subview %[[RESULT]]
-//        CHECK: linalg.fill(%[[ZERO]], %[[RESULTV]])
+//        CHECK: linalg.fill
+//   CHECK-SAME:     ins(%[[ZERO]] :
+//   CHECK-SAME:     outs(%[[RESULTV]] :
 //        CHECK: linalg.batch_matmul {{.*}} ins(%[[LHSV]], %[[RHSV]] : {{.*}}) outs(%[[RESULTV]]
 
 // -----
@@ -1621,7 +1632,7 @@
       %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [64, 27], strides = [1, 1] : !flow.dispatch.tensor<readonly:12544x27xf32> -> tensor<64x27xf32>
       %8 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [27, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:27x16xf32> -> tensor<27x16xf32>
       %9 = linalg.init_tensor [64, 16] : tensor<64x16xf32>
-      %10 = linalg.fill(%cst, %9) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<64x16xf32> -> tensor<64x16xf32>
+      %10 = linalg.fill  {__internal_linalg_transform__ = "workgroup"} ins(%cst : f32) outs(%9 : tensor<64x16xf32>) -> tensor<64x16xf32>
       %11 = tensor.pad %7 low[0, 0] high[0, 5]  {
       ^bb0(%arg2: index, %arg3: index):  // no predecessors
         tensor.yield %cst : f32
@@ -1639,23 +1650,29 @@
 }
 
 // CHECK-LABEL: func @padded_matmul()
-// CHECK-DAG: %[[LHS_PADDED:.+]] = memref.alloc() : memref<64x32xf32>
-// CHECK-DAG: %[[RHS_PADDED:.+]] = memref.alloc() : memref<32x16xf32>
-// CHECK-DAG: %[[C0:.+]] = arith.constant 0.000000e+00 : f32
-// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<12544x27xf32>
-// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<27x16xf32>
-// CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<12544x16xf32>
-// CHECK-DAG: %[[LHS_V:.+]] = memref.subview %[[LHS]][%{{.*}}, 0] [64, 27] [1, 1]
-// CHECK-DAG: %[[RHS_V:.+]] = memref.subview %[[RHS]][0, %{{.*}}] [27, 16] [1, 1]
-// CHECK-DAG: %[[DST_V:.+]] = memref.subview %[[DST]][%{{.*}}, %{{.*}}] [64, 16] [1, 1]
-//     CHECK: linalg.fill(%[[C0]], %[[DST_V]])
-//     CHECK: linalg.fill(%[[C0]], %[[LHS_PADDED]]) : f32, memref<64x32xf32>
-//     CHECK: %[[LHS_PADDED_INTER:.+]] = memref.subview %[[LHS_PADDED]][0, 0] [64, 27] [1, 1]
-//     CHECK: linalg.generic {{.*}} ins(%[[LHS_V]] {{.*}} outs(%[[LHS_PADDED_INTER]]
-//     CHECK: linalg.fill(%[[C0]], %[[RHS_PADDED]]) : f32, memref<32x16xf32>
-//     CHECK: %[[RHS_PADDED_INTER:.+]] = memref.subview %[[RHS_PADDED]][0, 0] [27, 16] [1, 1]
-//     CHECK: linalg.generic {{.*}} ins(%[[RHS_V]] {{.*}} outs(%[[RHS_PADDED_INTER]]
-//     CHECK: linalg.matmul ins(%[[LHS_PADDED]], %[[RHS_PADDED]] : memref<64x32xf32>, memref<32x16xf32>)
+//   CHECK-DAG: %[[LHS_PADDED:.+]] = memref.alloc() : memref<64x32xf32>
+//   CHECK-DAG: %[[RHS_PADDED:.+]] = memref.alloc() : memref<32x16xf32>
+//   CHECK-DAG: %[[C0:.+]] = arith.constant 0.000000e+00 : f32
+//   CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<12544x27xf32>
+//   CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<27x16xf32>
+//   CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<12544x16xf32>
+//   CHECK-DAG: %[[LHS_V:.+]] = memref.subview %[[LHS]][%{{.*}}, 0] [64, 27] [1, 1]
+//   CHECK-DAG: %[[RHS_V:.+]] = memref.subview %[[RHS]][0, %{{.*}}] [27, 16] [1, 1]
+//   CHECK-DAG: %[[DST_V:.+]] = memref.subview %[[DST]][%{{.*}}, %{{.*}}] [64, 16] [1, 1]
+//       CHECK: linalg.fill
+//  CHECK-SAME:     ins(%[[C0]] :
+//  CHECK-SAME:     outs(%[[DST_V]] :
+//       CHECK: linalg.fill
+//  CHECK-SAME:     ins(%[[C0]] :
+//  CHECK-SAME:     outs(%[[LHS_PADDED]] :
+//       CHECK: %[[LHS_PADDED_INTER:.+]] = memref.subview %[[LHS_PADDED]][0, 0] [64, 27] [1, 1]
+//       CHECK: linalg.generic {{.*}} ins(%[[LHS_V]] {{.*}} outs(%[[LHS_PADDED_INTER]]
+//       CHECK: linalg.fill
+//  CHECK-SAME:     ins(%[[C0]] :
+//  CHECK-SAME:     outs(%[[RHS_PADDED]] :
+//       CHECK: %[[RHS_PADDED_INTER:.+]] = memref.subview %[[RHS_PADDED]][0, 0] [27, 16] [1, 1]
+//       CHECK: linalg.generic {{.*}} ins(%[[RHS_V]] {{.*}} outs(%[[RHS_PADDED_INTER]]
+//       CHECK: linalg.matmul ins(%[[LHS_PADDED]], %[[RHS_PADDED]] : memref<64x32xf32>, memref<32x16xf32>)
 
 // -----
 
@@ -1695,7 +1712,7 @@
         tensor.yield %cst : f32
       } : tensor<2x?xf32> to tensor<4x4xf32>
       %15 = linalg.init_tensor [4, 4] : tensor<4x4xf32>
-      %16 = linalg.fill(%cst, %15) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
+      %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<4x4xf32>) -> tensor<4x4xf32>
       %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%13, %14 : tensor<4x4xf32>, tensor<4x4xf32>) outs(%16 : tensor<4x4xf32>) -> tensor<4x4xf32>
       %18 = tensor.extract_slice %17[0, 0] [%7, %9] [1, 1] : tensor<4x4xf32> to tensor<?x?xf32>
       flow.dispatch.tensor.store %18, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>{%m, %n}
@@ -1720,12 +1737,15 @@
 //   CHECK-DAG:      %[[TILE_N:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[N]]]
 //   CHECK-DAG:      %[[ARG0_SV:.+]] = memref.subview %[[ARG0]]
 //   CHECK-DAG:      %[[ARG1_SV:.+]] = memref.subview %[[ARG1]]
-//       CHECK:       linalg.fill(%{{.*}}, %[[ALLOC_ARG0]]
+//       CHECK:      linalg.fill
+//  CHECK-SAME:          outs(%[[ALLOC_ARG0]] :
 //       CHECK:      %[[ALLOC_ARG0_SV:.+]] = memref.subview %[[ALLOC_ARG0]]
 //       CHECK:      linalg.generic {{.*}} ins(%[[ARG0_SV]] {{.*}} outs(%[[ALLOC_ARG0_SV]]
-//       CHECK:      linalg.fill(%{{.*}}, %[[ALLOC_ARG1]]
+//       CHECK:      linalg.fill
+//  CHECK-SAME:          outs(%[[ALLOC_ARG1]] :
 //       CHECK:      linalg.generic {{.*}} ins(%[[ARG1_SV]]
-//       CHECK:      linalg.fill(%{{.*}}, %[[ALLOC_RET0]]
+//       CHECK:      linalg.fill
+//  CHECK-SAME:          outs(%[[ALLOC_RET0]] :
 //       CHECK:      linalg.matmul
 //  CHECK-SAME:        ins(%[[ALLOC_ARG0]], %[[ALLOC_ARG1]]
 //  CHECK-SAME:        outs(%[[ALLOC_RET0]]
@@ -1756,8 +1776,8 @@
     %7 = flow.dispatch.tensor.load %0, offsets = [0, %arg0], sizes = [%d0, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xi32>{%d0, %d1} -> tensor<?x?xi32>
     %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg0], sizes = [%d0, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xi32>{%d0, %d1} -> tensor<?x?xi32>
     %13 = linalg.init_tensor [%6] : tensor<?xi32>
-    %14 = linalg.fill(%c-2147483648_i32, %13) {__internal_linalg_transform__ = "workgroup", lowering_config = {tileSizes = [[128]]}} : i32, tensor<?xi32> -> tensor<?xi32>
-    %17 = linalg.fill(%c0_i32, %13) {__internal_linalg_transform__ = "workgroup", lowering_config = {tileSizes = [[128]]}} : i32, tensor<?xi32> -> tensor<?xi32>
+    %14 = linalg.fill {__internal_linalg_transform__ = "workgroup", lowering_config = {tileSizes = [[128]]}} ins(%c-2147483648_i32 : i32) outs(%13 : tensor<?xi32>) -> tensor<?xi32>
+    %17 = linalg.fill  {__internal_linalg_transform__ = "workgroup", lowering_config = {tileSizes = [[128]]}} ins(%c0_i32 : i32) outs(%13 : tensor<?xi32>) -> tensor<?xi32>
     %18:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%7, %9 : tensor<?x?xi32>, tensor<?x?xi32>) outs(%14, %17 : tensor<?xi32>, tensor<?xi32>) attrs =  {__internal_linalg_transform__ = "workgroup", lowering_config = {tileSizes = [[128]]}} {
     ^bb0(%arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32):  // no predecessors
       %19 = arith.cmpi sge, %arg1, %arg3 : i32
@@ -1784,9 +1804,11 @@
 //   CHECK-DAG:     %[[ARG0_SV:.+]] = memref.subview %[[ARG0]]
 //   CHECK-DAG:     %[[ARG1_SV:.+]] = memref.subview %[[ARG1]]
 //   CHECK-DAG:     %[[RET0_SV:.+]] = memref.subview %[[RET0]]
-//   CHECK-DAG:     linalg.fill(%{{.*}}, %[[RET0_SV]]
+//   CHECK-DAG:     linalg.fill
+//  CHECK-SAME:         outs(%[[RET0_SV]] :
 //   CHECK-DAG:     %[[RET1_SV:.+]] = memref.subview %[[RET1]]
-//   CHECK-DAG:     linalg.fill(%{{.*}}, %[[RET1_SV]]
+//   CHECK-DAG:     linalg.fill
+//  CHECK-SAME:         outs(%[[RET1_SV]] :
 //       CHECK:     linalg.generic
 //  CHECK-SAME:       ins(%[[ARG0_SV]], %[[ARG1_SV]]
 //  CHECK-SAME:       outs(%[[RET0_SV]], %[[RET1_SV]]
@@ -2020,7 +2042,7 @@
       %9 = affine.min #map2(%arg1)
       %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [144, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:144x370xf32> -> tensor<144x?xf32>
       %11 = linalg.init_tensor [%7, %9] : tensor<?x?xf32>
-      %12 = linalg.fill(%cst, %11) {__internal_linalg_transform__ = "workgroup", lowering_config = #config0} : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+      %12 = linalg.fill  {__internal_linalg_transform__ = "workgroup", lowering_config = #config0}  ins(%cst : f32) outs(%11 : tensor<?x?xf32>) -> tensor<?x?xf32>
       %13 = scf.for %arg2 = %c0 to %c250 step %c32 iter_args(%arg3 = %12) -> (tensor<?x?xf32>) {
         %14 = scf.for %arg4 = %c0 to %c370 step %c32 iter_args(%arg5 = %arg3) -> (tensor<?x?xf32>) {
           %15 = scf.for %arg6 = %c0 to %c144 step %c24 iter_args(%arg7 = %arg5) -> (tensor<?x?xf32>) {
@@ -2309,7 +2331,7 @@
       %13 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%9, %k], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32>{%m, %k} -> tensor<?x?xf32>
       %15 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [%k, %10], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32>{%k, %n} -> tensor<?x?xf32>
       %16 = linalg.init_tensor [%9, %10] : tensor<?x?xf32>
-      %17 = linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+      %17 = linalg.fill  {__internal_linalg_transform__ = "workgroup"} ins(%cst : f32) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
       %18 = scf.for %arg2 = %c0 to %9 step %c4 iter_args(%arg3 = %17) -> (tensor<?x?xf32>) {
         %20 = scf.for %arg4 = %c0 to %10 step %c4 iter_args(%arg5 = %arg3) -> (tensor<?x?xf32>) {
           %21 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%9, %arg2)
@@ -2360,7 +2382,8 @@
 //   CHECK-DAG:       %[[LHS_SUBVIEW1:.+]] = memref.subview %[[LHS]]
 //   CHECK-DAG:       %[[RHS_SUBVIEW1:.+]] = memref.subview %[[RHS]]
 //   CHECK-DAG:       %[[OUT_SUBVIEW1:.+]] = memref.subview %[[OUT]]
-//       CHECK:       linalg.fill(%{{.+}}, %[[OUT_SUBVIEW1]])
+//       CHECK:       linalg.fill
+//  CHECK-SAME:           outs(%[[OUT_SUBVIEW1]] :
 //       CHECK:       scf.for
 //       CHECK:         scf.for
 //       CHECK:           %[[OUT_SUBVIEW2:.+]] = memref.subview %[[OUT_SUBVIEW1]]
@@ -2410,7 +2433,7 @@
       %14 = affine.min affine_map<(d0) -> (-d0 + 1, 4)>(%arg1)
       %15 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [3, %14], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32>{%k, %n} -> tensor<?x?xf32>
       %16 = linalg.init_tensor [%12, %14] : tensor<?x?xf32>
-      %17 = linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+      %17 = linalg.fill  {__internal_linalg_transform__ = "workgroup"} ins(%cst : f32) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
       %18 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
           ins(%13, %15 : tensor<?x?xf32>, tensor<?x?xf32>)
           outs(%17: tensor<?x?xf32>) -> tensor<?x?xf32>
@@ -2438,7 +2461,8 @@
 //   CHECK-DAG:       %[[LHS_SUBVIEW1:.+]] = memref.subview %[[LHS]]
 //   CHECK-DAG:       %[[RHS_SUBVIEW1:.+]] = memref.subview %[[RHS]]
 //   CHECK-DAG:       %[[OUT_SUBVIEW1:.+]] = memref.subview %[[OUT]]
-//       CHECK:       linalg.fill(%{{.+}}, %[[OUT_SUBVIEW1]])
+//       CHECK:       linalg.fill
+//  CHECK-SAME:           outs(%[[OUT_SUBVIEW1]] :
 //       CHECK:       linalg.matmul
 //  CHECK-SAME:           ins(%[[LHS_SUBVIEW1]], %[[RHS_SUBVIEW1]] :
 //  CHECK-SAME:           outs(%[[OUT_SUBVIEW1]] :
@@ -2486,7 +2510,7 @@
           %ts_2_m = affine.min #map_min(%iv2)[%c16, %ts_m]
           %ts_2_n = affine.min #map_min(%iv3)[%c8, %ts_n]
           %tile_init_2 = linalg.init_tensor [%ts_2_m, %ts_2_n] : tensor<?x?xf32>
-          %fill_tile_2 = linalg.fill(%cst, %tile_init_2) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+          %fill_tile_2 = linalg.fill ins(%cst : f32) outs(%tile_init_2 : tensor<?x?xf32>) -> tensor<?x?xf32>
           %lhs_tile_2 = tensor.extract_slice %lhs_tile[%iv2, 0] [%ts_2_m, %k] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
           %rhs_tile_2 = tensor.extract_slice %rhs_tile[0, %iv3] [%k, %ts_2_n] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
           %matmul_tile_2 = linalg.matmul
@@ -2528,7 +2552,8 @@
 //       CHECK:       scf.for %[[IV2:[a-zA-Z0-9]+]] =
 //       CHECK:         scf.for %[[IV3:[a-zA-Z0-9]+]] =
 //       CHECK:           %[[RESULT_TILE_2:.+]] = memref.subview %[[RESULT_TILE]][%[[IV2]], %[[IV3]]]
-//       CHECK:           linalg.fill(%{{.+}}, %[[RESULT_TILE_2]])
+//       CHECK:           linalg.fill
+//  CHECK-SAME:               outs(%[[RESULT_TILE_2]] :
 //   CHECK-DAG:           %[[LHS_TILE_2:.+]] = memref.subview %[[LHS_TILE]][%[[IV2]], 0]
 //   CHECK-DAG:           %[[RHS_TILE_2:.+]] = memref.subview %[[RHS_TILE]][0, %[[IV3]]]
 //       CHECK:           linalg.matmul

diff --git a/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir b/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
index bb8498b..8aef8cb 100644
--- a/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
+++ b/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir

@@ -266,7 +266,7 @@
         %8 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0], sizes = [%0, %3, %2], strides = [1, 1, 1]
             : !flow.dispatch.tensor<readonly:?x?x?xf32>{%0, %3, %2} -> tensor<?x?x?xf32>
         %9 = linalg.init_tensor [%0, %1, %2] : tensor<?x?x?xf32>
-        %10 = linalg.fill(%cst, %9) : f32, tensor<?x?x?xf32> -> tensor<?x?x?xf32>
+        %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
         %11 = linalg.batch_matmul {lowering_config = #config}
             ins(%7, %8 : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%10 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
         flow.dispatch.tensor.store %11, %6, offsets = [0, 0, 0], sizes = [%0, %1, %2], strides = [1, 1, 1]
@@ -323,7 +323,7 @@
         %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 512], strides = [1, 1]
             : !flow.dispatch.tensor<readonly:256x512xf32> -> tensor<256x512xf32>
         %5 = linalg.init_tensor [128, 512] : tensor<128x512xf32>
-        %6 = linalg.fill(%cst, %5) : f32, tensor<128x512xf32> -> tensor<128x512xf32>
+        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x512xf32>) -> tensor<128x512xf32>
         %7 = linalg.matmul {lowering_config = #config}
             ins(%3, %4 : tensor<128x256xf32>, tensor<256x512xf32>) outs(%6 : tensor<128x512xf32>) -> tensor<128x512xf32>
         flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 512], strides = [1, 1]
@@ -352,7 +352,8 @@
 //  CHECK-DAG:       %[[LHS:.+]] = flow.dispatch.tensor.load %{{.+}}, offsets = [%[[IV0]], 0], sizes = [32, 256]
 //  CHECK-DAG:       %[[RHS:.+]] = flow.dispatch.tensor.load %{{.+}}, offsets = [0, %[[IV1]]], sizes = [256, 16]
 //  CHECK-DAG:       %[[INIT:.+]] = linalg.init_tensor [32, 16]
-//  CHECK-DAG:       %[[FILL:.+]] = linalg.fill(%{{.+}}, %[[INIT]])
+//  CHECK-DAG:       %[[FILL:.+]] = linalg.fill
+// CHECK-SAME:           outs(%[[INIT]] :
 //  CHECK-DAG:       %[[GEMM:.+]] = linalg.matmul
 // CHECK-SAME:           outs(%[[FILL]] :
 //      CHECK:       flow.dispatch.tensor.store %[[GEMM]]
@@ -787,7 +788,7 @@
         %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 96], strides = [1, 1, 1]
             : !flow.dispatch.tensor<readonly:3x3x96xf32> -> tensor<3x3x96xf32>
         %5 = linalg.init_tensor [1, 80, 80, 96] : tensor<1x80x80x96xf32>
-        %6 = linalg.fill(%cst, %5) : f32, tensor<1x80x80x96xf32> -> tensor<1x80x80x96xf32>
+        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x80x80x96xf32>) -> tensor<1x80x80x96xf32>
         %7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #config, strides = dense<2> : tensor<2xi64>}
             ins(%3, %4 : tensor<1x161x161x96xf32>, tensor<3x3x96xf32>) outs(%6 : tensor<1x80x80x96xf32>) -> tensor<1x80x80x96xf32>
         flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 80, 80, 96], strides = [1, 1, 1, 1]
@@ -816,7 +817,8 @@
 //      CHECK:     scf.for %[[IV1:.+]] =
 //      CHECK:       scf.for %[[IV2:.+]] =
 //      CHECK:         %[[INIT:.+]] = linalg.init_tensor [1, 20, 40, 48]
-//      CHECK:         %[[FILL:.+]] = linalg.fill(%{{.+}}, %[[INIT]])
+//      CHECK:         %[[FILL:.+]] = linalg.fill
+// CHECK-SAME:             outs(%[[INIT]] :
 //      CHECK:         %[[RESULT:.+]] = linalg.depthwise_conv_2d_nhwc_hwc
 // CHECK-SAME:             outs(%[[FILL]] :
 //      CHECK:         flow.dispatch.tensor.store %[[RESULT]], %{{.+}}, offsets = [0, %[[IV0]], %[[IV1]], %[[IV2]]], sizes = [1, 20, 40, 48]
@@ -913,7 +915,7 @@
         %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [240, 40], strides = [1, 1]
             : !flow.dispatch.tensor<readonly:240x40xf32> -> tensor<240x40xf32>
         %5 = linalg.init_tensor [196, 40] : tensor<196x40xf32>
-        %6 = linalg.fill(%cst, %5) : f32, tensor<196x40xf32> -> tensor<196x40xf32>
+        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<196x40xf32>) -> tensor<196x40xf32>
         %7 = linalg.matmul {lowering_config = #config}
             ins(%3, %4 : tensor<196x240xf32>, tensor<240x40xf32>) outs(%6 : tensor<196x40xf32>) -> tensor<196x40xf32>
         flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [196, 40], strides = [1, 1]
@@ -969,7 +971,7 @@
         %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [5, 5, 576], strides = [1, 1, 1]
             : !flow.dispatch.tensor<readonly:5x5x576xf32> -> tensor<5x5x576xf32>
         %5 = linalg.init_tensor [1, 7, 7, 576] : tensor<1x7x7x576xf32>
-        %6 = linalg.fill(%cst, %5) : f32, tensor<1x7x7x576xf32> -> tensor<1x7x7x576xf32>
+        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x7x7x576xf32>) -> tensor<1x7x7x576xf32>
         %7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #config, strides = dense<1> : tensor<2xi64>}
             ins(%3, %4 : tensor<1x11x11x576xf32>, tensor<5x5x576xf32>) outs(%6 : tensor<1x7x7x576xf32>) -> tensor<1x7x7x576xf32>
         flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 7, 7, 576], strides = [1, 1, 1, 1]
@@ -1021,7 +1023,7 @@
         %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [7, 7, 2048], strides = [1, 1, 1]
             : !flow.dispatch.tensor<readonly:7x7x2048xf32> -> tensor<7x7x2048xf32>
         %1 = linalg.init_tensor [7] : tensor<7xf32>
-        %2 = linalg.fill(%cst, %1) : f32, tensor<7xf32> -> tensor<7xf32>
+        %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<7xf32>) -> tensor<7xf32>
         %3 = linalg.generic {
             indexing_maps = [#map0, #map1], iterator_types = ["parallel", "reduction", "reduction"]}
             ins(%0 : tensor<7x7x2048xf32>) outs(%2 : tensor<7xf32>) attrs =  {lowering_config = #config} {
@@ -1058,7 +1060,8 @@
 //      CHECK:   scf.for %[[IV0:.+]] =
 //      CHECK:     %[[INIT0:.+]] = linalg.init_tensor
 //      CHECK:     %[[INIT:.+]] = linalg.init_tensor
-//      CHECK:     %[[FILL:.+]] = linalg.fill(%{{.+}}, %[[INIT]])
+//      CHECK:     %[[FILL:.+]] = linalg.fill
+// CHECK-SAME:         outs(%[[INIT]] :
 //      CHECK:     %[[REDUCE:.+]] = linalg.generic
 // CHECK-SAME:         outs(%[[FILL]] :
 //      CHECK:     %[[GENERIC:.+]] = linalg.generic

diff --git a/iree/compiler/Codegen/Common/test/type_propagation.mlir b/iree/compiler/Codegen/Common/test/type_propagation.mlir
index 75ea3d6..a56c31e 100644
--- a/iree/compiler/Codegen/Common/test/type_propagation.mlir
+++ b/iree/compiler/Codegen/Common/test/type_propagation.mlir

@@ -225,7 +225,7 @@
   %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<writeonly:?xi8>{%d}
   %1 = linalg.init_tensor [%d] : tensor<?xi1>
   %false = arith.constant false
-  %2 = linalg.fill(%false, %1) : i1, tensor<?xi1> -> tensor<?xi1>
+  %2 = linalg.fill ins(%false : i1) outs(%1 : tensor<?xi1>) -> tensor<?xi1>
   %3 = arith.extui %2 : tensor<?xi1> to tensor<?xi8>
   flow.dispatch.tensor.store %3, %0, offsets=[0], sizes=[%d], strides=[1] : tensor<?xi8> -> !flow.dispatch.tensor<writeonly:?xi8>{%d}
   return
@@ -235,5 +235,7 @@
 //   CHECK-DAG:   %[[INIT:.+]] = linalg.init_tensor
 //   CHECK-DAG:   %[[FALSE:.+]] = arith.constant false
 //   CHECK-DAG:   %[[EXT_SCALAR:.+]] = arith.extui %[[FALSE]]
-//       CHECK:   %[[FILL:.+]] = linalg.fill(%[[EXT_SCALAR]], %[[INIT]])
+//       CHECK:   %[[FILL:.+]] = linalg.fill
+//  CHECK-SAME:       ins(%[[EXT_SCALAR]] :
+//  CHECK-SAME:       outs(%[[INIT]] :
 //       CHECK:   flow.dispatch.tensor.store %[[FILL]], %[[OUT]]

diff --git a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
index 593810f..3445c60 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir

@@ -193,7 +193,7 @@
         %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0, 0], sizes = [%B, %K, %N], strides = [1, 1, 1]
             : !flow.dispatch.tensor<readonly:?x?x?xf32>{%B, %K, %N} -> tensor<?x?x?xf32>
         %init = linalg.init_tensor [%B, %M, %N] : tensor<?x?x?xf32>
-        %fill = linalg.fill(%cst, %init) : f32, tensor<?x?x?xf32> -> tensor<?x?x?xf32>
+        %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
         %batch_gemm = linalg.batch_matmul
             ins(%lhs, %rhs : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%fill : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
         flow.dispatch.tensor.store %batch_gemm, %result_binding, offsets = [0, 0, 0], sizes = [%B, %M, %N], strides = [1, 1, 1]
@@ -240,7 +240,7 @@
         %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [256, 512], strides = [1, 1]
             : !flow.dispatch.tensor<readonly:256x512xf32> -> tensor<256x512xf32>
         %init = linalg.init_tensor [128, 512] : tensor<128x512xf32>
-        %fill = linalg.fill(%cst, %init) : f32, tensor<128x512xf32> -> tensor<128x512xf32>
+        %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<128x512xf32>) -> tensor<128x512xf32>
         %gemm = linalg.matmul {compilation_info = #compilation}
             ins(%lhs, %rhs : tensor<128x256xf32>, tensor<256x512xf32>)
             outs(%fill : tensor<128x512xf32>) -> tensor<128x512xf32>
@@ -570,7 +570,7 @@
         %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x225x225x3xf32>
         %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x16xf32> -> tensor<3x3x3x16xf32>
         %5 = linalg.init_tensor [1, 112, 112, 16] : tensor<1x112x112x16xf32>
-        %6 = linalg.fill(%cst, %5) : f32, tensor<1x112x112x16xf32> -> tensor<1x112x112x16xf32>
+        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32>
         %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x16xf32>) outs(%6 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32>
         flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 16], strides = [1, 1, 1, 1] : tensor<1x112x112x16xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x16xf32>
         return
@@ -615,7 +615,7 @@
         %filter = flow.dispatch.tensor.load %filter_binding, offsets = [0, 0, 0], sizes = [3, 3, 96], strides = [1, 1, 1]
             : !flow.dispatch.tensor<readonly:3x3x96xf32> -> tensor<3x3x96xf32>
         %init = linalg.init_tensor [1, 80, 80, 96] : tensor<1x80x80x96xf32>
-        %fill = linalg.fill(%cst, %init) : f32, tensor<1x80x80x96xf32> -> tensor<1x80x80x96xf32>
+        %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x80x80x96xf32>) -> tensor<1x80x80x96xf32>
         %conv = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
             ins(%input, %filter : tensor<1x161x161x96xf32>, tensor<3x3x96xf32>) outs(%fill : tensor<1x80x80x96xf32>) -> tensor<1x80x80x96xf32>
         flow.dispatch.tensor.store %conv, %result_binding, offsets = [0, 0, 0, 0], sizes = [1, 80, 80, 96], strides = [1, 1, 1, 1]
@@ -707,7 +707,7 @@
         %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [240, 40], strides = [1, 1]
             : !flow.dispatch.tensor<readonly:240x40xf32> -> tensor<240x40xf32>
         %init = linalg.init_tensor [196, 40] : tensor<196x40xf32>
-        %fill = linalg.fill(%cst, %init) : f32, tensor<196x40xf32> -> tensor<196x40xf32>
+        %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<196x40xf32>) -> tensor<196x40xf32>
         %gemm = linalg.matmul ins(%lhs, %rhs : tensor<196x240xf32>, tensor<240x40xf32>)
             outs(%fill : tensor<196x40xf32>) -> tensor<196x40xf32>
         flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [196, 40], strides = [1, 1]
@@ -755,7 +755,7 @@
         %filter = flow.dispatch.tensor.load %filter_binding, offsets = [0, 0, 0], sizes = [5, 5, 576], strides = [1, 1, 1]
             : !flow.dispatch.tensor<readonly:5x5x576xf32> -> tensor<5x5x576xf32>
         %init = linalg.init_tensor [1, 7, 7, 576] : tensor<1x7x7x576xf32>
-        %fill = linalg.fill(%cst, %init) : f32, tensor<1x7x7x576xf32> -> tensor<1x7x7x576xf32>
+        %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x7x7x576xf32>) -> tensor<1x7x7x576xf32>
         %conv = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
             ins(%input, %filter : tensor<1x11x11x576xf32>, tensor<5x5x576xf32>)
             outs(%fill : tensor<1x7x7x576xf32>) -> tensor<1x7x7x576xf32>
@@ -803,7 +803,7 @@
         %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [512, 128], strides = [1, 1]
             : !flow.dispatch.tensor<readonly:512x128xf32> -> tensor<512x128xf32>
         %init = linalg.init_tensor [384, 128] : tensor<384x128xf32>
-        %fill = linalg.fill(%cst, %init) : f32, tensor<384x128xf32> -> tensor<384x128xf32>
+        %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<384x128xf32>) -> tensor<384x128xf32>
         %gemm = linalg.matmul ins(%lhs, %rhs : tensor<384x512xf32>, tensor<512x128xf32>)
             outs(%fill : tensor<384x128xf32>) -> tensor<384x128xf32>
         flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [384, 128], strides = [1, 1]
@@ -851,7 +851,7 @@
         %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [512, 128], strides = [1, 1]
             : !flow.dispatch.tensor<readonly:512x128xf32> -> tensor<512x128xf32>
         %init = linalg.init_tensor [384, 128] : tensor<384x128xf32>
-        %fill = linalg.fill(%cst, %init) : f32, tensor<384x128xf32> -> tensor<384x128xf32>
+        %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<384x128xf32>) -> tensor<384x128xf32>
         %gemm = linalg.matmul ins(%lhs, %rhs : tensor<384x512xf32>, tensor<512x128xf32>)
             outs(%fill : tensor<384x128xf32>) -> tensor<384x128xf32>
         flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [384, 128], strides = [1, 1]
@@ -896,7 +896,7 @@
         %input = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [7, 7, 2048], strides = [1, 1, 1]
             : !flow.dispatch.tensor<readonly:7x7x2048xf32> -> tensor<7x7x2048xf32>
         %init = linalg.init_tensor [7] : tensor<7xf32>
-        %fill = linalg.fill(%cst, %init) : f32, tensor<7xf32> -> tensor<7xf32> 
+        %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<7xf32>) -> tensor<7xf32> 
         %reduce = linalg.generic {
             indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>],
             iterator_types = ["parallel", "reduction", "reduction"]}
@@ -1110,7 +1110,7 @@
         %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16, 49], strides = [1, 1] : !flow.dispatch.tensor<readonly:16x49xf32> -> tensor<16x49xf32>
         %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [33, 49], strides = [1, 1] : !flow.dispatch.tensor<readonly:33x49xf32> -> tensor<33x49xf32>
         %7 = linalg.init_tensor [33, 49] : tensor<33x49xf32>
-        %8 = linalg.fill(%cst, %7) : f32, tensor<33x49xf32> -> tensor<33x49xf32>
+        %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<33x49xf32>) -> tensor<33x49xf32>
         %9 = linalg.matmul ins(%4, %5 : tensor<33x16xf32>, tensor<16x49xf32>) outs(%8 : tensor<33x49xf32>) -> tensor<33x49xf32>
         flow.dispatch.tensor.store %9, %3, offsets = [0, 0], sizes = [33, 49], strides = [1, 1] : tensor<33x49xf32> -> !flow.dispatch.tensor<writeonly:33x49xf32>
         return

diff --git a/iree/compiler/Codegen/LLVMCPU/test/tile_fuse_and_vectorize.mlir b/iree/compiler/Codegen/LLVMCPU/test/tile_fuse_and_vectorize.mlir
index a9175c3..44ce48f 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/tile_fuse_and_vectorize.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/tile_fuse_and_vectorize.mlir

@@ -26,7 +26,7 @@
         %7 = linalg.init_tensor [64, 64] : tensor<64x64xf32>
         %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [64, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:384x512xf32> -> tensor<64x512xf32>
         %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [512, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:512x128xf32> -> tensor<512x64xf32>
-        %10 = linalg.fill(%cst, %7) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
+        %10 = linalg.fill ins(%cst : f32) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32>
         %11 = linalg.matmul {lowering_config = #config} ins(%8, %9 : tensor<64x512xf32>, tensor<512x64xf32>) outs(%10 : tensor<64x64xf32>) -> tensor<64x64xf32>
         %12 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<64x64xf32>) outs(%7 : tensor<64x64xf32>) {
         ^bb0(%arg2: f32, %arg3: f32):  // no predecessors
@@ -109,7 +109,7 @@
       %14 = flow.dispatch.tensor.load %2, offsets = [%arg0, 0], sizes = [64, 384], strides = [1, 1] : !flow.dispatch.tensor<readonly:384x384xf32> -> tensor<64x384xf32>
       %15 = flow.dispatch.tensor.load %3, offsets = [0, %arg1], sizes = [384, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:384x512xf32> -> tensor<384x64xf32>
       %16 = linalg.init_tensor [64, 64] : tensor<64x64xf32>
-      %17 = linalg.fill(%cst, %16) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
+      %17 = linalg.fill ins(%cst : f32) outs(%16 : tensor<64x64xf32>) -> tensor<64x64xf32>
       %18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[], [32, 32, 32], [16, 16, 16]], native_vector_size = [16, 16, 16]>} ins(%14, %15 : tensor<64x384xf32>, tensor<384x64xf32>) outs(%17 : tensor<64x64xf32>) -> tensor<64x64xf32>
       %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%18, %11, %12 : tensor<64x64xf32>, tensor<64xi32>, tensor<64x64xf32>) outs(%13 : tensor<64x64xf32>) {
       ^bb0(%arg2: f32, %arg3: i32, %arg4: f32, %arg5: f32):  // no predecessors
@@ -178,7 +178,7 @@
       %16 = flow.dispatch.tensor.load %4, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:784x24xf32> -> tensor<16x24xf32>
       %17 = flow.dispatch.tensor.load %5, offsets = [0, %arg1], sizes = [24, 49], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x96xf32> -> tensor<24x49xf32>
       %18 = linalg.init_tensor [16, 49] : tensor<16x49xf32>
-      %19 = linalg.fill(%cst, %18) : f32, tensor<16x49xf32> -> tensor<16x49xf32>
+      %19 = linalg.fill ins(%cst : f32) outs(%18 : tensor<16x49xf32>) -> tensor<16x49xf32>
       %20 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[], [16, 16, 32], [16, 16, 16]], native_vector_size = [16, 16, 16]>} ins(%16, %17 : tensor<16x24xf32>, tensor<24x49xf32>) outs(%19 : tensor<16x49xf32>) -> tensor<16x49xf32>
       %21 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%20, %11, %12, %13, %14 : tensor<16x49xf32>, tensor<49xf32>, tensor<49xf32>, tensor<49xf32>, tensor<49xf32>) outs(%15 : tensor<16x49xf32>) {
       ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32, %arg7: f32):  // no predecessors

diff --git a/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir b/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir
index 847e138..726c24a 100644
--- a/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir
+++ b/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir

@@ -47,8 +47,7 @@
                 : memref<1024x1024xf32> to memref<1024x256xf32, #map4>
             %11 = memref.subview %2[%arg0, %arg1] [2, 256] [1, 1]
                 : memref<1024x1024xf32> to memref<2x256xf32, #map4>
-            linalg.fill(%cst, %11) {lowering_config = #config}
-                : f32, memref<2x256xf32, #map4>
+            linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%11 : memref<2x256xf32, #map4>)
             linalg.matmul {lowering_config = #config}
                 ins(%8, %10 : memref<2x1024xf32, #map4>, memref<1024x256xf32, #map4>)
                 outs(%11 : memref<2x256xf32, #map4>)
@@ -130,7 +129,7 @@
           %7 = memref.subview %0[%arg0, %arg1, 0] [1, 8, 1024] [1, 1, 1] : memref<4x32x1024xf32> to memref<1x8x1024xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 32768 + s0 + d1 * 1024 + d2)>>
           %8 = memref.subview %1[%arg0, 0, %arg2] [1, 1024, 32] [1, 1, 1] : memref<4x1024x64xf32> to memref<1x1024x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 65536 + s0 + d1 * 64 + d2)>>
           %9 = memref.subview %2[%arg0, %arg1, %arg2] [1, 8, 32] [1, 1, 1] : memref<4x32x64xf32> to memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>> 
-          linalg.fill(%cst, %9) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 8, 32, 32]]>} : f32, memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>  
+          linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 8, 32, 32]]>} ins(%cst : f32) outs(%9 : memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>)
           linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 8, 32, 32]]>} ins(%7, %8 : memref<1x8x1024xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 32768 + s0 + d1 * 1024 + d2)>>, memref<1x1024x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 65536 + s0 + d1 * 64 + d2)>>) outs(%9 : memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>)
         }
       }
@@ -207,8 +206,7 @@
                 : memref<1024x1024xf32> to memref<1024x32xf32, #map4>
             %11 = memref.subview %2[%arg0, %arg1] [2, 32] [1, 1]
                 : memref<1024x1024xf32> to memref<2x32xf32, #map4>
-            linalg.fill(%cst, %11) {lowering_config = #config}
-                : f32, memref<2x32xf32, #map4>
+            linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%11 : memref<2x32xf32, #map4>)
             linalg.matmul {lowering_config = #config}
                 ins(%8, %10 : memref<2x1024xf32, #map4>, memref<1024x32xf32, #map4>)
                 outs(%11 : memref<2x32xf32, #map4>)
@@ -271,7 +269,7 @@
         %cst_0 = arith.constant 0xFF800000 : f32
         %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1000xf32>
         %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<f32>
-        linalg.fill(%cst_0, %1) {lowering_config = #config}  : f32, memref<f32>
+        linalg.fill {lowering_config = #config} ins(%cst_0 : f32) outs(%1 : memref<f32>)
         linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%0 : memref<1000xf32>) outs(%1 : memref<f32>) attrs = {lowering_config = #config} {
         ^bb0(%arg0: f32, %arg1: f32):  // no predecessors
           %2 = arith.cmpf ogt, %arg0, %arg1 : f32

diff --git a/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir b/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
index 62ea000..25370ff 100644
--- a/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
+++ b/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir

@@ -61,9 +61,9 @@
         %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<2x3xf32>
         %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<3x4xf32>
         %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<2x4xf32>
-              linalg.fill(%cst, %2) : f32, memref<2x4xf32>
-              linalg.matmul ins(%0, %1 : memref<2x3xf32>, memref<3x4xf32>) outs(%2 : memref<2x4xf32>)
-              return
+        linalg.fill ins(%cst : f32) outs(%2 : memref<2x4xf32>)
+        linalg.matmul ins(%0, %1 : memref<2x3xf32>, memref<3x4xf32>) outs(%2 : memref<2x4xf32>)
+        return
       }
     }
   }
@@ -98,7 +98,7 @@
         %cst_0 = arith.constant 0xFF800000 : f32
         %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1000xf32>
         %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<f32>
-        linalg.fill(%cst_0, %1) : f32, memref<f32>
+        linalg.fill ins(%cst_0 : f32) outs(%1 : memref<f32>)
         linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%0 : memref<1000xf32>) outs(%1 : memref<f32>) {
         ^bb0(%arg0: f32, %arg1: f32):  // no predecessors
           %2 = arith.cmpf ogt, %arg0, %arg1 : f32
@@ -309,7 +309,7 @@
       %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]
           : !flow.dispatch.tensor<readonly:256x1024xf32> -> tensor<256x1024xf32>
       %15 = linalg.init_tensor [128, 1024] : tensor<128x1024xf32>
-      %16 = linalg.fill(%cst, %15) : f32, tensor<128x1024xf32> -> tensor<128x1024xf32>
+      %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
       %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup", compilation_info = #compilation}
           ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%16 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
       flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor<writeonly:128x1024xf32>

diff --git a/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir b/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
index 59735b5..58e9859 100644
--- a/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir

@@ -301,7 +301,7 @@
           %7 = memref.subview %0[%arg0, %arg1, 0] [1, 8, 1024] [1, 1, 1] : memref<4x32x1024xf32> to memref<1x8x1024xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 32768 + s0 + d1 * 1024 + d2)>>
           %8 = memref.subview %1[%arg0, 0, %arg2] [1, 1024, 32] [1, 1, 1] : memref<4x1024x64xf32> to memref<1x1024x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 65536 + s0 + d1 * 64 + d2)>>
           %9 = memref.subview %2[%arg0, %arg1, %arg2] [1, 8, 32] [1, 1, 1] : memref<4x32x64xf32> to memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>
-          linalg.fill(%cst, %9) {lowering_config = #config} : f32, memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>
+          linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%9 : memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>)
           // expected-error @+1 {{Received first tile dimension of 2 instead of 0 for LLVMGPUMatmulTensorCore}}
           linalg.batch_matmul {lowering_config = #config} ins(%7, %8 : memref<1x8x1024xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 32768 + s0 + d1 * 1024 + d2)>>, memref<1x1024x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 65536 + s0 + d1 * 64 + d2)>>) outs(%9 : memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>)
         }

diff --git a/iree/compiler/Codegen/LLVMGPU/test/nvvm_pipeline_test.mlir b/iree/compiler/Codegen/LLVMGPU/test/nvvm_pipeline_test.mlir
index 2379836..0e2cf86 100644
--- a/iree/compiler/Codegen/LLVMGPU/test/nvvm_pipeline_test.mlir
+++ b/iree/compiler/Codegen/LLVMGPU/test/nvvm_pipeline_test.mlir

@@ -68,7 +68,7 @@
         %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1]
             : !flow.dispatch.tensor<readonly:1024x1024xf32> -> tensor<1024x1024xf32>
         %15 = linalg.init_tensor [1024, 1024] : tensor<1024x1024xf32>
-        %16 = linalg.fill(%cst, %15) : f32, tensor<1024x1024xf32> -> tensor<1024x1024xf32>
+        %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
         %17 = linalg.matmul ins(%8, %10 : tensor<1024x1024xf32>, tensor<1024x1024xf32>)
             outs(%16 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
         flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1]
@@ -135,7 +135,7 @@
         %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1]
             : !flow.dispatch.tensor<readonly:1024x1024xf32> -> tensor<1024x1024xf32>
         %15 = linalg.init_tensor [1024, 1024] : tensor<1024x1024xf32>
-        %16 = linalg.fill(%cst, %15) : f32, tensor<1024x1024xf32> -> tensor<1024x1024xf32>
+        %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
         %17 = linalg.generic #matmul_trait 
             ins(%8, %10 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%16 : tensor<1024x1024xf32>)  {
           ^bb(%a: f32, %b: f32, %c: f32) :
@@ -185,7 +185,7 @@
       %13 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 2, 2, 1], strides = [1, 1, 1, 1]
           : !flow.dispatch.tensor<readonly:3x2x2x1xf32> -> tensor<3x2x2x1xf32>
       %20 = linalg.init_tensor [1, 2, 3, 1] : tensor<1x2x3x1xf32>
-      %21 = linalg.fill(%cst, %20) : f32, tensor<1x2x3x1xf32> -> tensor<1x2x3x1xf32>
+      %21 = linalg.fill ins(%cst : f32) outs(%20 : tensor<1x2x3x1xf32>) -> tensor<1x2x3x1xf32>
       %22 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
           ins(%11, %13 : tensor<1x4x4x2xf32>, tensor<3x2x2x1xf32>) outs(%21 : tensor<1x2x3x1xf32>) -> tensor<1x2x3x1xf32>
       flow.dispatch.tensor.store %22, %2, offsets = [0, 0, 0, 0], sizes = [1, 2, 3, 1], strides = [1, 1, 1, 1]
@@ -260,7 +260,7 @@
       %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [14, 14, 96], strides = [1, 1, 1]
           : !flow.dispatch.tensor<readonly:14x14x96xf32> -> tensor<14x14x96xf32>
       %8 = linalg.init_tensor [96] : tensor<96xf32>
-      %9 = linalg.fill(%cst, %8) : f32, tensor<96xf32> -> tensor<96xf32>
+      %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<96xf32>) -> tensor<96xf32>
       %10 = linalg.generic {
             indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2, d0)>, affine_map<(d0, d1, d2) -> (d0)>],
             iterator_types = ["parallel", "reduction", "reduction"]}
@@ -352,7 +352,7 @@
       %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 16384], strides = [1, 1]
           : !flow.dispatch.tensor<readonly:512x16384xf32> -> tensor<512x16384xf32>
       %8 = linalg.init_tensor [16384] : tensor<16384xf32>
-      %9 = linalg.fill(%cst, %8) : f32, tensor<16384xf32> -> tensor<16384xf32>
+      %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<16384xf32>) -> tensor<16384xf32>
       %10 = linalg.generic {
           indexing_maps = [#map3, #map4], iterator_types = ["parallel", "reduction"]}
           ins(%5 : tensor<512x16384xf32>) outs(%9 : tensor<16384xf32>) {
@@ -402,7 +402,7 @@
       %d = flow.dispatch.tensor.load %di, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1]
           : !flow.dispatch.tensor<readonly:2048x512xf32> -> tensor<2048x512xf32>             
       %init = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
-      %f = linalg.fill(%cst, %init) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
+      %f = linalg.fill ins(%cst : f32) outs(%init : tensor<2048x512xf32>) -> tensor<2048x512xf32>
       %m = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%f : tensor<2048x512xf32>) -> tensor<2048x512xf32>
       %init2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
       %a = linalg.generic {
@@ -511,7 +511,7 @@
           %13 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 1024, 64], strides = [1, 1, 1]
               : !flow.dispatch.tensor<readonly:4x1024x64xf32> -> tensor<4x1024x64xf32>
           %17 = linalg.init_tensor [4, 32, 64] : tensor<4x32x64xf32>
-          %18 = linalg.fill(%cst, %17) : f32, tensor<4x32x64xf32> -> tensor<4x32x64xf32>
+          %18 = linalg.fill ins(%cst : f32) outs(%17 : tensor<4x32x64xf32>) -> tensor<4x32x64xf32>
           %19 = linalg.batch_matmul ins(%11, %13 : tensor<4x32x1024xf32>, tensor<4x1024x64xf32>)
               outs(%18 : tensor<4x32x64xf32>) -> tensor<4x32x64xf32>
           flow.dispatch.tensor.store %19, %2, offsets = [0, 0, 0], sizes = [4, 32, 64], strides = [1, 1, 1]

diff --git a/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir b/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir
index ad61567..3badaf2 100644
--- a/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir
+++ b/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir

@@ -67,7 +67,7 @@
         %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1]
             : !flow.dispatch.tensor<readonly:1024x1024xf32> -> tensor<1024x1024xf32>
         %15 = linalg.init_tensor [1024, 1024] : tensor<1024x1024xf32>
-        %16 = linalg.fill(%cst, %15) : f32, tensor<1024x1024xf32> -> tensor<1024x1024xf32>
+        %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
         %17 = linalg.matmul ins(%8, %10 : tensor<1024x1024xf32>, tensor<1024x1024xf32>)
             outs(%16 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
         flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1]

diff --git a/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir b/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir
index 3acdb83..eac1c9e 100644
--- a/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir
+++ b/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir

@@ -20,7 +20,7 @@
   %10 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%9]
   %11 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 32)>(%8)
   %12 = memref.subview %7[%10, %11] [32, 32] [1, 1] : memref<64x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
-  linalg.fill(%cst, %12) {__internal_linalg_transform__ = "vectorize"} : f32, memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
+  linalg.fill {__internal_linalg_transform__ = "vectorize"} ins(%cst : f32) outs(%12 : memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
   scf.for %arg0 = %c0 to %c1024 step %c16 {
     %13 = memref.subview %5[0, %arg0] [64, 16] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
     %14 = memref.subview %6[%arg0, 0] [16, 64] [1, 1] : memref<1024x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>

diff --git a/iree/compiler/Codegen/SPIRV/test/config_adreno_conv.mlir b/iree/compiler/Codegen/SPIRV/test/config_adreno_conv.mlir
index c13e167..5d43eeb 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_adreno_conv.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_adreno_conv.mlir

@@ -32,7 +32,7 @@
         %15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 512], strides = [1, 1, 1, 1]
             : !flow.dispatch.tensor<readonly:3x3x3x512xf32> -> tensor<3x3x3x512xf32>
         %22 = linalg.init_tensor [1, 112, 112, 512] : tensor<1x112x112x512xf32>
-        %23 = linalg.fill(%cst, %22) : f32, tensor<1x112x112x512xf32> -> tensor<1x112x112x512xf32>
+        %23 = linalg.fill ins(%cst : f32) outs(%22 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32>
         %24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
             ins(%13, %15 : tensor<1x225x225x3xf32>, tensor<3x3x3x512xf32>) outs(%23 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32>
         flow.dispatch.tensor.store %24, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 512], strides = [1, 1, 1, 1]
@@ -86,7 +86,7 @@
         %15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1]
             : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
         %22 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
-        %23 = linalg.fill(%cst, %22) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+        %23 = linalg.fill ins(%cst : f32) outs(%22 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
         %24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
             ins(%13, %15 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%23 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
         flow.dispatch.tensor.store %24, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1]
@@ -139,7 +139,7 @@
         %15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1]
             : !flow.dispatch.tensor<readonly:3x3x3x16xf32> -> tensor<3x3x3x16xf32>
         %22 = linalg.init_tensor [1, 16, 16, 16] : tensor<1x16x16x16xf32>
-        %23 = linalg.fill(%cst, %22) : f32, tensor<1x16x16x16xf32> -> tensor<1x16x16x16xf32>
+        %23 = linalg.fill ins(%cst : f32) outs(%22 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32>
         %24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
             ins(%13, %15 : tensor<1x33x33x3xf32>, tensor<3x3x3x16xf32>) outs(%23 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32>
         flow.dispatch.tensor.store %24, %2, offsets = [0, 0, 0, 0], sizes = [1, 16, 16, 16], strides = [1, 1, 1, 1]
@@ -194,7 +194,7 @@
         %16 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 144], strides = [1, 1, 1]
             : !flow.dispatch.tensor<readonly:3x3x144xf32> -> tensor<3x3x144xf32>
         %23 = linalg.init_tensor [1, 28, 28, 144] : tensor<1x28x28x144xf32>
-        %24 = linalg.fill(%cst, %23) : f32, tensor<1x28x28x144xf32> -> tensor<1x28x28x144xf32>
+        %24 = linalg.fill ins(%cst : f32) outs(%23 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32>
         %25 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
                   ins(%14, %16 : tensor<1x57x57x144xf32>, tensor<3x3x144xf32>) outs(%24 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32>
         flow.dispatch.tensor.store %25, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 144], strides = [1, 1, 1, 1]
@@ -248,7 +248,7 @@
         %16 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 8], strides = [1, 1, 1]
             : !flow.dispatch.tensor<readonly:3x3x8xf32> -> tensor<3x3x8xf32>
         %23 = linalg.init_tensor [1, 4, 4, 8] : tensor<1x4x4x8xf32>
-        %24 = linalg.fill(%cst, %23) : f32, tensor<1x4x4x8xf32> -> tensor<1x4x4x8xf32>
+        %24 = linalg.fill ins(%cst : f32) outs(%23 : tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
         %25 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
             ins(%14, %16 : tensor<1x9x9x8xf32>, tensor<3x3x8xf32>) outs(%24 : tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
         flow.dispatch.tensor.store %25, %2, offsets = [0, 0, 0, 0], sizes = [1, 4, 4, 8], strides = [1, 1, 1, 1]

diff --git a/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir b/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir
index a3a08a9..59d7b3b 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir

@@ -32,7 +32,7 @@
         %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1]
             : !flow.dispatch.tensor<readonly:512x2048xf32> -> tensor<512x2048xf32>
         %15 = linalg.init_tensor [1024, 2048] : tensor<1024x2048xf32>
-        %16 = linalg.fill(%cst, %15) : f32, tensor<1024x2048xf32> -> tensor<1024x2048xf32>
+        %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32>
         %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
             ins(%8, %10 : tensor<1024x512xf32>, tensor<512x2048xf32>) outs(%16 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32>
         flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [1024, 2048], strides = [1, 1]
@@ -86,7 +86,7 @@
         %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [96, 24], strides = [1, 1]
             : !flow.dispatch.tensor<readonly:96x24xf32> -> tensor<96x24xf32>
         %15 = linalg.init_tensor [3136, 24] : tensor<3136x24xf32>
-        %16 = linalg.fill(%cst, %15) : f32, tensor<3136x24xf32> -> tensor<3136x24xf32>
+        %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<3136x24xf32>) -> tensor<3136x24xf32>
         %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
             ins(%8, %10 : tensor<3136x96xf32>, tensor<96x24xf32>) outs(%16 : tensor<3136x24xf32>) -> tensor<3136x24xf32>
         flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [3136, 24], strides = [1, 1]
@@ -140,7 +140,7 @@
         %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [192, 64], strides = [1, 1]
             : !flow.dispatch.tensor<readonly:192x64xf32> -> tensor<192x64xf32>
         %15 = linalg.init_tensor [196, 64] : tensor<196x64xf32>
-        %16 = linalg.fill(%cst, %15) : f32, tensor<196x64xf32> -> tensor<196x64xf32>
+        %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<196x64xf32>) -> tensor<196x64xf32>
         %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
             ins(%8, %10 : tensor<196x192xf32>, tensor<192x64xf32>) outs(%16 : tensor<196x64xf32>) -> tensor<196x64xf32>
         flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [196, 64], strides = [1, 1]
@@ -189,7 +189,7 @@
         %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<12544x16xf32>
         %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x96xf32>
         %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<12544x96xf32>
-        linalg.fill(%cst, %2) : f32, memref<12544x96xf32>
+        linalg.fill ins(%cst : f32) outs(%2 : memref<12544x96xf32>)
         linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%0, %1 : memref<12544x16xf32>, memref<16x96xf32>) outs(%2 : memref<12544x96xf32>)
         return
       }
@@ -238,7 +238,7 @@
         %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [49, 576], strides = [1, 1] : !flow.dispatch.tensor<readonly:49x576xf32> -> tensor<49x576xf32>
         %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [576, 160], strides = [1, 1] : !flow.dispatch.tensor<readonly:576x160xf32> -> tensor<576x160xf32>
         %15 = linalg.init_tensor [49, 160] : tensor<49x160xf32>
-        %16 = linalg.fill(%cst, %15) : f32, tensor<49x160xf32> -> tensor<49x160xf32>
+        %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<49x160xf32>) -> tensor<49x160xf32>
         %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
             ins(%8, %10 : tensor<49x576xf32>, tensor<576x160xf32>) outs(%16 : tensor<49x160xf32>) -> tensor<49x160xf32>
         flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [49, 160], strides = [1, 1]
@@ -292,7 +292,7 @@
         %14 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 384], strides = [1, 1, 1]
             : !flow.dispatch.tensor<readonly:4x32x384xf32> -> tensor<4x32x384xf32>
         %21 = linalg.init_tensor [4, 384, 384] : tensor<4x384x384xf32>
-        %22 = linalg.fill(%cst, %21) : f32, tensor<4x384x384xf32> -> tensor<4x384x384xf32>
+        %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32>
         %23 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"}
             ins(%11, %14 : tensor<4x384x32xf32>, tensor<4x32x384xf32>) outs(%22 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32>
         flow.dispatch.tensor.store %23, %2, offsets = [0, 0, 0], sizes = [4, 384, 384], strides = [1, 1, 1]
@@ -346,7 +346,7 @@
         %14 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 8], strides = [1, 1, 1]
             : !flow.dispatch.tensor<readonly:4x32x8xf32> -> tensor<4x32x8xf32>
         %21 = linalg.init_tensor [4, 8, 8] : tensor<4x8x8xf32>
-        %22 = linalg.fill(%cst, %21) : f32, tensor<4x8x8xf32> -> tensor<4x8x8xf32>
+        %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<4x8x8xf32>) -> tensor<4x8x8xf32>
         %23 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"}
             ins(%11, %14 : tensor<4x8x32xf32>, tensor<4x32x8xf32>) outs(%22 : tensor<4x8x8xf32>) -> tensor<4x8x8xf32>
         flow.dispatch.tensor.store %23, %2, offsets = [0, 0, 0], sizes = [4, 8, 8], strides = [1, 1, 1]

diff --git a/iree/compiler/Codegen/SPIRV/test/config_default_conv.mlir b/iree/compiler/Codegen/SPIRV/test/config_default_conv.mlir
index 74373dc..42ee057 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_default_conv.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_default_conv.mlir

@@ -45,7 +45,7 @@
         %21 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1]
             : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
         %24 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
-        %25 = linalg.fill(%cst, %24) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+        %25 = linalg.fill ins(%cst : f32) outs(%24 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
         %26 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
             ins(%19, %21 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%25 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
         %27 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]}

diff --git a/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir b/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir
index 7db31c6..dd29936 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir

@@ -117,7 +117,7 @@
         %14 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 24, 24, 8], strides = [1, 1, 1, 1]
             : !flow.dispatch.tensor<readonly:1x24x24x8xf32> -> tensor<1x24x24x8xf32>
         %20 = linalg.init_tensor [1, 2, 2, 8] : tensor<1x2x2x8xf32>
-        %21 = linalg.fill(%cst, %20) : f32, tensor<1x2x2x8xf32> -> tensor<1x2x2x8xf32>
+        %21 = linalg.fill ins(%cst : f32) outs(%20 : tensor<1x2x2x8xf32>) -> tensor<1x2x2x8xf32>
         %22 = linalg.pooling_nhwc_sum {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : vector<2xi64>, strides = dense<12> : vector<2xi64>}
             ins(%14, %2 : tensor<1x24x24x8xf32>, tensor<12x12xf32>)
             outs(%21 : tensor<1x2x2x8xf32>) -> tensor<1x2x2x8xf32>
@@ -168,7 +168,7 @@
         %13 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 76, 1, 1], strides = [1, 1, 1, 1]
             : !flow.dispatch.tensor<readonly:1x76x1x1xf32> -> tensor<1x76x1x1xf32>
         %18 = linalg.init_tensor [1, 38, 1, 1] : tensor<1x38x1x1xf32>
-        %19 = linalg.fill(%cst, %18) : f32, tensor<1x38x1x1xf32> -> tensor<1x38x1x1xf32>
+        %19 = linalg.fill ins(%cst : f32) outs(%18 : tensor<1x38x1x1xf32>) -> tensor<1x38x1x1xf32>
         %20 = linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<[2, 1]> : vector<2xi64>}
             ins(%13, %2 : tensor<1x76x1x1xf32>, tensor<2x1xf32>)
             outs(%19 : tensor<1x38x1x1xf32>) -> tensor<1x38x1x1xf32>
@@ -278,7 +278,7 @@
         %14 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 21, 20, 1], strides = [1, 1, 1, 1]
             : !flow.dispatch.tensor<readonly:1x21x20x1xf32> -> tensor<1x21x20x1xf32>
         %18 = linalg.init_tensor [1, 19, 18, 1, 4] : tensor<1x19x18x1x4xf32>
-        %19 = linalg.fill(%cst_9, %18) : f32, tensor<1x19x18x1x4xf32> -> tensor<1x19x18x1x4xf32>
+        %19 = linalg.fill ins(%cst_9 : f32) outs(%18 : tensor<1x19x18x1x4xf32>) -> tensor<1x19x18x1x4xf32>
         %20 = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
             ins(%14, %cst : tensor<1x21x20x1xf32>, tensor<3x3x1x4xf32>) outs(%19 : tensor<1x19x18x1x4xf32>) -> tensor<1x19x18x1x4xf32>
         %21 = linalg.generic {

diff --git a/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir b/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir
index 1887d47..f8b6fea 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir

@@ -33,7 +33,7 @@
         %14 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [1, 3, 32], strides = [1, 1, 1]
             : !flow.dispatch.tensor<readonly:1x3x32xf32> -> tensor<1x3x32xf32>
         %21 = linalg.init_tensor [1, 3, 32] : tensor<1x3x32xf32>
-        %22 = linalg.fill(%cst, %21) : f32, tensor<1x3x32xf32> -> tensor<1x3x32xf32>
+        %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<1x3x32xf32>) -> tensor<1x3x32xf32>
         %23 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"}
             ins(%11, %14 : tensor<1x3x3xf32>, tensor<1x3x32xf32>) outs(%22 : tensor<1x3x32xf32>) -> tensor<1x3x32xf32>
         flow.dispatch.tensor.store %23, %2, offsets = [0, 0, 0], sizes = [1, 3, 32], strides = [1, 1, 1]
@@ -87,7 +87,7 @@
         %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 16], strides = [1, 1]
             : !flow.dispatch.tensor<readonly:32x16xi8> -> tensor<32x16xi8>
         %15 = linalg.init_tensor [64, 16] : tensor<64x16xi32>
-        %16 = linalg.fill(%c0_i32, %15) : i32, tensor<64x16xi32> -> tensor<64x16xi32>
+        %16 = linalg.fill ins(%c0_i32 : i32) outs(%15 : tensor<64x16xi32>) -> tensor<64x16xi32>
         %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
             ins(%8, %10 : tensor<64x32xi8>, tensor<32x16xi8>) outs(%16 : tensor<64x16xi32>) -> tensor<64x16xi32>
         flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [64, 16], strides = [1, 1]
@@ -145,7 +145,7 @@
         %15 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [576, 273], strides = [1, 1]
             : !flow.dispatch.tensor<readonly:576x273xf32> -> tensor<576x273xf32>
         %16 = linalg.init_tensor [400, 273] : tensor<400x273xf32>
-        %17 = linalg.fill(%cst, %16) : f32, tensor<400x273xf32> -> tensor<400x273xf32>
+        %17 = linalg.fill ins(%cst : f32) outs(%16 : tensor<400x273xf32>) -> tensor<400x273xf32>
         %18 = linalg.matmul ins(%13, %15 : tensor<400x576xf32>, tensor<576x273xf32>) outs(%17 : tensor<400x273xf32>) -> tensor<400x273xf32>
         %19 = linalg.generic {
             indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
@@ -211,7 +211,7 @@
         %15 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [512, 546], strides = [1, 1]
             : !flow.dispatch.tensor<readonly:512x546xf32> -> tensor<512x546xf32>
         %16 = linalg.init_tensor [25, 546] : tensor<25x546xf32>
-        %17 = linalg.fill(%cst, %16) : f32, tensor<25x546xf32> -> tensor<25x546xf32>
+        %17 = linalg.fill ins(%cst : f32) outs(%16 : tensor<25x546xf32>) -> tensor<25x546xf32>
         %18 = linalg.matmul ins(%13, %15 : tensor<25x512xf32>, tensor<512x546xf32>) outs(%17 : tensor<25x546xf32>) -> tensor<25x546xf32>
         %19 = linalg.generic {
             indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
@@ -287,7 +287,7 @@
         %17 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1]
             : !flow.dispatch.tensor<readonly:128x1024xf16> -> tensor<128x1024xf16>
         %18 = linalg.init_tensor [256, 1024] : tensor<256x1024xf16>
-        %19 = linalg.fill(%cst, %18) : f16, tensor<256x1024xf16> -> tensor<256x1024xf16>
+        %19 = linalg.fill ins(%cst : f16) outs(%18 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
         %20 = linalg.matmul ins(%15, %17 : tensor<256x128xf16>, tensor<128x1024xf16>) outs(%19 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
         %21 = linalg.generic {
             indexing_maps = [#map5, #map5, #map5, #map5], iterator_types = ["parallel", "parallel"]}

diff --git a/iree/compiler/Codegen/SPIRV/test/config_mali_conv.mlir b/iree/compiler/Codegen/SPIRV/test/config_mali_conv.mlir
index 17257ae..75dbf88 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_mali_conv.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_mali_conv.mlir

@@ -32,7 +32,7 @@
         %15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 512], strides = [1, 1, 1, 1]
             : !flow.dispatch.tensor<readonly:3x3x3x512xf32> -> tensor<3x3x3x512xf32>
         %22 = linalg.init_tensor [1, 112, 112, 512] : tensor<1x112x112x512xf32>
-        %23 = linalg.fill(%cst, %22) : f32, tensor<1x112x112x512xf32> -> tensor<1x112x112x512xf32>
+        %23 = linalg.fill ins(%cst : f32) outs(%22 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32>
         %24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
             ins(%13, %15 : tensor<1x225x225x3xf32>, tensor<3x3x3x512xf32>)
             outs(%23 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32>
@@ -87,7 +87,7 @@
         %15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1]
             : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
         %22 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
-        %23 = linalg.fill(%cst, %22) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+        %23 = linalg.fill ins(%cst : f32) outs(%22 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
         %24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
             ins(%13, %15 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%23 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
         flow.dispatch.tensor.store %24, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1]
@@ -140,7 +140,7 @@
         %15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1]
             : !flow.dispatch.tensor<readonly:3x3x3x16xf32> -> tensor<3x3x3x16xf32>
         %22 = linalg.init_tensor [1, 16, 16, 16] : tensor<1x16x16x16xf32>
-        %23 = linalg.fill(%cst, %22) : f32, tensor<1x16x16x16xf32> -> tensor<1x16x16x16xf32>
+        %23 = linalg.fill ins(%cst : f32) outs(%22 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32>
         %24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
             ins(%13, %15 : tensor<1x33x33x3xf32>, tensor<3x3x3x16xf32>) outs(%23 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32>
         flow.dispatch.tensor.store %24, %2, offsets = [0, 0, 0, 0], sizes = [1, 16, 16, 16], strides = [1, 1, 1, 1]
@@ -194,7 +194,7 @@
         %16 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 144], strides = [1, 1, 1]
             : !flow.dispatch.tensor<readonly:3x3x144xf32> -> tensor<3x3x144xf32>
         %23 = linalg.init_tensor [1, 28, 28, 144] : tensor<1x28x28x144xf32>
-        %24 = linalg.fill(%cst, %23) : f32, tensor<1x28x28x144xf32> -> tensor<1x28x28x144xf32>
+        %24 = linalg.fill ins(%cst : f32) outs(%23 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32>
         %25 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
             ins(%14, %16 : tensor<1x57x57x144xf32>, tensor<3x3x144xf32>) outs(%24 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32>
         flow.dispatch.tensor.store %25, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 144], strides = [1, 1, 1, 1]
@@ -249,7 +249,7 @@
         %16 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 8], strides = [1, 1, 1]
             : !flow.dispatch.tensor<readonly:3x3x8xf32> -> tensor<3x3x8xf32>
         %23 = linalg.init_tensor [1, 1, 2, 8] : tensor<1x1x2x8xf32>
-        %24 = linalg.fill(%cst, %23) : f32, tensor<1x1x2x8xf32> -> tensor<1x1x2x8xf32>
+        %24 = linalg.fill ins(%cst : f32) outs(%23 : tensor<1x1x2x8xf32>) -> tensor<1x1x2x8xf32>
         %25 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
             ins(%14, %16 : tensor<1x3x5x8xf32>, tensor<3x3x8xf32>) outs(%24 : tensor<1x1x2x8xf32>) -> tensor<1x1x2x8xf32>
         flow.dispatch.tensor.store %25, %2, offsets = [0, 0, 0, 0], sizes = [1, 1, 2, 8], strides = [1, 1, 1, 1]

diff --git a/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir b/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir
index d46e6be..ebddd9f 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir

@@ -30,7 +30,7 @@
         %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x512xf32>
         %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:512x2048xf32> -> tensor<512x2048xf32>
         %15 = linalg.init_tensor [1024, 2048] : tensor<1024x2048xf32>
-        %16 = linalg.fill(%cst, %15) : f32, tensor<1024x2048xf32> -> tensor<1024x2048xf32>
+        %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32>
         %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
             ins(%8, %10 : tensor<1024x512xf32>, tensor<512x2048xf32>) outs(%16 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32>
         flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [1024, 2048], strides = [1, 1]
@@ -82,7 +82,7 @@
         %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [3136, 96], strides = [1, 1] : !flow.dispatch.tensor<readonly:3136x96xf32> -> tensor<3136x96xf32>
         %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [96, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:96x24xf32> -> tensor<96x24xf32>
         %15 = linalg.init_tensor [3136, 24] : tensor<3136x24xf32>
-        %16 = linalg.fill(%cst, %15) : f32, tensor<3136x24xf32> -> tensor<3136x24xf32>
+        %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<3136x24xf32>) -> tensor<3136x24xf32>
         %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
             ins(%8, %10 : tensor<3136x96xf32>, tensor<96x24xf32>)
             outs(%16 : tensor<3136x24xf32>) -> tensor<3136x24xf32>
@@ -137,7 +137,7 @@
         %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [192, 64], strides = [1, 1]
             : !flow.dispatch.tensor<readonly:192x64xf32> -> tensor<192x64xf32>
         %15 = linalg.init_tensor [196, 64] : tensor<196x64xf32>
-        %16 = linalg.fill(%cst, %15) : f32, tensor<196x64xf32> -> tensor<196x64xf32>
+        %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<196x64xf32>) -> tensor<196x64xf32>
         %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
             ins(%8, %10 : tensor<196x192xf32>, tensor<192x64xf32>) outs(%16 : tensor<196x64xf32>) -> tensor<196x64xf32>
         flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [196, 64], strides = [1, 1]
@@ -186,7 +186,7 @@
         %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<12544x16xf32>
         %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x96xf32>
         %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<12544x96xf32>
-        linalg.fill(%cst, %2) : f32, memref<12544x96xf32>
+        linalg.fill ins(%cst : f32) outs(%2 : memref<12544x96xf32>)
         linalg.matmul {__internal_linalg_transform__ = "workgroup"}
             ins(%0, %1 : memref<12544x16xf32>, memref<16x96xf32>) outs(%2 : memref<12544x96xf32>)
         return
@@ -238,7 +238,7 @@
         %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [576, 160], strides = [1, 1]
             : !flow.dispatch.tensor<readonly:576x160xf32> -> tensor<576x160xf32>
         %15 = linalg.init_tensor [49, 160] : tensor<49x160xf32>
-        %16 = linalg.fill(%cst, %15) : f32, tensor<49x160xf32> -> tensor<49x160xf32>
+        %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<49x160xf32>) -> tensor<49x160xf32>
         %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
             ins(%8, %10 : tensor<49x576xf32>, tensor<576x160xf32>) outs(%16 : tensor<49x160xf32>) -> tensor<49x160xf32>
         flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [49, 160], strides = [1, 1]
@@ -292,7 +292,7 @@
         %14 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 384], strides = [1, 1, 1]
             : !flow.dispatch.tensor<readonly:4x32x384xf32> -> tensor<4x32x384xf32>
         %21 = linalg.init_tensor [4, 384, 384] : tensor<4x384x384xf32>
-        %22 = linalg.fill(%cst, %21) : f32, tensor<4x384x384xf32> -> tensor<4x384x384xf32>
+        %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32>
         %23 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"}
             ins(%11, %14 : tensor<4x384x32xf32>, tensor<4x32x384xf32>)
             outs(%22 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32>
@@ -348,7 +348,7 @@
         %14 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 8], strides = [1, 1, 1]
             : !flow.dispatch.tensor<readonly:4x32x8xf32> -> tensor<4x32x8xf32>
         %21 = linalg.init_tensor [4, 2, 8] : tensor<4x2x8xf32>
-        %22 = linalg.fill(%cst, %21) : f32, tensor<4x2x8xf32> -> tensor<4x2x8xf32>
+        %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<4x2x8xf32>) -> tensor<4x2x8xf32>
         %23 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"}
             ins(%11, %14 : tensor<4x2x32xf32>, tensor<4x32x8xf32>) outs(%22 : tensor<4x2x8xf32>) -> tensor<4x2x8xf32>
         flow.dispatch.tensor.store %23, %2, offsets = [0, 0, 0], sizes = [4, 2, 8], strides = [1, 1, 1]

diff --git a/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir b/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir
index d7b8532..ab22b21 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir

@@ -56,7 +56,7 @@
         %21 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1]
             : !flow.dispatch.tensor<readonly:128x1024xf16> -> tensor<128x1024xf16>
         %24 = linalg.init_tensor [256, 1024] : tensor<256x1024xf16>
-        %25 = linalg.fill(%cst, %24) : f16, tensor<256x1024xf16> -> tensor<256x1024xf16>
+        %25 = linalg.fill ins(%cst : f16) outs(%24 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
         %26 = linalg.matmul ins(%19, %21 : tensor<256x128xf16>, tensor<128x1024xf16>) outs(%25 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
         %27 = linalg.generic {
             indexing_maps = [#map5, #map5, #map5, #map5], iterator_types = ["parallel", "parallel"]}
@@ -133,7 +133,7 @@
         %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 8], strides = [1, 1] : !flow.dispatch.tensor<readonly:256x8xf16> -> tensor<256x8xf16>
         %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:8x1024xf16> -> tensor<8x1024xf16>
         %15 = linalg.init_tensor [256, 1024] : tensor<256x1024xf16>
-        %16 = linalg.fill(%cst, %15) : f16, tensor<256x1024xf16> -> tensor<256x1024xf16>
+        %16 = linalg.fill ins(%cst : f16) outs(%15 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
         %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
             ins(%8, %10 : tensor<256x8xf16>, tensor<8x1024xf16>) outs(%16 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
         flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]

diff --git a/iree/compiler/Codegen/SPIRV/test/create_fast_slow_path.mlir b/iree/compiler/Codegen/SPIRV/test/create_fast_slow_path.mlir
index 008d001..a318d01 100644
--- a/iree/compiler/Codegen/SPIRV/test/create_fast_slow_path.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/create_fast_slow_path.mlir

@@ -38,7 +38,7 @@
           tensor.yield %cst : f32
         } : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
         %20 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
-        %21 = linalg.fill(%cst, %9) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} : f32, tensor<1x1x4x32xf32> -> tensor<1x1x4x32xf32>
+        %21 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} ins(%cst : f32) outs(%9 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
         %22 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>, strides = dense<2> : tensor<2xi64>} ins(%19, %20 : tensor<1x?x?x3xf32>, tensor<3x3x3x32xf32>) outs(%21 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
         %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22, %8 : tensor<1x1x4x32xf32>, tensor<1x1x4x32xf32>) outs(%9 : tensor<1x1x4x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} {
         ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):

diff --git a/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir b/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir
index 3dfe104..5c568c2 100644
--- a/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir

@@ -47,7 +47,7 @@
         %19 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:256x128xf16> -> tensor<256x128xf16>
         %21 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [128, 1204], strides = [1, 1] : !flow.dispatch.tensor<readonly:128x1024xf16> -> tensor<128x1024xf16>
         %24 = linalg.init_tensor [256, 1024] : tensor<256x1024xf16>
-        %25 = linalg.fill(%cst, %24) : f16, tensor<256x1024xf16> -> tensor<256x1024xf16>
+        %25 = linalg.fill ins(%cst : f16) outs(%24 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
         %26 = linalg.matmul ins(%19, %21 : tensor<256x128xf16>, tensor<128x1024xf16>) outs(%25 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
         %27 = linalg.generic {
             indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],

diff --git a/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_vectorization.mlir b/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_vectorization.mlir
index aa3e66f..cf8ceae 100644
--- a/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_vectorization.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_vectorization.mlir

@@ -27,7 +27,7 @@
         %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:4096x4096xf32> -> tensor<4096x4096xf32>
         %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:4096x4096xf32> -> tensor<4096x4096xf32>
         %15 = linalg.init_tensor [4096, 4096] : tensor<4096x4096xf32>
-        %16 = linalg.fill(%cst, %15) : f32, tensor<4096x4096xf32> -> tensor<4096x4096xf32>
+        %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<4096x4096xf32>) -> tensor<4096x4096xf32>
         %17 = linalg.matmul ins(%8, %10 : tensor<4096x4096xf32>, tensor<4096x4096xf32>) outs(%16 : tensor<4096x4096xf32>) -> tensor<4096x4096xf32>
         flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : tensor<4096x4096xf32> -> !flow.dispatch.tensor<writeonly:4096x4096xf32>
         return
@@ -78,7 +78,7 @@
         %15 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x512xf32>
         %17 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:512x256xf32> -> tensor<512x256xf32>
         %20 = linalg.init_tensor [1024, 256] : tensor<1024x256xf32>
-        %21 = linalg.fill(%cst, %20) : f32, tensor<1024x256xf32> -> tensor<1024x256xf32>
+        %21 = linalg.fill ins(%cst : f32) outs(%20 : tensor<1024x256xf32>) -> tensor<1024x256xf32>
         %22 = linalg.matmul ins(%15, %17 : tensor<1024x512xf32>, tensor<512x256xf32>) outs(%21 : tensor<1024x256xf32>) -> tensor<1024x256xf32>
         %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%22, %10 : tensor<1024x256xf32>, tensor<1024x256xf32>) outs(%13 : tensor<1024x256xf32>) {
         ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):

diff --git a/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_batch_matmul.mlir b/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_batch_matmul.mlir
index 980ae2d..aaabd0c 100644
--- a/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_batch_matmul.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_batch_matmul.mlir

@@ -51,7 +51,7 @@
               %15 = affine.min affine_map<(d0)[s0] -> (-d0 + 1024, s0)>(%arg1)[%workgroup_size_y]
               %16 = affine.min affine_map<(d0)[s0] -> (-d0 + 1024, s0)>(%arg2)[%workgroup_size_x]
               %17 = linalg.init_tensor [%14, %15, %16] : tensor<?x?x?xf32>
-              %18 = linalg.fill(%cst, %17) : f32, tensor<?x?x?xf32> -> tensor<?x?x?xf32>
+              %18 = linalg.fill ins(%cst : f32) outs(%17 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
               %19 = linalg.batch_matmul {lowering_config = #config} ins(%11, %13 : tensor<?x?x1024xf32>, tensor<?x1024x?xf32>) outs(%18 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
               flow.dispatch.tensor.store %19, %2, offsets = [%arg0, %arg1, %arg2], sizes = [%9, %10, %12], strides = [1, 1, 1] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<writeonly:4x1024x1024xf32>
             }

diff --git a/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_conv.mlir b/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_conv.mlir
index e4ba66d..631ce5e 100644
--- a/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_conv.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_conv.mlir

@@ -55,7 +55,7 @@
               %19 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y]
               %20 = affine.min affine_map<(d0)[s0] -> (-d0 + 16, s0)>(%arg2)[%workgroup_size_x]
               %21 = linalg.init_tensor [1, %18, %19, %20] : tensor<1x?x?x?xf32>
-              %22 = linalg.fill(%cst, %21) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
+              %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
               %23 = linalg.conv_2d_nhwc_hwcf {lowering_config = #config, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
                 ins(%13, %15 : tensor<1x?x?x8xf32>, tensor<3x3x8x?xf32>)
                 outs(%22 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
@@ -146,7 +146,7 @@
               %19 = affine.min affine_map<(d0)[s0] -> (-d0 + 56, s0)>(%arg1)[%workgroup_size_y]
               %20 = affine.min affine_map<(d0)[s0] -> (-d0 + 96, s0)>(%arg2)[%workgroup_size_x]
               %21 = linalg.init_tensor [1, %18, %19, %20] : tensor<1x?x?x?xf32>
-              %22 = linalg.fill(%cst, %21) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
+              %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
               %23 = linalg.depthwise_conv_2d_nhwc_hwc {lowering_config = #config, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
                 ins(%14, %15 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>)
                 outs(%22 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
@@ -257,7 +257,7 @@
               %36 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y]
               %37 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x]
               %38 = linalg.init_tensor [1, %35, %36, %37] : tensor<1x?x?x?xf32>
-              %39 = linalg.fill(%cst, %38) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
+              %39 = linalg.fill ins(%cst : f32) outs(%38 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
               %40 = linalg.conv_2d_nhwc_hwcf {lowering_config = #config, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%32, %34 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%39 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
               %41 = linalg.generic {lowering_config = #config, indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%40, %16 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) {
               ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
@@ -386,7 +386,7 @@
               %44 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y]
               %45 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x]
               %46 = linalg.init_tensor [1, %43, %44, %45] : tensor<1x?x?x?xf32>
-              %47 = linalg.fill(%cst, %46) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
+              %47 = linalg.fill ins(%cst : f32) outs(%46 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
               %48 = linalg.depthwise_conv_2d_nhwc_hwc {lowering_config = #config, dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%40, %42 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%47 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
               %49 = linalg.generic {lowering_config = #config, indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12, %48 : tensor<?xf32>, tensor<1x?x?x?xf32>) outs(%18 : tensor<1x?x?x?xf32>) {
               ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):

diff --git a/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_matmul.mlir b/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_matmul.mlir
index bc18016..816824b 100644
--- a/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_matmul.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_matmul.mlir

@@ -42,7 +42,7 @@
             %11 = affine.min affine_map<(d0)[s0] -> (-d0 + 4096, s0)>(%arg0)[%workgroup_size_y]
             %12 = affine.min affine_map<(d0)[s0] -> (-d0 + 4096, s0)>(%arg1)[%workgroup_size_x]
             %13 = linalg.init_tensor [%11, %12] : tensor<?x?xf16>
-            %14 = linalg.fill(%cst, %13) : f16, tensor<?x?xf16> -> tensor<?x?xf16>
+            %14 = linalg.fill ins(%cst : f16) outs(%13 : tensor<?x?xf16>) -> tensor<?x?xf16>
             %15 = linalg.matmul {lowering_config = #config} ins(%8, %10 : tensor<?x4096xf16>, tensor<4096x?xf16>) outs(%14 : tensor<?x?xf16>) -> tensor<?x?xf16>
             flow.dispatch.tensor.store %15, %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : tensor<?x?xf16> -> !flow.dispatch.tensor<writeonly:4096x4096xf16>
           }
@@ -106,7 +106,7 @@
             %11 = affine.min affine_map<(d0)[s0] -> (-d0 + 4096, s0)>(%arg0)[%workgroup_size_y]
             %12 = affine.min affine_map<(d0)[s0] -> (-d0 + 4096, s0)>(%arg1)[%workgroup_size_x]
             %13 = linalg.init_tensor [%11, %12] : tensor<?x?xf32>
-            %14 = linalg.fill(%cst, %13) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+            %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<?x?xf32>) -> tensor<?x?xf32>
             %15 = linalg.matmul {lowering_config = #config} ins(%8, %10 : tensor<?x4096xf32>, tensor<4096x?xf32>) outs(%14 : tensor<?x?xf32>) -> tensor<?x?xf32>
             flow.dispatch.tensor.store %15, %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:4096x4096xf32>
           }

diff --git a/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir b/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir
index 6b8bb0a..e770765 100644
--- a/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir

@@ -66,7 +66,7 @@
             %11 = memref.subview %2[%arg0, 0] [16, 128] [1, 1] : memref<256x128xf16> to memref<16x128xf16, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
             %12 = memref.subview %3[0, %arg1] [128, 16] [1, 1] : memref<128x1024xf16> to memref<128x16xf16, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
             %13 = memref.subview %4[%arg0, %arg1] [16, 16] [1, 1] : memref<256x1024xf16> to memref<16x16xf16, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
-            linalg.fill(%cst, %13) {lowering_config = #config} : f16, memref<16x16xf16, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
+            linalg.fill {lowering_config = #config} ins(%cst : f16) outs(%13 : memref<16x16xf16, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>)
             linalg.matmul {lowering_config = #config}
               ins(%11, %12 : memref<16x128xf16, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>, memref<128x16xf16, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>)
               outs(%13 : memref<16x16xf16, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>)

diff --git a/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir b/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir
index d1433c6..6c7ed93 100644
--- a/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir

@@ -28,7 +28,7 @@
       %10 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %9) -> (tensor<2x128xf32>) {
         %11 = scf.for %arg4 = %c0 to %c128 step %c4 iter_args(%arg5 = %arg3) -> (tensor<2x128xf32>) {
           %12 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<2x128xf32> to tensor<1x4xf32>
-          %13 = linalg.fill(%cst, %12) {lowering_config = #config} : f32, tensor<1x4xf32> -> tensor<1x4xf32>
+          %13 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%12 : tensor<1x4xf32>) -> tensor<1x4xf32>
           %14 = tensor.extract_slice %7[%arg2, 0] [1, 4] [1, 1] : tensor<2x4xf32> to tensor<1x4xf32>
           %15 = tensor.extract_slice %8[0, %arg4] [4, 4] [1, 1] : tensor<4x128xf32> to tensor<4x4xf32>
           %16 = linalg.matmul {lowering_config = #config} ins(%14, %15 : tensor<1x4xf32>, tensor<4x4xf32>) outs(%13 : tensor<1x4xf32>) -> tensor<1x4xf32>

diff --git a/iree/compiler/Codegen/Sandbox/test/fusion_expert.mlir b/iree/compiler/Codegen/Sandbox/test/fusion_expert.mlir
index f2cfcdd..beb6930 100644
--- a/iree/compiler/Codegen/Sandbox/test/fusion_expert.mlir
+++ b/iree/compiler/Codegen/Sandbox/test/fusion_expert.mlir

@@ -7,7 +7,7 @@
   %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
   %d1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
   %init = linalg.init_tensor [%d0, %d1] : tensor<?x?xf32>
-  %0 = linalg.fill(%cst, %init) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+  %0 = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
   %1 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[10, 20, 30]]>}
       ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
       outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
@@ -40,7 +40,7 @@
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %init = linalg.init_tensor [20, 120] : tensor<20x120xf32>
-  %0 = linalg.fill(%cst, %init) : f32, tensor<20x120xf32> -> tensor<20x120xf32>
+  %0 = linalg.fill ins(%cst : f32) outs(%init : tensor<20x120xf32>) -> tensor<20x120xf32>
   %1 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[10, 20, 30]]>}
       ins(%arg0, %arg1 : tensor<20x60xf32>, tensor<60x120xf32>)
       outs(%0 : tensor<20x120xf32>) -> tensor<20x120xf32>

diff --git a/iree/compiler/Codegen/Sandbox/test/outline_one_parent_loop.mlir b/iree/compiler/Codegen/Sandbox/test/outline_one_parent_loop.mlir
index f6b671d..f774ec1 100644
--- a/iree/compiler/Codegen/Sandbox/test/outline_one_parent_loop.mlir
+++ b/iree/compiler/Codegen/Sandbox/test/outline_one_parent_loop.mlir

@@ -34,7 +34,7 @@
         %13 = affine.min affine_map<(d0, d1) -> (8, -d0 + d1)>(%arg7, %5)
         %14 = scf.for %arg9 = %c0 to %c16 step %c8 iter_args(%arg10 = %arg8) -> (tensor<?x16xf32>) {
           %15 = tensor.extract_slice %arg10[%arg7, %arg9] [%13, 8] [1, 1] : tensor<?x16xf32> to tensor<?x8xf32>
-          %16 = linalg.fill(%cst, %15) : f32, tensor<?x8xf32> -> tensor<?x8xf32> 
+          %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<?x8xf32>) -> tensor<?x8xf32> 
           %17 = tensor.insert_slice %16 into %arg10[%arg7, %arg9] [%13, 8] [1, 1] : tensor<?x8xf32> into tensor<?x16xf32>
           scf.yield %17 : tensor<?x16xf32>
         }

diff --git a/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups_folding.mlir b/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups_folding.mlir
index 17a7944..6f8e598 100644
--- a/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups_folding.mlir
+++ b/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups_folding.mlir

@@ -113,8 +113,8 @@
     %0 = flow.dispatch.tensor.load %arg0, offsets=[0], sizes=[9], strides = [1] : !flow.dispatch.tensor<readonly:9xi32> -> tensor<9xi32>
     %1 = flow.dispatch.tensor.load %arg1, offsets=[0], sizes=[9], strides = [1] : !flow.dispatch.tensor<readonly:9xi32> -> tensor<9xi32>
     %2 = linalg.init_tensor [] : tensor<i32>
-    %3 = linalg.fill(%c-2147483648_i32, %2) : i32, tensor<i32> -> tensor<i32>
-    %4 = linalg.fill(%c0_i32, %2) : i32, tensor<i32> -> tensor<i32>
+    %3 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%2 : tensor<i32>) -> tensor<i32>
+    %4 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<i32>) -> tensor<i32>
     flow.dispatch.tensor.store %3, %arg2, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
     flow.dispatch.tensor.store %4, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
     flow.return
@@ -137,14 +137,14 @@
     // Used as a result; should remain after canonicalization.
     %c-2147483648_i32 = arith.constant -2147483648 : i32
     %ret0_init = linalg.init_tensor [] : tensor<i32>
-    %ret0_value = linalg.fill(%c-2147483648_i32, %ret0_init) : i32, tensor<i32> -> tensor<i32>
+    %ret0_value = linalg.fill ins(%c-2147483648_i32 : i32) outs(%ret0_init : tensor<i32>) -> tensor<i32>
     flow.dispatch.tensor.store %ret0_value, %ret0, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
 
     // Unused as a result; should be stripped entirely.
     %c0_i32 = arith.constant 0 : i32
     %ret1_shaped = flow.dispatch.tie_shape %ret1 : !flow.dispatch.tensor<writeonly:?xi32>{%dim}
     %ret1_init = linalg.init_tensor [%dim] : tensor<?xi32>
-    %ret1_value = linalg.fill(%c0_i32, %ret1_init) : i32, tensor<?xi32> -> tensor<?xi32>
+    %ret1_value = linalg.fill ins(%c0_i32 : i32) outs(%ret1_init : tensor<?xi32>) -> tensor<?xi32>
     flow.dispatch.tensor.store %ret1_value, %ret1_shaped, offsets = [0], sizes = [%dim], strides = [1] : tensor<?xi32> -> !flow.dispatch.tensor<writeonly:?xi32>{%dim}
     flow.return
   }
@@ -167,8 +167,8 @@
     %0 = flow.dispatch.tensor.load %arg0, offsets=[0], sizes=[9], strides = [1] : !flow.dispatch.tensor<readonly:9xi32> -> tensor<9xi32>
     %1 = flow.dispatch.tensor.load %arg1, offsets=[0], sizes=[9], strides = [1] : !flow.dispatch.tensor<readonly:9xi32> -> tensor<9xi32>
     %2 = linalg.init_tensor [] : tensor<i32>
-    %3 = linalg.fill(%c-2147483648_i32, %2) : i32, tensor<i32> -> tensor<i32>
-    %4 = linalg.fill(%c0_i32, %2) : i32, tensor<i32> -> tensor<i32>
+    %3 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%2 : tensor<i32>) -> tensor<i32>
+    %4 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<i32>) -> tensor<i32>
     flow.dispatch.tensor.store %3, %arg2, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
     flow.dispatch.tensor.store %4, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<readwrite:i32>
     flow.return
@@ -190,8 +190,8 @@
     %val = tensor.extract %0[] : tensor<i32>
     %1 = flow.dispatch.tensor.load %arg1, offsets=[0], sizes=[9], strides = [1] : !flow.dispatch.tensor<readonly:9xi32> -> tensor<9xi32>
     %2 = linalg.init_tensor [] : tensor<i32>
-    %3 = linalg.fill(%c-2147483648_i32, %2) : i32, tensor<i32> -> tensor<i32>
-    %4 = linalg.fill(%val, %2) : i32, tensor<i32> -> tensor<i32>
+    %3 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%2 : tensor<i32>) -> tensor<i32>
+    %4 = linalg.fill ins(%val : i32) outs(%2 : tensor<i32>) -> tensor<i32>
     flow.dispatch.tensor.store %3, %arg2, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
     flow.dispatch.tensor.store %4, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<readwrite:i32>
     flow.return

diff --git a/iree/compiler/Dialect/Flow/Transforms/test/convert_linalg_tensor_ops_after.mlir b/iree/compiler/Dialect/Flow/Transforms/test/convert_linalg_tensor_ops_after.mlir
index 3d8ccde..4dda3a4 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/convert_linalg_tensor_ops_after.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/convert_linalg_tensor_ops_after.mlir

@@ -9,7 +9,7 @@
   %3 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%1)[%arg2, %arg4]
   %4 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%2)[%arg3, %arg5]
   %5 = linalg.init_tensor [%3, %4] : tensor<?x?xf32>
-  %6 = linalg.fill(%0, %5) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+  %6 = linalg.fill ins(%0 : f32) outs(%5 : tensor<?x?xf32>) -> tensor<?x?xf32>
   %7 = flow.tensor.update %arg0, %6[%arg2, %arg3] : tensor<?x?xf32>{%1, %2} -> %6 as tensor<?x?xf32>{%3, %4}
   return %7 : tensor<?x?xf32>
 }

diff --git a/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir b/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir
index 60ff4c6..0755c49 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir

@@ -90,7 +90,7 @@
   %M = tensor.dim %A, %c0 : tensor<?x?xf32>
   %N = tensor.dim %B, %c1 : tensor<?x?xf32>
   %0 = linalg.init_tensor [%M, %N] : tensor<?x?xf32>
-  %1 = linalg.fill(%zero, %0) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+  %1 = linalg.fill ins(%zero : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
   %2 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
     outs(%1 : tensor<?x?xf32>) -> tensor<?x?xf32>
   return %2 : tensor<?x?xf32>
@@ -115,7 +115,9 @@
 //   CHECK-DAG:        %[[LHS:.+]] = flow.dispatch.tensor.load %[[ARG0_CAPTURE]], {{.*}} : !flow.dispatch.tensor<readonly:?x?xf32>{%[[ARG0_DIM0_CAPTURE]], %[[ARG0_DIM1_CAPTURE]]}
 //   CHECK-DAG:        %[[RHS:.+]] = flow.dispatch.tensor.load %[[ARG1_CAPTURE]], {{.*}} : !flow.dispatch.tensor<readonly:?x?xf32>{%[[ARG1_DIM0_CAPTURE]], %[[ARG1_DIM1_CAPTURE]]}
 //   CHECK-DAG:        %[[INIT:.+]] = linalg.init_tensor
-//       CHECK:        %[[FILL:.+]] = linalg.fill(%[[ZERO]], %[[INIT]])
+//       CHECK:        %[[FILL:.+]] = linalg.fill
+//  CHECK-SAME:            ins(%[[ZERO]] :
+//  CHECK-SAME:            outs(%[[INIT]] :
 //       CHECK:        %[[RESULT:.+]] = linalg.matmul
 //  CHECK-SAME:            ins(%[[LHS]], %[[RHS]] : tensor<?x?xf32>, tensor<?x?xf32>)
 //  CHECK-SAME:            outs(%[[FILL]] : tensor<?x?xf32>)
@@ -133,7 +135,7 @@
   %N = tensor.dim %B, %c1 : tensor<?x?xf32>
   %K = tensor.dim %A, %c1 : tensor<?x?xf32>
   %0 = linalg.init_tensor [%M, %N] : tensor<?x?xf32>
-  %1 = linalg.fill(%zero, %0) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+  %1 = linalg.fill ins(%zero : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
   %2 = linalg.init_tensor [%M, %K] : tensor<?x?xf32>
   %3 = linalg.generic
     {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
@@ -172,7 +174,9 @@
 //       CHECK:     flow.dispatch.workgroups[%[[N]], %[[M]], %[[C1]]]
 //       CHECK:       %[[ZERO:.+]] = arith.constant 0.0
 //       CHECK:       %[[INIT:.+]] = linalg.init_tensor
-//       CHECK:       %[[FILL:.+]] = linalg.fill(%[[ZERO]], %[[INIT]])
+//       CHECK:       %[[FILL:.+]] = linalg.fill
+//  CHECK-SAME:            ins(%[[ZERO]] :
+//  CHECK-SAME:            outs(%[[INIT]] :
 //       CHECK:       linalg.matmul
 //       CHECK:           outs(%[[FILL]] : tensor<?x?xf32>)
 
@@ -230,13 +234,13 @@
   %m = tensor.dim %0, %c0 : tensor<?x4xf32>
   %n1 = tensor.dim %rhs1, %c1 : tensor<4x?xf32>
   %init1 = linalg.init_tensor [%m, %n1] : tensor<?x?xf32>
-  %fill1 = linalg.fill(%cst, %init1) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+  %fill1 = linalg.fill ins(%cst : f32) outs(%init1 : tensor<?x?xf32>) -> tensor<?x?xf32>
   %1 = linalg.matmul
     ins(%0, %rhs1 : tensor<?x4xf32>, tensor<4x?xf32>)
     outs(%fill1 : tensor<?x?xf32>) -> tensor<?x?xf32>
   %n2 = tensor.dim %rhs2, %c1 : tensor<4x?xf32>
   %init2 = linalg.init_tensor [%m, %n2] : tensor<?x?xf32>
-  %fill2 = linalg.fill(%cst, %init2) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+  %fill2 = linalg.fill ins(%cst : f32) outs(%init2 : tensor<?x?xf32>) -> tensor<?x?xf32>
   %2= linalg.matmul
     ins(%0, %rhs2 : tensor<?x4xf32>, tensor<4x?xf32>)
     outs(%fill2 : tensor<?x?xf32>) -> tensor<?x?xf32>
@@ -275,7 +279,7 @@
   %3 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%1)[%arg2, %arg4]
   %4 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%2)[%arg3, %arg5]
   %5 = linalg.init_tensor [%3, %4] : tensor<?x?xf32>
-  %6 = linalg.fill(%0, %5) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+  %6 = linalg.fill ins(%0 : f32) outs(%5 : tensor<?x?xf32>) -> tensor<?x?xf32>
   %7 = flow.tensor.update %arg0, %6[%arg2, %arg3] : tensor<?x?xf32>{%1, %2} -> %6 as tensor<?x?xf32>{%3, %4}
   return %7 : tensor<?x?xf32>
 }
@@ -365,7 +369,7 @@
 func @conv2d(%input: tensor<1x225x225x16xf32>, %filter: tensor<3x3x16x32xf32>) -> tensor<1x112x112x32xf32> {
   %0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
   %cst = arith.constant 0.000000e+00 : f32
-  %1 = linalg.fill(%cst, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
   %2 = linalg.conv_2d_nhwc_hwcf
          {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
          ins(%input, %filter : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>)
@@ -387,7 +391,7 @@
 func @depthwise_conv2d(%input: tensor<1x113x113x96xf32>, %filter: tensor<3x3x96xf32>) -> tensor<1x56x56x96xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %1 = linalg.init_tensor [1, 56, 56, 96] : tensor<1x56x56x96xf32>
-  %2 = linalg.fill(%cst, %1) : f32, tensor<1x56x56x96xf32> -> tensor<1x56x56x96xf32>
+  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x56x56x96xf32>) -> tensor<1x56x56x96xf32>
   %4 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%input, %filter : tensor<1x113x113x96xf32>, tensor<3x3x96xf32>) outs(%2 : tensor<1x56x56x96xf32>) -> tensor<1x56x56x96xf32>
   return %4 : tensor<1x56x56x96xf32>
 }
@@ -448,7 +452,7 @@
 func @fuse_non_tiled_reduction_fill(%input1: tensor<1000xf32>, %input2: tensor<1000xf32>, %offset: tensor<f32>) -> tensor<f32> {
   %zero = arith.constant 0.0 : f32
   %init = linalg.init_tensor [] : tensor<f32>
-  %fill = linalg.fill(%zero, %init) : f32, tensor<f32> -> tensor<f32>
+  %fill = linalg.fill ins(%zero : f32) outs(%init : tensor<f32>) -> tensor<f32>
   %reduce = linalg.generic {
               indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>, affine_map<(d0) -> ()>],
               iterator_types = ["reduction"]}
@@ -701,8 +705,8 @@
   %c0 = arith.constant 0 : index
   %0 = tensor.dim %arg0, %c0 : tensor<?x?xi32>
   %1 = linalg.init_tensor [%0] : tensor<?xi32>
-  %2 = linalg.fill(%cmin, %1) : i32, tensor<?xi32> -> tensor<?xi32>
-  %3 = linalg.fill(%c0_i32, %1) : i32, tensor<?xi32> -> tensor<?xi32>
+  %2 = linalg.fill ins(%cmin : i32) outs(%1 : tensor<?xi32>) -> tensor<?xi32>
+  %3 = linalg.fill ins(%c0_i32 : i32) outs(%1 : tensor<?xi32>) -> tensor<?xi32>
   %4:2 = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>,
                        affine_map<(d0, d1) -> (d1, d0)>,
@@ -788,7 +792,7 @@
   %2 = tensor.dim %0, %c0 : tensor<?x?xf32>
   %3 = tensor.dim %1, %c1 : tensor<?x?xf32>
   %4 = linalg.init_tensor [%2, %3] : tensor<?x?xf32>
-  %5 = linalg.fill(%cst, %4) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
   %6 = linalg.matmul ins(%0, %1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%5 : tensor<?x?xf32>) -> tensor<?x?xf32>
   %7 = tensor.dim %6, %c0 : tensor<?x?xf32>
   %8 = tensor.dim %6, %c1 : tensor<?x?xf32>
@@ -927,7 +931,7 @@
 func @pooling_nwhc_sum_static(%input: tensor<1x33x33x160xf32>) -> tensor<1x3x3x160xf32> {
   %cst = arith.constant 0.0 : f32
   %1 = linalg.init_tensor [1, 3, 3, 160] : tensor<1x3x3x160xf32>
-  %2 = linalg.fill(%cst, %1) : f32, tensor<1x3x3x160xf32> -> tensor<1x3x3x160xf32>
+  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x3x3x160xf32>) -> tensor<1x3x3x160xf32>
   %3 = linalg.init_tensor [11, 11] : tensor<11x11xf32>
   %4 = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<11> : vector<2xi64>} ins(%input, %3 : tensor<1x33x33x160xf32>, tensor<11x11xf32>) outs(%2 : tensor<1x3x3x160xf32>) -> tensor<1x3x3x160xf32>
   return %4 : tensor<1x3x3x160xf32>

diff --git a/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion.mlir b/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion.mlir
index 6da28d8..d24826e 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion.mlir

@@ -3,7 +3,7 @@
 func @fuse_conv2d_elementwise(%input: tensor<1x225x225x16xf32>, %filter: tensor<3x3x16x32xf32>, %offset: tensor<32xf32>) -> tensor<1x112x112x32xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
-  %1 = linalg.fill(%cst, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
   %2 = linalg.conv_2d_nhwc_hwcf
          {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
          ins(%input, %filter : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>)
@@ -33,7 +33,8 @@
 
 //      CHECK: flow.dispatch.workgroups
 //      CHECK:   %[[INIT:.+]] = linalg.init_tensor
-//      CHECK:   %[[FILL:.+]] = linalg.fill(%{{.+}}, %[[INIT]])
+//      CHECK:   %[[FILL:.+]] = linalg.fill
+// CHECK-SAME:     outs(%[[INIT]] :
 //      CHECK:   %[[CONV:.+]] = linalg.conv_2d_nhwc_hwcf
 // CHECK-SAME:     outs(%[[FILL]] :
 //      CHECK:   linalg.generic
@@ -46,7 +47,7 @@
   -> (tensor<1x112x112x32xf32>, tensor<1x112x112x32xf32>) {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
-  %1 = linalg.fill(%cst, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
   %2 = linalg.conv_2d_nhwc_hwcf
          {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
          ins(%input, %filter : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>)
@@ -80,7 +81,7 @@
 func @dont_fuse_conv2d_with_non_identity_map(%input: tensor<1x225x225x16xf32>, %filter: tensor<3x3x16x32xf32>, %offset: tensor<32xf32>) -> tensor<1x112x112x32xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
-  %1 = linalg.fill(%cst, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
   %2 = linalg.conv_2d_nhwc_hwcf
          {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
          ins(%input, %filter : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>)

diff --git a/iree/compiler/Dialect/Flow/Transforms/test/infer_numeric_narrowing.mlir b/iree/compiler/Dialect/Flow/Transforms/test/infer_numeric_narrowing.mlir
index ac21741..ef440ec 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/infer_numeric_narrowing.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/infer_numeric_narrowing.mlir

@@ -18,7 +18,7 @@
     [[3.900000e+01], [0.000000e+00], [1.270000e+02]]> : tensor<3x1xf32>
   %init_value = arith.constant 0.000000e+00 : f32
   %0 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
-  %1 = linalg.fill(%init_value, %0) : f32, tensor<5x1xf32> -> tensor<5x1xf32>
+  %1 = linalg.fill ins(%init_value : f32) outs(%0 : tensor<5x1xf32>) -> tensor<5x1xf32>
   %2 = linalg.matmul ins(%arg0, %rhs : tensor<5x3xf32>, tensor<3x1xf32>) outs(%1 : tensor<5x1xf32>) -> tensor<5x1xf32>
   return %2 : tensor<5x1xf32>
 }
@@ -30,7 +30,7 @@
     [[-3.900000e+01], [0.000000e+00], [1.270000e+02]]> : tensor<3x1xf32>
   %init_value = arith.constant 0.000000e+00 : f32
   %0 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
-  %1 = linalg.fill(%init_value, %0) : f32, tensor<5x1xf32> -> tensor<5x1xf32>
+  %1 = linalg.fill ins(%init_value : f32) outs(%0 : tensor<5x1xf32>) -> tensor<5x1xf32>
   %2 = linalg.matmul ins(%arg0, %rhs : tensor<5x3xf32>, tensor<3x1xf32>) outs(%1 : tensor<5x1xf32>) -> tensor<5x1xf32>
   return %2 : tensor<5x1xf32>
 }
@@ -43,7 +43,7 @@
     [[0.000000e+00], [0.000000e+00], [-1.000000e+00]]> : tensor<3x1xf32>
   %init_value = arith.constant 0.000000e+00 : f32
   %0 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
-  %1 = linalg.fill(%init_value, %0) : f32, tensor<5x1xf32> -> tensor<5x1xf32>
+  %1 = linalg.fill ins(%init_value : f32) outs(%0 : tensor<5x1xf32>) -> tensor<5x1xf32>
   %2 = linalg.matmul ins(%arg0, %rhs : tensor<5x3xf32>, tensor<3x1xf32>) outs(%1 : tensor<5x1xf32>) -> tensor<5x1xf32>
   return %2 : tensor<5x1xf32>
 }
@@ -56,7 +56,7 @@
     [[1.000000e+00], [1.000000e+00], [2.000000e+00]]> : tensor<3x1xf32>
   %init_value = arith.constant 0.000000e+00 : f32
   %0 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
-  %1 = linalg.fill(%init_value, %0) : f32, tensor<5x1xf32> -> tensor<5x1xf32>
+  %1 = linalg.fill ins(%init_value : f32) outs(%0 : tensor<5x1xf32>) -> tensor<5x1xf32>
   %2 = linalg.matmul ins(%arg0, %rhs : tensor<5x3xf32>, tensor<3x1xf32>) outs(%1 : tensor<5x1xf32>) -> tensor<5x1xf32>
   return %2 : tensor<5x1xf32>
 }
@@ -69,7 +69,7 @@
     [[-1.000000e+00], [-1.000000e+00], [-2.000000e+00]]> : tensor<3x1xf32>
   %init_value = arith.constant 0.000000e+00 : f32
   %0 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
-  %1 = linalg.fill(%init_value, %0) : f32, tensor<5x1xf32> -> tensor<5x1xf32>
+  %1 = linalg.fill ins(%init_value : f32) outs(%0 : tensor<5x1xf32>) -> tensor<5x1xf32>
   %2 = linalg.matmul ins(%arg0, %rhs : tensor<5x3xf32>, tensor<3x1xf32>) outs(%1 : tensor<5x1xf32>) -> tensor<5x1xf32>
   return %2 : tensor<5x1xf32>
 }

diff --git a/iree/compiler/Dialect/Flow/Transforms/test/matmul_to_mmt4d.mlir b/iree/compiler/Dialect/Flow/Transforms/test/matmul_to_mmt4d.mlir
index 39a5eb8..fa5b8ea 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/matmul_to_mmt4d.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/matmul_to_mmt4d.mlir

@@ -71,7 +71,7 @@
 func @check_mmt4d_with_init_tensor_and_fill(%arg0: tensor<24x8xf32>, %arg1: tensor<8x32xf32>) -> tensor<24x32xf32> {
     %c0 = arith.constant 0.0 : f32
     %0 = linalg.init_tensor [24, 32] : tensor<24x32xf32>
-    %1 = linalg.fill(%c0, %0) : f32, tensor<24x32xf32> -> tensor<24x32xf32>
+    %1 = linalg.fill ins(%c0 : f32) outs(%0 : tensor<24x32xf32>) -> tensor<24x32xf32>
     %2 = linalg.matmul ins(%arg0, %arg1 : tensor<24x8xf32>, tensor<8x32xf32>) outs(%1 : tensor<24x32xf32>) -> tensor<24x32xf32>
     return %2 : tensor<24x32xf32>
 }
@@ -85,7 +85,8 @@
 //      CHECK: %[[RHS4D:.+]] = tensor.expand_shape %[[RHS]]
 // CHECK-SAME:   tensor<8x32xf32> into tensor<4x2x8x4xf32>
 //      CHECK: %[[DST_INIT:.+]] = linalg.init_tensor [3, 8, 8, 4] : tensor<3x8x8x4xf32>
-//      CHECK: [[DST:.+]] linalg.fill(%[[ZERO:.+]], %[[DST_INIT]])
+//      CHECK: [[DST:.+]] linalg.fill
+// CHECK-SAME:   outs(%[[DST_INIT]] :
 
 // -----
 func @check_mmt4d_i8_static_pad(%arg0: tensor<3x5xi8>, %arg1: tensor<5x2xi8>, %arg2: tensor<3x2xi32>) -> tensor<3x2xi32> {

diff --git a/iree/compiler/Dialect/Flow/Transforms/test/optimize_numerics.mlir b/iree/compiler/Dialect/Flow/Transforms/test/optimize_numerics.mlir
index a39ebe0..ae14cd0 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/optimize_numerics.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/optimize_numerics.mlir

@@ -60,9 +60,9 @@
 func @cast_fill(%arg0 : f32, %arg1 : tensor<3xf32>) -> tensor<3xi8> {
   // CHECK: %[[SCALAR:.*]] = arith.fptosi %arg0 : f32 to i8
   // CHECK: %[[INIT:.*]] = arith.fptosi %arg1 : tensor<3xf32> to tensor<3xi8>
-  // CHECK: %[[RESULT:.*]] = linalg.fill(%[[SCALAR]], %[[INIT]]) : i8, tensor<3xi8> -> tensor<3xi8>
+  // CHECK: %[[RESULT:.*]] = linalg.fill ins(%[[SCALAR]] : i8) outs(%[[INIT]] : tensor<3xi8>) -> tensor<3xi8>
   // CHECK: return %[[RESULT]]
-  %0 = linalg.fill(%arg0, %arg1) : f32, tensor<3xf32> -> tensor<3xf32>
+  %0 = linalg.fill ins(%arg0 : f32) outs(%arg1 : tensor<3xf32>) -> tensor<3xf32>
   %1 = arith.fptosi %0 : tensor<3xf32> to tensor<3xi8>
   return %1 : tensor<3xi8>
 }

diff --git a/iree/compiler/Dialect/Flow/Transforms/test/pad_tensor_to_tensor.mlir b/iree/compiler/Dialect/Flow/Transforms/test/pad_tensor_to_tensor.mlir
index 316023c..b3cf275 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/pad_tensor_to_tensor.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/pad_tensor_to_tensor.mlir

@@ -28,7 +28,9 @@
 //   CHECK-DAG:   %[[RD0:.+]] = affine.apply #[[MAP0]]()[%[[ARG3]], %[[D0]]]
 //   CHECK-DAG:   %[[RD1:.+]] = affine.apply #[[MAP1]]()[%[[ARG2]], %[[D1]]]
 //       CHECK:   %[[INIT:.+]] = linalg.init_tensor [%[[RD0]], %[[RD1]]]
-//       CHECK:   %[[FILL:.+]] = linalg.fill(%[[VAL]], %[[INIT]])
+//       CHECK:   %[[FILL:.+]] = linalg.fill
+//  CHECK-SAME:       ins(%[[VAL]] :
+//  CHECK-SAME:       outs(%[[INIT]] :
 //       CHECK:   %[[RESULT:.+]] = tensor.insert_slice %[[ARG0]] into %[[FILL]][4, %[[ARG2]]] [%[[D0]], %[[D1]]] [1, 1]
 //       CHECK:   return %[[RESULT]]
 
@@ -53,6 +55,8 @@
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<f32>
 //   CHECK-DAG:   %[[VAL:.+]] = tensor.extract %[[ARG1]]
 //       CHECK:   %[[INIT:.+]] = linalg.init_tensor [18, 12]
-//       CHECK:   %[[FILL:.+]] = linalg.fill(%[[VAL]], %[[INIT]])
+//       CHECK:   %[[FILL:.+]] = linalg.fill
+//  CHECK-SAME:       ins(%[[VAL]] :
+//  CHECK-SAME:       outs(%[[INIT]] :
 //       CHECK:   %[[RESULT:.+]] = tensor.insert_slice %[[ARG0]] into %[[FILL]][4, 5] [12, 4] [1, 1]
 //       CHECK:   return %[[RESULT]]

diff --git a/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir b/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir
index 6f417c8..a147963 100644
--- a/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir
+++ b/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir

@@ -20,7 +20,7 @@
       scf.for %i = %1 to %count step %2 {
         %3 = affine.min affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>(%i)[%count, %workgroup_size_0]
         %4 = linalg.init_tensor [%3] : tensor<?xi64>
-        %5 = linalg.fill(%value, %4) : i64, tensor<?xi64> -> tensor<?xi64>
+        %5 = linalg.fill ins(%value : i64) outs(%4 : tensor<?xi64>) -> tensor<?xi64>
         flow.dispatch.tensor.store %5, %out, offsets = [%i], sizes = [%3], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:?xi64>{%count}
       }
       return

diff --git a/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir b/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir
index 4daad1e..a701eae 100644
--- a/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir
+++ b/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir

@@ -21,7 +21,7 @@
       scf.for %i = %1 to %count step %2 {
         %3 = affine.min affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>(%i)[%count, %workgroup_size_0]
         %4 = linalg.init_tensor [%3] : tensor<?xi64>
-        %5 = linalg.fill(%value, %4) : i64, tensor<?xi64> -> tensor<?xi64>
+        %5 = linalg.fill ins(%value : i64) outs(%4 : tensor<?xi64>) -> tensor<?xi64>
         flow.dispatch.tensor.store %5, %out, offsets = [%i], sizes = [%3], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:?xi64>{%count}
       }
       return

diff --git a/iree/compiler/InputConversion/Common/test/linalg_quantized_matmul_to_matmul.mlir b/iree/compiler/InputConversion/Common/test/linalg_quantized_matmul_to_matmul.mlir
index 1ef6e39..67a0c07 100644
--- a/iree/compiler/InputConversion/Common/test/linalg_quantized_matmul_to_matmul.mlir
+++ b/iree/compiler/InputConversion/Common/test/linalg_quantized_matmul_to_matmul.mlir

@@ -30,7 +30,9 @@
 // CHECK:       %[[MATMUL:.+]] = linalg.matmul ins(%[[LHS]], %[[RHS]] : tensor<?x?xi8>, tensor<?x?xi8>) outs(%[[ACC]] : tensor<?x?xi32>)
 // CHECK-DAG:   %[[INIT_RESULT:.+]] = linalg.init_tensor
 // CHECK-DAG:   %[[INIT_LHS_SUMS_ACC:.+]] = linalg.init_tensor
-// CHECK:       %[[ZERO_LHS_SUMS_ACC:.+]] = linalg.fill(%[[C0_I32]], %[[INIT_LHS_SUMS_ACC]])
+// CHECK:       %[[ZERO_LHS_SUMS_ACC:.+]] = linalg.fill
+// CHECK-SAME:    ins(%[[C0_I32]] :
+// CHECK-SAME:    outs(%[[INIT_LHS_SUMS_ACC]] :
 // CHECK:       %[[LHS_SUMS:.+]] = linalg.generic
 // CHECK-SAME:    "parallel", "reduction"
 // CHECK-SAME:    ins(%[[LHS]] : tensor<?x?xi8>) 
@@ -55,7 +57,9 @@
 // CHECK:       %[[MATMUL:.+]] = linalg.matmul ins(%[[LHS]], %[[RHS]] : tensor<?x?xi8>, tensor<?x?xi8>) outs(%[[ACC]] : tensor<?x?xi32>)
 // CHECK-DAG:   %[[INIT_RESULT:.+]] = linalg.init_tensor
 // CHECK-DAG:   %[[INIT_RHS_SUMS_ACC:.+]] = linalg.init_tensor
-// CHECK:       %[[ZERO_RHS_SUMS_ACC:.+]] = linalg.fill(%[[C0_I32]], %[[INIT_RHS_SUMS_ACC]])
+// CHECK:       %[[ZERO_RHS_SUMS_ACC:.+]] = linalg.fill
+// CHECK-SAME:    ins(%[[C0_I32]] :
+// CHECK-SAME:    outs(%[[INIT_RHS_SUMS_ACC]] :
 // CHECK:       %[[RHS_SUMS:.+]] = linalg.generic
 // CHECK-SAME:    "reduction", "parallel"
 // CHECK-SAME:    ins(%[[RHS]] : tensor<?x?xi8>) 
@@ -80,13 +84,17 @@
 // CHECK:       %[[MATMUL:.+]] = linalg.matmul ins(%[[LHS]], %[[RHS]] : tensor<?x?xi8>, tensor<?x?xi8>) outs(%[[ACC]] : tensor<?x?xi32>)
 // CHECK-DAG:   %[[INIT_RESULT:.+]] = linalg.init_tensor
 // CHECK-DAG:   %[[INIT_LHS_SUMS_ACC:.+]] = linalg.init_tensor
-// CHECK:       %[[ZERO_LHS_SUMS_ACC:.+]] = linalg.fill(%[[C0_I32]], %[[INIT_LHS_SUMS_ACC]])
+// CHECK:       %[[ZERO_LHS_SUMS_ACC:.+]] = linalg.fill
+// CHECK-SAME:    ins(%[[C0_I32]] :
+// CHECK-SAME:    outs(%[[INIT_LHS_SUMS_ACC]] :
 // CHECK:       %[[LHS_SUMS:.+]] = linalg.generic
 // CHECK-SAME:    "parallel", "reduction"
 // CHECK-SAME:    ins(%[[LHS]] : tensor<?x?xi8>) 
 // CHECK-SAME:    outs(%[[ZERO_LHS_SUMS_ACC]] : tensor<?xi32>)
 // CHECK:       %[[INIT_RHS_SUMS_ACC:.+]] = linalg.init_tensor
-// CHECK:       %[[ZERO_RHS_SUMS_ACC:.+]] = linalg.fill(%[[C0_I32]], %[[INIT_RHS_SUMS_ACC]])
+// CHECK:       %[[ZERO_RHS_SUMS_ACC:.+]] = linalg.fill
+// CHECK-SAME:    ins(%[[C0_I32]] :
+// CHECK-SAME:    outs(%[[INIT_RHS_SUMS_ACC]] :
 // CHECK:       %[[RHS_SUMS:.+]] = linalg.generic
 // CHECK-SAME:    "reduction", "parallel"
 // CHECK-SAME:    ins(%[[RHS]] : tensor<?x?xi8>) 
@@ -115,13 +123,17 @@
 // CHECK:       %[[MATMUL:.+]] = linalg.matmul ins(%[[LHS]], %[[RHS]] : tensor<3x4xi8>, tensor<4x5xi8>) outs(%[[ACC]] : tensor<3x5xi32>)
 // CHECK-DAG:   %[[INIT_RESULT:.+]] = linalg.init_tensor
 // CHECK-DAG:   %[[INIT_LHS_SUMS_ACC:.+]] = linalg.init_tensor
-// CHECK:       %[[ZERO_LHS_SUMS_ACC:.+]] = linalg.fill(%[[C0_I32]], %[[INIT_LHS_SUMS_ACC]])
+// CHECK:       %[[ZERO_LHS_SUMS_ACC:.+]] = linalg.fill
+// CHECK-SAME:    ins(%[[C0_I32]] :
+// CHECK-SAME:    outs(%[[INIT_LHS_SUMS_ACC]] :
 // CHECK:       %[[LHS_SUMS:.+]] = linalg.generic
 // CHECK-SAME:    "parallel", "reduction"
 // CHECK-SAME:    ins(%[[LHS]] : tensor<3x4xi8>) 
 // CHECK-SAME:    outs(%[[ZERO_LHS_SUMS_ACC]] : tensor<3xi32>)
 // CHECK:       %[[INIT_RHS_SUMS_ACC:.+]] = linalg.init_tensor
-// CHECK:       %[[ZERO_RHS_SUMS_ACC:.+]] = linalg.fill(%[[C0_I32]], %[[INIT_RHS_SUMS_ACC]])
+// CHECK:       %[[ZERO_RHS_SUMS_ACC:.+]] = linalg.fill
+// CHECK-SAME:    ins(%[[C0_I32]] :
+// CHECK-SAME:    outs(%[[INIT_RHS_SUMS_ACC]] :
 // CHECK:       %[[RHS_SUMS:.+]] = linalg.generic
 // CHECK-SAME:    "reduction", "parallel"
 // CHECK-SAME:    ins(%[[RHS]] : tensor<4x5xi8>) 

diff --git a/iree/compiler/InputConversion/MHLO/BUILD b/iree/compiler/InputConversion/MHLO/BUILD
index 00b5217..e8093bc 100644
--- a/iree/compiler/InputConversion/MHLO/BUILD
+++ b/iree/compiler/InputConversion/MHLO/BUILD

@@ -93,6 +93,10 @@
         "@llvm-project//mlir:Transforms",
         "@mlir-hlo//:chlo_legalize_to_hlo",
         "@mlir-hlo//:hlo",
+        "@mlir-hlo//:hlo_legalize_shape_ops_to_standard",
+        "@mlir-hlo//:hlo_legalize_to_arithmetic",
+        "@mlir-hlo//:hlo_legalize_to_lhlo",
+        "@mlir-hlo//:hlo_legalize_to_memref",
         "@mlir-hlo//:legalize_control_flow",
         "@mlir-hlo//:legalize_einsum_to_dot_general",
         "@mlir-hlo//:legalize_gather_to_torch_index_select",

diff --git a/iree/compiler/InputConversion/MHLO/CMakeLists.txt b/iree/compiler/InputConversion/MHLO/CMakeLists.txt
index ccd8640..c153e45 100644
--- a/iree/compiler/InputConversion/MHLO/CMakeLists.txt
+++ b/iree/compiler/InputConversion/MHLO/CMakeLists.txt

@@ -83,7 +83,11 @@
     MLIRTransforms
     MhloDialect
     MhloPasses
+    MhloShapeOpsToStandard
+    MhloToArithmeticConversion
+    MhloToLhloConversion
     MhloToLinalg
+    MhloToMemrefConversion
     MhloToStandard
     iree::compiler::Dialect::Flow::IR
     iree::compiler::Dialect::Flow::Transforms

diff --git a/iree/compiler/InputConversion/MHLO/MHLOToLinalgOnTensors.cpp b/iree/compiler/InputConversion/MHLO/MHLOToLinalgOnTensors.cpp
index 99fab42..ebc4504 100644
--- a/iree/compiler/InputConversion/MHLO/MHLOToLinalgOnTensors.cpp
+++ b/iree/compiler/InputConversion/MHLO/MHLOToLinalgOnTensors.cpp

@@ -84,7 +84,7 @@
           rewriter.createOrFold<arith::AddIOp>(loc, resultDimSize, size);
     }
     sizes[dim] = resultDimSize;
-    auto initTensor = rewriter.create<linalg::InitTensorOp>(
+    Value initTensor = rewriter.create<linalg::InitTensorOp>(
         loc, resultType.getShape(), resultType.getElementType());
     auto zeroAttr = rewriter.getZeroAttr(resultType.getElementType());
     Value zero = rewriter.create<arith::ConstantOp>(loc, zeroAttr);
@@ -146,7 +146,7 @@
                                   Value rhs) {
   Value zero = b.create<arith::ConstantOp>(
       loc, b.getZeroAttr(resultType.getElementType()));
-  auto initTensor = b.create<linalg::InitTensorOp>(
+  Value initTensor = b.create<linalg::InitTensorOp>(
       loc, /*dyn_size=*/ValueRange{}, resultType.getShape(),
       resultType.getElementType());
   Value zeroTensor =

diff --git a/iree/compiler/InputConversion/MHLO/test/fft.mlir b/iree/compiler/InputConversion/MHLO/test/fft.mlir
index b5ec7b6..367269b 100644
--- a/iree/compiler/InputConversion/MHLO/test/fft.mlir
+++ b/iree/compiler/InputConversion/MHLO/test/fft.mlir

@@ -14,12 +14,16 @@
 // CHECK-DAG:   %[[ImagMatrix:.+]] = arith.constant dense<"0x00000080{{.*}}"> : tensor<32x17xf32>
 // CHECK-DAG:   %[[Zero:.+]] = arith.constant 0.000000e+00 : f32
 // CHECK:       %[[RealInit:.+]] = linalg.init_tensor [17] : tensor<17xf32>
-// CHECK:       %[[RealFill:.+]] = linalg.fill(%[[Zero]], %[[RealInit]])
+// CHECK:       %[[RealFill:.+]] = linalg.fill
+// CHECK-SAME:    ins(%[[Zero]] :
+// CHECK-SAME:    outs(%[[RealInit]] :
 // CHECK:       %[[RealRes:.+]] = linalg.vecmat
 // CHECK-SAME:    ins(%[[Arg0]], %[[RealMatrix]] : tensor<32xf32>, tensor<32x17xf32>)
 // CHECK-SAME:    outs(%[[RealFill]] : tensor<17xf32>) -> tensor<17xf32>
 // CHECK:        %[[ImagInit:.+]] = linalg.init_tensor [17] : tensor<17xf32>
-// CHECK:        %[[ImagFill:.+]] = linalg.fill(%[[Zero]], %[[ImagInit]])
+// CHECK:        %[[ImagFill:.+]] = linalg.fill
+// CHECK-SAME:     ins(%[[Zero]] :
+// CHECK-SAME:     outs(%[[ImagInit]] :
 // CHECK:        %[[ImagRes:.+]] = linalg.vecmat
 // CHECK-SAME:     ins(%[[Arg0]], %[[ImagMatrix]] : tensor<32xf32>, tensor<32x17xf32>)
 // CHECK-SAME:     outs(%[[ImagFill]] : tensor<17xf32>) -> tensor<17xf32>
@@ -41,12 +45,16 @@
 // CHECK-DAG:   %[[ImagMatrix:.+]] = arith.constant dense<"0x00000080{{.*}}"> : tensor<32x17xf32>
 // CHECK-DAG:   %[[Zero:.+]] = arith.constant 0.000000e+00 : f32
 // CHECK:        %[[RealInit:.+]] = linalg.init_tensor [1, 17] : tensor<1x17xf32>
-// CHECK:        %[[RealFill:.+]] = linalg.fill(%[[Zero]], %[[RealInit]])
+// CHECK:        %[[RealFill:.+]] = linalg.fill
+// CHECK-SAME:     ins(%[[Zero]] :
+// CHECK-SAME:     outs(%[[RealInit]] :
 // CHECK:        %[[RealRes:.+]] = linalg.matmul
 // CHECK-SAME:     ins(%[[Arg0]], %[[RealMatrix]] : tensor<1x32xf32>, tensor<32x17xf32>)
 // CHECK-SAME:     outs(%[[RealFill]] : tensor<1x17xf32>) -> tensor<1x17xf32>
 // CHECK:        %[[ImagInit:.+]] = linalg.init_tensor [1, 17] : tensor<1x17xf32>
-// CHECK:        %[[ImagFill:.+]] = linalg.fill(%[[Zero]], %[[ImagInit]])
+// CHECK:        %[[ImagFill:.+]] = linalg.fill
+// CHECK-SAME:     ins(%[[Zero]] :
+// CHECK-SAME:     outs(%[[ImagInit]] :
 // CHECK:        %[[ImagRes:.+]] = linalg.matmul
 // CHECK-SAME:     ins(%[[Arg0]], %[[ImagMatrix]] : tensor<1x32xf32>, tensor<32x17xf32>)
 // CHECK-SAME:     outs(%[[ImagFill]] : tensor<1x17xf32>) -> tensor<1x17xf32>

diff --git a/iree/test/e2e/regression/fill_i64.mlir b/iree/test/e2e/regression/fill_i64.mlir
index 3bd8565..1bc2d21 100644
--- a/iree/test/e2e/regression/fill_i64.mlir
+++ b/iree/test/e2e/regression/fill_i64.mlir

@@ -10,12 +10,12 @@
 
   %cv0 = arith.constant -1 : i64
   %v0_init = linalg.init_tensor [%0, %1] : tensor<?x?xi64>
-  %v0 = linalg.fill(%cv0, %v0_init) : i64, tensor<?x?xi64> -> tensor<?x?xi64>
+  %v0 = linalg.fill ins(%cv0 : i64) outs(%v0_init : tensor<?x?xi64>) -> tensor<?x?xi64>
   // CHECK: 2x3xi64=[-1 -1 -1][-1 -1 -1]
 
   %cv1 = arith.constant 9223372036854775807 : i64
   %v1_init = linalg.init_tensor [%0, %1] : tensor<?x?xi64>
-  %v1 = linalg.fill(%cv1, %v1_init) : i64, tensor<?x?xi64> -> tensor<?x?xi64>
+  %v1 = linalg.fill ins(%cv1 : i64) outs(%v1_init : tensor<?x?xi64>) -> tensor<?x?xi64>
   // CHECK: 2x3xi64=[9223372036854775807 9223372036854775807 9223372036854775807][9223372036854775807 9223372036854775807 9223372036854775807]
 
   return %v0, %v1 : tensor<?x?xi64>, tensor<?x?xi64>

diff --git a/iree/test/e2e/regression/linalg_ops.mlir b/iree/test/e2e/regression/linalg_ops.mlir
index 98173af..c9ec6e1 100644
--- a/iree/test/e2e/regression/linalg_ops.mlir
+++ b/iree/test/e2e/regression/linalg_ops.mlir

@@ -38,7 +38,7 @@
   %bias = util.unfoldable_constant dense<1.0> : tensor<16xf32>
   %init = linalg.init_tensor [1, 112, 112, 16] : tensor<1x112x112x16xf32>
   %cst = arith.constant 0.0 : f32
-  %fill = linalg.fill(%cst, %init) : f32, tensor<1x112x112x16xf32> -> tensor<1x112x112x16xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32>
   %conv = linalg.conv_2d_nhwc_hwcf
       {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
       ins(%input, %filter : tensor<1x225x225x3xf32>, tensor<3x3x3x16xf32>)

diff --git a/iree/test/e2e/regression/linalg_quantized_matmul_vs_linalg_matmul.mlir b/iree/test/e2e/regression/linalg_quantized_matmul_vs_linalg_matmul.mlir
index 3378d41..58dfe83 100644
--- a/iree/test/e2e/regression/linalg_quantized_matmul_vs_linalg_matmul.mlir
+++ b/iree/test/e2e/regression/linalg_quantized_matmul_vs_linalg_matmul.mlir

@@ -21,7 +21,7 @@
   // compute the sums along rows of %lhs.
   %lhs_i32 = arith.extsi %lhs : tensor<3x4xi8> to tensor<3x4xi32>
   %init_lhs_sums_uninitialized = linalg.init_tensor [3] : tensor<3xi32>
-  %zero_lhs_sums = linalg.fill(%c_0, %init_lhs_sums_uninitialized) : i32, tensor<3xi32> -> tensor<3xi32>
+  %zero_lhs_sums = linalg.fill ins(%c_0 : i32) outs(%init_lhs_sums_uninitialized : tensor<3xi32>) -> tensor<3xi32>
   %lhs_sums = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                        affine_map<(d0, d1) -> (d0)>],
@@ -36,7 +36,7 @@
   // compute the sums along columns of %rhs.
   %rhs_i32 = arith.extsi %rhs : tensor<4x5xi8> to tensor<4x5xi32>
   %init_rhs_sums_uninitialized = linalg.init_tensor [5] : tensor<5xi32>
-  %zero_rhs_sums = linalg.fill(%c_0, %init_rhs_sums_uninitialized) : i32, tensor<5xi32> -> tensor<5xi32>
+  %zero_rhs_sums = linalg.fill ins(%c_0 : i32) outs(%init_rhs_sums_uninitialized : tensor<5xi32>) -> tensor<5xi32>
   %rhs_sums = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                        affine_map<(d0, d1) -> (d1)>],
@@ -93,7 +93,7 @@
   // compute the sums along rows of %lhs.
   %lhs_i32 = arith.extsi %lhs : tensor<?x?xi8> to tensor<?x?xi32>
   %init_lhs_sums_uninitialized = linalg.init_tensor [%m_size] : tensor<?xi32>
-  %zero_lhs_sums = linalg.fill(%c_0, %init_lhs_sums_uninitialized) : i32, tensor<?xi32> -> tensor<?xi32>
+  %zero_lhs_sums = linalg.fill ins(%c_0 : i32) outs(%init_lhs_sums_uninitialized : tensor<?xi32>) -> tensor<?xi32>
   %lhs_sums = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                        affine_map<(d0, d1) -> (d0)>],
@@ -108,7 +108,7 @@
   // compute the sums along columns of %rhs.
   %rhs_i32 = arith.extsi %rhs : tensor<?x?xi8> to tensor<?x?xi32>
   %init_rhs_sums_uninitialized = linalg.init_tensor [%n_size] : tensor<?xi32>
-  %zero_rhs_sums = linalg.fill(%c_0, %init_rhs_sums_uninitialized) : i32, tensor<?xi32> -> tensor<?xi32>
+  %zero_rhs_sums = linalg.fill ins(%c_0 : i32) outs(%init_rhs_sums_uninitialized : tensor<?xi32>) -> tensor<?xi32>
   %rhs_sums = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                        affine_map<(d0, d1) -> (d1)>],
@@ -192,7 +192,7 @@
   %c_plus127 = arith.constant 127 : i32
 
   %init_acc_uninitialized =  linalg.init_tensor [3, 5] : tensor<3x5xi32>
-  %zero_acc = linalg.fill(%c_0, %init_acc_uninitialized) : i32, tensor<3x5xi32> -> tensor<3x5xi32>
+  %zero_acc = linalg.fill ins(%c_0 : i32) outs(%init_acc_uninitialized : tensor<3x5xi32>) -> tensor<3x5xi32>
   // Test special case: both zero points are 0
   call @check_one_quantized_matmul_as_matmul_3x4x5(%lhs_3x4_1, %rhs_4x5_1, %c_0, %c_0, %zero_acc) : (tensor<3x4xi8>, tensor<4x5xi8>, i32, i32, tensor<3x5xi32>) -> ()
   // Test special cases: one of the zero points is 0

diff --git a/third_party/llvm-project b/third_party/llvm-project
index c38fadb..8361c5d 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project

@@ -1 +1 @@
-Subproject commit c38fadb7f97e8aca077df9c2a01ec43f8cb1f805
+Subproject commit 8361c5da30588d3d4a48eae648f53be1feb5cfad

diff --git a/third_party/mlir-hlo b/third_party/mlir-hlo
index f52e7dc..7727bff 160000
--- a/third_party/mlir-hlo
+++ b/third_party/mlir-hlo

@@ -1 +1 @@
-Subproject commit f52e7dc1210dead4d3ff416696cad1c794332959
+Subproject commit 7727bfff1a219c9cd60087a1ae0a4b7e52916f57
commit	b10a62b57171711354746701ec4cbfdff1d84389	[log] [tgz]
author	Scott Todd <scotttodd@google.com>	Tue Mar 15 15:26:39 2022 -0700
committer	GitHub <noreply@github.com>	Tue Mar 15 15:26:39 2022 -0700
tree	9d68d83e99a7f5c246d9c8ab2850d60d058a2fa0
parent	ad57a7ddd4ec58f6a59e5b3060e6505600e42034 [diff]
parent	cb5917a9877f9f4bef51d90bbc521aecb5e560c1 [diff]